!pip install folium
!pip install nltk
import numpy as np
import pandas as pd
import time
import datetime
from datetime import datetime
import calendar
import chardet
import missingno as msno
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import re
import folium
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import string
import warnings
warnings.filterwarnings("ignore")
start_time = time.time()
print('\n\nPandas',pd.__version__)
print('Matplotlib',matplotlib.__version__)
print('Seaborn',sns.__version__)
print('File Size In KB for New_York_State_Fallen_Firefighters_Memorial CSV: ',(os.path.getsize('New_York_State_Fallen_Firefighters_Memorial_Roll_of_Honor.csv')/1000),' KB')
print('File Size In KB for Fire_Department CSV : ',(os.path.getsize('Fire_Department_Directory_for_New_York_State.csv')/1000),' KB')
NY = 'New York State'
Encoding check for the input CSV files to ensure data is in right format
with open('New_York_State_Fallen_Firefighters_Memorial_Roll_of_Honor.csv','rb') as fraw:
file_content = fraw.read()
chardet.detect(file_content)
Character encoding of the CSV file is utf-8 and confidence level is 0.99(99%). Hence I will provide encoding='utf-8' while loading the file into Pandas DataFrame using read_csv()
with open('Fire_Department_Directory_for_New_York_State.csv','rb') as fraw:
file_content = fraw.read()
chardet.detect(file_content)
Character encoding of the CSV file is ascii and confidence level is 1(100%).
Exploring file contents from the CSVs:
!head -n 3 New_York_State_Fallen_Firefighters_Memorial_Roll_of_Honor.csv
!head -n 3 Fire_Department_Directory_for_New_York_State.csv
Next, I will extract the data from CSV files and insert into dataframes for processing
pd.options.display.max_rows = 40
start_time_before_load_file1 = time.time()
fire_fighters_df = pd.read_csv("New_York_State_Fallen_Firefighters_Memorial_Roll_of_Honor.csv",encoding ='utf-8')
print('Time taken to load fire fighters data : ',time.time() - start_time_before_load_file1,'seconds')
print('Before Shape of fire_fighters_df : ',fire_fighters_df.shape)
print('Dropping rows with duplicate data from fire_fighters_df.')
fire_fighters_df.drop_duplicates()
print('After Shape of fire_fighters_df : ',fire_fighters_df.shape)
start_time_before_load_file2 = time.time()
fire_department_df = pd.read_csv("Fire_Department_Directory_for_New_York_State.csv")
print('\n\nTime taken to load fire department data : ',time.time() - start_time_before_load_file2,'seconds')
print('Before Shape of fire_department_df : ',fire_department_df.shape)
print('Dropping rows with duplicate data from fire_department_df.')
fire_department_df.drop_duplicates()
print('After Shape of fire_department_df : ',fire_department_df.shape)
The New_York_State_Fallen_Firefighters_Memorial_Roll_of_Honor.csv/fire_fighters_df dataframe contains 2549 rows and 5 columns.
The Fire_Department_Directory_for_New_York_State.csv/fire_department_df dataframe contains 1773 rows and 12 columns.
Let us explore the data a bit using head(), tail(), info(), describe() for both dataframes
fire_fighters_df.head()
fire_fighters_df.tail()
fire_fighters_df.info()
fire_fighters_df.describe()
fire_department_df.head()
fire_department_df.tail()
fire_department_df.info()
fire_department_df.describe()
fire_department_df.describe(include='all')
fire_department_df.describe(include='object')
print('Number of firefighter names in fire_fighters_df : ',fire_fighters_df['Name'].count())
print('Number of unique firefighter names in fire_fighters_df : ',fire_fighters_df['Name'].nunique())
print('\n')
print('Number of unique Fire Department Names in fire_department_df : ',fire_department_df['Fire Department Name'].nunique())
print('Number of unique Fire Department Codes in fire_department_df : ',fire_department_df['Fire Department Code'].nunique())
print('Number of rows in fire_department_df : ',fire_department_df.shape[0])
Obsevation 1
Obsevation 2
Obsevation 3
Step 1
'''
Dropping data to keep one entry for fire departments that have same department names but different department codes from fire_deparment_df.
'''
print('Index with same department name but different department code in fire_deparment_df:')
for department in list(fire_department_df['Fire Department Name'].value_counts().index):
if fire_department_df[fire_department_df['Fire Department Name']==department]['Fire Department Name'].value_counts().values > 1:
index_list_of_same_department_name=list(fire_department_df[fire_department_df['Fire Department Name']==department].index)
print(index_list_of_same_department_name)
for indx in index_list_of_same_department_name[:-1]:
fire_department_df.drop(index=indx,inplace=True)
Step 2
fire_department_df['Fire Department Code'].value_counts().head()
'''
Looking for the rows with same department code
'''
fire_department_df[fire_department_df['Fire Department Code']==42004]
'''
Checking whether the department name with duplicate deparment code exists in fire_fighters_df
'''
fighters_department_list=list(fire_fighters_df['Fire Department'].apply(lambda x : x.split(' ')[0].lower()))
exits='N'
for fighters_department in fighters_department_list:
if (fighters_department == 'brunswick') or (fighters_department =='brushton') :
exits = 'Y'
break
if exits=='Y' :
print('Yes, the department name with duplicate deparment code exists in fire_fighters_df.')
else:
print('No, the department name with duplicate deparment code does not exists in fire_fighters_df.')
'''
As, the department name with duplicate deparment code does not exist in fire_fighters_df, dropping this data from fire_deparment_df will not
affect the analysis. Therefore, I am dropping it.
'''
for department_code in list(fire_department_df['Fire Department Code'].value_counts().index):
if fire_department_df[fire_department_df['Fire Department Code']==department_code]['Fire Department Code'].value_counts().values > 1:
index_list_of_same_department_code=list(fire_department_df[fire_department_df['Fire Department Code']==department_code].index)
print('Index list of same department code : ',index_list_of_same_department_code)
for indx in index_list_of_same_department_code:
fire_department_df.drop(index=indx,inplace=True)
print('Dropped index : ',indx)
print('After cleanup shape of fire_department_df : ',fire_department_df.shape)
print('Number of unique Fire Department Name in fire_department_df : ',fire_department_df['Fire Department Name'].nunique())
print('Number of unique Fire Department Code in fire_department_df : ',fire_department_df['Fire Department Code'].nunique())
Now, the fire_department_df dataframe contains 1753 rows and 12 columns.
def department_preprocess(department):
"""
Takes in a string of department name, then performs the following:
1. Convert department name to lowercase
2. Split department name in words
3. Remove all punctuations
4. Remove empty strings from department name
5. Remove all stopwords
6. Remove repetitive filler words from department name
Replace words that can cause issues while join is performed based on department name.
7. Join cleaned department name tokens back to a phrase
8. Complete spelling corrections of department name
9. Return clean department name
"""
en_stops = set(stopwords.words('english'))
"""
Convert department name to lowercase
"""
department_lowercase = department.lower()
"""
Split department name in words
"""
list_of_words = word_tokenize(department_lowercase)
"""
Remove all punctuations
Check characters to see if they are in punctuation
"""
list_of_words_without_punctuation=[''.join(this_char for this_char in this_string if (this_char not in string.punctuation))for this_string in list_of_words]
"""
Remove empty strings from department name
"""
list_of_words_without_punctuation = list(filter(None, list_of_words_without_punctuation))
"""
Remove any stopwords
"""
filtered_word_list = [w for w in list_of_words_without_punctuation if w not in en_stops]
"""
Remove repetitive filler words from department name
"""
ignore_lst=['fire','department','city','vol','volunteer','no','district',
'dept','deparment','company','co','engine']
filtered_word_list = [w for w in filtered_word_list if w not in ignore_lst]
"""
Join cleaned department name tokens back to a phrase
"""
clean_department=' '.join(filtered_word_list)
"""
Complete spelling corrections of department name
"""
clean_department = clean_department.replace('centre','center')
clean_department = clean_department.replace('ctr','center')
clean_department = clean_department.replace('cliff','clift')
clean_department = clean_department.replace('springs','spring')
clean_department = clean_department.replace('sq','square')
clean_department = clean_department.replace('tact','tract')
"""
Return clean department name
"""
return clean_department
'''
For fire_fighters_df :
Convert column data to lowercase
Create new column with 'Department' as column name
'''
fire_fighters_df = fire_fighters_df.applymap(lambda s:s.lower() if type(s) == str else s)
fire_fighters_df['Department']=fire_fighters_df['Fire Department']
fire_fighters_df.head()
'''
Applying department_preprocess() to the new key column of fire_fighters_df
'''
fire_fighters_df['Department']=fire_fighters_df['Department'].apply(department_preprocess)
fire_fighters_df.head()
'''
For fire_department_df :
Convert column data to lowercase
Create new column with 'Department' as column name
'''
fire_department_df = fire_department_df.applymap(lambda s:s.lower() if type(s) == str else s)
fire_department_df['Department']=fire_department_df['Fire Department Name']
fire_department_df.head()
'''
Applying department_preprocess() to the new key column of fire_department_df
'''
fire_department_df['Department']=fire_department_df['Department'].apply(department_preprocess)
fire_department_df.head()
'''
Check after creation and key column preprocess for fire_department_df if there are departments that have same key.
'''
for department in list(fire_department_df['Department'].value_counts().index):
if fire_department_df[fire_department_df['Department']==department]['Department'].value_counts().values > 1:
print(department)
'''
Viewing rows of fire_department_df with same Department key
'''
fire_department_df[fire_department_df['Department'].isin(['lincoln','johnson','lakeside'])]
After joining if the above departments creates duplicate rows in merge df I will need to take care of that.
Merging the two dataframes into one dataframe
merge_df = pd.merge(fire_fighters_df,fire_department_df,how='inner',on='Department')
print('Shape of merge_df : ',merge_df.shape)
print('Number of unique name in merge_df : ',merge_df['Name'].nunique())
print('Number of unique Fire Department Name in merge_df : ',merge_df['Fire Department Name'].nunique())
print('Percentage of records lost in merge_df after processing data from fire_fighters_df : ',
((fire_fighters_df.shape[0]-merge_df.shape[0])/fire_fighters_df.shape[0])*100)
print('\n\n')
merge_df.head()
Next I will be checking for duplicate records in merge_df and removing it in meaningfull way
'''
Function to concatenate of values passed as parameters
'''
def new_col(cols):
return cols[0]+cols[1]+cols[2]+str(cols[3])+cols[4]
'''
Applying new_col function on those columns of the merge_df which are from fire_fighter_df
'''
merge_df['firefighter']=merge_df[['Name','Rank','Fire Department','Date of Death','Location on Memorial']].apply(new_col,axis=1)
"""
Checking whether any duplicate entries from fire_fighter_df exists or not in merge_df after merging.
If it exists I will delete the duplicate after matching with original Fire Department and Fire Department Name column
"""
for this_firefighter in list(merge_df['firefighter'].value_counts().index):
if merge_df[merge_df['firefighter']==this_firefighter]['firefighter'].value_counts().values > 1:
index_list_of_repeated_firefighter=list(merge_df[merge_df['firefighter']==this_firefighter].index)
print('Index list of repeated fire fighter data in merge_df : ',index_list_of_repeated_firefighter)
remove_index=merge_df[(merge_df['firefighter']==this_firefighter)
& (merge_df['Fire Department']!=merge_df['Fire Department Name'])].index
print('Index to remove : ',remove_index.values)
merge_df.drop(index=remove_index.values,inplace=True)
Reseting Index of merge_df
'''
Reseting Index of merge_df after droping indexes
'''
merge_df=merge_df.reset_index(drop=True)
'''
The firefighter column is no longer needed so, I am dropping the column.
'''
merge_df.drop(['firefighter'], axis=1,inplace=True)
print('After cleanup the shape of merge_df is : ',merge_df.shape)
After cleanup the merge_df has 2461 rows and 18 columns
I have my final data analysis dataframe now.
Let's explore the data a bit using head(), tail(), info(), describe() on merge_df
merge_df.head()
merge_df.tail()
merge_df.info()
merge_df.describe()
merge_df.describe(include='all')
merge_df.describe(include='object')
Next, I will explore the column metadata...
first_n_entries=5
print('Total rows in the dataframe:', merge_df.shape[0])
for col, col_type in merge_df.dtypes.iteritems():
if(col_type=='object'):
print(col, 'has', merge_df[col].nunique(), 'unique entries')
print('First', first_n_entries, 'entries are')
print(merge_df[col][0:first_n_entries])
print('')
for this_column in merge_df.columns:
print('====', this_column, 'has', merge_df[this_column].nunique(), 'unique entries ====')
print(merge_df[this_column].value_counts().head(5))
print('')
"""
Next, I transform the object data type for Rank to 'category' data type
"""
merge_df['Rank'] = merge_df['Rank'].astype('category')
merge_df['Rank'].dtype
"""
Next, I transform the object data type for Fire Department to 'category' data type
"""
merge_df['Fire Department'] = merge_df['Fire Department'].astype('category')
merge_df['Fire Department'].dtype
"""
Next, I transform the object data type for Location on Memorial to 'category' data type
"""
merge_df['Location on Memorial'] = merge_df['Location on Memorial'].astype('category')
merge_df['Location on Memorial'].dtype
"""
Next, I transform the object data type for Department to 'category' data type
"""
merge_df['Department'] = merge_df['Department'].astype('category')
merge_df['Department'].dtype
"""
Next, I transform the object data type for Fire Department Name to 'category' data type
"""
merge_df['Fire Department Name'] = merge_df['Fire Department Name'].astype('category')
merge_df['Fire Department Name'].dtype
"""
Next, I transform the object data type for City to 'category' data type
"""
merge_df['City'] = merge_df['City'].astype('category')
merge_df['City'].dtype
"""
Next, I transform the object data type for State to 'category' data type
"""
merge_df['State'] = merge_df['State'].astype('category')
merge_df['State'].dtype
"""
Next, I transform the object data type for Zip Code to 'category' data type
"""
merge_df['Zip Code'] = merge_df['Zip Code'].astype('category')
merge_df['Zip Code'].dtype
"""
Next, I transform the object data type for County Name to 'category' data type
"""
merge_df['County Name'] = merge_df['County Name'].astype('category')
merge_df['County Name'].dtype
"""
Next, I transform the object data type for Date of Death to 'datetime' data type
"""
merge_df['Date of Death']=pd.to_datetime(
merge_df['Date of Death'],
format='%m/%d/%Y')
merge_df.dtypes
Black = filled; white = empty
"""
Searching for missing data in sample set of 200 randomly selected data points
"""
_=msno.matrix(merge_df.sample(200))
plt.xlabel('Features in data',fontsize=16)
plt.ylabel('Gaps in data',fontsize=16)
plt.show()
"""
Searching for missing data in sample set of 2000 randomly selected data points
"""
_=msno.matrix(merge_df.sample(2000))
plt.xlabel('Features in data',fontsize=16)
plt.ylabel('Gaps in data',fontsize=16)
plt.show()
From the graphs one can see the data has some missing values for few columns like Date of Death, Phone Number, Latitude, Longitude, Location 1.
I will perform the following tasks:
print('Shape of data frame before Cleanup :',merge_df.shape)
print('Drop all rows and columns where entire row or column is NaN.')
merge_df.dropna(how='all',axis=0,inplace=True) # rows
merge_df.dropna(how='all',axis=1,inplace=True) # columns
print('Drop columns with duplicate data or with 50% missing value.')
half_count = len(merge_df)*.5
merge_df = merge_df.dropna(thresh=half_count, axis=1)
merge_df = merge_df.drop_duplicates()
print('Drop columns where all rows have the same value.')
for this_column in merge_df.columns:
if (merge_df[this_column].nunique()==1):
unique_entry=merge_df.iloc[0][this_column]
print('Drop column ',this_column,' where all rows have the same value : ', unique_entry)
merge_df.drop([this_column],axis=1,inplace=True)
print('Shape of data frame after cleanup :',merge_df.shape)
That is get data from the same County Name
merge_df.head().T
"""
Counting null data per column
"""
merge_df.isnull().sum()
"""
Percentage of missing data per column
"""
(merge_df.isnull().sum()/len(merge_df)).sort_values(ascending=False)
Filling up missing data of Location 1, Latitude and Longitude through sampling of data in same County Name
print("Data index for missing Location 1 : ",list(merge_df[merge_df['Location 1'].isnull()].index))
print("Data index for missing Latitude : ",list(merge_df[merge_df['Latitude'].isnull()].index))
print("Data index for missing Longitude : ",list(merge_df[merge_df['Longitude'].isnull()].index))
'''
County Name based sampling for Location 1, Latitude and Longitude data
'''
location1_smapling_dict={}
latitude_smapling_dict={}
longitude_smapling_dict={}
null_index=list(merge_df[merge_df['Location 1'].isnull()].index)
print(null_index)
for indx in null_index:
print('index :',indx)
this_county=merge_df.iloc[indx]['County Name']
print(this_county)
sample_location1=random.choice(list(merge_df[(merge_df['County Name']==this_county)
& (merge_df['Location 1'].notnull())]['Location 1']))
sample_latitude=float(sample_location1.split(',')[0][1:])
sample_longitude=float(sample_location1.split(',')[1][:-1].strip())
location1_smapling_dict[indx]=sample_location1
latitude_smapling_dict[indx]=sample_latitude
longitude_smapling_dict[indx]=sample_longitude
print(location1_smapling_dict)
print(latitude_smapling_dict)
print(longitude_smapling_dict)
'''
Filling up the missing values of Location 1, Latitude and Longitude with sampled data
'''
merge_df['Location 1'].fillna(location1_smapling_dict,inplace=True)
merge_df['Latitude'].fillna(latitude_smapling_dict,inplace=True)
merge_df['Longitude'].fillna(longitude_smapling_dict,inplace=True)
Filling up missing data of Phone Number through sampling of data in same County Name
print("Data index for missing Phone Number : ",list(merge_df[merge_df['Phone Number'].isnull()].index))
'''
County Name based sampling for Phone Number data
'''
phone_number_smapling_dict={}
null_phone_number_index=list(merge_df[merge_df['Phone Number'].isnull()].index)
print(null_phone_number_index)
for indx in null_phone_number_index:
print('index :',indx)
this_county=merge_df.iloc[indx]['County Name']
print(this_county)
sample_phone_number=random.choice(list(merge_df[(merge_df['County Name']==this_county)
& (merge_df['Phone Number'].notnull())]['Phone Number']))
phone_number_smapling_dict[indx]=sample_phone_number
print(phone_number_smapling_dict)
'''
Filling up the missing values of Phone Number with sampled data
'''
merge_df['Phone Number'].fillna(phone_number_smapling_dict,inplace=True)
Filling up missing data of Date of Death through sampling of data in same County Name
print("Data index for missing Date of Death : ",list(merge_df[merge_df['Date of Death'].isnull()].index))
'''
County Name based sampling for Date of Death data
'''
null_date_of_death_index=list(merge_df[merge_df['Date of Death'].isnull()].index)
print('null_date_of_death_index : ',null_date_of_death_index)
this_county=merge_df.iloc[null_date_of_death_index]['County Name'].values
print('this_county : ',this_county)
sample_date_of_death=datetime.strftime(random.choice(list(merge_df[(merge_df['County Name']==this_county) &
(merge_df['Date of Death'].notnull())]['Date of Death'])),'%m/%d/%Y')
print('sample_date_of_death : ',sample_date_of_death)
sample_date_of_death_in_datetime=datetime.strptime(sample_date_of_death,'%m/%d/%Y')
print('sample_date_of_death_in_datetime : ',sample_date_of_death_in_datetime)
'''
Filling up the missing values of Date of Death with sampled data
'''
merge_df['Date of Death'].fillna(sample_date_of_death_in_datetime,inplace=True)
merge_df.isnull().sum()
Missing data have been filled up successfully for Location 1, Latitude, Longitude, Phone Number and Date of Death columns
... for merged data of New_York_State_Fallen_Firefighters_Memorial_Roll_of_Honor, Fire_Department_Directory_for_New_York_State in New York State
Let's ask our data some questions about fire fighter fatalities in New York State.
Which county and/or city had a high fatality count?
Top ten fatality count by county for New York State fire departments
merge_df['County Name']=merge_df['County Name'].apply(lambda x : x.strip())
merge_df.groupby('County Name')['Name'].count().sort_values(ascending=False).head(10)
merge_df['City']=merge_df['City'].apply(lambda x : x.strip())
merge_df.groupby('City')['Name'].count().sort_values(ascending=False).head(10)
The answer to the question is Brooklyn, New York County, New York has had some very high fatality counts. I will now dive deeper into the data to find more about patterns in fatalities in New York State's fire departments.
Method defined to compute normalized value for a column of dataframe
Formula for normalization used is as follows:
$\mathbf{X_{new}} = {X - X_{min} \over X_{max} - X_{min}}$
'''
This method will return the value normalized between 0 and 1, given a number, maximum value and minimum value of the column in the dataframe
'''
def compute_norm(number, max_val, min_val):
return (number - min_val)/(max_val - min_val)
'''
This method will take a dataframe and return a dataframe with one extra column of normalized values for the colum of dataframe that
needs to be normalized. Created as I will reuse this a number of times.
'''
def get_normalized_value_df(df_to_process, name_of_column_to_normalize):
norm_df = df_to_process
normalized_value_list = []
for num in np.array(norm_df[name_of_column_to_normalize]):
normalized_value_list.append(compute_norm(number=float(num),
max_val=float(norm_df[name_of_column_to_normalize].nlargest(1)),
min_val=float(norm_df[name_of_column_to_normalize].nsmallest(1))
)
)
norm_df['normalized_'+name_of_column_to_normalize] = normalized_value_list
return norm_df
Processing date time to extract year, month of death
'''
Counting the number of fatalities in each of the top five fatal years
'''
merge_df['Year'] = merge_df['Date of Death'].apply(lambda time: time.year)
merge_df['Year'].value_counts().head()
'''
Counting the number of fatalities by month
'''
merge_df['Month'] = (merge_df['Date of Death'].dt.month).apply(lambda x : calendar.month_abbr[int(x)])
months=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
merge_df['Month'] = pd.Categorical(merge_df['Month'],
categories=months,
ordered=True)
merge_df['Month'].value_counts()
Above two analysis indicates that there was some significant incident that happened in the month of September. It also indicates that 2001 was a dangerous year for firefighters
top_counties_of_fire_fighter_death = merge_df.groupby(['County Name'])['Name'].count().sort_values(ascending=False).head().index.values
top_counties_of_fire_fighter_death
top_counties_of_fire_fighter_death_df = merge_df[merge_df['County Name'].isin(top_counties_of_fire_fighter_death)]
top_five_year_of_fire_fighter_death = merge_df.groupby(['Year'])['Name'].count().sort_values(ascending=False).head().index.values
top_five_year_of_fire_fighter_death
top_death_county_year_df = top_counties_of_fire_fighter_death_df[
top_counties_of_fire_fighter_death_df['Year'].isin(top_five_year_of_fire_fighter_death)]
top_date_of_death_county_df=pd.DataFrame(list(top_death_county_year_df.groupby(['Date of Death','County Name'])['Name']
.count().sort_values(ascending=False).head(10).items()),
columns=['date_of_death_county_combo','death_count'])
top_date_of_death_county_df.plot.bar(x='date_of_death_county_combo',y='death_count',figsize=(15,10))
plt.title('Number of fatalities by date of death in top five fatal counties of '+ NY,fontsize=18)
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Combination of date of death and county',fontsize=16)
plt.ylabel('Number of fatalities',fontsize=16)
plt.show()
'''
Computing the normalized count of fatalities by date of death for top five fatal years in top five fatal counties
I am computing normalized count to detect if 2001-09-11 combined with New York County is an outlier in terms of number of fatalities.
'''
top_date_of_death_county_norm_df = get_normalized_value_df(df_to_process=top_date_of_death_county_df,
name_of_column_to_normalize='death_count')
top_date_of_death_county_norm_df
top_date_of_death_county_norm_df.plot.bar(x='date_of_death_county_combo', y='normalized_death_count', figsize=(15,10))
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Combination of date of death and county',fontsize=16)
plt.ylabel('Normalized count of fatalities',fontsize=16)
plt.title('Normalized count of fatalities by date in top five fatal years and top five fatal counties',fontsize=18)
plt.show()
Combining the two datasets I am able to infer that on September 11, 2001, New York County happened to face a severe crisis, in which hundreds of firefighters gave their lives.
Answering our question 1 - Everyone knows about September 11th, 2001 and that people died but can it be determined from the combined data, which county most of the firefighters came from?
Story time
9/11 was one of the biggest terrorist attacks in American History on American soil killing thousands of people.
This link specifies that on that day:
Thus by combining two datasets I am able to prove that using the power of data analysis one may make fairly accurate inferences about real-life incidents.
Next, I will try to find the answer to my second question - During 9/11 was there a fire department where most of the fatalities occurred?
Top ten fire department location for incident
top_death_year_df=merge_df[merge_df['Year'].isin(top_five_year_of_fire_fighter_death)]
top_death_date_address_wise_df=top_death_year_df.groupby(['Date of Death','Address'])['Name'].count().sort_values(ascending=False).head(21)
top_death_date_address_df=pd.DataFrame(list(top_death_date_address_wise_df.items()),
columns=['date_of_death_address_combo','death_count'])
top_death_date_address_df.plot.bar(x='date_of_death_address_combo',y='death_count',figsize=(15,10))
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Combination of date of death and address',fontsize=16)
plt.ylabel('Number of fatalities',fontsize=16)
plt.title('Number of fatalities by date of death and address',fontsize=18)
plt.show()
'''
Computing the normalized count of fatalities for addresses
I am computing normalized counts to detect if address and 2001-09-01 has outliers for number of fatalities.
'''
top_death_date_address_norm_df = get_normalized_value_df(df_to_process=top_death_date_address_df,
name_of_column_to_normalize='death_count')
top_death_date_address_norm_df.plot.bar(x='date_of_death_address_combo', y='normalized_death_count', figsize=(15,10))
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Combination of date of death and address',fontsize=16)
plt.ylabel('Normalized count of fatalities',fontsize=16)
plt.title('Normalized count of fatalities by date of death and address',fontsize=20)
plt.show()
Our third question is about ranks of firefighters who passed away on 9/11 - It is understandable that there will be few high ranking officials directly involved in rescue operations. Does the combined data tell us the ranks of the firefighters that sacrificed their lives?
Analyzing fatalities by county and firefighter ranks
top_five_rank_of_fire_fighter_death = merge_df.groupby(['Rank'])['Name'].count().sort_values(ascending=False).head().index.values
top_five_rank_of_fire_fighter_death
top_death_county_year_rank_df = top_death_county_year_df[
top_death_county_year_df['Rank'].isin(top_five_rank_of_fire_fighter_death)]
top_death_date_of_death_county_rank_df=pd.DataFrame(list(top_death_county_year_rank_df.groupby(['Date of Death','County Name','Rank'])['Name']
.count().sort_values(ascending=False).head(10).items()),
columns=['date_of_death_county_rank_combo','death_count'])
top_death_date_of_death_county_rank_df.plot.bar(x='date_of_death_county_rank_combo',y='death_count',figsize=(15,10))
plt.title('Number of fatalities by date of top five fatal years, counties and ranks in ' + NY,fontsize=18)
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Combination of date of death, county and rank',fontsize=16)
plt.ylabel('Number of fatalities',fontsize=16)
plt.show()
'''
Computing the normalized count of fatalities for top five fatal counties and rank
I am computing normalized counts to detect if new york, 2001-09-11, firefighter has outliers for number of fatalities.
'''
top_death_date_of_death_county_rank_norm_df = get_normalized_value_df(df_to_process=top_death_date_of_death_county_rank_df,
name_of_column_to_normalize='death_count')
top_death_date_of_death_county_rank_norm_df.plot.bar(x='date_of_death_county_rank_combo', y='normalized_death_count', figsize=(15,10))
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Combination of date of death, county and rank',fontsize=16)
plt.ylabel('Normalized count of fatalities',fontsize=16)
plt.title('Normalized count of fatalities by date of death, county and rank',fontsize=20)
plt.show()
This answers my third question.
The chart above can also help answer the fourth question I had - Is there any incident comparable to 9/11 in terms of fire fighter fatalities? Where did such an incident occur?
'''
For this visualizaton I am using previously created dataframe for top ten fatal location to get the address, date_of_death and death_counts
value in different columns.I am breaking up the tuple in column date_of_death_address_combo to obtain the individual columns mentioned.
'''
top_death_count_date_address_list=[]
for date_address_tuple in list(top_death_date_address_df['date_of_death_address_combo']):
death_count_date_address_dict ={}
death_count_date_address_dict['death_count'] =top_death_date_address_df[top_death_date_address_df['date_of_death_address_combo']==date_address_tuple].iloc[0]['death_count']
death_count_date_address_dict['date_of_death'] =date_address_tuple[0]
death_count_date_address_dict['address'] =date_address_tuple[1]
top_death_count_date_address_list.append(death_count_date_address_dict)
top_death_count_date_address_df=pd.DataFrame(top_death_count_date_address_list)
top_death_count_date_address_df
location_list=[]
all_loc_dict = {}
indx = 0
for address in list(top_death_count_date_address_df['address']):
county=merge_df[merge_df['Address']==address].iloc[0]['County Name']
city=merge_df[merge_df['Address']==address].iloc[0]['City']
zip_code=merge_df[merge_df['Address']==address].iloc[0]['Zip Code']
full_address=address + ", " + county + " county, " + city + " city, " + NY + ", " + zip_code
latitude=merge_df[merge_df['Address']==address].iloc[0]['Latitude']
longitude=merge_df[merge_df['Address']==address].iloc[0]['Longitude']
location1=merge_df[merge_df['Address']==address].iloc[0]['Location 1']
date_of_death=top_death_count_date_address_df.iloc[indx]['date_of_death']
death_count=top_death_count_date_address_df.iloc[indx]['death_count']
'''
Ignoring a location if it is repeating in the list.
As top_death_count_date_address_df is sorted by count of fatalities I will get the highest fatality count for an address
'''
if address not in list(all_loc_dict.keys()):
location_dict={}
location_dict['latitude']=latitude
location_dict['longitude']=longitude
location_dict['location']=location1
location_dict['lat_long']=[latitude,longitude]
location_dict['full_address']=full_address
location_dict['date_of_death']=date_of_death
location_dict['death_count']=death_count
all_loc_dict[address] = location_dict
indx += 1
for address_key, location_value in all_loc_dict.items():
location_list.append(location_value)
location_df = pd.DataFrame(location_list)
location_df
def regioncolors(num_of_death):
if num_of_death > 100:
return 'red'
else:
return 'darkgreen'
location_df["color"] = location_df['death_count'].apply(regioncolors)
location_df
Using the Folium library and Link let's take a look at where the incident locations are in New York State!
folium_map = folium.Map(location=location_df.iloc[0]['lat_long'],
zoom_start=7,
tiles='Stamen Terrain')
for curr_loc in list(location_df.index):
folium.Marker(location=location_df.iloc[curr_loc]['lat_long'],
popup="Date of most fatal incident : "+
datetime.strftime(location_df.iloc[curr_loc]['date_of_death'],'%B,%d,%Y')+
" Number of deaths on that date : "+
str(location_df.iloc[curr_loc]['death_count'])+
" Location : "+
location_df.iloc[curr_loc]['full_address'],
icon=folium.Icon(color=location_df.iloc[curr_loc]['color'],
icon_color='white',
icon='male',
angle=0,
prefix='fa')
).add_to(folium_map)
folium_map
print('Total Time taken:',time.time() - start_time,'seconds')