The obtained data shows film permits granted for New York City. Permits are generally required when asserting the exclusive use of city property, like a sidewalk, a street, or a park. I found this data through the suggestions for project 2 ideas on Blackboard.
Growing up I have watched a lot of American movies and TV shows. Many of these have shown New York City. After I came to the USA I myself visited many of the places in New York City (NYC) and visualized the movies and shows I had watched as a kid. I did not get to see an actual film shoot though. So, when I saw this data, the data scientist in me thought I should figure out when do movies actually shoot in NYC. Following questions came to my mind:
!pip install geopy
!pip install humanize
!pip install folium
import numpy as np
import pandas as pd
import time
import datetime
from datetime import datetime
import calendar
import chardet
import missingno as msno
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import os
import random
import re
from geopy.geocoders import Nominatim
import json
import humanize
import folium
import warnings
warnings.filterwarnings("ignore")
start_time = time.time()
print('Pandas',pd.__version__)
print('Matplotlib',matplotlib.__version__)
print('Seaborn',sns.__version__)
print('File Size In MB : ',(os.path.getsize('Film_Permits.csv')/1048576),' MB')
NYC = 'New York City'
Encoding check for the input CSV file to ensure data is in the right format
with open('Film_Permits.csv','rb') as fraw:
file_content = fraw.read()
chardet.detect(file_content)
Character encoding of the CSV file is ascii and confidence level is 1(100%).
Exploring file contents from the CSV:
!head -n 3 Film_Permits.csv
Next, I will extract data from the CSV file and insert into a dataframe for processing
pd.options.display.max_rows = 40
start_time_before_load = time.time()
film_permits_df = pd.read_csv("Film_Permits.csv")
print('Time taken to load the data : ',time.time() - start_time_before_load,'seconds')
film_permits_df.shape
The csv/dataframe contains 40682 rows and 14 columns
Let us explore the data a bit using head(), tail(), info(), describe()
film_permits_df.head()
film_permits_df.tail()
film_permits_df.info()
film_permits_df.describe()
film_permits_df.describe(include='all')
film_permits_df.describe(include='object')
Next, I will explore the column metadata...
first_n_entries=5
print('Total rows in the dataframe:', film_permits_df.shape[0])
for col, col_type in film_permits_df.dtypes.iteritems():
if(col_type=='object'):
print(col, 'has', film_permits_df[col].nunique(), 'unique entries')
print('First', first_n_entries, 'entries are')
print(film_permits_df[col][0:first_n_entries])
print('')
for this_column in film_permits_df.columns:
print('====', this_column, 'has', film_permits_df[this_column].nunique(), 'unique entries ====')
print(film_permits_df[this_column].value_counts().head(5))
print('')
"""
Next, I transform the object data type for EventType to 'category' data type
"""
film_permits_df['EventType'] = film_permits_df['EventType'].astype('category')
film_permits_df['EventType'].dtype
"""
Next, I transform the object data type for EventAgency to 'category' data type
"""
film_permits_df['EventAgency'] = film_permits_df['EventAgency'].astype('category')
film_permits_df['EventAgency'].dtype
"""
Next, I transform the object data type for Borough to 'category' data type
"""
film_permits_df['Borough'] = film_permits_df['Borough'].astype('category')
film_permits_df['Borough'].dtype
"""
Next, I transform the object data type for Category to 'category' data type
"""
film_permits_df['Category'] = film_permits_df['Category'].astype('category')
film_permits_df['Category'].dtype
"""
Next, I transform the object data type for SubCategoryName to 'category' data type
"""
film_permits_df['SubCategoryName'] = film_permits_df['SubCategoryName'].astype('category')
film_permits_df['SubCategoryName'].dtype
"""
Next, I transform the object data type for Country to 'category' data type
"""
film_permits_df['Country'] = film_permits_df['Country'].astype('category')
film_permits_df['Country'].dtype
def get_date(d1):
return datetime.strptime(d1,"%m/%d/%Y %I:%M:%S %p").strftime('%m/%d/%Y %H:%M:%S')
"""
Next, I transform the object data type for StartDateTime to 'datetime' data type
"""
film_permits_df['StartDateTime']=film_permits_df['StartDateTime'].astype(str)
film_permits_df['StartDateTime']=film_permits_df['StartDateTime'].apply(get_date)
film_permits_df['StartDateTime']=pd.to_datetime(
film_permits_df['StartDateTime'],
format='%m/%d/%Y %H:%M:%S')
"""
Next, I transform the object data type for EndDateTime to 'datetime' data type
"""
film_permits_df['EndDateTime']=film_permits_df['EndDateTime'].astype(str)
film_permits_df['EndDateTime']=film_permits_df['EndDateTime'].apply(get_date)
film_permits_df['EndDateTime']=pd.to_datetime(
film_permits_df['EndDateTime'],
format='%m/%d/%Y %H:%M:%S')
"""
Next, I transform the object data type for EnteredOn to 'datetime' data type
"""
film_permits_df['EnteredOn']=film_permits_df['EnteredOn'].astype(str)
film_permits_df['EnteredOn']=film_permits_df['EnteredOn'].apply(get_date)
film_permits_df['EnteredOn']=pd.to_datetime(
film_permits_df['EnteredOn'],
format='%m/%d/%Y %H:%M:%S')
film_permits_df.dtypes
Black = filled; white = empty
"""
Searching for missing data in sample set of 300 randomly selected data points
"""
_=msno.matrix(film_permits_df.sample(300))
plt.xlabel('Features in data',fontsize=16)
plt.ylabel('Gaps in data',fontsize=16)
plt.show()
"""
Searching for missing data in sample set of 3000 randomly selected data points
"""
_=msno.matrix(film_permits_df.sample(3000))
plt.xlabel('Features in data',fontsize=16)
plt.ylabel('Gaps in data',fontsize=16)
plt.show()
The data looks fairly clean to me from the graphs above but jsut to make sure, I will perform the following tasks:
print('Shape of data frame before Cleanup :',film_permits_df.shape)
print('Drop all rows and columns where entire row or column is NaN.')
film_permits_df.dropna(how='all',axis=0,inplace=True) # rows
film_permits_df.dropna(how='all',axis=1,inplace=True) # columns
print('Drop columns with duplicate data or with 50% missing value.')
half_count = len(film_permits_df)*.5
film_permits_df = film_permits_df.dropna(thresh=half_count, axis=1)
film_permits_df = film_permits_df.drop_duplicates()
print('Drop columns where all rows have the same value.')
for this_column in film_permits_df.columns:
if (film_permits_df[this_column].nunique()==1):
unique_entry=film_permits_df.iloc[0][this_column]
print('Drop column ',this_column,' where all rows have the same value : ', unique_entry)
film_permits_df.drop([this_column],axis=1,inplace=True)
print('Shape of data frame after cleanup :',film_permits_df.shape)
That is get data from the same borough
film_permits_df.head().T
"""
Counting null data per column
"""
film_permits_df.isnull().sum()
"""
Percentage of missing data per column
"""
(film_permits_df.isnull().sum()/len(film_permits_df)).sort_values(ascending=False)
Filling up missing data through sampling of data in same boroughs
print("Data index for missing ZipCode(s)",list(film_permits_df[film_permits_df['ZipCode(s)'].isnull()].index))
print("Data index for missing CommunityBoard(s)",list(film_permits_df[film_permits_df['CommunityBoard(s)'].isnull()].index))
print("Data index for missing PolicePrecinct(s)",list(film_permits_df[film_permits_df['PolicePrecinct(s)'].isnull()].index))
'''
Viewing the missing data
'''
film_permits_df.iloc[[1138, 6038, 17714, 20833, 23054, 26856, 39837]]
'''
Boroguh based sampling for ZipCode(s), PolicePrecinct(s), CommunityBoard(s) data
'''
zipcode_smapling_dict={}
communityboard_smapling_dict={}
policeprecinc_smapling_dict={}
null_index=list(film_permits_df[film_permits_df['ZipCode(s)'].isnull()].index)
print(null_index)
for indx in null_index:
print('index :',indx)
this_borough=film_permits_df.iloc[indx]['Borough']
print(this_borough)
sample_zipcode=random.choice(list(film_permits_df[(film_permits_df['Borough']==this_borough)
& (film_permits_df['ZipCode(s)'].notnull())]['ZipCode(s)']))
sample_communityboard=random.choice(list(film_permits_df[(film_permits_df['Borough']==this_borough)
& (film_permits_df['CommunityBoard(s)'].notnull())]['CommunityBoard(s)']))
sample_policeprecinct=random.choice(list(film_permits_df[(film_permits_df['Borough']==this_borough)
& (film_permits_df['PolicePrecinct(s)'].notnull())]['PolicePrecinct(s)']))
zipcode_smapling_dict[indx]=sample_zipcode
communityboard_smapling_dict[indx]=sample_communityboard
policeprecinc_smapling_dict[indx]=sample_policeprecinct
print(zipcode_smapling_dict)
print(communityboard_smapling_dict)
print(policeprecinc_smapling_dict)
'''
Filling up the missing values with sampled data
'''
film_permits_df['ZipCode(s)'].fillna(zipcode_smapling_dict,inplace=True)
film_permits_df['CommunityBoard(s)'].fillna(communityboard_smapling_dict,inplace=True)
film_permits_df['PolicePrecinct(s)'].fillna(policeprecinc_smapling_dict,inplace=True)
'''
Checking filled up data
'''
film_permits_df.iloc[[1138, 6038, 17714, 20833, 23054, 26856, 39837]]
film_permits_df.isnull().sum()
Missing data have been filled up successfully for ZipCode(s), PolicePrecinct(s), CommunityBoard(s) columns
... for Film Permit data in New York City
Let's ask our data some questions about film permits in New York City.
print("There are",film_permits_df['Category'].nunique(),
"kinds of \"shooting\" activities happening in",NYC)
for shoot_category in film_permits_df['Category'].unique():
print(shoot_category)
film_permits_df['Category'].value_counts()
plt.figure(figsize=(15,10))
sns.countplot(x='Category',data=film_permits_df,order=film_permits_df['Category'].value_counts().index)
plt.title("Number of permits granted in each category of \"shooting\" activity in New York",fontsize=20)
plt.xlabel("Category",fontsize=16)
plt.ylabel("Number of permits",fontsize=16)
plt.show()
print("There are",film_permits_df['EventType'].nunique(),
"kinds of events that are being granted permits in",NYC)
for permit_category in film_permits_df['EventType'].unique():
print(permit_category)
film_permits_df['EventType'].value_counts()
plt.figure(figsize=(15,10))
sns.countplot(x='EventType',data=film_permits_df,order=film_permits_df['EventType'].value_counts().index)
plt.title("Number of permits granted per event type in New York",fontsize=20)
plt.xlabel("Event type",fontsize=16)
plt.ylabel("Number of permits",fontsize=16)
plt.show()
if film_permits_df['Borough'].nunique() == 5:
print("Yes, shoot permits are being granted for:")
else:
print("No, shoot permits are being granted for:")
for boroughs in film_permits_df['Borough'].unique():
print(boroughs)
film_permits_df['Borough'].value_counts()
I assume that a lot of foreign movies are shot in New York City. Its not just movies from Hollywood/USA.
if film_permits_df['Country'].nunique() == 1 and film_permits_df['Country'].unique() == 'United States of America':
print("No, it is not true. Only US based shoots are happening in",NYC)
else:
print("Yes, it is true. All the following countries come to shoot in",NYC)
for countries in film_permits_df['Country'].unique():
print(countries)
How many shoots are happening per country?
film_permits_df['Country'].value_counts()
Method defined to compute normalized value for a series
Formula for normalization used is as follows:
$\mathbf{X_{new}} = {X - X_{min} \over X_{max} - X_{min}}$
'''
This method will return the value normalized between 0 and 1, for a number in a series
given the number, maximum value and minimum value in the series
'''
def compute_norm(number, max_val, min_val):
return (number - min_val)/(max_val - min_val)
'''
This method will take a series and return a df with the normalized values for that series.
Created as we will reuse this a number of times.
'''
def get_normalized_value_df(series_to_process, category_col_name, count_col_name):
column_list = []
column_list.append(category_col_name)
column_list.append(count_col_name)
series_to_df = pd.DataFrame(list(series_to_process.items()), columns=column_list)
normalized_value_list = []
for num in np.array(series_to_df[count_col_name]):
normalized_value_list.append(compute_norm(number=float(num),
max_val=float(series_to_process.nlargest(1)),
min_val=float(series_to_process.nsmallest(1))
)
)
series_to_df['norm_'+count_col_name] = normalized_value_list
return series_to_df
'''
Computing the number of shooting permits per year
'''
film_permits_df['Year'] = film_permits_df['StartDateTime'].apply(lambda time: time.year)
film_permits_df['Month'] = (film_permits_df['StartDateTime'].dt.month).apply(lambda x : calendar.month_abbr[x])
film_permits_df['Hour'] = film_permits_df['StartDateTime'].apply(lambda time: time.hour)
film_permits_df['Year'].value_counts()
'''
Computing the number of shooting permits per month
'''
months=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
film_permits_df['Year'] = film_permits_df['StartDateTime'].apply(lambda time: time.year)
film_permits_df['Hour'] = film_permits_df['StartDateTime'].apply(lambda time: time.hour)
film_permits_df['Month'] = pd.Categorical(
film_permits_df['Month'],
categories=months,
ordered=True)
film_permits_df['Month'].value_counts()
'''
Computing the number of shooting permits per weekday
'''
weekdays=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
film_permits_df["Weekday"] = film_permits_df['StartDateTime'].dt.weekday_name
film_permits_df['Weekday'] = pd.Categorical(
film_permits_df['Weekday'],
categories=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
ordered=True)
film_permits_df['Weekday'].value_counts()
Extracting the top five category of shooting a activity for processing
top_category = film_permits_df['Category'].value_counts().head(5).index.values
top_category
top_category_df = film_permits_df[(film_permits_df['Category']=='Television')|(film_permits_df['Category']=='Film')
|(film_permits_df['Category']=='Theater')|(film_permits_df['Category']=='Commercial')
|(film_permits_df['Category']=='Still Photography')]
top_category_pivot_df=top_category_df.pivot_table(values='EventID', index='Month', columns='Year', aggfunc=np.size)
First on list, we have:
"Can this data tell me the popular day of the week when shooting activities occur?"
To answer the first question, let's find out the hour of events for top five category of shooting activity
top_category_hour_pivot = top_category_df.pivot_table(values='EventID',
index='Category',
columns=top_category_df['StartDateTime'].dt.hour,
aggfunc=np.size)
top_category_df.groupby([top_category_df['StartDateTime'].dt.hour,
'Category',])['EventID'].count().unstack().plot(marker='o',figsize=(15,10))
plt.title('Number of permits at hours of the day for top five category',fontsize=20)
plt.ylabel('Number of permits',fontsize=16)
plt.xlabel('Hours of the day',fontsize=16)
plt.xticks(np.arange(24))
plt.show()
'''
Computing the normalized value of total number of shooting permits per hour of day
We are computing normalized values to determine the outlier hours for shooting activities.
'''
hourly_permits_df = get_normalized_value_df(
series_to_process=film_permits_df['StartDateTime'].dt.hour.value_counts(),
category_col_name='hour',count_col_name='permit_count')
hourly_permits_df.plot.bar(x='hour', y='norm_permit_count', figsize=(15,10))
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Hour of day',fontsize=16)
plt.ylabel('Normalized number of permits',fontsize=16)
plt.title('Normalized value of total permits per hour of a day',fontsize=20)
plt.show()
From the above two graphs we can see that:
top_category_df.groupby(['Weekday','Category',])['EventID'].count().unstack().plot(marker='o',figsize=(15,10))
plt.title('Weekly trend for permits acquired in top-five category of shooting activities',fontsize=20)
plt.xticks(np.arange(7),weekdays)
plt.xlabel('Week Day',fontsize=16)
plt.ylabel('Number of permits',fontsize=16)
plt.show()
'''
Computing the normalized value of number of shooting permits per weekday
We are computing normalized values to detect if weekends are outliers for number of shooting activities.
'''
weekday_df = get_normalized_value_df(series_to_process=film_permits_df['Weekday'].value_counts(),
category_col_name='weekday',count_col_name='permit_count')
weekday_df.plot.bar(x='weekday', y='norm_permit_count', figsize=(15,10))
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Week Day',fontsize=16)
plt.ylabel('Normalized of permits',fontsize=16)
plt.title('Normalized value of total number of permits per weekday',fontsize=20)
plt.show()
Next, we look at our data to find out:
"Winter in New York city is very beautiful due to all the snow but are the shoots really happening in the harsh winter conditions of NYC?"
To answer the third question, let's find out the monthly trend for permits acquired per month in top five category of shooting activities
top_category_df.groupby(['Month','Category',])['EventID'].count().unstack().plot(marker='o',figsize=(15,10))
plt.title('Number of permits per month for top five category of shooting activity',fontsize=20)
plt.xticks(np.arange(12),months)
plt.xlabel('Month',fontsize=16)
plt.ylabel('Number of permits',fontsize=16)
plt.show()
'''
Computing the normalized value of total number of shooting permits per month
We are computing normalized values to detect if Winter months are outliers for number of shooting activities.
'''
month_df = get_normalized_value_df(series_to_process=film_permits_df['Month'].value_counts(),
category_col_name='month',count_col_name='permit_count')
month_df.plot.bar(x='month', y='norm_permit_count', figsize=(15,10))
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Month',fontsize=16)
plt.ylabel('Normalized number of permits',fontsize=16)
plt.title('Normalized value of total permits per month',fontsize=20)
plt.show()
From the above two graphs of month-wise number of shooting permits in each category and normalized value of total shooting permits per month we can see that:
From the permit data I would like to next find out the answer to: "I know some Bollywood movies have shot in Staten Island because of a large Indian community in that area but is it a popular location in general?"
'''
Computing the normalized value of number of shooting permits per borough and event combo
We are computing normalized values to detect if Staten Island is an outlier for number of shooting activities.
'''
borough_df = get_normalized_value_df(
series_to_process=film_permits_df.groupby(['Borough','EventType'])['EventID'].count(),
category_col_name='borough_and_event',
count_col_name='permit_count')
borough_df.plot.bar(x='borough_and_event', y='norm_permit_count', figsize=(15,10))
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Borough and Event combination',fontsize=16)
plt.ylabel('Normalized number of permits',fontsize=16)
plt.title('Normalized value of total permits per borough and event combination',fontsize=20)
plt.show()
Next, we take a look at some of the less popular events that acquire shooting permits in New York City. We would like to find out the answer to: "I like a lot of web series and watch Youtube stars like Casey Neistat who films in New York City. Given the popularity of Youtube in recent times are web shoots are rising in the city?""
film_permits_df.groupby(['Year','Category'])['EventID'].count().unstack().plot(kind='bar',figsize=(15,10))
plt.title('Year wise number of permits for each category of shooting activity',fontsize=20)
plt.setp(plt.gca().get_xticklabels(), rotation=0, fontsize=12)
plt.xlabel('Year',fontsize=16)
plt.ylabel('Number of permits',fontsize=16)
plt.show()
'''
Computing the normalized value of number of shooting permits per borough and event combo
'''
year_permit_df = get_normalized_value_df(
series_to_process=film_permits_df.groupby(['Category','Year'])['EventID'].count(),
category_col_name='category_year',
count_col_name='permit_count')
year_permit_df.plot.bar(x='category_year', y='norm_permit_count', figsize=(15,10))
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Category and Year',fontsize=16)
plt.ylabel('Normalized number of permits',fontsize=16)
plt.title('Normalized value of total permits per category over the years',fontsize=20)
plt.show()
web_df = film_permits_df[~film_permits_df['Category'].isin(top_category)]
web_df.groupby(['Year','Category'])['EventID'].count().unstack().plot(kind='bar',figsize=(15,10))
plt.title('Year wise number of permits for each low popularity shooting activity category',fontsize=20)
plt.setp(plt.gca().get_xticklabels(), rotation=0, fontsize=12)
plt.xlabel('Year',fontsize=16)
plt.ylabel('Number of permits',fontsize=16)
plt.show()
No further normalization is required in this case, as we are just looking for an up or down trend and not detecting outliers or comparing numerical values.
From the above graph we can see a clear rising trend of WEB shoot activity in New York City!
Lastly, we seek the answer for "Which locations in New York City are popular for movie shoots?" We determine this using the areas where parking was held for a shooting. Assumption being people don't want to walk too far to shoot their movies/shows.
Top ten parking held locations for shooting activities
geolocator = Nominatim()
street_address_list = []
lat_long_list = []
parking_series = film_permits_df['ParkingHeld'].value_counts().head(10)
parking_df = pd.DataFrame(list(parking_series.items()), columns=['ParkingHeld','permit_count'])
for street_info in parking_df['ParkingHeld']:
street_address = street_info.split('between')[0].strip()
found_numbers = re.search(r'\d+', street_address)
if found_numbers is not None:
indices = list(found_numbers.span())
street_number = street_address[indices[0]:indices[1]]
street_parts = street_address.split(street_number)
street_address = street_parts[0] + humanize.ordinal(street_number) + street_parts[1] + ', New York City, New York'
else:
street_address = street_address + ', New York City, New York'
location_dict = geolocator.geocode(street_address).raw
latitude = float(location_dict['lat'])
longitude = float(location_dict['lon'])
street_address_list.append(street_address)
lat_long_list.append([latitude,longitude])
new_df = pd.DataFrame({'ParkingHeld':street_address_list})
parking_df.update(new_df)
parking_df['lat_long'] = lat_long_list
parking_df
parking_df.plot.bar(x='ParkingHeld', y='permit_count', figsize=(15,10))
plt.setp(plt.gca().get_xticklabels(), rotation=90, fontsize=12)
plt.setp(plt.gca().get_yticklabels(), fontsize=12)
plt.xlabel('Top ten shooting locations',fontsize=16)
plt.ylabel('Number of permits',fontsize=16)
plt.title('Number of permits for top ten shooting locations',fontsize=20)
plt.show()
Using the Folium library let's take a look at where the popular shooting locations are in New York City!
folium_map = folium.Map(location=parking_df.iloc[0]['lat_long'],
zoom_start=11,
tiles='Stamen Terrain')
for curr_loc in list(parking_df.index):
folium.Marker(location=parking_df.iloc[curr_loc]['lat_long'],
popup=parking_df.iloc[curr_loc]['ParkingHeld']
).add_to(folium_map)
folium_map.add_child(folium.ClickForMarker(popup='Waypoint'))
folium_map
print('Total Time taken:',time.time() - start_time,'seconds')