import warnings
warnings.filterwarnings('ignore')
import operator
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
import scipy
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest
rng = np.random.default_rng()
import pip
import os
from tqdm.notebook import tqdm, trange
tqdm.pandas()
import seaborn as sns
sns.set_theme()
sns.set_style("whitegrid")
import matplotlib as mpl
import matplotlib.pyplot as plt
import folium
from folium import plugins
from pygam.pygam import LinearGAM, s, f, te, PoissonGAM
from scipy.stats import mannwhitneyu

pd.options.display.max_columns = None
pd.options.display.max_rows = None

# location of data 
data_folder = '../data/calls_for_service'
# paths for all csv files in data_folder
csv_files = [f for f in os.listdir(data_folder) if f.endswith('csv')]
# if compiled csv file does not already exist
if 'calls_master.csv' not in csv_files:
    # make empty dataframe
    calls_for_service = pd.DataFrame()
    # combine all files in folder into one large dataframe
    for f in tqdm(csv_files, desc = "Combining Files"):
        file_path = os.path.join(data_folder, f)
        df = pd.read_csv(file_path)
        calls_for_service = pd.concat([calls_for_service, df], ignore_index = True)
    # export to combined csv file
    calls_for_service.to_csv('../data/calls_for_service/calls_master.csv')
else:
    # if compiled csv already exists, just load that
    calls_for_service = pd.read_csv(os.path.join(data_folder, 'calls_master.csv'))

calls_for_service.head()

calls_for_service.head()

# size of calls_for_service
calls_for_service.shape

(5109233, 24)

# list first 50 unique type labels for NOPD incidents
calls_for_service["TypeText"].unique()[:50]

array(['AREA CHECK', 'COMPLAINT OTHER', 'AGGRAVATED BATTERY BY SHOOTING',
       'AUTO ACCIDENT', 'RECOVERY OF REPORTED STOLEN VEHICLE',
       'DISTURBANCE (OTHER)', 'SHOPLIFTING', 'BICYCLE THEFT', 'HIT & RUN',
       'TRAFFIC STOP', 'BURGLAR ALARM, SILENT', 'DISCHARGING FIREARM',
       'SIMPLE BURGLARY VEHICLE', 'MEDICAL', 'SUSPICIOUS PERSON',
       'DOMESTIC DISTURBANCE', 'FIREWORKS', 'MENTAL PATIENT',
       'SUICIDE THREAT', 'PROWLER', 'FIGHT', 'THEFT',
       'SIMPLE CRIMINAL DAMAGE', 'EXTORTION (THREATS)', 'THEFT BY FRAUD',
       'SIMPLE BATTERY', 'RESIDENCE BURGLARY', 'HOMICIDE BY SHOOTING',
       'MISSING JUVENILE', 'RETURN FOR ADDITIONAL INFO',
       'UNAUTHORIZED USE OF VEHICLE', 'LOST PROPERTY',
       'VIOLATION OF PROTECTION ORDER', 'PUBLIC GATHERING',
       'AGGRAVATED RAPE', 'UNCLASSIFIED DEATH',
       'AGGRAVATED ASSAULT DOMESTIC', 'AUTO THEFT', 'TRAFFIC INCIDENT',
       'SIMPLE BATTERY DOMESTIC', 'DRUG VIOLATIONS',
       'SIMPLE ASSAULT DOMESTIC', 'THEFT FROM EXTERIOR OF VEHICLE',
       'ILLEGAL EVICTION', 'SIMPLE BURGLARY', 'ARMED ROBBERY WITH KNIFE',
       'ARMED ROBBERY WITH GUN', 'NOISE COMPLAINT',
       'AGGRAVATED BATTERY BY CUTTING', 'AUTO ACCIDENT WITH INJURY'],
      dtype=object)

# get all data types for calls_for_service
calls_for_service.dtypes

Unnamed: 0           int64
NOPD_Item           object
Type                object
TypeText            object
Priority            object
InitialType         object
InitialTypeText     object
InitialPriority     object
MapX               float64
MapY               float64
TimeCreate          object
TimeDispatch        object
TimeArrive          object
TimeClosed          object
Disposition         object
DispositionText     object
SelfInitiated       object
Beat                object
BLOCK_ADDRESS       object
Zip                float64
PoliceDistrict       int64
Location            object
Type_               object
TimeArrival         object
dtype: object

# convert ZIP column to object
calls_for_service['Zip'] = calls_for_service['Zip'].astype(str)

# convert temporal attributes to datetime objects
calls_for_service['TimeCreate'] = pd.to_datetime(calls_for_service['TimeCreate'])
calls_for_service['TimeDispatch'] = pd.to_datetime(calls_for_service['TimeDispatch'])
calls_for_service['TimeArrive'] = pd.to_datetime(calls_for_service['TimeArrive'])
calls_for_service['TimeClosed'] = pd.to_datetime(calls_for_service['TimeClosed'])

# drop junk index generated during reading
calls_for_service.drop(['Unnamed: 0'], axis =1, inplace = True)

calls_for_service[["Longitude", "Latitude"]] = calls_for_service["Location"].str.extract(r'POINT \((-?\d+\.\d+) (-?\d+\.\d+)\)')

calls_for_service["Longitude"] = calls_for_service["Longitude"].astype(float)
calls_for_service["Latitude"] = calls_for_service["Latitude"].astype(float)

calls_for_service.head()

calls_for_service["DispositionText"].value_counts()

DispositionText
Necessary Action Taken                2511179
REPORT TO FOLLOW                       979928
GONE ON ARRIVAL                        549778
NECESSARY ACTION TAKEN                 527679
VOID                                   236148
UNFOUNDED                              167989
DUPLICATE                              134592
MUNICIPAL NECESSARY ACTION TAK            422
Test incident                             253
Test Incident                             233
Canceled By Complainant                   226
REFERRED TO EXTERNAL AGENCY               185
RTA Related Incident Disposition          130
TRUANCY NECESSARY ACTION TAKEN            129
FALSE ALARM                               100
UNKNOWN                                    81
Clear                                      46
SUPPLEMENTAL                               28
Sobering Center Transport                  28
CURFEW NECESSARY ACTION TAKEN               6
REPORT TO FOLLOW MUNICIPAL                  4
REPORT TO FOLLOW CURFEW                     4
TEST MOTOROLA                               3
REPORT TO FOLLOW TRUANCY                    2
CREATED ON SYS DOWN/RESEARCH                1
Report written incident UnFounded           1
Name: count, dtype: int64

calls_for_service["Disposition"].value_counts()

Disposition
NAT      3038859
RTF       979928
GOA       549777
VOI       236148
UNF       167989
DUP       134592
NATM         422
EST          253
TST          233
CBC          226
REF          185
TRN          130
NATT         129
FAR          100
NO911         50
-13           43
SBC           28
SUPP          28
FDINF         17
NODIS         12
NATC           6
RTFC           4
RTFM           4
CLR            3
TEST           3
RTFT           2
MD/PM          1
1              1
NAT67          1
NAT18          1
NAT71          1
OFFLN          1
RUF            1
Name: count, dtype: int64

disposition_mask = "GONE ON ARRIVAL|VOID|FALSE ALARM|Clear|DUPLICATE|Test incident|Test Incident|Canceled By Complainant"
calls_for_service = calls_for_service[calls_for_service["DispositionText"].str.contains(disposition_mask)==False]

calls_for_service["DispositionText"].value_counts()

DispositionText
Necessary Action Taken                2511179
REPORT TO FOLLOW                       979928
NECESSARY ACTION TAKEN                 527679
UNFOUNDED                              167989
MUNICIPAL NECESSARY ACTION TAK            422
REFERRED TO EXTERNAL AGENCY               185
RTA Related Incident Disposition          130
TRUANCY NECESSARY ACTION TAKEN            129
UNKNOWN                                    81
SUPPLEMENTAL                               28
Sobering Center Transport                  28
CURFEW NECESSARY ACTION TAKEN               6
REPORT TO FOLLOW MUNICIPAL                  4
REPORT TO FOLLOW CURFEW                     4
TEST MOTOROLA                               3
REPORT TO FOLLOW TRUANCY                    2
Report written incident UnFounded           1
CREATED ON SYS DOWN/RESEARCH                1
Name: count, dtype: int64

calls_for_service["TypeText"].unique().shape[0]

430

calls_for_service["TypeText"][0:10]

0                              AREA CHECK
1                         COMPLAINT OTHER
2                              AREA CHECK
4          AGGRAVATED BATTERY BY SHOOTING
5                           AUTO ACCIDENT
6     RECOVERY OF REPORTED STOLEN VEHICLE
10                              HIT & RUN
11                           TRAFFIC STOP
13                             AREA CHECK
15                             AREA CHECK
Name: TypeText, dtype: object

types = ['Accidents/Traffic Safety', 'Alarms', 'Public Assistance', 'Mental Health', 'Complaints/Environment', 'Domestic Violence',
        'Drugs','Fire','Alcohol','Medical Emergencies','Missing Persons','Officer Needs Help', 'Not Crime', 'Other', 'Property',
        'Sex Offenses', 'Status','Suspicion','Violent Crime','Warrants']

file_path = '../data/output_data.xlsx'
mapping_df = pd.read_excel(file_path)

mapping_dict = mapping_df.set_index('TypeText')['Index'].to_dict()

calls_for_service.head()

calls_for_service["SimpleType"] = calls_for_service['TypeText'].progress_apply(lambda x: types[mapping_dict[x]])

calls_for_service.head()

print(calls_for_service.iloc[352]["SimpleType"])

  0%|          | 0/4187799 [00:00<?, ?it/s]

Status

# read in weather data
weather = pd.read_csv('../data/weather/NCEI_CDO.csv', low_memory = False)

weather.head()

station_list = weather["STATION"].unique()
station_list

array(['US1LAOR0006', 'US1LAOR0016', 'USW00012916', 'US1LAOR0003',
       'US1LAOR0014', 'USC00166666', 'US1LAOR0012', 'USW00053917',
       'USW00012930', 'US1LAOR0009', 'US1LAOR0019'], dtype=object)

weather.columns

Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND',
       'AWND_ATTRIBUTES', 'DAPR', 'DAPR_ATTRIBUTES', 'FMTM', 'FMTM_ATTRIBUTES',
       'MDPR', 'MDPR_ATTRIBUTES', 'PGTM', 'PGTM_ATTRIBUTES', 'PRCP',
       'PRCP_ATTRIBUTES', 'SNOW', 'SNOW_ATTRIBUTES', 'SNWD', 'SNWD_ATTRIBUTES',
       'TAVG', 'TAVG_ATTRIBUTES', 'TMAX', 'TMAX_ATTRIBUTES', 'TMIN',
       'TMIN_ATTRIBUTES', 'TOBS', 'TOBS_ATTRIBUTES', 'WDF2', 'WDF2_ATTRIBUTES',
       'WDF5', 'WDF5_ATTRIBUTES', 'WSF2', 'WSF2_ATTRIBUTES', 'WSF5',
       'WSF5_ATTRIBUTES', 'WT01', 'WT01_ATTRIBUTES', 'WT02', 'WT02_ATTRIBUTES',
       'WT03', 'WT03_ATTRIBUTES', 'WT04', 'WT04_ATTRIBUTES', 'WT05',
       'WT05_ATTRIBUTES', 'WT06', 'WT06_ATTRIBUTES', 'WT08', 'WT08_ATTRIBUTES',
       'WT10', 'WT10_ATTRIBUTES', 'WT11', 'WT11_ATTRIBUTES', 'WT13',
       'WT13_ATTRIBUTES', 'WT14', 'WT14_ATTRIBUTES', 'WT16', 'WT16_ATTRIBUTES',
       'WT18', 'WT18_ATTRIBUTES', 'WT21', 'WT21_ATTRIBUTES'],
      dtype='object')

special_attribute_labels = {"WT01":"Fog","WT02":"Heavy Fog","WT03":"Thunder","WT04":"Ice Pellets", "WT05":"Hail", "WT06":"Rime", 
                            "WT07": "Dust", "WT08":"Smoke", "WT09":"Blowing Snow", "WT10":"Tornado", "WT11":"High Wind", "WT12":"Blowing Spray",
                            "WT13":"Mist", "WT14":"Drizzle", "WT15":"Freezing Drizzle", "WT16":"Rain", "WT17":"Freezing Rain", "WT18":"Snow", "WT19":"Unknown Precipitation",
                           "WT21":"Ground Fog", "WT22":"Ice Fog"}


for col in special_attribute_labels:
    if col in weather.columns:
        weather[col] = weather[col].notnull()
    attribute = col + "_ATTRIBUTES"
    if attribute in weather.columns:
        weather.drop(attribute, inplace = True, axis = 1)

weather.rename(columns = special_attribute_labels, inplace = True)

weather.iloc[[2716]]

weather["DATE"] = pd.to_datetime(weather["DATE"])

general_attribute_labels = {"AWND":"AverageDailyWind", "DAPR":"NumDaysPrecipAvg", "FMTM":"FastestWindTime",
                      "MDPR":"MultidayPrecipTotal", "PGTM":"PeakGustTime", "PRCP":"Precipitation", "SNOW":"Snowfall",
                      "SNWD":"MinSoilTemp", "TAVG":"TimeAvgTemp", "TMAX":"TimeMaxTemp", "TMIN":"TimeMinTemp","TOBS":"TempAtObs", "WDF2":"2MinMaxWindDirection",
                      "WDF5":"5MinMaxWindDirection", "WSF2":"2MinMaxWindSpeed", "WSF5":"5MinMaxWindSpeed"}
                 
                      
for c, col in enumerate(general_attribute_labels):
    attribute = col + "_ATTRIBUTES"
    if attribute in weather.columns:
        weather.drop(attribute, inplace = True, axis = 1)
        
weather.rename(columns = general_attribute_labels, inplace = True)
    
    
decapitalize = {"STATION":"Station", "NAME":"Name", "LATITUDE":"Latitude", "LONGITUDE":"Longitude", "ELEVATION":"Elevation", "DATE":"Date"}

weather.rename(columns = decapitalize, inplace = True)

weather.head()

weather.set_index(["Date", "Station"], inplace = True)
weather.sort_index(ascending = False, inplace = True)

weather.head()

# create copy of dataframe type and time columns, and select only ones where the type includes FIRE BOMB, EXPLOSION, FIREWORKS, or ILLEGAL FIREWORKS
explosions_df = calls_for_service[["TypeText", "TimeCreate"]].copy()[calls_for_service["TypeText"].str.contains('FIRE BOMB|EXPLOSION|FIREWORKS|ILLEGAL FIREWORKS')]

# get the number of each of these kind of incidents overall
explosions_df["TypeText"].value_counts()

TypeText
FIREWORKS            3290
ILLEGAL FIREWORKS      87
EXPLOSION              14
FIRE BOMB               1
Name: count, dtype: int64

# extract just the month and day from each incident
explosions_df["Date"] = explosions_df["TimeCreate"].dt.strftime('%m-%d')
explosions_df.head()

# reindex over date, and then the kind of incident within
explosions_nested_df = pd.DataFrame(explosions_df.groupby(["Date", "TypeText"])["TypeText"].count())
explosions_nested_df.rename(columns = {"Date" : "Date", "TypeText": "TypeText", "TypeText": "Quantity"}, inplace = True)
explosions_nested_df.head()

print("Maximum Incidents on", explosions_nested_df["Quantity"].idxmax()[0])
print("Maximum Number of Incidents is", explosions_nested_df["Quantity"].max())

Maximum Incidents on 07-04
Maximum Number of Incidents is 726

# create stacked bar plot of each kind of explosion-related incident for each day during the year over all of the data
ax = explosions_nested_df.unstack().plot(kind = "bar", stacked = True, figsize = (20,6))
xtick_interval = 30
ax.set_xticks(range(0, 365, xtick_interval));
ax.set_ylabel("Quantity")
ax.set_title("Explosion Related Incidents in New Orleans Throughout the Year from 2011-2023")
ax.legend(["Explosion", "Fire Bomb", "Fireworks", "Illegal Fireworks"]);

# get all incidents that mention 'FLOOD'
floods_df = calls_for_service[["TypeText", "TimeCreate"]].copy()[calls_for_service["TypeText"].str.contains('FLOOD')]
floods_df["TypeText"].value_counts()

TypeText
FLOOD EVENT                     2898
FLOODED STREET                   121
FLOODED VEHICLE                   35
FLOODED VEHICLE (NOT MOVING)       1
Name: count, dtype: int64

# extract month, day from datetime objects
floods_df["Date"] = floods_df["TimeCreate"].dt.strftime('%m-%d')
floods_df.head()

# get quantity of each kind of flood-related events for any given date in a year
floods_nested_df = pd.DataFrame(floods_df.groupby(["Date", "TypeText"])["TypeText"].count())
floods_nested_df.rename(columns = {"Date" : "Date", "TypeText": "TypeText", "TypeText": "Quantity"}, inplace = True)
floods_nested_df.head()

# create stacked bar plot of each kind of flood-related incident for each day during the year over all of the data
ax = floods_nested_df.unstack().plot(kind = "bar", stacked = True, figsize = (20,6))
xtick_interval = 30
ax.set_xticks(range(0, len(floods_nested_df), xtick_interval));
ax.set_ylabel("Quantity")
ax.set_title("Flood Related Incidents in New Orleans Throughout the Year from 2011-2023")
ax.legend(["Flood Event", "Flooded Street", "Flooded Vehicle", "Flooded Vehicle (Not Moving)"]);

# get rate sof null values for all attributes
(weather.isnull().mean() * 100)

Name                     0.000000
Latitude                 0.000000
Longitude                0.000000
Elevation                0.000000
AverageDailyWind        63.303377
NumDaysPrecipAvg        98.658376
FastestWindTime         98.705226
MultidayPrecipTotal     98.679671
PeakGustTime            87.720942
Precipitation            1.499212
Snowfall                80.361174
MinSoilTemp             99.872226
TimeAvgTemp             83.670514
TimeMaxTemp             43.017164
TimeMinTemp             43.064015
TempAtObs               81.042634
2MinMaxWindDirection    63.239491
5MinMaxWindDirection    63.367264
2MinMaxWindSpeed        63.239491
5MinMaxWindSpeed        63.367264
Fog                      0.000000
Heavy Fog                0.000000
Thunder                  0.000000
Ice Pellets              0.000000
Hail                     0.000000
Rime                     0.000000
Smoke                    0.000000
Tornado                  0.000000
High Wind                0.000000
Mist                     0.000000
Drizzle                  0.000000
Rain                     0.000000
Snow                     0.000000
Ground Fog               0.000000
dtype: float64

#entity matching

# warning, can take >30 mins
def match_weather(crime_row):
    # extract date, latitude, and longitude
    c_date = crime_row["DateCreate"]
    c_lat = crime_row["Latitude"]
    c_long = crime_row["Longitude"]
    # try to find weather on that day
    
    try:
        weather_by_day = weather.loc[c_date]
    except KeyError:
        return np.nan
    
    # if weather exists, get closest station identifier
    euc_distances = np.sqrt((weather_by_day['Latitude'] - c_lat) ** 2 + (weather_by_day['Longitude'] - c_long) ** 2)
    closest_station = euc_distances.idxmin()
    
    return(closest_station)

match_table_path = '../data/match_table.csv'

calls_for_service["DateCreate"] = calls_for_service["TimeCreate"].dt.floor('D')

if os.path.exists(match_table_path):
    print("Loading Cached Entity Matching...")
    match_table = pd.read_csv(match_table_path)
    calls_for_service = calls_for_service.merge(match_table, on = "NOPD_Item", how = "outer")
    
else:
    print("Generating Entity Matching...")
    calls_for_service["ClosestStation"] = calls_for_service.progress_apply(match_weather, axis = 1)
    # If the file doesn't exist, save the DataFrame as a CSV
    match_table = calls_for_service[["NOPD_Item", "ClosestStation"]]
    match_table.to_csv(match_table_path, index=False)
    print("Dumping Relational Table to %s" %match_table_path)

Loading Cached Entity Matching...

calls_for_service.head()

# merge datasets together
calls_weather_master = pd.merge(calls_for_service, weather, left_on = ["DateCreate", "PairedStation"], right_on = ["Date", "Station"])
calls_weather_master.head()

# segment data by precipitation being 0 or non-0
precip_data = calls_weather_master.loc[calls_weather_master["Precipitation"]>0]
no_precip_data = calls_weather_master.loc[calls_weather_master["Precipitation"] == 0]

# how many of each kind of incident were there on each day in each data set?
precip_data_counts = precip_data.groupby(by = ["DateCreate"])["SimpleType"].value_counts().to_frame()
no_precip_data_counts = no_precip_data.groupby(by = ["DateCreate"])["SimpleType"].value_counts().to_frame()

# in each dataset, on average, how many were of each type on each day?
avg_precip_data_counts = precip_data_counts.groupby(by = ["SimpleType"]).mean()
avg_no_precip_data_counts = no_precip_data_counts.groupby(by = ["SimpleType"]).mean()
 
# merge data back together
total_avg_precip_counts = pd.merge(avg_precip_data_counts, avg_no_precip_data_counts, on = "SimpleType", suffixes = ("_precip", "_noprecip"))
total_avg_precip_counts.columns = ["PrecipPresent", "PrecipNotPresent"]

total_avg_precip_counts.head()

# melt data back out into duplicate rows, with presence of precipitation as indicator variable
melted_precip_counts = pd.melt(total_avg_precip_counts.reset_index(), id_vars = "SimpleType", value_vars = ["PrecipPresent", "PrecipNotPresent"], var_name = "Precip", value_name = "AvgCount")
melted_precip_counts.head()

# create barplot, segment by 'precip'
plt.figure(figsize = (20,6))
precip_diffs = sns.barplot(melted_precip_counts, x = "SimpleType", y = "AvgCount", hue = "Precip")
# rotate long xtick labels
for item in precip_diffs.get_xticklabels():
    item.set_rotation(45)
precip_diffs.set(xlabel = "Simple Type", ylabel = "Average Count Per Day", title = "Average Frequency of Event Categories, With and Without Precipitation")

plt.legend(title = "Precipitation Present on Day")
plt.show()

# drop date index
precip_data_raw = precip_data_counts.droplevel("DateCreate")
no_precip_data_raw = no_precip_data_counts.droplevel("DateCreate")

precip_data_raw.head(20)

# create dataframe to store results
t_tests = pd.DataFrame(columns = ["SimpleType", "TVal", "PVal"])
t_tests.set_index(["SimpleType"], inplace = True)

# for each category of incident
for i, kind in enumerate(precip_data_raw.index.unique()):
    # subselect category from each dataset
    subset_precip = precip_data_raw.loc[kind]
    subset_noprecip = no_precip_data_raw.loc[kind]
    # do numerical 2-sided ttest between the two, 50,000 perumutations
    ttest = stats.ttest_ind(subset_precip, subset_noprecip, nan_policy = 'omit', alternative = 'two-sided', permutations = 50000)
    # store results in dictionary, append to dataframe
    tval, pval = ttest.statistic[0], ttest.pvalue[0]
    temp_dict = pd.DataFrame({"TVal": tval, "PVal": pval}, index = [kind])
    t_tests = pd.concat([t_tests, temp_dict], ignore_index = False)


display(t_tests)

plt.figure(figsize = (20,6))

# make barplot
t_hist = sns.barplot(data = t_tests, x = t_tests.index, y = "TVal", color = 'b')

# rotate long xtick labels
for item in t_hist.get_xticklabels():
    item.set_rotation(45)

# annotate bars with PVal
for i, value in enumerate(t_tests['TVal']):
    if value < 0:
        plt.text(i, value -0.01, round(t_tests.iloc[i]["PVal"], 6), ha='center', va='top')
    else:
        plt.text(i, value + 0.01, round(t_tests.iloc[i]["PVal"], 6), ha='center', va='bottom')

t_hist.set(xlabel = "Incident Type", ylabel = "T-Statistic for Difference of Means", 
           title = "T Statistic for Difference of Means in Incidents per Day with Precipitation Present vs. Not Present (with p-values, 50,000 permutations)");

# get rate of nans in TimeMaxTemp column
weather["TimeMaxTemp"].isnull().sum() * 100 / len(weather["TimeMaxTemp"])

43.017164274458025

# get count of values by each station
weather.groupby(by = "Station")["TimeMaxTemp"].count()

Station
US1LAOR0003       0
US1LAOR0006       0
US1LAOR0009       0
US1LAOR0012       0
US1LAOR0014       0
US1LAOR0016       0
US1LAOR0019       0
USC00166666       0
USW00012916    4654
USW00012930    4126
USW00053917    4599
Name: TimeMaxTemp, dtype: int64

# create dataframe for temperature data
temp_df = calls_for_service[["NOPD_Item", "Longitude", "Latitude", "DateCreate", "SimpleType"]].copy().dropna(how = 'any')
# select temperature data from station "USW00012930"
temp_subselect = weather.loc[weather.index.get_level_values(1) == "USW00012930"][["TimeAvgTemp", "TimeMaxTemp", "TimeMinTemp"]]

# merge dataframes
temp_df = pd.merge(temp_df, temp_subselect, left_on = "DateCreate", right_on = "Date", how = "outer").progress_apply(lambda x: x)

  0%|          | 0/8 [00:00<?, ?it/s]

temp_df.head()

# plot density of all incidents given each temperature value
plt.figure(figsize = (20,6))
temp_hist = sns.histplot(data = temp_df, x = "TimeMaxTemp", stat = "density")

# create indicator IsViolent, turn into binary classification
temp_df["IsViolent"] = temp_df["SimpleType"] == "Violent Crime"
temp_df.head(10)

# make plots of violent and nonviolent distributions based on max day temperature, normalized separately so that they are comparable
plt.figure(figsize = (20,6))
viol_temp_hist = sns.histplot(data = temp_df, x = "TimeMaxTemp", hue = "IsViolent", 
                              stat = "density", multiple = "layer", common_norm = False, element = "step")
viol_temp_hist.set(xlabel = "Maximum Temperature on Day (Fahrenheit)", ylabel = "Porportion of Incidents", title = "Relative Proportions of Violent and Non-Violent Incidents Across Maximum Temperatures on Day");

# make plots of violent and nonviolent distributions based on min day temperature, normalized separately so that they are comparable
plt.figure(figsize = (20,6))
viol_temp_hist = sns.histplot(data = temp_df, x = "TimeMinTemp", hue = "IsViolent", 
                              stat = "density", multiple = "layer", common_norm = False, element = "step")
viol_temp_hist.set(xlabel = "Minimum Temperature on Day (Fahrenheit)", ylabel = "Porportion of Incidents", title = "Relative Proportions of Violent and Non-Violent Incidents Across Minimum Temperatures on Day");

# create indicator IsHot for if TimeMaxTemp was geq 90 degrees, turn into binary classification
temp_df["IsHot"] = temp_df["TimeMaxTemp"] >= 90.0
temp_df.head()

# get number of incidents for each combination of IsHot and IsViolent
temp_df_hot_gb = temp_df.groupby(by = ["IsHot", "IsViolent"])["NOPD_Item"].count()
temp_df_hot_gb

IsHot  IsViolent
False  False        776335
       True          29198
True   False        257354
       True          10537
Name: NOPD_Item, dtype: int64

# get counts of violent crimes in each classification
hot_c = temp_df_hot_gb[True, True]
not_hot_c = temp_df_hot_gb[False, True]

# get sample size of each classification
hot_n = temp_df_hot_gb[True, True] + temp_df_hot_gb[True, False]
not_hot_n = temp_df_hot_gb[False, True] + temp_df_hot_gb[False, False]

n_array = np.array([hot_n, not_hot_n])
c_array = np.array([hot_c, not_hot_c])

# perform difference of proportions test.
zval, pval = proportions_ztest(c_array, n_array, alternative = 'larger')

print("P-Value:", pval)

P-Value: 1.1556238891735672e-13

# subselect violent incidents
violent_incidents = temp_df.loc[temp_df["SimpleType"] == 'Violent Crime']
violent_incidents.head()

weather.groupby(by = "Station")["TimeMaxTemp"].mean()

Station
US1LAOR0003          NaN
US1LAOR0006          NaN
US1LAOR0009          NaN
US1LAOR0012          NaN
US1LAOR0014          NaN
US1LAOR0016          NaN
US1LAOR0019          NaN
USC00166666          NaN
USW00012916    79.902879
USW00012930    80.475036
USW00053917    78.820613
Name: TimeMaxTemp, dtype: float64

violent_incidents = violent_incidents[violent_incidents["TimeMaxTemp"].isnull()==False]
violent_incidents["TimeMaxTemp"].mean()

80.46237456080483

max_temp_percentiles = np.percentile(violent_incidents['TimeMaxTemp'], [25, 75])
print("25th percentile of Maximum Temperature", max_temp_percentiles[0])
print("75th percentile of Maximum Temperature:", max_temp_percentiles[1])

25th percentile of Maximum Temperature 73.0
75th percentile of Maximum Temperature: 90.0

# select only x,y,z, drop empty values
violent_incidents_clean = violent_incidents[["TimeMaxTemp", "Longitude", "Latitude"]].dropna(how = 'any')
# create folium map entity
combined_map = folium.Map(location=[30, -90], tiles="Cartodb dark_matter", zoom_start=12.8)

# add markers to map object
for index, row in violent_incidents_clean.iterrows():

    # generate marker for either high or low outlier
    if(row['TimeMaxTemp'] < max_temp_percentiles[0]):
        kw = {"color": "blue"}
        icon = folium.Icon(**kw)
        folium.CircleMarker([row['Latitude'], row['Longitude']], color='blue', radius=3, stroke=False,fill=True, fill_opacity=0.5 ,opacity=1).add_to(combined_map)
    elif(row['TimeMaxTemp'] > max_temp_percentiles[1]):
        kw = {"color": "green"}
        icon = folium.Icon(**kw)
        folium.CircleMarker([row['Latitude'], row['Longitude']], color='red', radius=3, stroke=False,fill=True, fill_opacity=0.3 ,opacity=1).add_to(combined_map)

combined_map

# create smaller cfs dataset
cfs_ml = calls_weather_master[["NOPD_Item", "TimeCreate", "Zip", "Longitude_x", "Latitude_x", "SimpleType", "TimeMaxTemp", "PairedStation", "DateCreate", "Precipitation"]]
cfs_ml.head()

# Plot coordinates of violent crime with seabron
sns.scatterplot(cfs_ml, x = "Longitude_x", y = "Latitude_x")
plt.title("Distribution of Crime by Location")

Text(0.5, 1.0, 'Distribution of Crime by Location')

# Filter off those two crimes with likely misinupt coordinates
cfs_ml = cfs_ml[(cfs_ml["Longitude_x"] < -89.5) & (cfs_ml["Latitude_x"] > 29.5)]

# creating geospatial bins, nxn
n = 20

# create bins
long_bins = np.linspace(cfs_ml["Longitude_x"].min() - 0.1, cfs_ml["Longitude_x"].max() + 0.1, n + 1)
lat_bins = np.linspace(cfs_ml["Latitude_x"].min() - 0.1, cfs_ml["Latitude_x"].max() + 0.1, n + 1)

# assign each crime to a latitude and longitude bin
cfs_ml["Long_bin"] = np.digitize(cfs_ml["Longitude_x"], long_bins)
cfs_ml["Lat_bin"] = np.digitize(cfs_ml["Latitude_x"], lat_bins)

cfs_ml.head()

# Create new columns for year, month, and day
cfs_ml["YearCreate"] = cfs_ml["DateCreate"].apply(lambda x: x.year)
cfs_ml["MonthCreate"] = cfs_ml["DateCreate"].apply(lambda x: x.month)
cfs_ml["DayCreate"] = cfs_ml["DateCreate"].apply(lambda x: x.day)


cfs_ml.head()

# filter by violent incidents
v_cfs_ml = cfs_ml[cfs_ml["SimpleType"] == "Violent Crime"]

# get counts for each day in each bin
v_cfs_ml_counts = v_cfs_ml[["DateCreate", "Long_bin", "Lat_bin", "NOPD_Item", "SimpleType"]].groupby(by = ["DateCreate", "Long_bin", "Lat_bin", "SimpleType"]).count().reset_index()

# merge back with rest of information
v_cfs_ml_counts = v_cfs_ml_counts.merge(v_cfs_ml[["DateCreate", "TimeMaxTemp", "YearCreate", "MonthCreate", "DayCreate"]].groupby(by = "DateCreate").max(), how = 'left', on = "DateCreate", 
                                                  indicator = False, validate = "many_to_many").drop(["DateCreate"], axis = 1)

# rename NOPD_count column
v_cfs_ml_counts.columns = ['Long_bin', 'Lat_bin', 'SimpleType', 'Count', 'TimeMaxTemp', 'YearCreate', 'MonthCreate', 'DayCreate']

# drop redundant simpletype column
v_cfs_ml_counts.drop("SimpleType", inplace = True, axis = 1)


# drop missing values
v_cfs_ml_counts.dropna(inplace = True)


v_cfs_ml_counts.head()

# create two new datasets by temperature values
v_cfs_hot = v_cfs_ml_counts[v_cfs_ml_counts["TimeMaxTemp"] > 90.0]
v_cfs_cold = v_cfs_ml_counts[v_cfs_ml_counts["TimeMaxTemp"] <= 90.0]

v_cfs_hot.head()

# split data into attributes and targets
hot_X = pd.get_dummies(v_cfs_hot.drop(["Count"], axis = 1), dtype = 'int')
hot_y = v_cfs_hot["Count"]

# split into seen and unseen data
hot_X_train, hot_X_test, hot_y_train, hot_y_test = train_test_split(hot_X, hot_y, test_size = 0.20, random_state = 0)

# initialize model
hotgam = PoissonGAM()

# hyperparameters to search over
lam_grid = [0, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001]
splines_grid = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# conduct grid search over hyperparameters
hotgam.gridsearch(hot_X_train.values, hot_y_train.values, n_splines = splines_grid, lam = lam_grid)
print(hotgam.summary())

100% (110 of 110) |######################| Elapsed Time: 0:00:04 Time:  0:00:04

PoissonGAM                                                                                                
=============================================== ==========================================================
Distribution:                       PoissonDist Effective DoF:                                     37.7904
Link Function:                          LogLink Log Likelihood:                                 -4511.3565
Number of Samples:                         2572 AIC:                                             9098.2939
                                                AICc:                                             9099.513
                                                UBRE:                                               2.8734
                                                Scale:                                                 1.0
                                                Pseudo R-Squared:                                   0.6009
==========================================================================================================
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
================================= ==================== ============ ============ ============ ============
s(0)                              [0.001]              9            8.0          0.00e+00     ***         
s(1)                              [0.001]              9            5.9          0.00e+00     ***         
s(2)                              [0.001]              9            8.0          8.21e-01                 
s(3)                              [0.001]              9            3.0          0.00e+00     ***         
s(4)                              [0.001]              9            5.0          9.25e-01                 
s(5)                              [0.001]              9            7.9          1.13e-02     *           
intercept                                              1            0.0          9.18e-01                 
==========================================================================================================
Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

WARNING: Fitting splines and a linear function to a feature introduces a model identifiability problem
         which can cause p-values to appear significant when they are not.

WARNING: p-values calculated in this manner behave correctly for un-penalized models or models with
         known smoothing parameters, but when smoothing parameters have been estimated, the p-values
         are typically lower than they should be, meaning that the tests reject the null too readily.
None

# what is the mean squared error of the model on the hot data?
y_pred = hotgam.predict(hot_X_test)

# filter for any broken predictions
inf_indices = np.isinf(y_pred)
y_pred_clean = y_pred[~inf_indices]
hot_y_test_clean = hot_y_test[~inf_indices]

print("Broken Predictions: %d" %(inf_indices.sum()))
mape = mean_absolute_percentage_error(y_pred_clean, hot_y_test_clean)
print(mape)

Broken Predictions: 0
0.4717844727392372

# fit gam to cold data
# can take around 9 mins to fit

# split data into attributes and targets
cold_X = pd.get_dummies(v_cfs_cold.drop(["Count"], axis = 1), dtype = 'int')
cold_y = v_cfs_cold["Count"]

# split into seen and unseen data
cold_X_train, cold_X_test, cold_y_train, cold_y_test = train_test_split(cold_X, cold_y, test_size = 0.20, random_state = 0)

# initialize model
coldgam = PoissonGAM() 

# hyperparameters to search over
lam_grid = [0, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001]
splines_grid = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# conduct grid search over hyperparameters
coldgam.gridsearch(cold_X_train.values, cold_y_train.values, n_splines = splines_grid, lam = lam_grid)
print(coldgam.summary())

100% (110 of 110) |######################| Elapsed Time: 0:00:07 Time:  0:00:07

PoissonGAM                                                                                                
=============================================== ==========================================================
Distribution:                       PoissonDist Effective DoF:                                     43.9985
Link Function:                          LogLink Log Likelihood:                                -14807.9279
Number of Samples:                         8427 AIC:                                            29703.8528
                                                AICc:                                           29704.3468
                                                UBRE:                                               2.8862
                                                Scale:                                                 1.0
                                                Pseudo R-Squared:                                   0.5389
==========================================================================================================
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
================================= ==================== ============ ============ ============ ============
s(0)                              [0]                  9            9.0          0.00e+00     ***         
s(1)                              [0]                  9            8.0          0.00e+00     ***         
s(2)                              [0]                  9            8.0          2.25e-02     *           
s(3)                              [0]                  9            3.0          0.00e+00     ***         
s(4)                              [0]                  9            8.0          0.00e+00     ***         
s(5)                              [0]                  9            8.0          7.09e-02     .           
intercept                                              1            0.0          7.93e-02     .           
==========================================================================================================
Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

WARNING: Fitting splines and a linear function to a feature introduces a model identifiability problem
         which can cause p-values to appear significant when they are not.

WARNING: p-values calculated in this manner behave correctly for un-penalized models or models with
         known smoothing parameters, but when smoothing parameters have been estimated, the p-values
         are typically lower than they should be, meaning that the tests reject the null too readily.
None

# what is the mean squared error of the model on the cold data?
y_pred = coldgam.predict(cold_X_test)

inf_indices = np.isinf(y_pred)
y_pred_clean = y_pred[~inf_indices]
cold_y_test_clean = cold_y_test[~inf_indices]

print("Broken Predictions: %d" %(inf_indices.sum()))
mape = mean_absolute_error(y_pred_clean, cold_y_test_clean)
print(mape)

Broken Predictions: 0
1.2703372027921234

# how does each model perform on the entire dataset?
X = v_cfs_ml_counts.drop("Count", axis = 1)
y = v_cfs_ml_counts["Count"]

cold_y_pred = coldgam.predict(X)
hot_y_pred = hotgam.predict(X)


# large imbalance of 0 - we can have low mean absolute error, but mean absolute percentage error is more useful in 
# examining how much the models differ from each otehr in terms of forecasting

hotmape = mean_absolute_percentage_error(hot_y_pred, y)
hotmae = mean_absolute_error(hot_y_pred, y)
print("'Hot' Mean Absolute Percentage Error: %.3f\n'Hot' Mean Absolute Error: %.3f\n" %(hotmape, hotmae))

coldmape = mean_absolute_percentage_error(cold_y_pred,y)
coldmae = mean_absolute_error(cold_y_pred, y)
print("'Cold' Mean Absolute Percentage Error: %.3f\n'Cold' Mean Absolute Error: %.3f" %(coldmape, coldmae))

'Hot' Mean Absolute Percentage Error: 0.505
'Hot' Mean Absolute Error: 2.518

'Cold' Mean Absolute Percentage Error: 327534518354.673
'Cold' Mean Absolute Error: 1.279

# get predictions for each model
hot_pred = hotgam.predict(X)
cold_pred = coldgam.predict(X)

# create dataframe of unique bins
pred_diffs_df = X[["Long_bin", "Lat_bin"]].drop_duplicates()

# add predictions to dataframe
X["hot_pred_count"] = hot_pred
X["cold_pred_count"] = cold_pred


pred_diffs_df.head()

# given a row of coordinate values, are the predictions between the models significantly different?
def mann_whitney_bin(row):
    # extract coordinate values
    target_long, target_lat = row["Long_bin"], row["Lat_bin"]
    # select data based on coordinates
    target_rows = X[(X["Long_bin"] == target_long) & (X["Lat_bin"] == target_lat)]
    # select predictions from model
    hot_preds = target_rows["hot_pred_count"]
    cold_preds = target_rows["cold_pred_count"]
    # runn mannwhitney U test
    ustat, pval = mannwhitneyu(hot_preds, cold_preds, alternative = 'greater')
    return ustat, pval

# apply to generated list of unique bin coordinates
pred_diffs_df[["UStat", "PVal"]] = pred_diffs_df.progress_apply(mann_whitney_bin, axis = 1, result_type = 'expand')

pred_diffs_df.head()

  0%|          | 0/37 [00:00<?, ?it/s]

# Label the significant rows as True
pred_diffs_df["Significant"] = (pred_diffs_df["PVal"] < 0.05)
pred_diffs_df.head(20)

# Calculate the boundaries of the plot
pred_diffs_df["Long_1"] = long_bins[pred_diffs_df["Long_bin"] - 2]
pred_diffs_df["Long_2"] = long_bins[pred_diffs_df["Long_bin"] - 1]
pred_diffs_df["Lat_1"] = lat_bins[pred_diffs_df["Lat_bin"] - 2]
pred_diffs_df["Lat_2"] = lat_bins[pred_diffs_df["Lat_bin"] - 1]

# Normalize the UStat values for plotting. Not a stastistical measure, just for visualization
pred_diffs_df["hue"] = (pred_diffs_df["UStat"]-pred_diffs_df["UStat"].min() * 1.5) / ((pred_diffs_df["UStat"].max() * 1.5) - pred_diffs_df["UStat"].min() * 1.5)

# initialize folium map
significance_map = folium.Map(location=[30, -90], zoom_start=10.5)

# for row in data where test was significant, render and plot rectangle
for i, row in pred_diffs_df[pred_diffs_df["Significant"] == True].iterrows():

    folium.Rectangle(
        bounds = [[row["Lat_1"], row["Long_1"]],[row["Lat_2"], row["Long_2"]]],
        color = 'red',
        fill_opacity = row['hue'],
        fill = True,
        weight = 0.5,
        popup = f"U: {row['UStat']}, p: {row['PVal']}",
    ).add_to(significance_map)


significance_map

	Unnamed: 0	NOPD_Item	Type	TypeText	Priority	InitialType	InitialTypeText	InitialPriority	MapX	MapY	TimeCreate	TimeDispatch	TimeArrive	TimeClosed	Disposition	DispositionText	SelfInitiated	Beat	BLOCK_ADDRESS	Zip	PoliceDistrict	Location	Type_	TimeArrival
0	0	A3472220	22A	AREA CHECK	1K	22A	AREA CHECK	1K	3688756.0	528696.0	01/28/2020 01:37:20 AM	01/28/2020 01:37:20 AM	01/28/2020 01:37:28 AM	01/28/2020 02:25:50 AM	NAT	Necessary Action Taken	N	4G04	Atlantic Ave & Slidell St	70114.0	4	POINT (-90.04525645 29.94750953)	NaN	NaN
1	1	A0000220	21	COMPLAINT OTHER	1J	21	COMPLAINT OTHER	1J	3668710.0	533007.0	01/01/2020 12:00:42 AM	01/01/2020 12:00:42 AM	01/01/2020 12:00:42 AM	01/01/2020 01:37:16 AM	NAT	Necessary Action Taken	Y	2U04	034XX Broadway St	70125.0	2	POINT (-90.10840522 29.95996774)	NaN	NaN
2	2	A2190820	22A	AREA CHECK	1K	22A	AREA CHECK	1K	3682445.0	530709.0	01/17/2020 09:18:41 PM	01/17/2020 09:18:41 PM	01/17/2020 09:18:47 PM	01/17/2020 09:18:54 PM	NAT	Necessary Action Taken	N	8B02	N Peters St & Bienville St	70130.0	8	POINT (-90.065113 29.95323762)	NaN	NaN
3	3	A2874820	21	COMPLAINT OTHER	2A	21	COMPLAINT OTHER	1J	3737616.0	590067.0	01/23/2020 10:19:48 AM	01/23/2020 10:22:05 AM	01/23/2020 10:31:11 AM	01/23/2020 10:34:35 AM	GOA	GONE ON ARRIVAL	N	7L08	I-10 E	70129.0	7	POINT (-89.88854843 30.11465463)	NaN	NaN
4	4	A2029120	34S	AGGRAVATED BATTERY BY SHOOTING	2C	34S	AGGRAVATED BATTERY BY SHOOTING	2C	3696210.0	551411.0	01/16/2020 05:09:05 PM	01/16/2020 05:09:43 PM	01/16/2020 05:16:07 PM	01/16/2020 10:49:37 PM	RTF	REPORT TO FOLLOW	N	7A01	Chef Menteur Hwy & Downman Rd	70126.0	7	POINT (-90.02090137 30.00973449)	NaN	NaN

	Unnamed: 0	NOPD_Item	Type	TypeText	Priority	InitialType	InitialTypeText	InitialPriority	MapX	MapY	TimeCreate	TimeDispatch	TimeArrive	TimeClosed	Disposition	DispositionText	SelfInitiated	Beat	BLOCK_ADDRESS	Zip	PoliceDistrict	Location	Type_	TimeArrival
0	0	A3472220	22A	AREA CHECK	1K	22A	AREA CHECK	1K	3688756.0	528696.0	01/28/2020 01:37:20 AM	01/28/2020 01:37:20 AM	01/28/2020 01:37:28 AM	01/28/2020 02:25:50 AM	NAT	Necessary Action Taken	N	4G04	Atlantic Ave & Slidell St	70114.0	4	POINT (-90.04525645 29.94750953)	NaN	NaN
1	1	A0000220	21	COMPLAINT OTHER	1J	21	COMPLAINT OTHER	1J	3668710.0	533007.0	01/01/2020 12:00:42 AM	01/01/2020 12:00:42 AM	01/01/2020 12:00:42 AM	01/01/2020 01:37:16 AM	NAT	Necessary Action Taken	Y	2U04	034XX Broadway St	70125.0	2	POINT (-90.10840522 29.95996774)	NaN	NaN
2	2	A2190820	22A	AREA CHECK	1K	22A	AREA CHECK	1K	3682445.0	530709.0	01/17/2020 09:18:41 PM	01/17/2020 09:18:41 PM	01/17/2020 09:18:47 PM	01/17/2020 09:18:54 PM	NAT	Necessary Action Taken	N	8B02	N Peters St & Bienville St	70130.0	8	POINT (-90.065113 29.95323762)	NaN	NaN
3	3	A2874820	21	COMPLAINT OTHER	2A	21	COMPLAINT OTHER	1J	3737616.0	590067.0	01/23/2020 10:19:48 AM	01/23/2020 10:22:05 AM	01/23/2020 10:31:11 AM	01/23/2020 10:34:35 AM	GOA	GONE ON ARRIVAL	N	7L08	I-10 E	70129.0	7	POINT (-89.88854843 30.11465463)	NaN	NaN
4	4	A2029120	34S	AGGRAVATED BATTERY BY SHOOTING	2C	34S	AGGRAVATED BATTERY BY SHOOTING	2C	3696210.0	551411.0	01/16/2020 05:09:05 PM	01/16/2020 05:09:43 PM	01/16/2020 05:16:07 PM	01/16/2020 10:49:37 PM	RTF	REPORT TO FOLLOW	N	7A01	Chef Menteur Hwy & Downman Rd	70126.0	7	POINT (-90.02090137 30.00973449)	NaN	NaN

	NOPD_Item	Type	TypeText	Priority	InitialType	InitialTypeText	InitialPriority	MapX	MapY	TimeCreate	TimeDispatch	TimeArrive	TimeClosed	Disposition	DispositionText	SelfInitiated	Beat	BLOCK_ADDRESS	Zip	PoliceDistrict	Location	Type_	TimeArrival	Longitude	Latitude
0	A3472220	22A	AREA CHECK	1K	22A	AREA CHECK	1K	3688756.0	528696.0	2020-01-28 01:37:20	2020-01-28 01:37:20	2020-01-28 01:37:28	2020-01-28 02:25:50	NAT	Necessary Action Taken	N	4G04	Atlantic Ave & Slidell St	70114.0	4	POINT (-90.04525645 29.94750953)	NaN	NaN	-90.045256	29.947510
1	A0000220	21	COMPLAINT OTHER	1J	21	COMPLAINT OTHER	1J	3668710.0	533007.0	2020-01-01 00:00:42	2020-01-01 00:00:42	2020-01-01 00:00:42	2020-01-01 01:37:16	NAT	Necessary Action Taken	Y	2U04	034XX Broadway St	70125.0	2	POINT (-90.10840522 29.95996774)	NaN	NaN	-90.108405	29.959968
2	A2190820	22A	AREA CHECK	1K	22A	AREA CHECK	1K	3682445.0	530709.0	2020-01-17 21:18:41	2020-01-17 21:18:41	2020-01-17 21:18:47	2020-01-17 21:18:54	NAT	Necessary Action Taken	N	8B02	N Peters St & Bienville St	70130.0	8	POINT (-90.065113 29.95323762)	NaN	NaN	-90.065113	29.953238
3	A2874820	21	COMPLAINT OTHER	2A	21	COMPLAINT OTHER	1J	3737616.0	590067.0	2020-01-23 10:19:48	2020-01-23 10:22:05	2020-01-23 10:31:11	2020-01-23 10:34:35	GOA	GONE ON ARRIVAL	N	7L08	I-10 E	70129.0	7	POINT (-89.88854843 30.11465463)	NaN	NaN	-89.888548	30.114655
4	A2029120	34S	AGGRAVATED BATTERY BY SHOOTING	2C	34S	AGGRAVATED BATTERY BY SHOOTING	2C	3696210.0	551411.0	2020-01-16 17:09:05	2020-01-16 17:09:43	2020-01-16 17:16:07	2020-01-16 22:49:37	RTF	REPORT TO FOLLOW	N	7A01	Chef Menteur Hwy & Downman Rd	70126.0	7	POINT (-90.02090137 30.00973449)	NaN	NaN	-90.020901	30.009734

	NOPD_Item	Type	TypeText	Priority	InitialType	InitialTypeText	InitialPriority	MapX	MapY	TimeCreate	TimeDispatch	TimeArrive	TimeClosed	Disposition	DispositionText	SelfInitiated	Beat	BLOCK_ADDRESS	Zip	PoliceDistrict	Location	Type_	TimeArrival	Longitude	Latitude
0	A3472220	22A	AREA CHECK	1K	22A	AREA CHECK	1K	3688756.0	528696.0	2020-01-28 01:37:20	2020-01-28 01:37:20	2020-01-28 01:37:28	2020-01-28 02:25:50	NAT	Necessary Action Taken	N	4G04	Atlantic Ave & Slidell St	70114.0	4	POINT (-90.04525645 29.94750953)	NaN	NaN	-90.045256	29.947510
1	A0000220	21	COMPLAINT OTHER	1J	21	COMPLAINT OTHER	1J	3668710.0	533007.0	2020-01-01 00:00:42	2020-01-01 00:00:42	2020-01-01 00:00:42	2020-01-01 01:37:16	NAT	Necessary Action Taken	Y	2U04	034XX Broadway St	70125.0	2	POINT (-90.10840522 29.95996774)	NaN	NaN	-90.108405	29.959968
2	A2190820	22A	AREA CHECK	1K	22A	AREA CHECK	1K	3682445.0	530709.0	2020-01-17 21:18:41	2020-01-17 21:18:41	2020-01-17 21:18:47	2020-01-17 21:18:54	NAT	Necessary Action Taken	N	8B02	N Peters St & Bienville St	70130.0	8	POINT (-90.065113 29.95323762)	NaN	NaN	-90.065113	29.953238
4	A2029120	34S	AGGRAVATED BATTERY BY SHOOTING	2C	34S	AGGRAVATED BATTERY BY SHOOTING	2C	3696210.0	551411.0	2020-01-16 17:09:05	2020-01-16 17:09:43	2020-01-16 17:16:07	2020-01-16 22:49:37	RTF	REPORT TO FOLLOW	N	7A01	Chef Menteur Hwy & Downman Rd	70126.0	7	POINT (-90.02090137 30.00973449)	NaN	NaN	-90.020901	30.009734
5	A3444420	20	AUTO ACCIDENT	1E	20	AUTO ACCIDENT	1E	3666298.0	529693.0	2020-01-27 19:59:59	2020-01-27 20:02:05	2020-01-27 20:14:58	2020-01-27 21:19:56	RTF	REPORT TO FOLLOW	N	2L04	Broadway St & S Claiborne Ave	70125.0	2	POINT (-90.11613127 29.95092657)	NaN	NaN	-90.116131	29.950927

	TypeText	TimeCreate	Date
26	FIREWORKS	2020-01-01 00:00:34	01-01
27	FIREWORKS	2020-01-01 00:01:05	01-01
2502	FIREWORKS	2020-01-01 00:03:46	01-01
2503	FIREWORKS	2020-01-01 00:03:52	01-01
4494	FIREWORKS	2020-04-20 20:22:27	04-20

Investigating Relationship Between Weather and Criminal Offense Trends in New Orleans¶

Hayden Outlaw, Joe Wagner | Tulane CMPS 6790 Data Science | Fall 2023¶

https://outlawhayden.github.io/weather-crime ¶

Project Outline¶

Collaboration Plan¶

New Orleans Police Department Calls for Service¶

Cleaning Calls for Service Dataframe¶

Location Extraction¶

Getting Rid of Duplicates and False Calls¶

Categorizing Call Types¶

NOAA Weather Station Data¶

Cleaning Weather Dataframe¶

EDA¶

Data Matching¶

Analysis¶

Effect of Precipitation¶

Effect of Temperature on the Rates of Violent Crime¶

Locational Distribution of Violent Crimes on days with Extreme Maximum or Minimum Daily Temperature¶

Modeling¶

Background¶

Preprocessing for GAM¶

Temperature vs Violence¶

Checking Significance By Grid Location¶

Plotting Bin Count Prediction Difference¶

	STATION	NAME	LATITUDE	LONGITUDE	ELEVATION	DATE	AWND	AWND_ATTRIBUTES	DAPR	DAPR_ATTRIBUTES	FMTM	FMTM_ATTRIBUTES	MDPR	MDPR_ATTRIBUTES	PGTM	PGTM_ATTRIBUTES	PRCP	PRCP_ATTRIBUTES	SNOW	SNOW_ATTRIBUTES	SNWD	SNWD_ATTRIBUTES	TAVG	TAVG_ATTRIBUTES	TMAX	TMAX_ATTRIBUTES	TMIN	TMIN_ATTRIBUTES	TOBS	TOBS_ATTRIBUTES	WDF2	WDF2_ATTRIBUTES	WDF5	WDF5_ATTRIBUTES	WSF2	WSF2_ATTRIBUTES	WSF5	WSF5_ATTRIBUTES	WT01	WT01_ATTRIBUTES	WT02	WT02_ATTRIBUTES	WT03	WT03_ATTRIBUTES	WT04	WT04_ATTRIBUTES	WT05	WT05_ATTRIBUTES	WT06	WT06_ATTRIBUTES	WT08	WT08_ATTRIBUTES	WT10	WT10_ATTRIBUTES	WT11	WT11_ATTRIBUTES	WT13	WT13_ATTRIBUTES	WT14	WT14_ATTRIBUTES	WT16	WT16_ATTRIBUTES	WT18	WT18_ATTRIBUTES	WT21	WT21_ATTRIBUTES
0	US1LAOR0006	NEW ORLEANS 2.1 ENE, LA US	29.961679	-90.038803	2.4	2015-02-01	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.03	,,N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	US1LAOR0006	NEW ORLEANS 2.1 ENE, LA US	29.961679	-90.038803	2.4	2015-02-02	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.04	,,N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	US1LAOR0006	NEW ORLEANS 2.1 ENE, LA US	29.961679	-90.038803	2.4	2015-02-03	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.00	T,,N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	US1LAOR0006	NEW ORLEANS 2.1 ENE, LA US	29.961679	-90.038803	2.4	2015-02-04	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.50	,,N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	US1LAOR0006	NEW ORLEANS 2.1 ENE, LA US	29.961679	-90.038803	2.4	2015-02-05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.59	,,N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

		Name	Latitude	Longitude	Elevation	AverageDailyWind	NumDaysPrecipAvg	FastestWindTime	MultidayPrecipTotal	PeakGustTime	Precipitation	Snowfall	MinSoilTemp	TimeAvgTemp	TimeMaxTemp	TimeMinTemp	TempAtObs	2MinMaxWindDirection	5MinMaxWindDirection	2MinMaxWindSpeed	5MinMaxWindSpeed	Fog	Heavy Fog	Thunder	Ice Pellets	Hail	Rime	Smoke	Tornado	High Wind	Mist	Drizzle	Rain	Snow	Ground Fog
Date	Station
2023-09-29	USW00012930	NEW ORLEANS AUDUBON, LA US	29.91660	-90.130200	6.1	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	85.0	73.0	73.0	NaN	NaN	NaN	NaN	False	False	False	False	False	False	False	False	False	False	False	False	False	False
	USW00012916	NEW ORLEANS AIRPORT, LA US	29.99755	-90.277720	-1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	78.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	False	False	False	False	False	False	False	False	False	False	False	False	False	False
	US1LAOR0014	NEW ORLEANS 3.8 WSW, LA US	29.93772	-90.131310	2.1	NaN	NaN	NaN	NaN	NaN	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	False	False	False	False	False	False	False	False	False	False	False	False	False	False
	US1LAOR0009	NEW ORLEANS 5.0 N, LA US	30.01515	-90.065586	0.6	NaN	NaN	NaN	NaN	NaN	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	False	False	False	False	False	False	False	False	False	False	False	False	False	False
2023-09-28	USW00053917	NEW ORLEANS LAKEFRONT AIRPORT, LA US	30.04934	-90.028990	0.9	15.43	NaN	NaN	NaN	1815.0	0.0	NaN	NaN	NaN	87.0	77.0	NaN	60.0	90.0	21.9	25.9	False	False	False	False	False	False	False	False	False	False	False	False	False	False

	TypeText	TimeCreate	Date
14553	FLOOD EVENT	2020-05-23 21:26:31	05-23
35718	FLOOD EVENT	2020-05-15 00:45:38	05-15
35725	FLOOD EVENT	2020-05-15 00:49:22	05-15
35744	FLOOD EVENT	2020-05-15 01:08:16	05-15
35747	FLOOD EVENT	2020-05-15 01:12:54	05-15

		Quantity
Date	TypeText
01-04	FLOOD EVENT	1
01-07	FLOOD EVENT	4
01-10	FLOOD EVENT	5
01-12	FLOOD EVENT	1
01-23	FLOODED STREET	1

	PrecipPresent	PrecipNotPresent
SimpleType
Accidents/Traffic Safety	49.928854	65.570949
Alarms	26.324538	34.521739
Alcohol	1.398551	1.350427
Complaints/Environment	119.301837	157.387823
Domestic Violence	17.045213	22.742857

	SimpleType	Precip	AvgCount
0	Accidents/Traffic Safety	PrecipPresent	49.928854
1	Alarms	PrecipPresent	26.324538
2	Alcohol	PrecipPresent	1.398551
3	Complaints/Environment	PrecipPresent	119.301837
4	Domestic Violence	PrecipPresent	17.045213

	count
SimpleType
Property	54
Complaints/Environment	53
Accidents/Traffic Safety	29
Status	25
Fire	22
Alarms	13
Domestic Violence	11
Violent Crime	11
Suspicion	6
Other	2
Drugs	2
Status	265
Complaints/Environment	195
Accidents/Traffic Safety	149
Property	97
Alarms	44
Violent Crime	30
Domestic Violence	25
Suspicion	17
Drugs	4

	TVal	PVal
Property	-13.718642	0.000020
Complaints/Environment	-9.611537	0.000020
Accidents/Traffic Safety	-9.314315	0.000020
Status	-4.030017	0.000020
Fire	0.015970	0.989140
Alarms	-8.525947	0.000020
Domestic Violence	-10.065440	0.000020
Violent Crime	-13.872960	0.000020
Suspicion	-9.708032	0.000020
Other	-7.481751	0.000020
Drugs	-4.156151	0.000120
Alcohol	0.547275	0.598588
Sex Offenses	-2.225532	0.025499
Warrants	-0.125015	0.920742
Missing Persons	-4.199223	0.000060
Not Crime	-0.433415	0.674607
Officer Needs Help	-4.012472	0.000100
Public Assistance	-2.071430	0.040339
Medical Emergencies	-1.813930	0.069979
Mental Health	-6.251554	0.000020

	NOPD_Item	Longitude	Latitude	DateCreate	SimpleType	TimeAvgTemp	TimeMaxTemp	TimeMinTemp	IsViolent	IsHot
24	A3548720	-90.106546	29.969927	2020-01-28	Violent Crime	NaN	56.0	45.0	True	False
50	A3571420	-90.098663	29.973679	2020-01-28	Violent Crime	NaN	56.0	45.0	True	False
118	A3500320	-90.081004	29.938222	2020-01-28	Violent Crime	NaN	56.0	45.0	True	False
120	A3587520	-90.050751	30.001026	2020-01-28	Violent Crime	NaN	56.0	45.0	True	False
188	A3481720	-90.050159	30.002987	2020-01-28	Violent Crime	NaN	56.0	45.0	True	False

	Long_bin	Lat_bin	Count	TimeMaxTemp	YearCreate	MonthCreate	DayCreate
0	8	9	2	61.0	2020	1	1
1	9	8	5	61.0	2020	1	1
2	9	9	10	61.0	2020	1	1
3	9	10	2	61.0	2020	1	1
4	10	8	1	61.0	2020	1	1

	Long_bin	Lat_bin	UStat	PVal
0	8	9	651199.0	3.978217e-45
1	9	8	1227946.0	1.093721e-111
2	9	9	1398408.0	1.560836e-156
3	9	10	460483.0	1.522088e-67
4	10	8	679479.0	2.344271e-83

	Long_bin	Lat_bin	UStat	PVal	Significant
0	8	9	651199.0	3.978217e-45	True
1	9	8	1227946.0	1.093721e-111	True
2	9	9	1398408.0	1.560836e-156	True
3	9	10	460483.0	1.522088e-67	True
4	10	8	679479.0	2.344271e-83	True
5	10	9	1284353.0	5.377456e-150	True
6	10	10	964719.0	9.588653e-89	True
7	11	8	568437.0	1.126920e-62	True
8	11	10	856716.0	3.022422e-67	True
9	11	11	415552.0	2.818562e-52	True
10	12	11	552305.0	2.105193e-65	True
27	13	11	120830.0	2.687330e-17	True
28	8	8	196070.0	2.928040e-18	True
40	11	7	4361.0	1.486019e-04	True
42	11	9	181126.0	2.507103e-47	True
55	12	10	94327.0	2.814925e-27	True
71	10	11	9115.0	3.341455e-10	True
78	8	10	44445.0	8.378286e-10	True
98	13	10	1590.0	1.659153e-05	True
120	12	8	318.0	7.341460e-03	True

	Long_bin	Lat_bin	Count	TimeMaxTemp	YearCreate	MonthCreate	DayCreate
1397	9	8	2	93.0	2020	5	21
1398	9	9	4	93.0	2020	5	21
1399	10	8	1	93.0	2020	5	21
1400	10	9	2	93.0	2020	5	21
1401	10	10	5	93.0	2020	5	21

Investigating Relationship Between Weather and Criminal Offense Trends in New Orleans¶

Hayden Outlaw, Joe Wagner | Tulane CMPS 6790 Data Science | Fall 2023¶

https://outlawhayden.github.io/weather-crime¶

Project Outline¶

Collaboration Plan¶

New Orleans Police Department Calls for Service¶

Cleaning Calls for Service Dataframe¶

Location Extraction¶

Getting Rid of Duplicates and False Calls¶

Categorizing Call Types¶

NOAA Weather Station Data¶

Cleaning Weather Dataframe¶

EDA¶

Data Matching¶

Analysis¶

Effect of Precipitation¶

Effect of Temperature on the Rates of Violent Crime¶

Locational Distribution of Violent Crimes on days with Extreme Maximum or Minimum Daily Temperature¶

Modeling¶

Background¶

Preprocessing for GAM¶

Temperature vs Violence¶

Checking Significance By Grid Location¶

Plotting Bin Count Prediction Difference¶

https://outlawhayden.github.io/weather-crime ¶