Hack for Heat #7: Heat complaints over time
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import psycopg2
from datetime import datetime
from datetime import date
pd.options.display.max_columns = 40
Hack for Heat #7: Heat complaints over time¶
In the last post, I plotted how the number of complaints differd by borough over time. This time around, I'm going to revisit this process, but this time focusing on heating complaints only (this is Heat Seek, after all).
Loading data:¶
connection = psycopg2.connect('dbname = threeoneone user=threeoneoneadmin password=threeoneoneadmin')
cursor = connection.cursor()
cursor.execute('''SELECT DISTINCT complainttype FROM service;''')
complainttypes = cursor.fetchall()
cursor.execute('''SELECT createddate, borough, complainttype FROM service;''')
borodata = cursor.fetchall()
borodata = pd.DataFrame(borodata)
borodata.head()
borodata.columns = ['Date', 'Boro', 'Comptype']
heatdata = borodata.loc[(borodata['Comptype'] == 'HEATING') | (borodata['Comptype'] == 'HEAT/HOT WATER')]
I removed entires earlier than 2011 because there are data quality issues (substantially fewer cases):
heatdata = heatdata.loc[heatdata['Date'] > date(2011,3,1)]
Heat complaints by borough:¶
len(heatdata)
There were about a million heating complaints over the 3+ years we have data for.
heatbydate = heatdata.groupby(by='Boro').count()
heatbydate
Per capita:¶
Again, let's look at how many heat complaints each boro generates per person:
boropop = {
'MANHATTAN': 1636268,
'BRONX': 1438159,
'BROOKLYN': 2621793,
'QUEENS': 2321580,
'STATEN ISLAND': 473279,
}
heatbydate['Pop'] = [boropop.get(x) for x in heatbydate.index]
heatbydate['CompPerCap'] = heatbydate['Date']/heatbydate['Pop']
heatbydate
Complaints by borough over months¶
First, let's recreate the graph from before:
heatdata['Year'] = [x.year for x in heatdata['Date']]
heatdata['Month'] = [x.month for x in heatdata['Date']]
heatdata['Day'] = [x.day for x in heatdata['Date']]
heatdata.head()
We remove data where it was unspecified what the boro was
heatdata = heatdata.loc[heatdata['Boro'] != 'Unspecified']
heatplotdata = heatdata.groupby(by=['Boro', 'Year','Month']).count()
heatplotdata
boros = heatbydate.index
borodict = {x:[] for x in boros}
borodict.pop('Unspecified')
for boro in borodict:
borodict[boro] = list(heatplotdata.xs(boro).Date)
plotdata = np.zeros(len(borodict['BROOKLYN']))
for boro in sorted(borodict.keys()):
plotdata = np.row_stack((plotdata, borodict[boro]))
plotdata = np.delete(plotdata, (0), axis=0)
from matplotlib import patches as mpatches
x = np.arange(len(plotdata[0]))
#crude xlabels
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
years = ['2011', '2012', '2013', '2014', '2015', '2016']
xlabels = []
for year in years:
for month in months:
xlabels.append("{0} {1}".format(month,year))
xlabels = xlabels[2:-7] #start from march 2011, end may2016
A non-normalized plot:¶
The plot below is of raw complaint numbers, and is what we might expect: complaints about eat matter the most during the heating season!
plotcolors = [(1,0,103),(213,255,0),(255,0,86),(158,0,142),(14,76,161),(255,229,2),(0,95,57),\
(0,255,0),(149,0,58),(255,147,126),(164,36,0),(0,21,68),(145,208,203),(98,14,0)]
#rescaling rgb from 0-255 to 0 to 1
plotcolors = [(color[0]/float(255),color[1]/float(255),color[2]/float(255)) for color in plotcolors]
legendcolors = [mpatches.Patch(color = color) for color in plotcolors]
plt.figure(figsize = (15,10));
plt.stackplot(x,plotdata, colors = plotcolors);
plt.xticks(x,xlabels,rotation=90);
plt.xlim(0,len(xlabels))
plt.legend(legendcolors,sorted(borodict.keys()), bbox_to_anchor=(0.2, 1));
plt.title('Heating Complaints by Borough', size = 24)
plt.ylabel('Number of Complaints',size = 14)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().yaxis.set_ticks_position('left')
plt.gca().xaxis.set_ticks_position('bottom')
Normalizing data:¶
Next, we're going to normalize these data. This allows us to better visualize if the proportion of heating complaints changed by borough, over time (e.g., did one borough generate more/less complaints vs. others over time?).
What we want to do is divide each of the 5 rows by the total number of complaints by that column (the month)
totalcounts = heatdata.groupby(by=['Year', 'Month']).count().Date.values
boros = heatbydate.index
normdict = {x:[] for x in boros}
normdict.pop('Unspecified')
for boro in normdict:
for i in range(0,len(plotdata[1])): # for all the values in each row
normp = float(borodict[boro][i])/float(totalcounts[i])
normdict[boro].append(normp*100)
normplotdata = np.zeros(len(borodict['BROOKLYN']))
for boro in sorted(normdict.keys()):
normplotdata = np.row_stack((normplotdata, normdict[boro]))
normplotdata = np.delete(normplotdata,(0),axis = 0)
plotcolors = [(1,0,103),(213,255,0),(255,0,86),(158,0,142),(14,76,161),(255,229,2),(0,95,57),\
(0,255,0),(149,0,58),(255,147,126),(164,36,0),(0,21,68),(145,208,203),(98,14,0)]
#rescaling rgb from 0-255 to 0 to 1
plotcolors = [(color[0]/float(255),color[1]/float(255),color[2]/float(255)) for color in plotcolors]
legendcolors = [mpatches.Patch(color = color) for color in plotcolors]
plt.figure(figsize = (15,10));
plt.stackplot(x,normplotdata, colors = plotcolors);
plt.xticks(x,xlabels,rotation=90);
plt.xlim(0,len(xlabels))
plt.ylim(0,100)
plt.legend(legendcolors,sorted(normdict.keys()), bbox_to_anchor=(0.2, 1));
plt.title('Heating Complaints by Borough (normalized)', size = 24)
plt.ylabel('% of Complaints',size = 14)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().yaxis.set_ticks_position('left')
plt.gca().xaxis.set_ticks_position('bottom')
Overall, it looks like heating complaints were pretty consistent by borough. In the next post, I'm going to take a look at one way of addressing the afore-mentioned censoring problems.