import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from scipy.stats import linregress
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("Demand_All.csv")
df.head()
UTC | Eastern | Year | OrdDay | OrdHr | Weekday | Month | Day | HourEnd | DST | Concord | Greenwood | NewRiver | KingsMountain | Winterville | Total | Peak | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-01 06:00:00+00:00 | 2015-01-01 01:00:00-05:00 | 2015 | 1 | 1 | 4 | 1 | 1 | 1 | 0 | 97098.0 | 33667.0 | 27333 | 14393.0 | 8060.0 | 180550 | 0 |
1 | 2015-01-01 07:00:00+00:00 | 2015-01-01 02:00:00-05:00 | 2015 | 1 | 2 | 4 | 1 | 1 | 2 | 0 | 96620.0 | 33223.0 | 27048 | 14306.0 | 8307.0 | 179504 | 0 |
2 | 2015-01-01 08:00:00+00:00 | 2015-01-01 03:00:00-05:00 | 2015 | 1 | 3 | 4 | 1 | 1 | 3 | 0 | 96084.0 | 32849.0 | 27008 | 14270.0 | 8174.0 | 178385 | 0 |
3 | 2015-01-01 09:00:00+00:00 | 2015-01-01 04:00:00-05:00 | 2015 | 1 | 4 | 4 | 1 | 1 | 4 | 0 | 95829.0 | 33053.0 | 26821 | 14285.0 | 8333.0 | 178320 | 0 |
4 | 2015-01-01 10:00:00+00:00 | 2015-01-01 05:00:00-05:00 | 2015 | 1 | 5 | 4 | 1 | 1 | 5 | 0 | 98489.0 | 33686.0 | 26924 | 14450.0 | 8396.0 | 181946 | 0 |
data_type = df.dtypes
data_type
UTC object Eastern object Year int64 OrdDay int64 OrdHr int64 Weekday int64 Month int64 Day int64 HourEnd int64 DST int64 Concord float64 Greenwood float64 NewRiver int64 KingsMountain float64 Winterville float64 Total int64 Peak int64 dtype: object
nan_count = df.isna().sum()
nan_count
UTC 0 Eastern 0 Year 0 OrdDay 0 OrdHr 0 Weekday 0 Month 0 Day 0 HourEnd 0 DST 0 Concord 7102 Greenwood 7102 NewRiver 0 KingsMountain 7102 Winterville 7102 Total 0 Peak 0 dtype: int64
# Checking bell curve to see if filling in missing values with mean is appropriate
#creating a new data frame with just "year" and "concord"
concord_data = df[['Year', 'Concord']]
concord_data.head()
Year | Concord | |
---|---|---|
0 | 2015 | 97098.0 |
1 | 2015 | 96620.0 |
2 | 2015 | 96084.0 |
3 | 2015 | 95829.0 |
4 | 2015 | 98489.0 |
greenwood_data = df[['Year', 'Greenwood']]
greenwood_data.head()
Year | Greenwood | |
---|---|---|
0 | 2015 | 33667.0 |
1 | 2015 | 33223.0 |
2 | 2015 | 32849.0 |
3 | 2015 | 33053.0 |
4 | 2015 | 33686.0 |
KingsMountain_data = df[['Year', 'KingsMountain']]
KingsMountain_data.head()
Year | KingsMountain | |
---|---|---|
0 | 2015 | 14393.0 |
1 | 2015 | 14306.0 |
2 | 2015 | 14270.0 |
3 | 2015 | 14285.0 |
4 | 2015 | 14450.0 |
Winterville_data = df[['Year', 'Winterville']]
Winterville_data.head()
Year | Winterville | |
---|---|---|
0 | 2015 | 8060.0 |
1 | 2015 | 8307.0 |
2 | 2015 | 8174.0 |
3 | 2015 | 8333.0 |
4 | 2015 | 8396.0 |
# Filling in Missing values with the mean
df['Concord'] = df['Concord'].fillna(df['Concord'].mean())
df['Greenwood'] = df['Greenwood'].fillna(df['Greenwood'].mean())
df['KingsMountain'] = df['KingsMountain'].fillna(df['KingsMountain'].mean())
df['Winterville'] = df['Winterville'].fillna(df['Winterville'].mean())
nan_count2 = df.isna().sum()
nan_count2
UTC 0 Eastern 0 Year 0 OrdDay 0 OrdHr 0 Weekday 0 Month 0 Day 0 HourEnd 0 DST 0 Concord 0 Greenwood 0 NewRiver 0 KingsMountain 0 Winterville 0 Total 0 Peak 0 dtype: int64
data_type = df.dtypes
data_type
UTC object Eastern object Year int64 OrdDay int64 OrdHr int64 Weekday int64 Month int64 Day int64 HourEnd int64 DST int64 Concord float64 Greenwood float64 NewRiver int64 KingsMountain float64 Winterville float64 Total int64 Peak int64 dtype: object
df['Concord'] = df['Concord'].astype('int64')
df['Greenwood'] = df['Greenwood'].astype('int64')
df['KingsMountain'] = df['KingsMountain'].astype('int64')
df['Winterville'] = df['Winterville'].astype('int64')
data_type = df.dtypes
data_type
UTC object Eastern object Year int64 OrdDay int64 OrdHr int64 Weekday int64 Month int64 Day int64 HourEnd int64 DST int64 Concord int64 Greenwood int64 NewRiver int64 KingsMountain int64 Winterville int64 Total int64 Peak int64 dtype: object
Concord_sum = df['Concord'].sum()
print("Concord total power usage:", Concord_sum)
Greenwood_sum = df['Greenwood'].sum()
print("Greenwood total power usage: ", Greenwood_sum)
NewRiver_sum = df['NewRiver'].sum()
print("NewRiver total power usage: ", NewRiver_sum)
KingsMountain_sum = df['KingsMountain'].sum()
print("KingsMountain total power usage: ", KingsMountain_sum)
Winterville_sum = df['Winterville'].sum()
print("Winterville total power usage: ", Winterville_sum)
Concord total power usage: 5587949911 Greenwood total power usage: 1873460604 NewRiver total power usage: 1266075128 KingsMountain total power usage: 902715964 Winterville total power usage: 324750027
total_power_by_region = df[['Concord', 'Greenwood', 'NewRiver', 'KingsMountain', 'Winterville']].sum()
total_power_df = total_power_by_region.reset_index(name='Total Power')
sns.barplot(x='index', y='Total Power', data=total_power_df)
<AxesSubplot:xlabel='index', ylabel='Total Power'>
totals_power_df = pd.DataFrame({
"Region": ['Concord', 'Greenwood', 'NewRiver', 'KingsMountain', 'Winterville'],
"Total Power": [Concord_sum, Greenwood_sum, NewRiver_sum, KingsMountain_sum, Winterville_sum]})
total_power_df_sorted = total_power_df.sort_values(by ='Total Power', ascending = False)
total_power_df_sorted
index | Total Power | |
---|---|---|
0 | Concord | 5587949911 |
1 | Greenwood | 1873460604 |
2 | NewRiver | 1266075128 |
3 | KingsMountain | 902715964 |
4 | Winterville | 324750027 |
# All of January Data
Jan_df = df[(df['Month'] == 1)]
Jan_df
UTC | Eastern | Year | OrdDay | OrdHr | Weekday | Month | Day | HourEnd | DST | Concord | Greenwood | NewRiver | KingsMountain | Winterville | Total | Peak | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-01 06:00:00+00:00 | 2015-01-01 01:00:00-05:00 | 2015 | 1 | 1 | 4 | 1 | 1 | 1 | 0 | 97098 | 33667 | 27333 | 14393 | 8060 | 180550 | 0 |
1 | 2015-01-01 07:00:00+00:00 | 2015-01-01 02:00:00-05:00 | 2015 | 1 | 2 | 4 | 1 | 1 | 2 | 0 | 96620 | 33223 | 27048 | 14306 | 8307 | 179504 | 0 |
2 | 2015-01-01 08:00:00+00:00 | 2015-01-01 03:00:00-05:00 | 2015 | 1 | 3 | 4 | 1 | 1 | 3 | 0 | 96084 | 32849 | 27008 | 14270 | 8174 | 178385 | 0 |
3 | 2015-01-01 09:00:00+00:00 | 2015-01-01 04:00:00-05:00 | 2015 | 1 | 4 | 4 | 1 | 1 | 4 | 0 | 95829 | 33053 | 26821 | 14285 | 8333 | 178320 | 0 |
4 | 2015-01-01 10:00:00+00:00 | 2015-01-01 05:00:00-05:00 | 2015 | 1 | 5 | 4 | 1 | 1 | 5 | 0 | 98489 | 33686 | 26924 | 14450 | 8396 | 181946 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
44563 | 2020-02-01 01:00:00+00:00 | 2020-01-31 20:00:00-05:00 | 2020 | 31 | 740 | 5 | 1 | 31 | 20 | 0 | 113208 | 42034 | 31668 | 20362 | 7303 | 214574 | 0 |
44564 | 2020-02-01 02:00:00+00:00 | 2020-01-31 21:00:00-05:00 | 2020 | 31 | 741 | 5 | 1 | 31 | 21 | 0 | 109198 | 40802 | 30294 | 19678 | 7341 | 207313 | 0 |
44565 | 2020-02-01 03:00:00+00:00 | 2020-01-31 22:00:00-05:00 | 2020 | 31 | 742 | 5 | 1 | 31 | 22 | 0 | 103821 | 39485 | 28952 | 18814 | 7226 | 198297 | 0 |
44566 | 2020-02-01 04:00:00+00:00 | 2020-01-31 23:00:00-05:00 | 2020 | 31 | 743 | 5 | 1 | 31 | 23 | 0 | 97146 | 37034 | 27495 | 17582 | 6911 | 186169 | 0 |
44567 | 2020-02-01 05:00:00+00:00 | 2020-02-01 00:00:00-05:00 | 2020 | 31 | 744 | 5 | 1 | 31 | 24 | 0 | 91061 | 35146 | 26037 | 15977 | 6339 | 174559 | 0 |
4464 rows × 17 columns
Jan_2015 = df[(df['Month'] == 1) & (df['Year'] == 2015)]
Jan_2015_NewRiver_Total = Jan_2015.groupby('HourEnd')['NewRiver'].sum()
Jan_2015_NewRiver_Total
HourEnd 1 830939 2 806077 3 791975 4 788556 5 800197 6 833628 7 899553 8 968696 9 1005189 10 1019562 11 1020847 12 1005319 13 981670 14 963095 15 947010 16 939390 17 939369 18 974770 19 1012672 20 1007196 21 988856 22 954404 23 911466 24 866841 Name: NewRiver, dtype: int64
df_result_Jan_2015 = Jan_2015_NewRiver_Total.reset_index(name ='TotalPowerUsage')
#df_result_Jan_2015
df_Jan_2015_sorted = df_result_Jan_2015.sort_values(by = 'TotalPowerUsage', ascending = False)
df_Jan_2015_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
10 | 11 | 1020847 |
9 | 10 | 1019562 |
18 | 19 | 1012672 |
19 | 20 | 1007196 |
11 | 12 | 1005319 |
8 | 9 | 1005189 |
20 | 21 | 988856 |
12 | 13 | 981670 |
17 | 18 | 974770 |
7 | 8 | 968696 |
13 | 14 | 963095 |
21 | 22 | 954404 |
14 | 15 | 947010 |
15 | 16 | 939390 |
16 | 17 | 939369 |
22 | 23 | 911466 |
6 | 7 | 899553 |
23 | 24 | 866841 |
5 | 6 | 833628 |
0 | 1 | 830939 |
1 | 2 | 806077 |
4 | 5 | 800197 |
2 | 3 | 791975 |
3 | 4 | 788556 |
Jan_NewRiver = df[(df['Month'] == 1)]
Jan_NewRiver_Total = Jan_NewRiver.groupby('HourEnd')['NewRiver'].sum()
Jan_NewRiver_Total
HourEnd 1 4877637 2 4741598 3 4671303 4 4653754 5 4726696 6 4917476 7 5327374 8 5719311 9 5939928 10 6032503 11 6041263 12 5954354 13 5846188 14 5733195 15 5631853 16 5566556 17 5578212 18 5767909 19 5981854 20 5941348 21 5834807 22 5631257 23 5366839 24 5109415 Name: NewRiver, dtype: int64
## Total electricity usage for each hour in January
df_result_Jan = Jan_NewRiver_Total.reset_index(name = 'TotalPowerUsage')
df_Jan_sorted = df_result_Jan.sort_values(by = 'TotalPowerUsage', ascending = False)
df_Jan_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
10 | 11 | 6041263 |
9 | 10 | 6032503 |
18 | 19 | 5981854 |
11 | 12 | 5954354 |
19 | 20 | 5941348 |
8 | 9 | 5939928 |
12 | 13 | 5846188 |
20 | 21 | 5834807 |
17 | 18 | 5767909 |
13 | 14 | 5733195 |
7 | 8 | 5719311 |
14 | 15 | 5631853 |
21 | 22 | 5631257 |
16 | 17 | 5578212 |
15 | 16 | 5566556 |
22 | 23 | 5366839 |
6 | 7 | 5327374 |
23 | 24 | 5109415 |
5 | 6 | 4917476 |
0 | 1 | 4877637 |
1 | 2 | 4741598 |
4 | 5 | 4726696 |
2 | 3 | 4671303 |
3 | 4 | 4653754 |
Jan_NewRiver_Avg = Jan_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_Jan = Jan_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_Jan_sorted = df_result_Avg_Jan.sort_values(by ='AvgTotalPowerUsage', ascending = False)
df_Avg_Jan_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
10 | 11 | 32479.91 |
9 | 10 | 32432.81 |
18 | 19 | 32160.51 |
11 | 12 | 32012.66 |
19 | 20 | 31942.73 |
8 | 9 | 31935.10 |
12 | 13 | 31431.12 |
20 | 21 | 31369.93 |
17 | 18 | 31010.26 |
13 | 14 | 30823.63 |
7 | 8 | 30748.98 |
14 | 15 | 30278.78 |
21 | 22 | 30275.58 |
16 | 17 | 29990.39 |
15 | 16 | 29927.72 |
22 | 23 | 28853.97 |
6 | 7 | 28641.80 |
23 | 24 | 27469.97 |
5 | 6 | 26438.04 |
0 | 1 | 26223.85 |
1 | 2 | 25492.46 |
4 | 5 | 25412.34 |
2 | 3 | 25114.53 |
3 | 4 | 25020.18 |
# ADDED TIME FRAME TO MAKE IT MORE INTUITIVE - THIS IS THE AVERAGE POWER USAGE LEADING UP TO THAT HOUR FOR EACH HOUR
# EX: HOUR END (1) = 26233.85 --> FROM MIDNIGHT TO 1:00 AM
df_Avg_Jan_sorted['Time Frame'] = df_Avg_Jan_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Jan_sorted['Time Frame'] = pd.to_datetime(df_Avg_Jan_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_Jan_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
10 | 11 | 32479.91 | 11:00:00 |
9 | 10 | 32432.81 | 10:00:00 |
18 | 19 | 32160.51 | 19:00:00 |
11 | 12 | 32012.66 | 12:00:00 |
19 | 20 | 31942.73 | 20:00:00 |
8 | 9 | 31935.10 | 09:00:00 |
12 | 13 | 31431.12 | 13:00:00 |
20 | 21 | 31369.93 | 21:00:00 |
17 | 18 | 31010.26 | 18:00:00 |
13 | 14 | 30823.63 | 14:00:00 |
7 | 8 | 30748.98 | 08:00:00 |
14 | 15 | 30278.78 | 15:00:00 |
21 | 22 | 30275.58 | 22:00:00 |
16 | 17 | 29990.39 | 17:00:00 |
15 | 16 | 29927.72 | 16:00:00 |
22 | 23 | 28853.97 | 23:00:00 |
6 | 7 | 28641.80 | 07:00:00 |
23 | 24 | 27469.97 | 00:00:00 |
5 | 6 | 26438.04 | 06:00:00 |
0 | 1 | 26223.85 | 01:00:00 |
1 | 2 | 25492.46 | 02:00:00 |
4 | 5 | 25412.34 | 05:00:00 |
2 | 3 | 25114.53 | 03:00:00 |
3 | 4 | 25020.18 | 04:00:00 |
Jan_max = df_Avg_Jan_sorted['AvgTotalPowerUsage'].idxmax()
Jan_max_df = df_Avg_Jan_sorted.loc[[Jan_max]]
Jan_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
10 | 11 | 32479.91 | 11:00:00 |
line_Jan_sorted = df_result_Avg_Jan.sort_values(by = 'HourEnd', ascending = True)
line_Jan_sorted.set_index('HourEnd', inplace = True)
line_plot_Jan = df_result_Avg_Jan.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
#line_plot_Jan = line_Jan_sorted.plot.line(y='AvgTotalPowerUsage')
plt.title("January Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
Feb_NewRiver = df[(df['Month'] == 2)]
Feb_NewRiver_Total = Feb_NewRiver.groupby('HourEnd')['NewRiver'].sum()
Feb_NewRiver_Total
HourEnd 1 4230578 2 4069790 3 3984407 4 3958556 5 4015610 6 4199592 7 4602717 8 4952400 9 5210838 10 5312829 11 5347523 12 5290742 13 5211663 14 5118994 15 5030838 16 4967112 17 4951879 18 5019501 19 5247533 20 5266013 21 5159570 22 4972692 23 4721096 24 4448769 Name: NewRiver, dtype: int64
df_result_Feb = Feb_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_Feb_sorted = df_result_Feb.sort_values(by = 'TotalPowerUsage', ascending = False)
df_Feb_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
10 | 11 | 5347523 |
9 | 10 | 5312829 |
11 | 12 | 5290742 |
19 | 20 | 5266013 |
18 | 19 | 5247533 |
12 | 13 | 5211663 |
8 | 9 | 5210838 |
20 | 21 | 5159570 |
13 | 14 | 5118994 |
14 | 15 | 5030838 |
17 | 18 | 5019501 |
21 | 22 | 4972692 |
15 | 16 | 4967112 |
7 | 8 | 4952400 |
16 | 17 | 4951879 |
22 | 23 | 4721096 |
6 | 7 | 4602717 |
23 | 24 | 4448769 |
0 | 1 | 4230578 |
5 | 6 | 4199592 |
1 | 2 | 4069790 |
4 | 5 | 4015610 |
2 | 3 | 3984407 |
3 | 4 | 3958556 |
Feb_NewRiver_Avg = Feb_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_Feb = Feb_NewRiver_Avg.reset_index(name ='AvgTotalPowerUsage')
df_Avg_Feb_sorted = df_result_Avg_Feb.sort_values(by = 'AvgTotalPowerUsage', ascending = False)
df_Avg_Feb_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
10 | 11 | 31456.02 |
9 | 10 | 31251.94 |
11 | 12 | 31122.01 |
19 | 20 | 30976.55 |
18 | 19 | 30867.84 |
12 | 13 | 30656.84 |
8 | 9 | 30651.99 |
20 | 21 | 30350.41 |
13 | 14 | 30111.73 |
14 | 15 | 29593.16 |
17 | 18 | 29526.48 |
21 | 22 | 29251.13 |
15 | 16 | 29218.31 |
7 | 8 | 29131.76 |
16 | 17 | 29128.70 |
22 | 23 | 27771.15 |
6 | 7 | 27074.81 |
23 | 24 | 26169.23 |
0 | 1 | 24885.75 |
5 | 6 | 24703.48 |
1 | 2 | 23939.94 |
4 | 5 | 23621.24 |
2 | 3 | 23437.69 |
3 | 4 | 23285.62 |
df_Avg_Feb_sorted['Time Frame'] = df_Avg_Feb_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Feb_sorted['Time Frame'] = pd.to_datetime(df_Avg_Feb_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_Feb_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
10 | 11 | 31456.02 | 11:00:00 |
9 | 10 | 31251.94 | 10:00:00 |
11 | 12 | 31122.01 | 12:00:00 |
19 | 20 | 30976.55 | 20:00:00 |
18 | 19 | 30867.84 | 19:00:00 |
12 | 13 | 30656.84 | 13:00:00 |
8 | 9 | 30651.99 | 09:00:00 |
20 | 21 | 30350.41 | 21:00:00 |
13 | 14 | 30111.73 | 14:00:00 |
14 | 15 | 29593.16 | 15:00:00 |
17 | 18 | 29526.48 | 18:00:00 |
21 | 22 | 29251.13 | 22:00:00 |
15 | 16 | 29218.31 | 16:00:00 |
7 | 8 | 29131.76 | 08:00:00 |
16 | 17 | 29128.70 | 17:00:00 |
22 | 23 | 27771.15 | 23:00:00 |
6 | 7 | 27074.81 | 07:00:00 |
23 | 24 | 26169.23 | 00:00:00 |
0 | 1 | 24885.75 | 01:00:00 |
5 | 6 | 24703.48 | 06:00:00 |
1 | 2 | 23939.94 | 02:00:00 |
4 | 5 | 23621.24 | 05:00:00 |
2 | 3 | 23437.69 | 03:00:00 |
3 | 4 | 23285.62 | 04:00:00 |
Feb_max = df_Avg_Feb_sorted['AvgTotalPowerUsage'].idxmax()
Feb_max_df = df_Avg_Feb_sorted.loc[[Feb_max]]
Feb_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
10 | 11 | 31456.02 | 11:00:00 |
line_Feb_sorted = df_result_Avg_Feb.sort_values(by = 'HourEnd', ascending = True)
line_Feb_sorted.set_index('HourEnd', inplace = True)
line_plot_Feb = df_result_Avg_Feb.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("February Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
March_NewRiver = df[(df['Month'] == 3)]
March_NewRiver_Total = March_NewRiver.groupby('HourEnd')['NewRiver'].sum()
df_result_March = March_NewRiver_Total.reset_index(name = 'TotalPowerUsage')
df_March_sorted = df_result_March.sort_values(by ='TotalPowerUsage', ascending = False)
df_March_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
10 | 11 | 5092940 |
11 | 12 | 5059432 |
9 | 10 | 5043124 |
12 | 13 | 4981787 |
8 | 9 | 4925356 |
13 | 14 | 4892627 |
20 | 21 | 4825025 |
14 | 15 | 4799177 |
19 | 20 | 4753232 |
15 | 16 | 4713838 |
7 | 8 | 4688765 |
18 | 19 | 4652419 |
16 | 17 | 4652194 |
21 | 22 | 4639607 |
17 | 18 | 4609341 |
22 | 23 | 4381901 |
6 | 7 | 4334765 |
23 | 24 | 4118565 |
0 | 1 | 3922978 |
5 | 6 | 3918544 |
4 | 5 | 3734432 |
2 | 3 | 3697771 |
3 | 4 | 3676591 |
1 | 2 | 3652509 |
March_NewRiver_Avg = March_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_March = March_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_March_sorted = df_result_Avg_March.sort_values(by = 'AvgTotalPowerUsage', ascending = False)
df_Avg_March_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
10 | 11 | 27381.40 |
11 | 12 | 27201.25 |
9 | 10 | 27113.57 |
12 | 13 | 26783.80 |
8 | 9 | 26480.41 |
13 | 14 | 26304.45 |
20 | 21 | 25940.99 |
14 | 15 | 25802.03 |
19 | 20 | 25555.01 |
15 | 16 | 25343.22 |
7 | 8 | 25208.41 |
18 | 19 | 25013.01 |
16 | 17 | 25011.80 |
21 | 22 | 24944.12 |
17 | 18 | 24781.40 |
22 | 23 | 23558.61 |
6 | 7 | 23305.19 |
23 | 24 | 22142.82 |
0 | 1 | 21091.28 |
5 | 6 | 21067.44 |
1 | 2 | 20291.72 |
4 | 5 | 20077.59 |
2 | 3 | 19880.49 |
3 | 4 | 19766.62 |
df_Avg_March_sorted['Time Frame'] = df_Avg_March_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_March_sorted['Time Frame'] = pd.to_datetime(df_Avg_March_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_March_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
10 | 11 | 27381.40 | 11:00:00 |
11 | 12 | 27201.25 | 12:00:00 |
9 | 10 | 27113.57 | 10:00:00 |
12 | 13 | 26783.80 | 13:00:00 |
8 | 9 | 26480.41 | 09:00:00 |
13 | 14 | 26304.45 | 14:00:00 |
20 | 21 | 25940.99 | 21:00:00 |
14 | 15 | 25802.03 | 15:00:00 |
19 | 20 | 25555.01 | 20:00:00 |
15 | 16 | 25343.22 | 16:00:00 |
7 | 8 | 25208.41 | 08:00:00 |
18 | 19 | 25013.01 | 19:00:00 |
16 | 17 | 25011.80 | 17:00:00 |
21 | 22 | 24944.12 | 22:00:00 |
17 | 18 | 24781.40 | 18:00:00 |
22 | 23 | 23558.61 | 23:00:00 |
6 | 7 | 23305.19 | 07:00:00 |
23 | 24 | 22142.82 | 00:00:00 |
0 | 1 | 21091.28 | 01:00:00 |
5 | 6 | 21067.44 | 06:00:00 |
1 | 2 | 20291.72 | 02:00:00 |
4 | 5 | 20077.59 | 05:00:00 |
2 | 3 | 19880.49 | 03:00:00 |
3 | 4 | 19766.62 | 04:00:00 |
March_max = df_Avg_March_sorted['AvgTotalPowerUsage'].idxmax()
March_max_df = df_Avg_March_sorted.loc[[March_max]]
March_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
10 | 11 | 27381.4 | 11:00:00 |
line_March_sorted = df_result_Avg_March.sort_values(by = 'HourEnd', ascending = True)
line_March_sorted.set_index('HourEnd', inplace=True)
line_plot_March = df_result_Avg_March.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("March Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
April_NewRiver = df[(df['Month'] == 4)]
April_NewRiver_Total = April_NewRiver.groupby('HourEnd')['NewRiver'].sum()
# April_NewRiver_Total
df_result_April = April_NewRiver_Total.reset_index(name = 'TotalPowerUsage')
df_April_sorted = df_result_April.sort_values(by = 'TotalPowerUsage', ascending = False)
df_April_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
11 | 12 | 4435441 |
12 | 13 | 4420712 |
10 | 11 | 4406145 |
13 | 14 | 4386029 |
14 | 15 | 4351748 |
15 | 16 | 4295107 |
9 | 10 | 4279043 |
20 | 21 | 4256999 |
16 | 17 | 4239153 |
17 | 18 | 4168634 |
18 | 19 | 4122328 |
19 | 20 | 4114828 |
21 | 22 | 4102228 |
8 | 9 | 4094587 |
22 | 23 | 3822150 |
7 | 8 | 3793255 |
23 | 24 | 3532353 |
6 | 7 | 3521216 |
0 | 1 | 3294988 |
5 | 6 | 3179018 |
1 | 2 | 3133484 |
2 | 3 | 3030580 |
4 | 5 | 3029341 |
3 | 4 | 2989220 |
April_NewRiver_Avg = April_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_April = April_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_April_sorted = df_result_Avg_April.sort_values(by = 'AvgTotalPowerUsage', ascending = False)
df_Avg_April_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
11 | 12 | 24641.34 |
12 | 13 | 24559.51 |
10 | 11 | 24478.58 |
13 | 14 | 24366.83 |
14 | 15 | 24176.38 |
15 | 16 | 23861.71 |
9 | 10 | 23772.46 |
20 | 21 | 23649.99 |
16 | 17 | 23550.85 |
17 | 18 | 23159.08 |
18 | 19 | 22901.82 |
19 | 20 | 22860.16 |
21 | 22 | 22790.16 |
8 | 9 | 22747.71 |
22 | 23 | 21234.17 |
7 | 8 | 21073.64 |
23 | 24 | 19624.18 |
6 | 7 | 19562.31 |
0 | 1 | 18305.49 |
5 | 6 | 17661.21 |
1 | 2 | 17408.24 |
2 | 3 | 16836.56 |
4 | 5 | 16829.67 |
3 | 4 | 16606.78 |
df_Avg_April_sorted['Time Frame'] = df_Avg_April_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_April_sorted['Time Frame'] = pd.to_datetime(df_Avg_April_sorted['Time Frame'], format ='%H:%M').dt.time
df_Avg_April_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
11 | 12 | 24641.34 | 12:00:00 |
12 | 13 | 24559.51 | 13:00:00 |
10 | 11 | 24478.58 | 11:00:00 |
13 | 14 | 24366.83 | 14:00:00 |
14 | 15 | 24176.38 | 15:00:00 |
15 | 16 | 23861.71 | 16:00:00 |
9 | 10 | 23772.46 | 10:00:00 |
20 | 21 | 23649.99 | 21:00:00 |
16 | 17 | 23550.85 | 17:00:00 |
17 | 18 | 23159.08 | 18:00:00 |
18 | 19 | 22901.82 | 19:00:00 |
19 | 20 | 22860.16 | 20:00:00 |
21 | 22 | 22790.16 | 22:00:00 |
8 | 9 | 22747.71 | 09:00:00 |
22 | 23 | 21234.17 | 23:00:00 |
7 | 8 | 21073.64 | 08:00:00 |
23 | 24 | 19624.18 | 00:00:00 |
6 | 7 | 19562.31 | 07:00:00 |
0 | 1 | 18305.49 | 01:00:00 |
5 | 6 | 17661.21 | 06:00:00 |
1 | 2 | 17408.24 | 02:00:00 |
2 | 3 | 16836.56 | 03:00:00 |
4 | 5 | 16829.67 | 05:00:00 |
3 | 4 | 16606.78 | 04:00:00 |
April_max = df_Avg_April_sorted['AvgTotalPowerUsage'].idxmax()
April_max_df = df_Avg_April_sorted.loc[[April_max]]
April_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
11 | 12 | 24641.34 | 12:00:00 |
line_April_sorted = df_result_Avg_April.sort_values(by = 'HourEnd', ascending = True)
line_April_sorted.set_index('HourEnd', inplace = True)
line_plot_April = df_result_Avg_April.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("April Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
May_NewRiver = df[(df['Month'] == 5)]
May_NewRiver_Total = May_NewRiver.groupby('HourEnd')['NewRiver'].sum()
# May_NewRiver_Total
df_result_May = May_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_May_sorted = df_result_May.sort_values(by = 'TotalPowerUsage', ascending = False)
df_May_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
13 | 14 | 4537118 |
14 | 15 | 4536494 |
12 | 13 | 4524032 |
15 | 16 | 4500636 |
11 | 12 | 4485113 |
16 | 17 | 4450890 |
10 | 11 | 4372849 |
17 | 18 | 4331935 |
18 | 19 | 4222954 |
9 | 10 | 4156285 |
19 | 20 | 4122795 |
20 | 21 | 4121261 |
21 | 22 | 4006964 |
8 | 9 | 3867544 |
22 | 23 | 3710501 |
7 | 8 | 3514059 |
23 | 24 | 3427684 |
6 | 7 | 3203711 |
0 | 1 | 3194444 |
1 | 2 | 3032832 |
5 | 6 | 2989258 |
2 | 3 | 2924384 |
4 | 5 | 2877959 |
3 | 4 | 2867289 |
May_NewRiver_Avg = May_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_May = May_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_May_sorted = df_result_Avg_May.sort_values(by ='AvgTotalPowerUsage', ascending = False)
df_Avg_May_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
13 | 14 | 24393.11 |
14 | 15 | 24389.75 |
12 | 13 | 24322.75 |
15 | 16 | 24196.97 |
11 | 12 | 24113.51 |
16 | 17 | 23929.52 |
10 | 11 | 23509.94 |
17 | 18 | 23289.97 |
18 | 19 | 22704.05 |
9 | 10 | 22345.62 |
19 | 20 | 22165.56 |
20 | 21 | 22157.32 |
21 | 22 | 21542.82 |
8 | 9 | 20793.25 |
22 | 23 | 19948.93 |
7 | 8 | 18892.79 |
23 | 24 | 18428.41 |
6 | 7 | 17224.25 |
0 | 1 | 17174.43 |
1 | 2 | 16305.55 |
5 | 6 | 16071.28 |
2 | 3 | 15722.49 |
4 | 5 | 15472.90 |
3 | 4 | 15415.53 |
df_Avg_May_sorted['Time Frame'] = df_Avg_May_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_May_sorted['Time Frame'] = pd.to_datetime(df_Avg_May_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_May_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
13 | 14 | 24393.11 | 14:00:00 |
14 | 15 | 24389.75 | 15:00:00 |
12 | 13 | 24322.75 | 13:00:00 |
15 | 16 | 24196.97 | 16:00:00 |
11 | 12 | 24113.51 | 12:00:00 |
16 | 17 | 23929.52 | 17:00:00 |
10 | 11 | 23509.94 | 11:00:00 |
17 | 18 | 23289.97 | 18:00:00 |
18 | 19 | 22704.05 | 19:00:00 |
9 | 10 | 22345.62 | 10:00:00 |
19 | 20 | 22165.56 | 20:00:00 |
20 | 21 | 22157.32 | 21:00:00 |
21 | 22 | 21542.82 | 22:00:00 |
8 | 9 | 20793.25 | 09:00:00 |
22 | 23 | 19948.93 | 23:00:00 |
7 | 8 | 18892.79 | 08:00:00 |
23 | 24 | 18428.41 | 00:00:00 |
6 | 7 | 17224.25 | 07:00:00 |
0 | 1 | 17174.43 | 01:00:00 |
1 | 2 | 16305.55 | 02:00:00 |
5 | 6 | 16071.28 | 06:00:00 |
2 | 3 | 15722.49 | 03:00:00 |
4 | 5 | 15472.90 | 05:00:00 |
3 | 4 | 15415.53 | 04:00:00 |
May_max = df_Avg_May_sorted['AvgTotalPowerUsage'].idxmax()
May_max_df = df_Avg_May_sorted.loc[[May_max]]
May_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
13 | 14 | 24393.11 | 14:00:00 |
line_May_sorted = df_result_Avg_May.sort_values(by = 'HourEnd', ascending = True)
line_May_sorted.set_index('HourEnd', inplace=True)
line_plot_May = df_result_Avg_May.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("May Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
June_NewRiver = df[(df['Month'] == 6)]
June_NewRiver_Total = June_NewRiver.groupby('HourEnd')['NewRiver'].sum()
# June_NewRiver_Total
df_result_June = June_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_June_sorted = df_result_June.sort_values(by = 'TotalPowerUsage', ascending = False)
df_June_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
14 | 15 | 4775708 |
15 | 16 | 4751685 |
13 | 14 | 4750091 |
16 | 17 | 4713692 |
12 | 13 | 4710142 |
11 | 12 | 4633166 |
17 | 18 | 4580174 |
10 | 11 | 4475726 |
18 | 19 | 4434029 |
19 | 20 | 4284271 |
9 | 10 | 4225169 |
20 | 21 | 4170072 |
21 | 22 | 4090353 |
8 | 9 | 3897867 |
22 | 23 | 3781411 |
7 | 8 | 3495794 |
23 | 24 | 3475879 |
0 | 1 | 3201895 |
6 | 7 | 3137094 |
1 | 2 | 3030709 |
5 | 6 | 2963153 |
2 | 3 | 2917942 |
4 | 5 | 2866555 |
3 | 4 | 2858986 |
June_NewRiver_Avg =June_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_June = June_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_June_sorted = df_result_Avg_June.sort_values(by = 'AvgTotalPowerUsage', ascending = False)
df_Avg_June_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
14 | 15 | 26531.71 |
15 | 16 | 26398.25 |
13 | 14 | 26389.39 |
16 | 17 | 26187.18 |
12 | 13 | 26167.46 |
11 | 12 | 25739.81 |
17 | 18 | 25445.41 |
10 | 11 | 24865.14 |
18 | 19 | 24633.49 |
19 | 20 | 23801.51 |
9 | 10 | 23473.16 |
20 | 21 | 23167.07 |
21 | 22 | 22724.18 |
8 | 9 | 21654.82 |
22 | 23 | 21007.84 |
7 | 8 | 19421.08 |
23 | 24 | 19310.44 |
0 | 1 | 17788.31 |
6 | 7 | 17428.30 |
1 | 2 | 16837.27 |
5 | 6 | 16461.96 |
2 | 3 | 16210.79 |
4 | 5 | 15925.31 |
3 | 4 | 15883.26 |
df_Avg_June_sorted['Time Frame'] = df_Avg_June_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_June_sorted['Time Frame'] = pd.to_datetime(df_Avg_June_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_June_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
14 | 15 | 26531.71 | 15:00:00 |
15 | 16 | 26398.25 | 16:00:00 |
13 | 14 | 26389.39 | 14:00:00 |
16 | 17 | 26187.18 | 17:00:00 |
12 | 13 | 26167.46 | 13:00:00 |
11 | 12 | 25739.81 | 12:00:00 |
17 | 18 | 25445.41 | 18:00:00 |
10 | 11 | 24865.14 | 11:00:00 |
18 | 19 | 24633.49 | 19:00:00 |
19 | 20 | 23801.51 | 20:00:00 |
9 | 10 | 23473.16 | 10:00:00 |
20 | 21 | 23167.07 | 21:00:00 |
21 | 22 | 22724.18 | 22:00:00 |
8 | 9 | 21654.82 | 09:00:00 |
22 | 23 | 21007.84 | 23:00:00 |
7 | 8 | 19421.08 | 08:00:00 |
23 | 24 | 19310.44 | 00:00:00 |
0 | 1 | 17788.31 | 01:00:00 |
6 | 7 | 17428.30 | 07:00:00 |
1 | 2 | 16837.27 | 02:00:00 |
5 | 6 | 16461.96 | 06:00:00 |
2 | 3 | 16210.79 | 03:00:00 |
4 | 5 | 15925.31 | 05:00:00 |
3 | 4 | 15883.26 | 04:00:00 |
June_max = df_Avg_June_sorted['AvgTotalPowerUsage'].idxmax()
June_max_df = df_Avg_June_sorted.loc[[June_max]]
June_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
14 | 15 | 26531.71 | 15:00:00 |
line_June_sorted = df_result_Avg_June.sort_values(by = 'HourEnd', ascending = True)
line_June_sorted.set_index('HourEnd', inplace = True)
line_plot_June = df_result_Avg_June.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("June Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
July_NewRiver = df[(df['Month'] == 7)]
July_NewRiver_Total = July_NewRiver.groupby('HourEnd')['NewRiver'].sum()
# July_NewRiver_Total
df_result_July = July_NewRiver_Total.reset_index(name = 'TotalPowerUsage')
df_July_sorted = df_result_July.sort_values(by ='TotalPowerUsage', ascending = False)
df_July_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
14 | 15 | 5279510 |
15 | 16 | 5255347 |
13 | 14 | 5242181 |
16 | 17 | 5209818 |
12 | 13 | 5186374 |
11 | 12 | 5085027 |
17 | 18 | 5072601 |
18 | 19 | 4915637 |
10 | 11 | 4898724 |
19 | 20 | 4725883 |
9 | 10 | 4598746 |
20 | 21 | 4584799 |
21 | 22 | 4478258 |
8 | 9 | 4219758 |
22 | 23 | 4141986 |
23 | 24 | 3812131 |
7 | 8 | 3789518 |
0 | 1 | 3522968 |
6 | 7 | 3464410 |
1 | 2 | 3343746 |
5 | 6 | 3248298 |
2 | 3 | 3220884 |
3 | 4 | 3148959 |
4 | 5 | 3146936 |
July_NewRiver_Avg =July_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_July = July_NewRiver_Avg.reset_index(name ='AvgTotalPowerUsage')
df_Avg_July_sorted = df_result_Avg_July.sort_values(by ='AvgTotalPowerUsage', ascending = False)
df_Avg_July_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
14 | 15 | 28384.46 |
15 | 16 | 28254.55 |
13 | 14 | 28183.77 |
16 | 17 | 28009.77 |
12 | 13 | 27883.73 |
11 | 12 | 27338.85 |
17 | 18 | 27272.05 |
18 | 19 | 26428.16 |
10 | 11 | 26337.23 |
19 | 20 | 25407.97 |
9 | 10 | 24724.44 |
20 | 21 | 24649.46 |
21 | 22 | 24076.66 |
8 | 9 | 22686.87 |
22 | 23 | 22268.74 |
23 | 24 | 20495.33 |
7 | 8 | 20373.75 |
0 | 1 | 18940.69 |
6 | 7 | 18625.86 |
1 | 2 | 17977.13 |
5 | 6 | 17463.97 |
2 | 3 | 17316.58 |
3 | 4 | 16929.89 |
4 | 5 | 16919.01 |
df_Avg_July_sorted['Time Frame'] = df_Avg_July_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_July_sorted['Time Frame'] = pd.to_datetime(df_Avg_July_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_July_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
14 | 15 | 28384.46 | 15:00:00 |
15 | 16 | 28254.55 | 16:00:00 |
13 | 14 | 28183.77 | 14:00:00 |
16 | 17 | 28009.77 | 17:00:00 |
12 | 13 | 27883.73 | 13:00:00 |
11 | 12 | 27338.85 | 12:00:00 |
17 | 18 | 27272.05 | 18:00:00 |
18 | 19 | 26428.16 | 19:00:00 |
10 | 11 | 26337.23 | 11:00:00 |
19 | 20 | 25407.97 | 20:00:00 |
9 | 10 | 24724.44 | 10:00:00 |
20 | 21 | 24649.46 | 21:00:00 |
21 | 22 | 24076.66 | 22:00:00 |
8 | 9 | 22686.87 | 09:00:00 |
22 | 23 | 22268.74 | 23:00:00 |
23 | 24 | 20495.33 | 00:00:00 |
7 | 8 | 20373.75 | 08:00:00 |
0 | 1 | 18940.69 | 01:00:00 |
6 | 7 | 18625.86 | 07:00:00 |
1 | 2 | 17977.13 | 02:00:00 |
5 | 6 | 17463.97 | 06:00:00 |
2 | 3 | 17316.58 | 03:00:00 |
3 | 4 | 16929.89 | 04:00:00 |
4 | 5 | 16919.01 | 05:00:00 |
July_max = df_Avg_July_sorted['AvgTotalPowerUsage'].idxmax()
July_max_df = df_Avg_July_sorted.loc[[July_max]]
July_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
14 | 15 | 28384.46 | 15:00:00 |
line_July_sorted = df_result_Avg_July.sort_values(by = 'HourEnd', ascending = True)
line_July_sorted.set_index('HourEnd', inplace = True)
line_plot_July = df_result_Avg_July.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("July Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
August_NewRiver = df[(df['Month'] == 8)]
August_NewRiver_Total = August_NewRiver.groupby('HourEnd')['NewRiver'].sum()
df_result_August = August_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_August_sorted = df_result_August.sort_values(by ='TotalPowerUsage', ascending = False)
df_August_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
14 | 15 | 5455978 |
15 | 16 | 5434155 |
13 | 14 | 5413379 |
16 | 17 | 5384494 |
12 | 13 | 5343976 |
17 | 18 | 5245833 |
11 | 12 | 5231457 |
18 | 19 | 5104080 |
10 | 11 | 5033108 |
19 | 20 | 4945743 |
20 | 21 | 4906699 |
9 | 10 | 4718279 |
21 | 22 | 4690815 |
8 | 9 | 4334715 |
22 | 23 | 4318253 |
23 | 24 | 3942678 |
7 | 8 | 3890285 |
0 | 1 | 3610968 |
6 | 7 | 3581665 |
1 | 2 | 3397087 |
5 | 6 | 3280723 |
2 | 3 | 3257721 |
3 | 4 | 3178692 |
4 | 5 | 3176919 |
August_NewRiver_Avg =August_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_August = August_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_August_sorted = df_result_Avg_August.sort_values(by = 'AvgTotalPowerUsage', ascending=False)
df_Avg_August_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
14 | 15 | 29333.22 |
15 | 16 | 29215.89 |
13 | 14 | 29104.19 |
16 | 17 | 28948.89 |
12 | 13 | 28731.05 |
17 | 18 | 28203.40 |
11 | 12 | 28126.11 |
18 | 19 | 27441.29 |
10 | 11 | 27059.72 |
19 | 20 | 26590.02 |
20 | 21 | 26380.10 |
9 | 10 | 25367.09 |
21 | 22 | 25219.44 |
8 | 9 | 23304.92 |
22 | 23 | 23216.41 |
23 | 24 | 21197.19 |
7 | 8 | 20915.51 |
0 | 1 | 19413.81 |
6 | 7 | 19256.26 |
1 | 2 | 18263.91 |
5 | 6 | 17638.30 |
2 | 3 | 17514.63 |
3 | 4 | 17089.74 |
4 | 5 | 17080.21 |
df_Avg_August_sorted['Time Frame'] = df_Avg_August_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_August_sorted['Time Frame'] = pd.to_datetime(df_Avg_August_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_August_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
14 | 15 | 29333.22 | 15:00:00 |
15 | 16 | 29215.89 | 16:00:00 |
13 | 14 | 29104.19 | 14:00:00 |
16 | 17 | 28948.89 | 17:00:00 |
12 | 13 | 28731.05 | 13:00:00 |
17 | 18 | 28203.40 | 18:00:00 |
11 | 12 | 28126.11 | 12:00:00 |
18 | 19 | 27441.29 | 19:00:00 |
10 | 11 | 27059.72 | 11:00:00 |
19 | 20 | 26590.02 | 20:00:00 |
20 | 21 | 26380.10 | 21:00:00 |
9 | 10 | 25367.09 | 10:00:00 |
21 | 22 | 25219.44 | 22:00:00 |
8 | 9 | 23304.92 | 09:00:00 |
22 | 23 | 23216.41 | 23:00:00 |
23 | 24 | 21197.19 | 00:00:00 |
7 | 8 | 20915.51 | 08:00:00 |
0 | 1 | 19413.81 | 01:00:00 |
6 | 7 | 19256.26 | 07:00:00 |
1 | 2 | 18263.91 | 02:00:00 |
5 | 6 | 17638.30 | 06:00:00 |
2 | 3 | 17514.63 | 03:00:00 |
3 | 4 | 17089.74 | 04:00:00 |
4 | 5 | 17080.21 | 05:00:00 |
August_max = df_Avg_August_sorted['AvgTotalPowerUsage'].idxmax()
August_max_df = df_Avg_August_sorted.loc[[August_max]]
August_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
14 | 15 | 29333.22 | 15:00:00 |
line_August_sorted = df_result_Avg_August.sort_values(by = 'HourEnd', ascending = True)
line_August_sorted.set_index('HourEnd', inplace = True)
line_plot_August = df_result_Avg_August.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("August Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
Sep_NewRiver = df[(df['Month'] == 9)]
Sep_NewRiver_Total = Sep_NewRiver.groupby('HourEnd')['NewRiver'].sum()
# July_NewRiver_Total
df_result_Sep = Sep_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_Sep_sorted = df_result_Sep.sort_values(by = 'TotalPowerUsage', ascending = False)
df_Sep_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
14 | 15 | 5154458 |
15 | 16 | 5144799 |
13 | 14 | 5118838 |
16 | 17 | 5104255 |
12 | 13 | 5068615 |
17 | 18 | 4989817 |
11 | 12 | 4969910 |
18 | 19 | 4875350 |
19 | 20 | 4835675 |
20 | 21 | 4807395 |
10 | 11 | 4781701 |
21 | 22 | 4511835 |
9 | 10 | 4481845 |
22 | 23 | 4153896 |
8 | 9 | 4133731 |
23 | 24 | 3791515 |
7 | 8 | 3763669 |
0 | 1 | 3484119 |
6 | 7 | 3458690 |
1 | 2 | 3254355 |
5 | 6 | 3128533 |
2 | 3 | 3109610 |
3 | 4 | 3025872 |
4 | 5 | 3019986 |
Sep_NewRiver_Avg =Sep_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_Sep = Sep_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_Sep_sorted = df_result_Avg_Sep.sort_values(by = 'AvgTotalPowerUsage', ascending = False)
df_Avg_Sep_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
14 | 15 | 28635.88 |
15 | 16 | 28582.22 |
13 | 14 | 28437.99 |
16 | 17 | 28356.97 |
12 | 13 | 28158.97 |
17 | 18 | 27721.21 |
11 | 12 | 27610.61 |
18 | 19 | 27085.28 |
19 | 20 | 26864.86 |
20 | 21 | 26707.75 |
10 | 11 | 26565.01 |
21 | 22 | 25065.75 |
9 | 10 | 24899.14 |
22 | 23 | 23077.20 |
8 | 9 | 22965.17 |
23 | 24 | 21063.97 |
7 | 8 | 20909.27 |
0 | 1 | 19356.22 |
6 | 7 | 19214.94 |
1 | 2 | 18079.75 |
5 | 6 | 17380.74 |
2 | 3 | 17275.61 |
3 | 4 | 16810.40 |
4 | 5 | 16777.70 |
df_Avg_Sep_sorted['Time Frame'] = df_Avg_Sep_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Sep_sorted['Time Frame'] = pd.to_datetime(df_Avg_Sep_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_Sep_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
14 | 15 | 28635.88 | 15:00:00 |
15 | 16 | 28582.22 | 16:00:00 |
13 | 14 | 28437.99 | 14:00:00 |
16 | 17 | 28356.97 | 17:00:00 |
12 | 13 | 28158.97 | 13:00:00 |
17 | 18 | 27721.21 | 18:00:00 |
11 | 12 | 27610.61 | 12:00:00 |
18 | 19 | 27085.28 | 19:00:00 |
19 | 20 | 26864.86 | 20:00:00 |
20 | 21 | 26707.75 | 21:00:00 |
10 | 11 | 26565.01 | 11:00:00 |
21 | 22 | 25065.75 | 22:00:00 |
9 | 10 | 24899.14 | 10:00:00 |
22 | 23 | 23077.20 | 23:00:00 |
8 | 9 | 22965.17 | 09:00:00 |
23 | 24 | 21063.97 | 00:00:00 |
7 | 8 | 20909.27 | 08:00:00 |
0 | 1 | 19356.22 | 01:00:00 |
6 | 7 | 19214.94 | 07:00:00 |
1 | 2 | 18079.75 | 02:00:00 |
5 | 6 | 17380.74 | 06:00:00 |
2 | 3 | 17275.61 | 03:00:00 |
3 | 4 | 16810.40 | 04:00:00 |
4 | 5 | 16777.70 | 05:00:00 |
Sep_max = df_Avg_Sep_sorted['AvgTotalPowerUsage'].idxmax()
Sep_max_df = df_Avg_Sep_sorted.loc[[Sep_max]]
Sep_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
14 | 15 | 28635.88 | 15:00:00 |
line_Sep_sorted = df_result_Avg_Sep.sort_values(by = 'HourEnd', ascending = True)
line_Sep_sorted.set_index('HourEnd', inplace = True)
line_plot_Sep = df_result_Avg_Sep.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("September Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
Oct_NewRiver = df[(df['Month'] == 10)]
Oct_NewRiver_Total = Oct_NewRiver.groupby('HourEnd')['NewRiver'].sum()
# July_NewRiver_Total
df_result_Oct = Oct_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_Oct_sorted = df_result_Oct.sort_values(by ='TotalPowerUsage', ascending = False)
df_Oct_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
12 | 13 | 4827029 |
13 | 14 | 4823857 |
14 | 15 | 4810720 |
11 | 12 | 4793565 |
15 | 16 | 4772443 |
19 | 20 | 4766087 |
16 | 17 | 4723308 |
10 | 11 | 4698497 |
17 | 18 | 4646941 |
18 | 19 | 4646055 |
20 | 21 | 4633197 |
9 | 10 | 4499015 |
21 | 22 | 4356992 |
8 | 9 | 4259317 |
22 | 23 | 4029804 |
7 | 8 | 3983510 |
23 | 24 | 3709803 |
6 | 7 | 3590751 |
0 | 1 | 3433807 |
1 | 2 | 3237594 |
5 | 6 | 3214255 |
2 | 3 | 3113159 |
4 | 5 | 3073695 |
3 | 4 | 3051473 |
Oct_NewRiver_Avg = Oct_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_Oct = Oct_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_Oct_sorted = df_result_Avg_Oct.sort_values(by = 'AvgTotalPowerUsage', ascending = False)
df_Avg_Oct_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
12 | 13 | 25951.77 |
13 | 14 | 25934.72 |
14 | 15 | 25864.09 |
11 | 12 | 25771.85 |
15 | 16 | 25658.30 |
19 | 20 | 25624.12 |
16 | 17 | 25394.13 |
10 | 11 | 25260.74 |
17 | 18 | 24983.55 |
18 | 19 | 24978.79 |
20 | 21 | 24909.66 |
9 | 10 | 24188.25 |
21 | 22 | 23424.69 |
8 | 9 | 22899.55 |
22 | 23 | 21665.61 |
7 | 8 | 21416.72 |
23 | 24 | 19945.18 |
6 | 7 | 19305.11 |
0 | 1 | 18461.33 |
1 | 2 | 17406.42 |
5 | 6 | 17280.94 |
2 | 3 | 16737.41 |
4 | 5 | 16525.24 |
3 | 4 | 16405.77 |
df_Avg_Oct_sorted['Time Frame'] = df_Avg_Oct_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Oct_sorted['Time Frame'] = pd.to_datetime(df_Avg_Oct_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_Oct_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
12 | 13 | 25951.77 | 13:00:00 |
13 | 14 | 25934.72 | 14:00:00 |
14 | 15 | 25864.09 | 15:00:00 |
11 | 12 | 25771.85 | 12:00:00 |
15 | 16 | 25658.30 | 16:00:00 |
19 | 20 | 25624.12 | 20:00:00 |
16 | 17 | 25394.13 | 17:00:00 |
10 | 11 | 25260.74 | 11:00:00 |
17 | 18 | 24983.55 | 18:00:00 |
18 | 19 | 24978.79 | 19:00:00 |
20 | 21 | 24909.66 | 21:00:00 |
9 | 10 | 24188.25 | 10:00:00 |
21 | 22 | 23424.69 | 22:00:00 |
8 | 9 | 22899.55 | 09:00:00 |
22 | 23 | 21665.61 | 23:00:00 |
7 | 8 | 21416.72 | 08:00:00 |
23 | 24 | 19945.18 | 00:00:00 |
6 | 7 | 19305.11 | 07:00:00 |
0 | 1 | 18461.33 | 01:00:00 |
1 | 2 | 17406.42 | 02:00:00 |
5 | 6 | 17280.94 | 06:00:00 |
2 | 3 | 16737.41 | 03:00:00 |
4 | 5 | 16525.24 | 05:00:00 |
3 | 4 | 16405.77 | 04:00:00 |
Oct_max = df_Avg_Oct_sorted['AvgTotalPowerUsage'].idxmax()
Oct_max_df = df_Avg_Oct_sorted.loc[[Oct_max]]
Oct_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
12 | 13 | 25951.77 | 13:00:00 |
line_Oct_sorted = df_result_Avg_Oct.sort_values(by = 'HourEnd', ascending = True)
line_Oct_sorted.set_index('HourEnd', inplace = True)
line_plot_Oct = df_result_Avg_Oct.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("October Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
Nov_NewRiver = df[(df['Month'] == 11)]
Nov_NewRiver_Total = Nov_NewRiver.groupby('HourEnd')['NewRiver'].sum()
# July_NewRiver_Total
df_result_Nov = Nov_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_Nov_sorted = df_result_Nov.sort_values(by = 'TotalPowerUsage', ascending = False)
df_Nov_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
18 | 19 | 4912292 |
10 | 11 | 4872718 |
19 | 20 | 4838577 |
11 | 12 | 4833155 |
9 | 10 | 4818854 |
17 | 18 | 4811326 |
12 | 13 | 4769385 |
20 | 21 | 4712320 |
8 | 9 | 4699467 |
13 | 14 | 4695477 |
14 | 15 | 4631004 |
16 | 17 | 4610766 |
15 | 16 | 4590451 |
21 | 22 | 4520246 |
7 | 8 | 4453251 |
22 | 23 | 4271181 |
6 | 7 | 4148493 |
23 | 24 | 4014141 |
0 | 1 | 3911201 |
5 | 6 | 3776155 |
1 | 2 | 3653773 |
4 | 5 | 3602723 |
2 | 3 | 3571408 |
3 | 4 | 3544014 |
Nov_NewRiver_Avg = Nov_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_Nov = Nov_NewRiver_Avg.reset_index(name ='AvgTotalPowerUsage')
df_Avg_Nov_sorted = df_result_Avg_Nov.sort_values(by = 'AvgTotalPowerUsage', ascending = False)
df_Avg_Nov_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
18 | 19 | 27290.51 |
10 | 11 | 27070.66 |
19 | 20 | 26880.98 |
11 | 12 | 26850.86 |
9 | 10 | 26771.41 |
17 | 18 | 26729.59 |
12 | 13 | 26496.58 |
20 | 21 | 26179.56 |
8 | 9 | 26108.15 |
13 | 14 | 26085.98 |
14 | 15 | 25727.80 |
16 | 17 | 25615.37 |
15 | 16 | 25502.51 |
21 | 22 | 25112.48 |
7 | 8 | 24740.28 |
22 | 23 | 23728.78 |
6 | 7 | 23047.18 |
23 | 24 | 22300.78 |
0 | 1 | 21027.96 |
5 | 6 | 20978.64 |
1 | 2 | 20298.74 |
4 | 5 | 20015.13 |
2 | 3 | 19841.16 |
3 | 4 | 19688.97 |
df_Avg_Nov_sorted['Time Frame'] = df_Avg_Nov_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Nov_sorted['Time Frame'] = pd.to_datetime(df_Avg_Nov_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_Nov_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
18 | 19 | 27290.51 | 19:00:00 |
10 | 11 | 27070.66 | 11:00:00 |
19 | 20 | 26880.98 | 20:00:00 |
11 | 12 | 26850.86 | 12:00:00 |
9 | 10 | 26771.41 | 10:00:00 |
17 | 18 | 26729.59 | 18:00:00 |
12 | 13 | 26496.58 | 13:00:00 |
20 | 21 | 26179.56 | 21:00:00 |
8 | 9 | 26108.15 | 09:00:00 |
13 | 14 | 26085.98 | 14:00:00 |
14 | 15 | 25727.80 | 15:00:00 |
16 | 17 | 25615.37 | 17:00:00 |
15 | 16 | 25502.51 | 16:00:00 |
21 | 22 | 25112.48 | 22:00:00 |
7 | 8 | 24740.28 | 08:00:00 |
22 | 23 | 23728.78 | 23:00:00 |
6 | 7 | 23047.18 | 07:00:00 |
23 | 24 | 22300.78 | 00:00:00 |
0 | 1 | 21027.96 | 01:00:00 |
5 | 6 | 20978.64 | 06:00:00 |
1 | 2 | 20298.74 | 02:00:00 |
4 | 5 | 20015.13 | 05:00:00 |
2 | 3 | 19841.16 | 03:00:00 |
3 | 4 | 19688.97 | 04:00:00 |
Nov_max = df_Avg_Nov_sorted['AvgTotalPowerUsage'].idxmax()
Nov_max_df = df_Avg_Nov_sorted.loc[[Nov_max]]
Nov_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
18 | 19 | 27290.51 | 19:00:00 |
line_Nov_sorted = df_result_Avg_Nov.sort_values(by = 'HourEnd', ascending = True)
line_Nov_sorted.set_index('HourEnd', inplace = True)
line_plot_Nov = df_result_Avg_Nov.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("November Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
Dec_NewRiver = df[(df['Month'] == 12)]
Dec_NewRiver_Total = Dec_NewRiver.groupby('HourEnd')['NewRiver'].sum()
# July_NewRiver_Total
df_result_Dec = Dec_NewRiver_Total.reset_index(name='TotalPowerUsage')
df_Dec_sorted = df_result_Dec.sort_values(by='TotalPowerUsage', ascending = False)
df_Dec_sorted
HourEnd | TotalPowerUsage | |
---|---|---|
10 | 11 | 5242411 |
9 | 10 | 5223392 |
18 | 19 | 5190862 |
11 | 12 | 5176594 |
8 | 9 | 5137713 |
19 | 20 | 5131540 |
17 | 18 | 5114881 |
12 | 13 | 5075601 |
20 | 21 | 5036919 |
13 | 14 | 4989454 |
7 | 8 | 4960518 |
14 | 15 | 4909678 |
16 | 17 | 4882546 |
21 | 22 | 4880097 |
15 | 16 | 4853640 |
6 | 7 | 4665050 |
22 | 23 | 4658048 |
23 | 24 | 4442213 |
5 | 6 | 4315304 |
0 | 1 | 4257858 |
1 | 2 | 4151315 |
4 | 5 | 4150074 |
2 | 3 | 4095161 |
3 | 4 | 4088404 |
Dec_NewRiver_Avg = Dec_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)
df_result_Avg_Dec = Dec_NewRiver_Avg.reset_index(name ='AvgTotalPowerUsage')
df_Avg_Dec_sorted = df_result_Avg_Dec.sort_values(by ='AvgTotalPowerUsage', ascending = False)
df_Avg_Dec_sorted
HourEnd | AvgTotalPowerUsage | |
---|---|---|
10 | 11 | 28185.01 |
9 | 10 | 28082.75 |
18 | 19 | 27907.86 |
11 | 12 | 27831.15 |
8 | 9 | 27622.11 |
19 | 20 | 27588.92 |
17 | 18 | 27499.36 |
12 | 13 | 27288.18 |
20 | 21 | 27080.21 |
13 | 14 | 26825.02 |
7 | 8 | 26669.45 |
14 | 15 | 26396.12 |
16 | 17 | 26250.25 |
21 | 22 | 26237.08 |
15 | 16 | 26094.84 |
6 | 7 | 25080.91 |
22 | 23 | 25043.27 |
23 | 24 | 23882.87 |
5 | 6 | 23200.56 |
0 | 1 | 22891.71 |
1 | 2 | 22318.90 |
4 | 5 | 22312.23 |
2 | 3 | 22016.99 |
3 | 4 | 21980.67 |
df_Avg_Dec_sorted['Time Frame'] = df_Avg_Dec_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'
# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Dec_sorted['Time Frame'] = pd.to_datetime(df_Avg_Dec_sorted['Time Frame'], format = '%H:%M').dt.time
df_Avg_Dec_sorted
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
10 | 11 | 28185.01 | 11:00:00 |
9 | 10 | 28082.75 | 10:00:00 |
18 | 19 | 27907.86 | 19:00:00 |
11 | 12 | 27831.15 | 12:00:00 |
8 | 9 | 27622.11 | 09:00:00 |
19 | 20 | 27588.92 | 20:00:00 |
17 | 18 | 27499.36 | 18:00:00 |
12 | 13 | 27288.18 | 13:00:00 |
20 | 21 | 27080.21 | 21:00:00 |
13 | 14 | 26825.02 | 14:00:00 |
7 | 8 | 26669.45 | 08:00:00 |
14 | 15 | 26396.12 | 15:00:00 |
16 | 17 | 26250.25 | 17:00:00 |
21 | 22 | 26237.08 | 22:00:00 |
15 | 16 | 26094.84 | 16:00:00 |
6 | 7 | 25080.91 | 07:00:00 |
22 | 23 | 25043.27 | 23:00:00 |
23 | 24 | 23882.87 | 00:00:00 |
5 | 6 | 23200.56 | 06:00:00 |
0 | 1 | 22891.71 | 01:00:00 |
1 | 2 | 22318.90 | 02:00:00 |
4 | 5 | 22312.23 | 05:00:00 |
2 | 3 | 22016.99 | 03:00:00 |
3 | 4 | 21980.67 | 04:00:00 |
Dec_max = df_Avg_Dec_sorted['AvgTotalPowerUsage'].idxmax()
Dec_max_df = df_Avg_Dec_sorted.loc[[Dec_max]]
Dec_max_df
HourEnd | AvgTotalPowerUsage | Time Frame | |
---|---|---|---|
10 | 11 | 28185.01 | 11:00:00 |
line_Dec_sorted = df_result_Avg_Dec.sort_values(by = 'HourEnd', ascending = True)
line_Dec_sorted.set_index('HourEnd', inplace = True)
line_plot_Dec = df_result_Avg_Dec.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
plt.title("December Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()
# DataFrame to use for the power usage patterns of different months
Months_comparison = pd.DataFrame({
'January': df_Avg_Jan_sorted['AvgTotalPowerUsage'],
'February': df_Avg_Feb_sorted['AvgTotalPowerUsage'],
'March': df_Avg_March_sorted['AvgTotalPowerUsage'],
'April': df_Avg_April_sorted['AvgTotalPowerUsage'],
'May': df_Avg_May_sorted['AvgTotalPowerUsage'],
'June': df_Avg_June_sorted['AvgTotalPowerUsage'],
'July': df_Avg_July_sorted['AvgTotalPowerUsage'],
'August': df_Avg_August_sorted['AvgTotalPowerUsage'],
'September': df_Avg_Sep_sorted['AvgTotalPowerUsage'],
'October': df_Avg_Oct_sorted['AvgTotalPowerUsage'],
'November': df_Avg_Nov_sorted['AvgTotalPowerUsage'],
'December': df_Avg_Dec_sorted['AvgTotalPowerUsage']
})
correlation_matrix = Months_comparison.corr()
correlation_matrix
January | February | March | April | May | June | July | August | September | October | November | December | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
January | 1.000000 | 0.995385 | 0.973206 | 0.934845 | 0.866108 | 0.838219 | 0.825292 | 0.832789 | 0.843728 | 0.913792 | 0.986630 | 0.997897 |
February | 0.995385 | 1.000000 | 0.983460 | 0.960998 | 0.902131 | 0.876297 | 0.864381 | 0.871848 | 0.881830 | 0.942957 | 0.991596 | 0.992169 |
March | 0.973206 | 0.983460 | 1.000000 | 0.968246 | 0.903988 | 0.873539 | 0.859544 | 0.862919 | 0.867933 | 0.930582 | 0.964714 | 0.970005 |
April | 0.934845 | 0.960998 | 0.968246 | 1.000000 | 0.978875 | 0.962321 | 0.954468 | 0.958678 | 0.961230 | 0.986439 | 0.965883 | 0.937263 |
May | 0.866108 | 0.902131 | 0.903988 | 0.978875 | 1.000000 | 0.997358 | 0.994905 | 0.994552 | 0.991866 | 0.988450 | 0.926692 | 0.874771 |
June | 0.838219 | 0.876297 | 0.873539 | 0.962321 | 0.997358 | 1.000000 | 0.999425 | 0.997793 | 0.993667 | 0.981104 | 0.908416 | 0.849230 |
July | 0.825292 | 0.864381 | 0.859544 | 0.954468 | 0.994905 | 0.999425 | 1.000000 | 0.998385 | 0.994079 | 0.977371 | 0.899741 | 0.837295 |
August | 0.832789 | 0.871848 | 0.862919 | 0.958678 | 0.994552 | 0.997793 | 0.998385 | 1.000000 | 0.998469 | 0.982696 | 0.906434 | 0.843269 |
September | 0.843728 | 0.881830 | 0.867933 | 0.961230 | 0.991866 | 0.993667 | 0.994079 | 0.998469 | 1.000000 | 0.987353 | 0.914976 | 0.852549 |
October | 0.913792 | 0.942957 | 0.930582 | 0.986439 | 0.988450 | 0.981104 | 0.977371 | 0.982696 | 0.987353 | 1.000000 | 0.961754 | 0.918340 |
November | 0.986630 | 0.991596 | 0.964714 | 0.965883 | 0.926692 | 0.908416 | 0.899741 | 0.906434 | 0.914976 | 0.961754 | 1.000000 | 0.990057 |
December | 0.997897 | 0.992169 | 0.970005 | 0.937263 | 0.874771 | 0.849230 | 0.837295 | 0.843269 | 0.852549 | 0.918340 | 0.990057 | 1.000000 |
sns.clustermap(correlation_matrix, method = 'average', cmap = 'coolwarm', figsize = (11, 9))
plt.title('Hierarchical Clustering of Monthly Correlation')
plt.show()
import numpy as np
# List of month names
months = [
'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
corr_threshold = 0.97
mask = np.triu(np.ones_like(correlation_matrix), k = 1)
indices = np.where((correlation_matrix * mask) >= corr_threshold)
groupings = [(months[month1], months[month2]) for month1, month2 in zip(indices[0], indices[1])]
print("Groupings based on correlation above", corr_threshold)
print(groupings)
Groupings based on correlation above 0.97 [('January', 'February'), ('January', 'March'), ('January', 'November'), ('January', 'December'), ('February', 'March'), ('February', 'November'), ('February', 'December'), ('March', 'December'), ('April', 'May'), ('April', 'October'), ('May', 'June'), ('May', 'July'), ('May', 'August'), ('May', 'September'), ('May', 'October'), ('June', 'July'), ('June', 'August'), ('June', 'September'), ('June', 'October'), ('July', 'August'), ('July', 'September'), ('July', 'October'), ('August', 'September'), ('August', 'October'), ('September', 'October'), ('November', 'December')]
winter_months_to_plot = ['January', 'February', 'March', 'April', 'November', 'December']
dataframes = [df_result_Avg_Jan, df_result_Avg_Feb, df_result_Avg_March, df_result_Avg_April, df_result_Avg_Nov,df_result_Avg_Dec]
plt.figure(figsize = (9, 6))
for month, df_month in zip(winter_months_to_plot, dataframes):
line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
line_month_sorted.set_index('HourEnd', inplace = True)
plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')
plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Winter Months')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()
winter_months_to_plot1 = ['January', 'February', 'March', 'November', 'December']
dataframes = [df_result_Avg_Jan, df_result_Avg_Feb, df_result_Avg_March, df_result_Avg_April, df_result_Avg_Nov,df_result_Avg_Dec]
plt.figure(figsize = (9, 6))
for month, df_month in zip(winter_months_to_plot1, dataframes):
line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
line_month_sorted.set_index('HourEnd', inplace = True)
plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')
plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Cold Months')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()
summer_months_to_plot = ['May', 'June', 'July', 'August', 'September']
dataframes = [df_result_Avg_May, df_result_Avg_June, df_result_Avg_July, df_result_Avg_August ,df_result_Avg_Sep]
plt.figure(figsize = (9, 6))
for month, df_month in zip(summer_months_to_plot, dataframes):
line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
line_month_sorted.set_index('HourEnd', inplace = True)
plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')
plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Warm Months')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()
# May -> June -> July -> Aug (Increases)
# Aug -> Sep (Starts to decrease)
summer_months_to_plot = ['May', 'June', 'July', 'August', 'September', 'October']
dataframes = [df_result_Avg_May, df_result_Avg_June, df_result_Avg_July, df_result_Avg_August ,df_result_Avg_Sep, df_result_Avg_Oct]
plt.figure(figsize = (9, 6))
for month, df_month in zip(summer_months_to_plot, dataframes):
line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
line_month_sorted.set_index('HourEnd', inplace = True)
plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')
plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Summer Months (Inlcuding October)')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()
# Months with a second spike
spike_months_to_plot = ['January', 'February', 'March', 'April', 'November', 'December', 'October']
dataframes = [df_result_Avg_Jan, df_result_Avg_Feb, df_result_Avg_March, df_result_Avg_April, df_result_Avg_Nov,df_result_Avg_Dec, df_result_Avg_Oct]
plt.figure(figsize = (9, 6))
for month, df_month in zip(spike_months_to_plot, dataframes):
line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
line_month_sorted.set_index('HourEnd', inplace = True)
plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')
plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Second Spiked Months')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()
# Months with a second spike
spike_months_to_plot = ['April', 'October']
dataframes = [df_result_Avg_April, df_result_Avg_Oct]
plt.figure(figsize = (9, 6))
for month, df_month in zip(spike_months_to_plot, dataframes):
line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
line_month_sorted.set_index('HourEnd', inplace = True)
plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')
plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Transition Months')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()
# Create a DataFrame for K-means clustering
months_comparison = pd.concat([
df_Avg_Jan_sorted['AvgTotalPowerUsage'],
df_Avg_Feb_sorted['AvgTotalPowerUsage'],
df_Avg_March_sorted['AvgTotalPowerUsage'],
df_Avg_April_sorted['AvgTotalPowerUsage'],
df_Avg_May_sorted['AvgTotalPowerUsage'],
df_Avg_June_sorted['AvgTotalPowerUsage'],
df_Avg_July_sorted['AvgTotalPowerUsage'],
df_Avg_August_sorted['AvgTotalPowerUsage'],
df_Avg_Sep_sorted['AvgTotalPowerUsage'],
df_Avg_Oct_sorted['AvgTotalPowerUsage'],
df_Avg_Nov_sorted['AvgTotalPowerUsage'],
df_Avg_Dec_sorted['AvgTotalPowerUsage']
], axis = 1)
X = months_comparison.transpose()
num_clusters = 12
kmeans = KMeans(n_clusters = num_clusters, random_state = 42)
clusters = kmeans.fit_predict(X)
clustered_months_comparison = pd.DataFrame({'AvgTotalPowerUsage': X.values.flatten(), 'Cluster': np.repeat(clusters, len(X.columns))})
plt.figure(figsize = (9, 6))
for i in range(num_clusters):
cluster_data = clustered_months_comparison[clustered_months_comparison['Cluster'] == i]
plt.scatter(cluster_data.index, cluster_data['AvgTotalPowerUsage'], label = f'Cluster {i + 1}')
plt.xlabel('Month', fontsize = 12)
plt.ylabel('AvgTotalPowerUsage', fontsize = 12)
plt.title('KMeans Clustering of Monthly Power Usage', fontsize = 13)
plt.legend()
plt.show()
months_comparison = pd.concat([
df_Avg_Jan_sorted['AvgTotalPowerUsage'],
df_Avg_Feb_sorted['AvgTotalPowerUsage'],
df_Avg_March_sorted['AvgTotalPowerUsage'],
df_Avg_April_sorted['AvgTotalPowerUsage'],
df_Avg_May_sorted['AvgTotalPowerUsage'],
df_Avg_June_sorted['AvgTotalPowerUsage'],
df_Avg_July_sorted['AvgTotalPowerUsage'],
df_Avg_August_sorted['AvgTotalPowerUsage'],
df_Avg_Sep_sorted['AvgTotalPowerUsage'],
df_Avg_Oct_sorted['AvgTotalPowerUsage'],
df_Avg_Nov_sorted['AvgTotalPowerUsage'],
df_Avg_Dec_sorted['AvgTotalPowerUsage']
], axis = 1)
X = months_comparison.transpose()
num_clusters = 12
kmeans = KMeans(n_clusters = num_clusters, random_state = 42)
clusters = kmeans.fit_predict(X)
clustered_months_comparison = pd.DataFrame({'AvgTotalPowerUsage': X.values.flatten(), 'Cluster': np.repeat(clusters, len(X.columns))})
month_labels = {0: 'January', 1: 'February', 2: 'March', 3: 'April', 4: 'May', 5: 'June',
6: 'July', 7: 'August', 8: 'September', 9: 'October', 10: 'November', 11: 'December'}
plt.figure(figsize = (10, 6))
for i in range(num_clusters):
cluster_data = X[clusters == i]
plt.scatter(cluster_data.index, cluster_data.transpose().values[0], label = f'Cluster {i + 1}: {month_labels[i]}')
plt.xlabel('Month', fontsize = 12)
plt.ylabel('AvgTotalPowerUsage', fontsize = 12)
plt.title('KMeans Clustering of Monthly Power Usage', fontsize = 13)
plt.legend()
plt.show()
# Create a DataFrame for K-means clustering
months_comparison = pd.concat([
df_Avg_Jan_sorted['AvgTotalPowerUsage'],
df_Avg_Feb_sorted['AvgTotalPowerUsage'],
df_Avg_March_sorted['AvgTotalPowerUsage'],
df_Avg_April_sorted['AvgTotalPowerUsage'],
df_Avg_May_sorted['AvgTotalPowerUsage'],
df_Avg_June_sorted['AvgTotalPowerUsage'],
df_Avg_July_sorted['AvgTotalPowerUsage'],
df_Avg_August_sorted['AvgTotalPowerUsage'],
df_Avg_Sep_sorted['AvgTotalPowerUsage'],
df_Avg_Oct_sorted['AvgTotalPowerUsage'],
df_Avg_Nov_sorted['AvgTotalPowerUsage'],
df_Avg_Dec_sorted['AvgTotalPowerUsage']
], axis = 1)
X = months_comparison.transpose()
num_clusters = 12
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(X)
# create a DataFrame for K-means clustering with the cluster labels
clustered_months_comparison = pd.DataFrame({'AvgTotalPowerUsage': X.values.flatten(), 'Cluster': np.repeat(clusters, len(X.columns))})
plt.figure(figsize = (10, 6))
for i in range(num_clusters):
cluster_data = clustered_months_comparison[clustered_months_comparison['Cluster'] == i]
# Calculatin average power usage for the cluster
avg_power = cluster_data['AvgTotalPowerUsage'].mean()
plt.scatter(cluster_data.index, cluster_data['AvgTotalPowerUsage'], label = f'Cluster {i + 1} (Avg Power: {avg_power:.2f})')
plt.xlabel('Month', fontsize = 12)
plt.ylabel('AvgTotalPowerUsage', fontsize = 12)
plt.title('KMeans Clustering of Monthly Power Usage', fontsize=13)
plt.legend()
plt.show()
### create a DataFrame for K-means clustering
months_comparison = pd.concat([
df_Avg_Jan_sorted['AvgTotalPowerUsage'],
df_Avg_Feb_sorted['AvgTotalPowerUsage'],
df_Avg_March_sorted['AvgTotalPowerUsage'],
df_Avg_April_sorted['AvgTotalPowerUsage'],
df_Avg_May_sorted['AvgTotalPowerUsage'],
df_Avg_June_sorted['AvgTotalPowerUsage'],
df_Avg_July_sorted['AvgTotalPowerUsage'],
df_Avg_August_sorted['AvgTotalPowerUsage'],
df_Avg_Sep_sorted['AvgTotalPowerUsage'],
df_Avg_Oct_sorted['AvgTotalPowerUsage'],
df_Avg_Nov_sorted['AvgTotalPowerUsage'],
df_Avg_Dec_sorted['AvgTotalPowerUsage']
], axis = 1)
# Transpose the DataFrame for K-means clustering
X = months_comparison.transpose()
num_clusters = 12
kmeans = KMeans(n_clusters = num_clusters, random_state=42)
clusters = kmeans.fit_predict(X)
clustered_months_comparison = pd.DataFrame({'AvgTotalPowerUsage': X.values.flatten(), 'Cluster': np.repeat(clusters, len(X.columns))})
plt.figure(figsize = (10, 6))
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
for i in range(num_clusters):
cluster_data = clustered_months_comparison[clustered_months_comparison['Cluster'] == i]
plt.scatter(cluster_data.index, cluster_data['AvgTotalPowerUsage'], label=f'Cluster {i + 1} ({month_names[i]})')
plt.xlabel('Month', fontsize = 12)
plt.ylabel('AvgTotalPowerUsage', fontsize = 12)
plt.title('KMeans Clustering of Monthly Power Usage', fontsize = 13)
plt.legend()
plt.show()
newriver = df[['Year', 'Weekday', 'Month', 'Day', 'HourEnd', 'NewRiver']]
newriver.head()
Year | Weekday | Month | Day | HourEnd | NewRiver | |
---|---|---|---|---|---|---|
0 | 2015 | 4 | 1 | 1 | 1 | 27333 |
1 | 2015 | 4 | 1 | 1 | 2 | 27048 |
2 | 2015 | 4 | 1 | 1 | 3 | 27008 |
3 | 2015 | 4 | 1 | 1 | 4 | 26821 |
4 | 2015 | 4 | 1 | 1 | 5 | 26924 |
monthly_hourly_avg_demand_newriver = newriver.groupby(['Month', 'HourEnd'])['NewRiver'].mean().reset_index()
peak_hours = monthly_hourly_avg_demand_newriver.loc[monthly_hourly_avg_demand_newriver.groupby('Month')['NewRiver'].idxmax()]
peak_hours
Month | HourEnd | NewRiver | |
---|---|---|---|
10 | 1 | 11 | 32479.908602 |
34 | 2 | 11 | 31456.017647 |
58 | 3 | 11 | 27381.397849 |
83 | 4 | 12 | 24641.338889 |
109 | 5 | 14 | 24393.107527 |
134 | 6 | 15 | 26531.711111 |
158 | 7 | 15 | 28384.462366 |
182 | 8 | 15 | 29333.215054 |
206 | 9 | 15 | 28635.877778 |
228 | 10 | 13 | 25951.768817 |
258 | 11 | 19 | 27290.511111 |
274 | 12 | 11 | 28185.005376 |
X = newriver[['Year', 'Weekday', 'Month', 'Day', 'HourEnd']]
y = newriver['NewRiver']
# Splitting data in training and testing sets (80 /20 split)
train_data = newriver.iloc[:42088]
test_data = newriver.iloc[42088:]
X_train, y_train = train_data[['Year', 'Month', 'Day', 'HourEnd', 'Weekday']], train_data['NewRiver']
X_test, y_test = test_data[['Year', 'Month', 'Day', 'HourEnd', 'Weekday']], test_data['NewRiver']
# Making sure the calculateions are correct for the training and testing set
X_train.shape, X_test.shape
((42088, 5), (10520, 5))
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
rf_regressor = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf_regressor.fit(X_train, y_train) # training the model
# Predicting on the test set
y_pred = rf_regressor.predict(X_test)
# Calculating performance metrics here
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae, rmse
(3228.395100285171, 4173.730170727415)
rf_model = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf_model.fit(X_train, y_train)
y_pred_test = rf_model.predict(X_test)
X_test_with_predictions = X_test.copy()
X_test_with_predictions['PredictedDemand'] = y_pred_test
monthly_hourly_demand = X_test_with_predictions.groupby(['Month', 'HourEnd'])['PredictedDemand'].mean().reset_index()
# this will find the maximum predicted demand for each month to identify the peak hour
peak_hours_per_month = monthly_hourly_demand.loc[monthly_hourly_demand.groupby('Month')['PredictedDemand'].idxmax()]
rf_df = peak_hours_per_month[['Month', 'HourEnd', 'PredictedDemand']]
rf_df
Month | HourEnd | PredictedDemand | |
---|---|---|---|
10 | 1 | 11 | 32993.610000 |
34 | 2 | 11 | 29406.464138 |
58 | 3 | 11 | 29205.801613 |
84 | 4 | 13 | 25170.978667 |
110 | 5 | 15 | 25441.357742 |
134 | 6 | 15 | 25900.754667 |
158 | 7 | 15 | 28210.920000 |
182 | 8 | 15 | 28868.415484 |
207 | 9 | 16 | 30382.872667 |
227 | 10 | 12 | 26412.976429 |
259 | 11 | 20 | 27795.040167 |
274 | 12 | 11 | 29177.830161 |
monthly_peak_dfs = [Jan_max_df, Feb_max_df, March_max_df, April_max_df, May_max_df, June_max_df, July_max_df, August_max_df, Sep_max_df, Oct_max_df, Nov_max_df, Dec_max_df]
monthly_peaks_df = pd.concat(monthly_peak_dfs).reset_index(drop = True)
final_df = pd.merge(rf_df, monthly_peaks_df, left_on = 'Month', right_index = True)
final_df = final_df.rename(columns = {
'HourEnd_x' : 'HourEnd(RandomForest)',
'PredictedDemand' : 'PredictedDemand(RandomForest)',
'HourEnd_y' : 'HourEnd(Actual)',
})
final_df = final_df.drop(columns = 'Time Frame')
final_df
Month | HourEnd(RandomForest) | PredictedDemand(RandomForest) | HourEnd(Actual) | AvgTotalPowerUsage | |
---|---|---|---|---|---|
10 | 1 | 11 | 32993.610000 | 11 | 31456.02 |
34 | 2 | 11 | 29406.464138 | 11 | 27381.40 |
58 | 3 | 11 | 29205.801613 | 12 | 24641.34 |
84 | 4 | 13 | 25170.978667 | 14 | 24393.11 |
110 | 5 | 15 | 25441.357742 | 15 | 26531.71 |
134 | 6 | 15 | 25900.754667 | 15 | 28384.46 |
158 | 7 | 15 | 28210.920000 | 15 | 29333.22 |
182 | 8 | 15 | 28868.415484 | 15 | 28635.88 |
207 | 9 | 16 | 30382.872667 | 13 | 25951.77 |
227 | 10 | 12 | 26412.976429 | 19 | 27290.51 |
259 | 11 | 20 | 27795.040167 | 11 | 28185.01 |
Jan_max_df['Month'] = 1
Feb_max_df['Month'] = 2
March_max_df['Month'] = 3
April_max_df['Month'] = 4
May_max_df['Month'] = 5
June_max_df['Month'] = 6
July_max_df['Month'] = 7
August_max_df['Month'] = 8
Sep_max_df['Month'] = 9
Oct_max_df['Month'] = 10
Nov_max_df['Month'] = 11
Dec_max_df['Month'] = 12
monthly_peak_dfs = [Jan_max_df, Feb_max_df, March_max_df, April_max_df, May_max_df, June_max_df, July_max_df, August_max_df, Sep_max_df, Oct_max_df, Nov_max_df, Dec_max_df]
monthly_peaks_df = pd.concat(monthly_peak_dfs).reset_index(drop = True)
final_df = pd.merge(rf_df, monthly_peaks_df, left_on = 'Month', right_on = 'Month')
final_df = final_df.rename(columns = {
'HourEnd_x' : 'HourEnd(RandomForest)',
'PredictedDemand' : 'PredictedDemand(RandomForest)',
'HourEnd_y' : 'HourEnd(Actual)',
})
final_df = final_df.drop(columns = 'Time Frame')
final_df
Month | HourEnd(RandomForest) | PredictedDemand(RandomForest) | HourEnd(Actual) | AvgTotalPowerUsage | |
---|---|---|---|---|---|
0 | 1 | 11 | 32993.610000 | 11 | 32479.91 |
1 | 2 | 11 | 29406.464138 | 11 | 31456.02 |
2 | 3 | 11 | 29205.801613 | 11 | 27381.40 |
3 | 4 | 13 | 25170.978667 | 12 | 24641.34 |
4 | 5 | 15 | 25441.357742 | 14 | 24393.11 |
5 | 6 | 15 | 25900.754667 | 15 | 26531.71 |
6 | 7 | 15 | 28210.920000 | 15 | 28384.46 |
7 | 8 | 15 | 28868.415484 | 15 | 29333.22 |
8 | 9 | 16 | 30382.872667 | 15 | 28635.88 |
9 | 10 | 12 | 26412.976429 | 13 | 25951.77 |
10 | 11 | 20 | 27795.040167 | 19 | 27290.51 |
11 | 12 | 11 | 29177.830161 | 11 | 28185.01 |
pivot_df = final_df.pivot_table(index='Month', values=['PredictedDemand(RandomForest)', 'AvgTotalPowerUsage'])
pivot_df.columns = ['Predicted Demand (Random Forest Regression)', 'Actual Average Power Usage (Power Curves)']
visual = pivot_df.plot(kind = 'bar', figsize = (14, 7))
visual.set_xticklabels(['11 11\nJan', '11 11 \nFeb', '11 11 \nMar', '13 12 \nApr', '15 14 \nMay',
'15 15 \nJun', '15 15 \nJul', '15 15 \nAug', '16 15 \nSep',
'12 13 \nOct', '20 19 \nNov', '11 11 \nDec'], rotation = 360)
plt.xlabel('\nMonth and Hour End')
plt.ylabel('Power Usage (kWh)')
plt.title('Comparison of Predicted Demand vs Actual Average Power Usage')
plt.legend(title = 'Type of Model')
plt.show()
avg_peak_usage = final_df['AvgTotalPowerUsage'].values
predicted_peak_usage = final_df['PredictedDemand(RandomForest)'].values
from sklearn.metrics import r2_score
r_squared = r2_score(avg_peak_usage, predicted_peak_usage)
print("R-squared value:", r_squared)
R-squared value: 0.7822025378274747
plt.scatter(avg_peak_usage, predicted_peak_usage)
plt.title('Actual vs Predicted Peak Hour Demand')
plt.xlabel('Actual Peak Hour Demand')
plt.ylabel('Predicted Peak Hour Demand')
plt.plot([min(avg_peak_usage), max(avg_peak_usage)], [min(avg_peak_usage), max(avg_peak_usage)], color = 'red')
plt.show()
from sklearn.metrics import mean_absolute_error, mean_squared_error
# use avg_peak_usage and predicted_peak_usage from above
mae = mean_absolute_error(avg_peak_usage, predicted_peak_usage)
print("Mean Absolute Error (MAE):", mae)
mse = mean_squared_error(avg_peak_usage, predicted_peak_usage)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)
Mean Absolute Error (MAE): 911.6994296859826 Root Mean Squared Error (RMSE): 1092.4570900789852
from scipy import stats
t_stat, p_value = stats.ttest_rel(avg_peak_usage, predicted_peak_usage)
print("t-statistic:", t_stat, "p-value:", p_value)
t-statistic: -1.1523913939997472 p-value: 0.2735894453508184
alpha = 0.05
if p_value < alpha:
print("We reject the null hypothesis (Ho), suggesting a significant difference between the actual and predicted peak demand.")
else:
print("We fail to reject the null hypothesis (Ho), suggesting no significant difference between the actual and predicted peak demand.")
We fail to reject the null hypothesis (Ho), suggesting no significant difference between the actual and predicted peak demand.
#pip install xgboost
import xgboost as xgb
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_6624\2785909336.py in <module> ----> 1 import xgboost as xgb ModuleNotFoundError: No module named 'xgboost'
xgb_regressor = xgb.XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 5, random_state = 42)
xgb_regressor.fit(X_train, y_train)
y_pred_xgb = xgb_regressor.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print(f"MAE: {mae_xgb}, RMSE: {rmse_xgb}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_6624\2220971069.py in <module> ----> 1 xgb_regressor = xgb.XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 5, random_state = 42) 2 xgb_regressor.fit(X_train, y_train) 3 4 y_pred_xgb = xgb_regressor.predict(X_test) 5 NameError: name 'xgb' is not defined
X_test['PredictedDemand'] = y_pred_xgb
--------------------------------------------------------------------------- NameError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_6624\3308208585.py in <module> ----> 1 X_test['PredictedDemand'] = y_pred_xgb NameError: name 'y_pred_xgb' is not defined
monthly_hourly_demand_xgb = X_test.groupby(['Month', 'HourEnd'])['PredictedDemand'].mean().reset_index()
# Finding th hour with the highest average predicted demand for each month
peak_hours_per_month_xgb = monthly_hourly_demand_xgb.loc[monthly_hourly_demand_xgb.groupby('Month')['PredictedDemand'].idxmax()]
peak_hours_per_month_xgb[['Month', 'HourEnd', 'PredictedDemand']]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_6624\1349816840.py in <module> ----> 1 monthly_hourly_demand_xgb = X_test.groupby(['Month', 'HourEnd'])['PredictedDemand'].mean().reset_index() 2 3 # Finding th hour with the highest average predicted demand for each month 4 peak_hours_per_month_xgb = monthly_hourly_demand_xgb.loc[monthly_hourly_demand_xgb.groupby('Month')['PredictedDemand'].idxmax()] 5 ~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in __getitem__(self, key) 1336 stacklevel=find_stack_level(), 1337 ) -> 1338 return super().__getitem__(key) 1339 1340 def _gotitem(self, key, ndim: int, subset=None): ~\anaconda3\lib\site-packages\pandas\core\base.py in __getitem__(self, key) 248 else: 249 if key not in self.obj: --> 250 raise KeyError(f"Column not found: {key}") 251 subset = self.obj[key] 252 ndim = subset.ndim KeyError: 'Column not found: PredictedDemand'
temp = pd.read_csv("AvgBooneTemp.csv")
temp.head()
Date | Month | Day | Year | AverageAirTemp(F) | |
---|---|---|---|---|---|
0 | 1/1/2015 | 1 | 1 | 2015 | 32.0 |
1 | 1/2/2015 | 1 | 2 | 2015 | 40.6 |
2 | 1/3/2015 | 1 | 3 | 2015 | 42.3 |
3 | 1/4/2015 | 1 | 4 | 2015 | 49.2 |
4 | 1/5/2015 | 1 | 5 | 2015 | 28.5 |
temp_dtype = temp.dtypes
temp_dtype
Date object Month int64 Day int64 Year int64 AverageAirTemp(F) float64 dtype: object
new_df = pd.merge(df, temp, on = ['Year', 'Month', 'Day'])
new_df.head()
UTC | Eastern | Year | OrdDay | OrdHr | Weekday | Month | Day | HourEnd | DST | Concord | Greenwood | NewRiver | KingsMountain | Winterville | Total | Peak | Date | AverageAirTemp(F) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-01 06:00:00+00:00 | 2015-01-01 01:00:00-05:00 | 2015 | 1 | 1 | 4 | 1 | 1 | 1 | 0 | 97098 | 33667 | 27333 | 14393 | 8060 | 180550 | 0 | 1/1/2015 | 32.0 |
1 | 2015-01-01 07:00:00+00:00 | 2015-01-01 02:00:00-05:00 | 2015 | 1 | 2 | 4 | 1 | 1 | 2 | 0 | 96620 | 33223 | 27048 | 14306 | 8307 | 179504 | 0 | 1/1/2015 | 32.0 |
2 | 2015-01-01 08:00:00+00:00 | 2015-01-01 03:00:00-05:00 | 2015 | 1 | 3 | 4 | 1 | 1 | 3 | 0 | 96084 | 32849 | 27008 | 14270 | 8174 | 178385 | 0 | 1/1/2015 | 32.0 |
3 | 2015-01-01 09:00:00+00:00 | 2015-01-01 04:00:00-05:00 | 2015 | 1 | 4 | 4 | 1 | 1 | 4 | 0 | 95829 | 33053 | 26821 | 14285 | 8333 | 178320 | 0 | 1/1/2015 | 32.0 |
4 | 2015-01-01 10:00:00+00:00 | 2015-01-01 05:00:00-05:00 | 2015 | 1 | 5 | 4 | 1 | 1 | 5 | 0 | 98489 | 33686 | 26924 | 14450 | 8396 | 181946 | 0 | 1/1/2015 | 32.0 |
#new_df = new_df.drop(columns = ['UTC', 'DST', 'Date_x', 'Date_y'])
new_df = new_df.drop(columns = ['UTC', 'DST'])
new_df.head()
Eastern | Year | OrdDay | OrdHr | Weekday | Month | Day | HourEnd | Concord | Greenwood | NewRiver | KingsMountain | Winterville | Total | Peak | Date | AverageAirTemp(F) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-01 01:00:00-05:00 | 2015 | 1 | 1 | 4 | 1 | 1 | 1 | 97098 | 33667 | 27333 | 14393 | 8060 | 180550 | 0 | 1/1/2015 | 32.0 |
1 | 2015-01-01 02:00:00-05:00 | 2015 | 1 | 2 | 4 | 1 | 1 | 2 | 96620 | 33223 | 27048 | 14306 | 8307 | 179504 | 0 | 1/1/2015 | 32.0 |
2 | 2015-01-01 03:00:00-05:00 | 2015 | 1 | 3 | 4 | 1 | 1 | 3 | 96084 | 32849 | 27008 | 14270 | 8174 | 178385 | 0 | 1/1/2015 | 32.0 |
3 | 2015-01-01 04:00:00-05:00 | 2015 | 1 | 4 | 4 | 1 | 1 | 4 | 95829 | 33053 | 26821 | 14285 | 8333 | 178320 | 0 | 1/1/2015 | 32.0 |
4 | 2015-01-01 05:00:00-05:00 | 2015 | 1 | 5 | 4 | 1 | 1 | 5 | 98489 | 33686 | 26924 | 14450 | 8396 | 181946 | 0 | 1/1/2015 | 32.0 |
len(new_df)
52608
January = new_df[(new_df['Month'] == 1)]
February = new_df[(new_df['Month'] == 2)]
March = new_df[(new_df['Month'] == 3)]
April = new_df[(new_df['Month'] == 4)]
May = new_df[(new_df['Month'] == 5)]
June = new_df[(new_df['Month'] == 6)]
July = new_df[(new_df['Month'] == 7)]
August = new_df[(new_df['Month'] == 8)]
September = new_df[(new_df['Month'] == 9)]
October = new_df[(new_df['Month'] == 10)]
November = new_df[(new_df['Month'] == 11)]
December = new_df[(new_df['Month'] == 12)]
January['AverageAirTemp(F)'] = January['AverageAirTemp(F)'].astype(float)
February['AverageAirTemp(F)'] = February['AverageAirTemp(F)'].astype(float)
March['AverageAirTemp(F)'] = March['AverageAirTemp(F)'].astype(float)
April['AverageAirTemp(F)'] = April['AverageAirTemp(F)'].astype(float)
May['AverageAirTemp(F)'] = May['AverageAirTemp(F)'].astype(float)
June['AverageAirTemp(F)'] = June['AverageAirTemp(F)'].astype(float)
July['AverageAirTemp(F)'] = July['AverageAirTemp(F)'].astype(float)
August['AverageAirTemp(F)'] = August['AverageAirTemp(F)'].astype(float)
September['AverageAirTemp(F)'] = September['AverageAirTemp(F)'].astype(float)
October['AverageAirTemp(F)'] = October['AverageAirTemp(F)'].astype(float)
November['AverageAirTemp(F)'] = November['AverageAirTemp(F)'].astype(float)
December['AverageAirTemp(F)'] = December['AverageAirTemp(F)'].astype(float)
Jan_Avg = January['AverageAirTemp(F)'].mean()
Feb_Avg = February['AverageAirTemp(F)'].mean()
March_Avg = March['AverageAirTemp(F)'].mean()
April_Avg = April['AverageAirTemp(F)'].mean()
May_Avg = May['AverageAirTemp(F)'].mean()
June_Avg = June['AverageAirTemp(F)'].mean()
July_Avg = July['AverageAirTemp(F)'].mean()
Aug_Avg = August['AverageAirTemp(F)'].mean()
Sep_Avg = September['AverageAirTemp(F)'].mean()
Oct_Avg = October['AverageAirTemp(F)'].mean()
Nov_Avg = November['AverageAirTemp(F)'].mean()
Dec_Avg = December['AverageAirTemp(F)'].mean()
temp_data = {
'Month': ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'],
'Average Temperature': [Jan_Avg, Feb_Avg, March_Avg, April_Avg, May_Avg, June_Avg, July_Avg, Aug_Avg, Sep_Avg, Oct_Avg, Nov_Avg, Dec_Avg]
}
avg_monthly_temp = pd.DataFrame(temp_data)
avg_monthly_temp
Month | Average Temperature | |
---|---|---|
0 | January | 33.338172 |
1 | February | 37.728824 |
2 | March | 44.034141 |
3 | April | 52.006667 |
4 | May | 60.470430 |
5 | June | 66.946111 |
6 | July | 70.131720 |
7 | August | 67.834409 |
8 | September | 63.590556 |
9 | October | 54.049462 |
10 | November | 43.353560 |
11 | December | 38.182796 |
final_df = pd.DataFrame(final_df)
final_df
Month | HourEnd(RandomForest) | PredictedDemand(RandomForest) | HourEnd(Actual) | AvgTotalPowerUsage | |
---|---|---|---|---|---|
0 | 1 | 11 | 32993.610000 | 11 | 32479.91 |
1 | 2 | 11 | 29406.464138 | 11 | 31456.02 |
2 | 3 | 11 | 29205.801613 | 11 | 27381.40 |
3 | 4 | 13 | 25170.978667 | 12 | 24641.34 |
4 | 5 | 15 | 25441.357742 | 14 | 24393.11 |
5 | 6 | 15 | 25900.754667 | 15 | 26531.71 |
6 | 7 | 15 | 28210.920000 | 15 | 28384.46 |
7 | 8 | 15 | 28868.415484 | 15 | 29333.22 |
8 | 9 | 16 | 30382.872667 | 15 | 28635.88 |
9 | 10 | 12 | 26412.976429 | 13 | 25951.77 |
10 | 11 | 20 | 27795.040167 | 19 | 27290.51 |
11 | 12 | 11 | 29177.830161 | 11 | 28185.01 |
temp_forest_df = pd.concat([final_df, avg_monthly_temp], axis = 1)
temp_forest_df = temp_forest_df[['Month', 'HourEnd(RandomForest)', 'PredictedDemand(RandomForest)', 'HourEnd(Actual)', 'AvgTotalPowerUsage', 'Average Temperature']]
temp_forest_df = temp_forest_df.iloc[:, [0, 2, 3, 4, 5, 6]]
temp_forest_df
Month | HourEnd(RandomForest) | PredictedDemand(RandomForest) | HourEnd(Actual) | AvgTotalPowerUsage | Average Temperature | |
---|---|---|---|---|---|---|
0 | 1 | 11 | 32993.610000 | 11 | 32479.91 | 33.338172 |
1 | 2 | 11 | 29406.464138 | 11 | 31456.02 | 37.728824 |
2 | 3 | 11 | 29205.801613 | 11 | 27381.40 | 44.034141 |
3 | 4 | 13 | 25170.978667 | 12 | 24641.34 | 52.006667 |
4 | 5 | 15 | 25441.357742 | 14 | 24393.11 | 60.470430 |
5 | 6 | 15 | 25900.754667 | 15 | 26531.71 | 66.946111 |
6 | 7 | 15 | 28210.920000 | 15 | 28384.46 | 70.131720 |
7 | 8 | 15 | 28868.415484 | 15 | 29333.22 | 67.834409 |
8 | 9 | 16 | 30382.872667 | 15 | 28635.88 | 63.590556 |
9 | 10 | 12 | 26412.976429 | 13 | 25951.77 | 54.049462 |
10 | 11 | 20 | 27795.040167 | 19 | 27290.51 | 43.353560 |
11 | 12 | 11 | 29177.830161 | 11 | 28185.01 | 38.182796 |
# in the excel sheet, 2.6 is the lowest temp and 77 is the highest temp
# increment by about 15 degrees for the most part
temp_bins = [min(temp_forest_df['Average Temperature']), 40, 45, 50, 55, 60, 65, max(temp_forest_df['Average Temperature'])]
temp_bin_labels = ['<35°F', '35-40°F', '40-45°F', '45-50°F', '50-55°F', '55-60°F', '60-65°F']
temp_forest_df['Temperature Interval'] = pd.cut(temp_forest_df['Average Temperature'], bins = temp_bins, labels = temp_bin_labels, include_lowest = True)
peak_usage_by_temp_interval = temp_forest_df.groupby('Temperature Interval')['AvgTotalPowerUsage', 'PredictedDemand(RandomForest)'].mean().reset_index()
peak_usage_by_temp_interval
Temperature Interval | AvgTotalPowerUsage | PredictedDemand(RandomForest) | |
---|---|---|---|
0 | <35°F | 30706.980 | 30525.968100 |
1 | 35-40°F | 27335.955 | 28500.420890 |
2 | 40-45°F | NaN | NaN |
3 | 45-50°F | 25296.555 | 25791.977548 |
4 | 50-55°F | NaN | NaN |
5 | 55-60°F | 26514.495 | 27912.115204 |
6 | 60-65°F | 28083.130 | 27660.030050 |
temp_bins2 = [min(new_df['AverageAirTemp(F)']), 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, max(new_df['AverageAirTemp(F)'])]
temp_bin_labels2 = ['<5°F', '5-10°F', '10-15°F', '15-20°F', '20-25°F', '25-30°F', '30-35°F', '35-40°F', '40-45°F', '45-50°F', '50-55°F', '55-60°F', '60-65°F', '>65°F']
new_df['Temperature Interval'] = pd.cut(new_df['AverageAirTemp(F)'], bins = temp_bins2, labels = temp_bin_labels2, include_lowest = True)
peak_usage_by_temp_interval2 = new_df.groupby('Temperature Interval')['NewRiver'].mean().reset_index()
peak_usage_by_temp_interval2
Temperature Interval | NewRiver | |
---|---|---|
0 | <5°F | 48143.458333 |
1 | 5-10°F | 40638.104167 |
2 | 10-15°F | 38916.022727 |
3 | 15-20°F | 37433.886667 |
4 | 20-25°F | 33989.489583 |
5 | 25-30°F | 31665.983043 |
6 | 30-35°F | 28597.559711 |
7 | 35-40°F | 25964.040308 |
8 | 40-45°F | 23664.123180 |
9 | 45-50°F | 22257.352887 |
10 | 50-55°F | 21275.019806 |
11 | 55-60°F | 21143.176656 |
12 | 60-65°F | 21664.816699 |
13 | >65°F | 23531.379942 |
plt.figure(figsize = (13, 8))
sns.barplot(x = 'Temperature Interval', y = 'NewRiver', data = peak_usage_by_temp_interval2)
plt.title('Average Power Usage by Temperature Category')
plt.xlabel('Temperature Category')
plt.ylabel('Average Power Usage')
plt.grid(True)
plt.show()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
peak_usage_by_temp_interval2['NormalizedPowerUsage'] = scaler.fit_transform(peak_usage_by_temp_interval2[['NewRiver']])
kmeans = KMeans(n_clusters = 5, random_state = 0)
peak_usage_by_temp_interval2['Cluster'] = kmeans.fit_predict(peak_usage_by_temp_interval2[['NormalizedPowerUsage']])
sns.scatterplot(data = peak_usage_by_temp_interval2, x = 'Temperature Interval', y = 'NormalizedPowerUsage', hue = 'Cluster', palette = 'viridis')
plt.xticks(rotation = 45) # turns x-axis labels
plt.title('Average Power Usage by Temperature Interval (Clustered)')
plt.xlabel('Temperature Interval')
plt.ylabel('Normalized Average Power Usage')
plt.show()
# Uses Random Forest Peak Demand Predictions
sns.lineplot(data=temp_forest_df, x = 'Average Temperature', y = 'PredictedDemand(RandomForest)', color = 'purple', marker = 'o')
plt.title("Average Total Power Usage (Random Forest Predictions) vs. Average Temperature\n\n(Each Data Point Corresponds to Each Month's Predictions)\n")
plt.xlabel('Average Temperature (°F)')
plt.ylabel('Average Total Power Usage')
plt.legend(labels = ['Predicted Demand', 'Actual Usage'])
plt.grid(True)
plt.show()
print("----------------------------------------------------------------------------------------")
# Using Power Curve Peaks
sns.lineplot(data=temp_forest_df, x = 'Average Temperature', y = 'AvgTotalPowerUsage', color = 'green', marker = 'o')
plt.title("Average Total Power Usage (Power Curves) vs. Average Temperature\n\n(Each Data Point Corresponds to Each Month's Predictions)\n")
plt.xlabel('Average Temperature (°F)')
plt.ylabel('Average Total Power Usage')
plt.legend(labels = ['Predicted Demand', 'Actual Usage'])
plt.grid(True)
plt.show()
----------------------------------------------------------------------------------------
correlation_forest = temp_forest_df['Average Temperature'].corr(temp_forest_df['PredictedDemand(RandomForest)'])
correlation_curves = temp_forest_df['Average Temperature'].corr(temp_forest_df['AvgTotalPowerUsage'])
print(f'Correlation coefficient (Avg. Temp vs. Random Forest Regression Predictions: {correlation_forest}')
print("")
print(f'Correlation coefficient (Avg. Temp vs Power Curve Predictions) {correlation_curves}')
Correlation coefficient (Avg. Temp vs. Random Forest Regression Predictions: -0.4582138897627224 Correlation coefficient (Avg. Temp vs Power Curve Predictions) -0.41090289030943566
Higher temperatures may not necessarily lead to higher energy consumptions
The negative correlation also suggests that we might expect lower electricity usage for days that are warmer
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])
daily_power_usage = df.groupby('Date')['NewRiver'].mean().reset_index()
daily_power_usage.columns = ['Date', 'AverageDailyPowerUsage']
daily_power_usage['Date'] = daily_power_usage['Date'].dt.strftime('%m/%d/%y')
daily_power_usage
Date | AverageDailyPowerUsage | |
---|---|---|
0 | 01/01/15 | 26971.583333 |
1 | 01/02/15 | 24511.125000 |
2 | 01/03/15 | 21908.208333 |
3 | 01/04/15 | 20010.958333 |
4 | 01/05/15 | 28633.000000 |
... | ... | ... |
2187 | 12/27/20 | 27481.583333 |
2188 | 12/28/20 | 26964.791667 |
2189 | 12/29/20 | 25295.166667 |
2190 | 12/30/20 | 26725.916667 |
2191 | 12/31/20 | 21952.875000 |
2192 rows × 2 columns
daily_avg_power_df = pd.concat([daily_power_usage, temp], axis = 1)
daily_avg_power_df = daily_avg_power_df[['Date', 'AverageDailyPowerUsage', 'AverageAirTemp(F)']]
daily_avg_power_df.head()
daily_avg_power_df = daily_avg_power_df.iloc[:, [0, 2, 3]]
daily_avg_power_df
Date | AverageDailyPowerUsage | AverageAirTemp(F) | |
---|---|---|---|
0 | 01/01/15 | 26971.583333 | 32.0 |
1 | 01/02/15 | 24511.125000 | 40.6 |
2 | 01/03/15 | 21908.208333 | 42.3 |
3 | 01/04/15 | 20010.958333 | 49.2 |
4 | 01/05/15 | 28633.000000 | 28.5 |
... | ... | ... | ... |
2187 | 12/27/20 | 27481.583333 | 31.0 |
2188 | 12/28/20 | 26964.791667 | 36.2 |
2189 | 12/29/20 | 25295.166667 | 37.7 |
2190 | 12/30/20 | 26725.916667 | 36.1 |
2191 | 12/31/20 | 21952.875000 | 49.3 |
2192 rows × 3 columns
len(daily_avg_power_df)
2192
bins = [2.6, 32, 50, 65, 77]
labels = ['Very Cold\n (< 32°F)', 'Cold\n (32°F - 50°F)', 'Moderately Warm\n (50°F - 65°F)', 'Warm\n (> 65°F)']
daily_avg_power_df['TempCategory'] = pd.cut(daily_avg_power_df['AverageAirTemp(F)'], bins = bins, labels = labels, include_lowest = True)
plt.figure(figsize = (10, 6))
sns.barplot(x = 'TempCategory', y = 'AverageDailyPowerUsage', data = daily_avg_power_df)
plt.title('Average Peak Power Usage by Temperature Category')
plt.xlabel('Temperature Category')
plt.ylabel('Average Peak Power Usage')
plt.grid(True)
plt.show()
slope, intercept, r_value, p_value, std_err = linregress(daily_avg_power_df['AverageAirTemp(F)'], daily_avg_power_df['AverageDailyPowerUsage'])
print("Slope:", slope, "\nIntercept:", intercept, "\nR-squared:", r_value**2, "\np-value:", p_value)
Slope: -166.63618365376936 Intercept: 32848.935907272906 R-squared: 0.3678369444070465 p-value: 2.2718415607098826e-220
bins = [2.6, 32, 50, 65, 77]
temp_labels = ['Very Cold\n (< 32°F)', 'Cold\n (32°F - 50°F)', 'Moderately Warm\n (50°F - 65°F', 'Warm\n (> 65°F)']
new_df['TempCategory'] = pd.cut(new_df['AverageAirTemp(F)'], bins = bins, labels = temp_labels, include_lowest = True)
pivot_table = new_df.pivot_table(
values = 'NewRiver',
index = 'HourEnd',
columns = 'TempCategory',
aggfunc = 'mean'
)
plt.figure(figsize = (12, 9))
ax = sns.heatmap(pivot_table, annot = True, fmt = ".1f", cmap = "coolwarm")
plt.title('Heatmap of Average Electricity Usage by Hour and Temperature Category')
plt.xlabel('\nTemperature Category')
plt.ylabel('Hour of Day')
color_bar = ax.collections[0].colorbar
color_bar.set_label('Average Power Usage\n\n (Blue: Least Power Usage)\n (Red: Most Power Usage)')
plt.show()
This heatmap visualizes the average electricity usage across different hours of the day and categorized temperature ranges (Very Cold, Cold, Moderately Warm, Warm)
Note: Each of the cells represents the average usage for a given hour across all of the days that fall within their respective associated temperature category
The different colors represent the magnitude of the power usage
correlation = new_df['NewRiver'].corr(new_df['AverageAirTemp(F)'])
correlation
-0.4504009638977822
corr = daily_avg_power_df['AverageDailyPowerUsage'].corr(daily_avg_power_df['AverageAirTemp(F)'])
corr
-0.60649562604115
# This looks chaotic so don't use this
daily_avg_line = daily_avg_power_df.plot.line(x = 'AverageAirTemp(F)', y = 'AverageDailyPowerUsage')
sns.lmplot(x = 'AverageAirTemp(F)', y = 'AverageDailyPowerUsage', data = daily_avg_power_df, aspect = 1.5, line_kws = {"color": "red"})
plt.title('Correlation between Average Temperature and Average Power Usage')
plt.xlabel('Average Air Temperature (°F)')
plt.ylabel('Average Daily Power Usage')
# show the correlation coefficient that was calculated earlier
plt.text(85, 20000, 'Correlation:' + str(round(corr,3)), fontsize = 12, ha = 'left', va = 'center')
plt.show()
concord = pd.read_csv(r"C:\Users\alexp\Documents\School\Summer 2024\Concord.csv")
demand = pd.read_csv(r"C:\Users\alexp\Documents\School\Summer 2024\Demand_All.csv")
concord.head()
constant_columns = [col for col in concord.columns if concord[col].nunique() == 1]
print(f"Constant columns: {constant_columns}")
Constant columns: ['WT01', 'WT03', 'WT04', 'WT05', 'WT06', 'WT11']
df.head()
UTC | Eastern | Year | OrdDay | OrdHr | Weekday | Month | Day | HourEnd | DST | Concord | Greenwood | NewRiver | KingsMountain | Winterville | Total | Peak | Date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-01 06:00:00+00:00 | 2015-01-01 01:00:00-05:00 | 2015 | 1 | 1 | 4 | 1 | 1 | 1 | 0 | 97098 | 33667 | 27333 | 14393 | 8060 | 180550 | 0 | 2015-01-01 |
1 | 2015-01-01 07:00:00+00:00 | 2015-01-01 02:00:00-05:00 | 2015 | 1 | 2 | 4 | 1 | 1 | 2 | 0 | 96620 | 33223 | 27048 | 14306 | 8307 | 179504 | 0 | 2015-01-01 |
2 | 2015-01-01 08:00:00+00:00 | 2015-01-01 03:00:00-05:00 | 2015 | 1 | 3 | 4 | 1 | 1 | 3 | 0 | 96084 | 32849 | 27008 | 14270 | 8174 | 178385 | 0 | 2015-01-01 |
3 | 2015-01-01 09:00:00+00:00 | 2015-01-01 04:00:00-05:00 | 2015 | 1 | 4 | 4 | 1 | 1 | 4 | 0 | 95829 | 33053 | 26821 | 14285 | 8333 | 178320 | 0 | 2015-01-01 |
4 | 2015-01-01 10:00:00+00:00 | 2015-01-01 05:00:00-05:00 | 2015 | 1 | 5 | 4 | 1 | 1 | 5 | 0 | 98489 | 33686 | 26924 | 14450 | 8396 | 181946 | 0 | 2015-01-01 |
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']]).dt.strftime('%m-%d-%Y')
df['Date'] = pd.to_datetime(df['Date'])
daily_energy_usage_concord = df.groupby(df['Date'].dt.date)['Concord'].sum()
daily_energy_usage_concord_df = daily_energy_usage_concord.reset_index()
daily_energy_usage_concord_df.drop(columns='Date', inplace=True)
daily_energy_usage_concord_series = daily_energy_usage_concord_df['Concord']
print(daily_energy_usage_concord_series.head())
0 2333478 1 2260705 2 2049778 3 1944530 4 2384060 Name: Concord, dtype: int64
df.head()
UTC | Eastern | Year | OrdDay | OrdHr | Weekday | Month | Day | HourEnd | DST | Concord | Greenwood | NewRiver | KingsMountain | Winterville | Total | Peak | Date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-01 06:00:00+00:00 | 2015-01-01 01:00:00-05:00 | 2015 | 1 | 1 | 4 | 1 | 1 | 1 | 0 | 97098 | 33667 | 27333 | 14393 | 8060 | 180550 | 0 | 2015-01-01 |
1 | 2015-01-01 07:00:00+00:00 | 2015-01-01 02:00:00-05:00 | 2015 | 1 | 2 | 4 | 1 | 1 | 2 | 0 | 96620 | 33223 | 27048 | 14306 | 8307 | 179504 | 0 | 2015-01-01 |
2 | 2015-01-01 08:00:00+00:00 | 2015-01-01 03:00:00-05:00 | 2015 | 1 | 3 | 4 | 1 | 1 | 3 | 0 | 96084 | 32849 | 27008 | 14270 | 8174 | 178385 | 0 | 2015-01-01 |
3 | 2015-01-01 09:00:00+00:00 | 2015-01-01 04:00:00-05:00 | 2015 | 1 | 4 | 4 | 1 | 1 | 4 | 0 | 95829 | 33053 | 26821 | 14285 | 8333 | 178320 | 0 | 2015-01-01 |
4 | 2015-01-01 10:00:00+00:00 | 2015-01-01 05:00:00-05:00 | 2015 | 1 | 5 | 4 | 1 | 1 | 5 | 0 | 98489 | 33686 | 26924 | 14450 | 8396 | 181946 | 0 | 2015-01-01 |
concord['DATE'] = pd.to_datetime(concord['DATE'])
df['HourEnd'] = pd.to_datetime(df['HourEnd'])
concord['Month'] = concord['DATE'].dt.month
concord['Day'] = concord['DATE'].dt.day
concord['Year'] = concord['DATE'].dt.year
concord['Concord'] = daily_energy_usage_concord_series
print(concord.head())
STATION NAME DATE AWND DAPR MDPR PRCP SNOW SNWD \ 0 USC00311975 CONCORD, NC US 2015-01-01 0.00 NaN NaN 0.00 0.0 0.0 1 USC00311975 CONCORD, NC US 2015-01-02 2.91 NaN NaN 0.00 0.0 0.0 2 USC00311975 CONCORD, NC US 2015-01-03 2.91 NaN NaN 0.18 0.0 0.0 3 USC00311975 CONCORD, NC US 2015-01-04 2.91 NaN NaN 0.17 0.0 0.0 4 USC00311975 CONCORD, NC US 2015-01-05 2.91 NaN NaN 0.22 0.0 0.0 TMAX ... WT01 WT03 WT04 WT05 WT06 WT11 Month Day Year Concord 0 47.0 ... NaN NaN NaN NaN NaN NaN 1 1 2015 2333478.0 1 54.0 ... NaN NaN NaN NaN NaN NaN 1 2 2015 2260705.0 2 52.0 ... 1.0 NaN NaN NaN NaN NaN 1 3 2015 2049778.0 3 51.0 ... 1.0 NaN NaN NaN NaN NaN 1 4 2015 1944530.0 4 65.0 ... 1.0 NaN NaN NaN NaN NaN 1 5 2015 2384060.0 [5 rows x 22 columns]
weather_data_types = concord.dtypes
weather_data_types
STATION object NAME object DATE datetime64[ns] AWND float64 DAPR float64 MDPR float64 PRCP float64 SNOW float64 SNWD float64 TMAX float64 TMIN float64 TOBS float64 WT01 float64 WT03 float64 WT04 float64 WT05 float64 WT06 float64 WT11 float64 Month int64 Day int64 Year int64 Concord float64 dtype: object
concord = concord.drop(columns=['STATION', 'NAME', 'DATE','WT01', 'WT03', 'WT04', 'WT05', 'WT06', 'WT11'])
concord.head()
AWND | DAPR | MDPR | PRCP | SNOW | SNWD | TMAX | TMIN | TOBS | Month | Day | Year | Concord | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00 | NaN | NaN | 0.00 | 0.0 | 0.0 | 47.0 | 24.0 | 25.0 | 1 | 1 | 2015 | 2333478.0 |
1 | 2.91 | NaN | NaN | 0.00 | 0.0 | 0.0 | 54.0 | 25.0 | 41.0 | 1 | 2 | 2015 | 2260705.0 |
2 | 2.91 | NaN | NaN | 0.18 | 0.0 | 0.0 | 52.0 | 41.0 | 46.0 | 1 | 3 | 2015 | 2049778.0 |
3 | 2.91 | NaN | NaN | 0.17 | 0.0 | 0.0 | 51.0 | 46.0 | 48.0 | 1 | 4 | 2015 | 1944530.0 |
4 | 2.91 | NaN | NaN | 0.22 | 0.0 | 0.0 | 65.0 | 38.0 | 39.0 | 1 | 5 | 2015 | 2384060.0 |
# Fill missing values with the mean
columns_to_fill = ['DAPR', 'MDPR', 'AWND', 'PRCP']
for col in columns_to_fill:
concord[col] = concord[col].fillna(concord[col].mean())
# Check for any remaining missing values
missing_values = concord.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print("Missing values after filling: \n", missing_columns)
Missing values after filling: SNOW 1546 SNWD 2779 TMAX 2782 TMIN 2782 TOBS 2782 Concord 2783 dtype: int64
columns_to_check = ['AWND', 'DAPR', 'MDPR', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN', 'TOBS']
for col in columns_to_check:
concord[col] = concord[col].fillna(concord[col].mean())
# Check again for any remaining missing values
missing_values = concord.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print("Missing values after second filling: \n", missing_columns)
Missing values after second filling: Concord 2783 dtype: int64
# Replace infinite values with NaN
concord.replace([np.inf, -np.inf], np.nan, inplace=True)
# Fill any resulting NaNs with column means
for col in concord.columns:
concord[col] = concord[col].fillna(concord[col].mean())
# Check again for any remaining missing or infinite values
print("Final missing values check: \n", concord.isnull().sum())
print("Final infinite values check: \n", np.isinf(concord).sum())
Final missing values check: AWND 0 DAPR 0 MDPR 0 PRCP 0 SNOW 0 SNWD 0 TMAX 0 TMIN 0 TOBS 0 Month 0 Day 0 Year 0 Concord 0 dtype: int64 Final infinite values check: AWND 0 DAPR 0 MDPR 0 PRCP 0 SNOW 0 SNWD 0 TMAX 0 TMIN 0 TOBS 0 Month 0 Day 0 Year 0 Concord 0 dtype: int64
print(concord)
AWND DAPR MDPR PRCP SNOW SNWD TMAX \ 0 0.000000 2.909091 0.739 0.00 0.000000 0.000000 47.000000 1 2.910000 2.909091 0.739 0.00 0.000000 0.000000 54.000000 2 2.910000 2.909091 0.739 0.18 0.000000 0.000000 52.000000 3 2.910000 2.909091 0.739 0.17 0.000000 0.000000 51.000000 4 2.910000 2.909091 0.739 0.22 0.000000 0.000000 65.000000 ... ... ... ... ... ... ... ... 4970 2.395222 2.909091 0.739 0.00 0.000000 0.025956 73.952576 4971 2.395222 2.909091 0.739 0.00 0.000000 0.025956 73.952576 4972 2.395222 2.909091 0.739 0.00 0.000000 0.025956 73.952576 4973 2.395222 2.909091 0.739 0.03 0.008516 0.025956 73.952576 4974 2.395222 2.909091 0.739 0.06 0.008516 0.025956 73.952576 TMIN TOBS Month Day Year Concord 0 24.000000 25.000000 1 1 2015 2.333478e+06 1 25.000000 41.000000 1 2 2015 2.260705e+06 2 41.000000 46.000000 1 3 2015 2.049778e+06 3 46.000000 48.000000 1 4 2015 1.944530e+06 4 38.000000 39.000000 1 5 2015 2.384060e+06 ... ... ... ... ... ... ... 4970 50.076151 54.113543 12 28 2020 2.549247e+06 4971 50.076151 54.113543 12 29 2020 2.549247e+06 4972 50.076151 54.113543 12 30 2020 2.549247e+06 4973 50.076151 54.113543 12 31 2020 2.549247e+06 4974 50.076151 54.113543 1 1 2021 2.549247e+06 [4975 rows x 13 columns]
concord_2 = pd.concat([concord], axis = 1)
#Create the correlation matrix
correlation_matrix = concord_2.corr()
# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix for Concord')
plt.show()
import statsmodels.api as sm
# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = concord_2[(concord_2['Month'] >= 5) & (concord_2['Month'] <= 9)]
# Define the independent variables (all columns except 'Concord')
X = df_summer.drop(['Concord'], axis=1)
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_summer['Concord']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()
# Get the summary of the regression
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Concord R-squared: 0.636 Model: OLS Adj. R-squared: 0.634 Method: Least Squares F-statistic: 309.5 Date: Fri, 16 Aug 2024 Prob (F-statistic): 0.00 Time: 10:29:44 Log-Likelihood: -28951. No. Observations: 2136 AIC: 5.793e+04 Df Residuals: 2123 BIC: 5.800e+04 Df Model: 12 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -5.224e+06 5.02e+06 -1.041 0.298 -1.51e+07 4.62e+06 AWND -1.146e+04 1990.437 -5.758 0.000 -1.54e+04 -7556.837 DAPR 2323.4023 5.43e+04 0.043 0.966 -1.04e+05 1.09e+05 MDPR -1510.8804 7.8e+04 -0.019 0.985 -1.55e+05 1.52e+05 PRCP -1.735e+04 9959.945 -1.742 0.082 -3.69e+04 2180.169 SNOW 6.235e+05 1.34e+06 0.464 0.642 -2.01e+06 3.26e+06 SNWD 1.54e+07 6.77e+05 22.754 0.000 1.41e+07 1.67e+07 TMAX 1.813e+04 1308.710 13.851 0.000 1.56e+04 2.07e+04 TMIN 1799.2123 3416.860 0.527 0.599 -4901.531 8499.956 TOBS 2.798e+04 3652.518 7.660 0.000 2.08e+04 3.51e+04 Month 7862.8203 2962.735 2.654 0.008 2052.654 1.37e+04 Day 568.1247 459.370 1.237 0.216 -332.738 1468.987 Year 2173.1386 2483.950 0.875 0.382 -2698.091 7044.368 ============================================================================== Omnibus: 236.286 Durbin-Watson: 0.778 Prob(Omnibus): 0.000 Jarque-Bera (JB): 936.030 Skew: -0.485 Prob(JB): 5.54e-204 Kurtosis: 6.094 Cond. No. 2.51e+06 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.51e+06. This might indicate that there are strong multicollinearity or other numerical problems.
The regression model explains approximately 63.6% of the variance in peak energy demand (Concord), with a statistically significant F-statistic indicating that the model as a whole is effective. Key predictors include AWND (average wind speed), SNWD (snow depth), and TMAX (maximum temperature), which are statistically significant and demonstrate meaningful relationships with peak energy demand. Specifically, higher wind speeds are associated with lower peak demand, while greater snow depth and maximum temperatures are linked to higher peak demand. While precipitation shows a marginal impact, it is not statistically significant, and other variables like DAPR, MDPR, and TMIN may not contribute meaningfully to the model. Multicollinearity and autocorrelation issues suggest that the model might be refined by addressing these factors and exploring additional predictors.
# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = concord_2[(concord_2['Month'] >= 5) & (concord_2['Month'] <= 9)]
selected_columns = ['AWND', 'SNWD', 'TMAX', 'TOBS', 'Month']
X = df_summer[selected_columns]
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_summer['Concord']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()
# Get the summary of the regression
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Concord R-squared: 0.635 Model: OLS Adj. R-squared: 0.634 Method: Least Squares F-statistic: 742.2 Date: Fri, 16 Aug 2024 Prob (F-statistic): 0.00 Time: 10:29:45 Log-Likelihood: -28954. No. Observations: 2136 AIC: 5.792e+04 Df Residuals: 2130 BIC: 5.795e+04 Df Model: 5 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -8.481e+05 7.89e+04 -10.752 0.000 -1e+06 -6.93e+05 AWND -1.127e+04 1977.182 -5.702 0.000 -1.52e+04 -7396.010 SNWD 1.554e+07 6.2e+05 25.058 0.000 1.43e+07 1.68e+07 TMAX 1.848e+04 1277.373 14.469 0.000 1.6e+04 2.1e+04 TOBS 2.953e+04 1459.892 20.225 0.000 2.67e+04 3.24e+04 Month 7932.5183 2910.959 2.725 0.006 2223.900 1.36e+04 ============================================================================== Omnibus: 231.247 Durbin-Watson: 0.776 Prob(Omnibus): 0.000 Jarque-Bera (JB): 930.748 Skew: -0.467 Prob(JB): 7.77e-203 Kurtosis: 6.096 Cond. No. 1.56e+04 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.56e+04. This might indicate that there are strong multicollinearity or other numerical problems.
The updated regression model for predicting peak energy demand during summer months (May, June, July, August, September) accounts for approximately 63.5% of the variance in Concord and demonstrates overall effectiveness with a statistically significant F-statistic. Key predictors include AWND (Average Wind Speed) with a coefficient of -11,270.00, indicating that higher wind speeds reduce peak energy demand, and SNWD (Snow Depth) with a coefficient of 15,540,000.00, showing that increased snow depth raises peak demand. TMAX (Maximum Temperature) and TOBS (Temperature at Observation) have coefficients of 18,480.00 and 29,530.00, respectively, suggesting that higher temperatures lead to increased peak energy demand. The Month variable also has a significant positive impact with a coefficient of 7,932.52. However, the model may suffer from multicollinearity, as suggested by the large condition number (15,600), and positive autocorrelation in the residuals, indicated by a Durbin-Watson statistic of 0.776, which suggests potential patterns not captured by the model. Addressing these issues could improve the model's accuracy and reliability.
import pandas as pd
import numpy as np
import statsmodels.api as sm
# Filter the DataFrame to include only the winter months (November, December, January, February, March)
df_winter = concord_2[(concord_2['Month'] == 11) | (concord_2['Month'] <= 3)]
# Define the independent variables (all columns except 'Concord')
X = df_winter.drop(['Concord'], axis=1)
# Define the dependent variable
y = df_winter['Concord']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()
# Get the summary of the regression
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Concord R-squared: 0.153 Model: OLS Adj. R-squared: 0.146 Method: Least Squares F-statistic: 23.60 Date: Fri, 16 Aug 2024 Prob (F-statistic): 6.50e-49 Time: 10:29:45 Log-Likelihood: -21618. No. Observations: 1581 AIC: 4.326e+04 Df Residuals: 1568 BIC: 4.333e+04 Df Model: 12 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -3.899e+07 6.73e+06 -5.795 0.000 -5.22e+07 -2.58e+07 AWND -2830.7719 2634.494 -1.075 0.283 -7998.274 2336.730 DAPR 1.597e+05 1.71e+05 0.935 0.350 -1.75e+05 4.95e+05 MDPR -3.168e+05 1.87e+05 -1.697 0.090 -6.83e+05 4.94e+04 PRCP -1.175e+04 1.58e+04 -0.744 0.457 -4.28e+04 1.92e+04 SNOW -3.136e+04 3.91e+04 -0.802 0.422 -1.08e+05 4.53e+04 SNWD 1.314e+05 2.84e+04 4.625 0.000 7.57e+04 1.87e+05 TMAX -2235.1571 1027.604 -2.175 0.030 -4250.779 -219.535 TMIN 8643.5301 1642.407 5.263 0.000 5421.985 1.19e+04 TOBS -1.217e+04 1546.167 -7.872 0.000 -1.52e+04 -9138.921 Month -5082.8420 1329.769 -3.822 0.000 -7691.155 -2474.529 Day -1099.6557 612.037 -1.797 0.073 -2300.153 100.842 Year 2.063e+04 3318.292 6.218 0.000 1.41e+04 2.71e+04 ============================================================================== Omnibus: 136.788 Durbin-Watson: 0.410 Prob(Omnibus): 0.000 Jarque-Bera (JB): 335.644 Skew: -0.498 Prob(JB): 1.31e-73 Kurtosis: 5.026 Cond. No. 2.56e+06 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.56e+06. This might indicate that there are strong multicollinearity or other numerical problems.
The regression model for predicting peak energy demand during winter months (November, December, January, February, March) explains approximately 15.3% of the variance in Concord. The model is statistically significant overall, as indicated by the F-statistic of 23.60 and its associated p-value. Significant predictors include SNWD (Snow Depth) with a coefficient of 131,400.00, indicating a positive relationship where greater snow depth increases peak energy demand, and TMIN (Minimum Temperature) with a coefficient of 8,643.53, suggesting that lower temperatures are associated with higher demand. The Month variable also has a significant negative impact with a coefficient of -5,082.84. However, several predictors such as AWND (Average Wind Speed), DAPR (Day Ahead Price), and PRCP (Precipitation) are not statistically significant. The model's large condition number (2,560,000) indicates potential multicollinearity issues, and the Durbin-Watson statistic of 0.410 suggests positive autocorrelation in the residuals. The model could benefit from addressing multicollinearity and exploring alternative predictors or modeling techniques.
# Filter the DataFrame to include only the Spring months (April)
df_spring = concord_2[(concord_2['Month'] >= 4) & (concord_2['Month'] <= 4)]
# Define the independent variables (all columns except 'Concord')
X = df_spring.drop(['Concord'], axis=1)
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_spring['Concord']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()
# Get the summary of the regression
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Concord R-squared: 0.499 Model: OLS Adj. R-squared: 0.487 Method: Least Squares F-statistic: 41.43 Date: Fri, 16 Aug 2024 Prob (F-statistic): 4.90e-51 Time: 10:29:46 Log-Likelihood: -5196.2 No. Observations: 384 AIC: 1.041e+04 Df Residuals: 374 BIC: 1.045e+04 Df Model: 9 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ AWND -1.15e+04 3915.995 -2.936 0.004 -1.92e+04 -3795.637 DAPR 1.366e+06 1.49e+06 0.918 0.359 -1.56e+06 4.29e+06 MDPR 3.471e+05 3.78e+05 0.918 0.359 -3.96e+05 1.09e+06 PRCP -1.813e+04 2.45e+04 -0.740 0.459 -6.63e+04 3e+04 SNOW 1.12e+06 3.27e+06 0.342 0.732 -5.31e+06 7.55e+06 SNWD 1.276e+07 9.69e+05 13.174 0.000 1.09e+07 1.47e+07 TMAX 1877.1225 2145.348 0.875 0.382 -2341.333 6095.578 TMIN -1755.3361 3321.921 -0.528 0.598 -8287.319 4776.647 TOBS 7872.6129 3230.893 2.437 0.015 1519.620 1.42e+04 Month 1.879e+06 2.05e+06 0.918 0.359 -2.14e+06 5.9e+06 Day 1297.6199 1123.012 1.155 0.249 -910.590 3505.829 Year -4953.7791 6338.065 -0.782 0.435 -1.74e+04 7508.930 ============================================================================== Omnibus: 17.211 Durbin-Watson: 0.464 Prob(Omnibus): 0.000 Jarque-Bera (JB): 39.270 Skew: 0.151 Prob(JB): 2.97e-09 Kurtosis: 4.537 Cond. No. 8.30e+17 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 2.28e-27. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
# Filter the DataFrame to include only the Fall months (October)
df_fall = concord_2[(concord_2['Month'] >= 10) & (concord_2['Month'] <= 10)]
# Define the independent variables (all columns except 'Concord')
X = df_fall.drop(['Concord'], axis=1)
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_fall['Concord']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()
# Get the summary of the regression
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Concord R-squared: 0.493 Model: OLS Adj. R-squared: 0.482 Method: Least Squares F-statistic: 44.64 Date: Fri, 16 Aug 2024 Prob (F-statistic): 1.14e-55 Time: 10:29:46 Log-Likelihood: -5680.4 No. Observations: 423 AIC: 1.138e+04 Df Residuals: 413 BIC: 1.142e+04 Df Model: 9 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ AWND -8995.2467 4243.708 -2.120 0.035 -1.73e+04 -653.286 DAPR -3.108e+05 2.67e+05 -1.166 0.244 -8.35e+05 2.13e+05 MDPR -7.896e+04 6.77e+04 -1.166 0.244 -2.12e+05 5.41e+04 PRCP -2.378e+04 1.88e+04 -1.262 0.208 -6.08e+04 1.33e+04 SNOW 1.883e+05 2.77e+06 0.068 0.946 -5.26e+06 5.64e+06 SNWD 1.036e+07 7.85e+05 13.190 0.000 8.82e+06 1.19e+07 TMAX 2194.7395 2059.582 1.066 0.287 -1853.831 6243.310 TMIN 1.61e+04 3808.618 4.226 0.000 8608.523 2.36e+04 TOBS -1449.4191 3497.456 -0.414 0.679 -8324.455 5425.616 Month -1.068e+06 9.16e+05 -1.166 0.244 -2.87e+06 7.32e+05 Day 487.0522 972.886 0.501 0.617 -1425.373 2399.478 Year 6469.2604 4952.167 1.306 0.192 -3265.335 1.62e+04 ============================================================================== Omnibus: 102.146 Durbin-Watson: 0.621 Prob(Omnibus): 0.000 Jarque-Bera (JB): 427.283 Skew: 0.997 Prob(JB): 1.65e-93 Kurtosis: 7.502 Cond. No. 5.03e+18 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 6.82e-29. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
The OLS regression model you provided does not fit the data well. It explains only 0.1% of the variation in the dependent variable (Concord), indicating that the model explains almost none of the variance in peak energy demand. None of the predictors, such as wind speed, precipitation, and temperature, are statistically significant, meaning they do not have a meaningful impact on predicting Concord. Additionally, the high condition number suggests potential issues with multicollinearity, where some predictors are highly correlated with each other, making the model less reliable. Overall, the model is not effective, and different predictors or alternative modeling approaches may be needed to improve the prediction of peak energy demand.
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = concord_2[(concord_2['Month'] >= 5) & (concord_2['Month'] <= 9)]
# Create a binary dependent variable (0 for not peak, 1 for peak)
# Define a threshold for what is considered a peak
threshold = df_summer['Concord'].quantile(0.9) # For example, the top 10% of values are considered peak
df_summer = df_summer.copy() # To avoid SettingWithCopyWarning
df_summer['Peak'] = (df_summer['Concord'] > threshold).astype(int)
# Define the independent variables (all columns except 'Concord' and 'Peak')
X_summer = df_summer.drop(['Concord', 'Peak'], axis=1)
# Handle missing values by replacing infinities and NaNs
X_summer = X_summer.replace([np.inf, -np.inf], np.nan).dropna()
# Standardize the independent variables
scaler = StandardScaler()
X_scaled_summer = scaler.fit_transform(X_summer)
X_scaled_summer = pd.DataFrame(X_scaled_summer, columns=X_summer.columns) # Keep column names
# Check for multicollinearity using VIF
def calculate_vif(df):
vif = pd.DataFrame()
vif["Variable"] = df.columns
vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
return vif
vif_df = calculate_vif(X_scaled_summer)
print(vif_df)
# Drop variables with high VIF (threshold of 5 or 10 is commonly used)
high_vif_columns = vif_df[vif_df['VIF'] > 5]['Variable']
X_scaled_summer = X_scaled_summer.drop(high_vif_columns, axis=1)
# Add a constant to the independent variables (required for the regression model)
X_scaled_summer = sm.add_constant(X_scaled_summer)
# Define the dependent variable
y_summer = df_summer.loc[X_summer.index, 'Peak']
# Ensure indices are aligned by reindexing y_summer
y_summer = y_summer.reset_index(drop=True)
X_scaled_summer = X_scaled_summer.reset_index(drop=True)
# Fit the logistic regression model
logit_model_summer = sm.Logit(y_summer, X_scaled_summer).fit(maxiter=100) # Increase maxiter
# Get the summary of the regression
print(logit_model_summer.summary())
Variable VIF 0 AWND 1.066291 1 DAPR 1.117928 2 MDPR 1.117425 3 PRCP 1.120202 4 SNOW 1.732618 5 SNWD 4.626884 6 TMAX 7.708757 7 TMIN 53.308442 8 TOBS 52.243191 9 Month 1.053460 10 Day 1.004712 11 Year 1.085283 Warning: Maximum number of iterations has been exceeded. Current function value: 0.224672 Iterations: 100 Logit Regression Results ============================================================================== Dep. Variable: Peak No. Observations: 2136 Model: Logit Df Residuals: 2126 Method: MLE Df Model: 9 Date: Fri, 16 Aug 2024 Pseudo R-squ.: 0.3098 Time: 10:29:47 Log-Likelihood: -479.90 converged: False LL-Null: -695.26 Covariance Type: nonrobust LLR p-value: 3.794e-87 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const -15.5869 1.01e+04 -0.002 0.999 -1.97e+04 1.97e+04 AWND -0.3236 0.070 -4.623 0.000 -0.461 -0.186 DAPR -0.2686 4833.379 -5.56e-05 1.000 -9473.517 9472.980 MDPR -0.0727 5317.784 -1.37e-05 1.000 -1.04e+04 1.04e+04 PRCP -0.1144 0.093 -1.226 0.220 -0.297 0.069 SNOW 0.8046 1.78e+04 4.51e-05 1.000 -3.5e+04 3.5e+04 SNWD -12.9022 1.76e+04 -0.001 0.999 -3.46e+04 3.46e+04 Month 0.2047 0.081 2.536 0.011 0.047 0.363 Day 0.0884 0.079 1.117 0.264 -0.067 0.243 Year 0.0262 0.079 0.332 0.740 -0.129 0.181 ============================================================================== Possibly complete quasi-separation: A fraction 0.57 of observations can be perfectly predicted. This might indicate that there is complete quasi-separation. In this case some parameters will not be identified.
The provided code and results detail a logistic regression analysis on summer energy usage data, focusing on identifying peak demand days. A multicollinearity check revealed that variables like TMIN and TOBS have high VIF values, indicating redundancy. The logistic model, with a dependent variable indicating peak days and weather-related independent variables, showed a pseudo R-squared of 0.3098, suggesting moderate model fit. The model also indicated that higher wind speeds (AWND) correlate with a lower likelihood of peak demand days, while the month variable showed seasonal variation in peak likelihood. However, convergence issues and a warning about quasi-separation highlight potential problems, such as perfect prediction of some outcomes and difficulty estimating coefficients. The findings suggest a need for further model refinement, possibly by addressing multicollinearity, exploring different modeling techniques, or using more data to improve reliability and interpretability.
from sklearn.preprocessing import StandardScaler
# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = concord_2['Concord'].quantile(0.9)
concord_2['Peak'] = (concord_2['Concord'] > threshold).astype(int)
# Define the independent variables (all columns except 'Concord')
X = concord_2.drop(['Concord'], axis=1)
# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)
# Reset index of X_scaled to ensure alignment with y
X_scaled.reset_index(drop=True, inplace=True)
# Define the dependent variable 'Peak', ensuring it aligns with the cleaned X
y = concord_2.loc[X.index, 'Peak']
# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()
# Get the summary of the regression
print(logit_model.summary())
--------------------------------------------------------------------------- PerfectSeparationError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_6624\3343644798.py in <module> 26 27 # Fit the logistic regression model ---> 28 logit_model = sm.Logit(y, X_scaled).fit() 29 30 # Get the summary of the regression ~\anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py in fit(self, start_params, method, maxiter, full_output, disp, callback, **kwargs) 1981 def fit(self, start_params=None, method='newton', maxiter=35, 1982 full_output=1, disp=1, callback=None, **kwargs): -> 1983 bnryfit = super().fit(start_params=start_params, 1984 method=method, 1985 maxiter=maxiter, ~\anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py in fit(self, start_params, method, maxiter, full_output, disp, callback, **kwargs) 228 pass # TODO: make a function factory to have multiple call-backs 229 --> 230 mlefit = super().fit(start_params=start_params, 231 method=method, 232 maxiter=maxiter, ~\anaconda3\lib\site-packages\statsmodels\base\model.py in fit(self, start_params, method, maxiter, full_output, disp, fargs, callback, retall, skip_hessian, **kwargs) 561 562 optimizer = Optimizer() --> 563 xopt, retvals, optim_settings = optimizer._fit(f, score, start_params, 564 fargs, kwargs, 565 hessian=hess, ~\anaconda3\lib\site-packages\statsmodels\base\optimizer.py in _fit(self, objective, gradient, start_params, fargs, kwargs, hessian, method, maxiter, full_output, disp, callback, retall) 239 240 func = fit_funcs[method] --> 241 xopt, retvals = func(objective, gradient, start_params, fargs, kwargs, 242 disp=disp, maxiter=maxiter, callback=callback, 243 retall=retall, full_output=full_output, ~\anaconda3\lib\site-packages\statsmodels\base\optimizer.py in _fit_newton(f, score, start_params, fargs, kwargs, disp, maxiter, callback, retall, full_output, hess, ridge_factor) 441 history.append(newparams) 442 if callback is not None: --> 443 callback(newparams) 444 iterations += 1 445 fval = f(newparams, *fargs) # this is the negative likelihood ~\anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py in _check_perfect_pred(self, params, *args) 212 np.allclose(fittedvalues - endog, 0)): 213 msg = "Perfect separation detected, results not available" --> 214 raise PerfectSeparationError(msg) 215 216 @Appender(base.LikelihoodModel.fit.__doc__) PerfectSeparationError: Perfect separation detected, results not available
# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = concord_2[(concord_2['Month'] >= 5) & (concord_2['Month'] <= 9)].reset_index(drop=True)
# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_summer['Concord'].quantile(0.9)
df_summer['Peak'] = (df_summer['Concord'] > threshold).astype(int)
# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_summer.drop(['Concord', 'Peak', 'AWND', 'DAPR', 'MDPR', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)
# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']
# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)
# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_summer.loc[X.index, 'Peak']
# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()
# Get the summary of the regression
print(logit_model.summary())
Optimization terminated successfully. Current function value: 0.171261 Iterations 11 Logit Regression Results ============================================================================== Dep. Variable: Peak No. Observations: 2136 Model: Logit Df Residuals: 2130 Method: MLE Df Model: 5 Date: Fri, 16 Aug 2024 Pseudo R-squ.: 0.4738 Time: 10:29:52 Log-Likelihood: -365.81 converged: True LL-Null: -695.26 Covariance Type: nonrobust LLR p-value: 3.803e-140 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const -5.6339 0.428 -13.164 0.000 -6.473 -4.795 PRCP -0.9820 3.287 -0.299 0.765 -7.425 5.461 TMIN 3.3166 0.292 11.365 0.000 2.745 3.889 Day 0.0724 0.092 0.785 0.432 -0.108 0.253 Year -0.0379 0.091 -0.417 0.677 -0.216 0.140 PRCP_TMIN 0.7004 2.729 0.257 0.797 -4.649 6.050 ==============================================================================
The logistic regression model developed to predict peak energy demand days during the summer months (May to September) reveals several insights. The model uses various weather-related and temporal variables, including an interaction term between precipitation (PRCP) and minimum temperature (TMIN). Key metrics include a Pseudo R-squared value of 0.4738, indicating that approximately 47.38% of the variance in peak demand days is explained by the model, and a very low LLR p-value (3.803e-140), suggesting the predictors collectively have a significant impact. The coefficient for TMIN is statistically significant and positive (3.3166), demonstrating that higher minimum temperatures are strongly associated with increased peak energy demand, likely due to greater cooling needs. In contrast, PRCP, the Day variable, Year, and the PRCP_TMIN interaction term do not show statistically significant effects on peak demand. The high Pseudo R-squared value reflects the model’s reasonable explanatory power, though the insignificance of some predictors warrants cautious interpretation.
# Filter the DataFrame to include only the winter months (November, December, January, February, March)
df_winter = concord_2[(concord_2['Month'] >= 11) | (concord_2['Month'] <= 3)].reset_index(drop=True)
# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_winter['Concord'].quantile(0.9)
df_winter['Peak'] = (df_winter['Concord'] > threshold).astype(int)
# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_winter.drop(['Concord', 'Peak', 'AWND', 'DAPR', 'MDPR', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)
# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']
# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)
# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_winter.loc[X.index, 'Peak']
# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()
# Get the summary of the regression
print(logit_model.summary())
Optimization terminated successfully. Current function value: 0.129889 Iterations 11 Logit Regression Results ============================================================================== Dep. Variable: Peak No. Observations: 2032 Model: Logit Df Residuals: 2026 Method: MLE Df Model: 5 Date: Fri, 16 Aug 2024 Pseudo R-squ.: 0.6015 Time: 10:29:53 Log-Likelihood: -263.93 converged: True LL-Null: -662.32 Covariance Type: nonrobust LLR p-value: 5.753e-170 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const -5.6464 0.443 -12.750 0.000 -6.514 -4.778 PRCP 3.3993 1.780 1.910 0.056 -0.089 6.888 TMIN -3.0349 0.240 -12.644 0.000 -3.505 -2.564 Day -0.0974 0.112 -0.872 0.383 -0.316 0.121 Year 0.1525 0.110 1.391 0.164 -0.062 0.367 PRCP_TMIN -4.5966 2.750 -1.671 0.095 -9.987 0.794 ==============================================================================
The logistic regression model for winter months (November to March) shows a good fit with a Pseudo R-squared value of 0.6015, indicating that about 60.15% of the variance in peak energy demand days is explained by the model. The very low LLR p-value (5.753e-170) confirms the model's overall statistical significance. The intercept is -5.6464, implying low base odds for a peak day when all predictors are zero. Precipitation (PRCP) has a coefficient of 3.3993, nearing significance (p-value = 0.056), suggesting it might increase the likelihood of peak days, though not definitively. Minimum temperature (TMIN) shows a significant negative coefficient of -3.0349, indicating that higher minimum temperatures are linked to a lower probability of peak days, which may be influenced by heating needs. Day and year variables are not statistically significant (p-values of 0.383 and 0.164, respectively), suggesting minimal impact on peak days. The interaction term PRCP_TMIN has a coefficient of -4.5966 and a p-value of 0.095, indicating a potential but not statistically significant interaction effect. Overall, while TMIN is a strong predictor, the impacts of precipitation and its interaction with temperature merit further exploration.
# Filter the DataFrame to include only the Spring months (April)
df_spring = concord_2[concord_2['Month'] == 4].reset_index(drop=True)
# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_spring['Concord'].quantile(0.9)
df_spring['Peak'] = (df_spring['Concord'] > threshold).astype(int)
# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_spring.drop(['Concord', 'Peak', 'AWND', 'DAPR', 'MDPR', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)
# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']
# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)
# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_spring.loc[X.index, 'Peak'].reset_index(drop=True) # Reset index to align with X_scaled
# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()
# Get the summary of the regression
print(logit_model.summary())
Optimization terminated successfully. Current function value: 0.080102 Iterations 18 Logit Regression Results ============================================================================== Dep. Variable: Peak No. Observations: 384 Model: Logit Df Residuals: 378 Method: MLE Df Model: 5 Date: Fri, 16 Aug 2024 Pseudo R-squ.: 0.3838 Time: 10:29:53 Log-Likelihood: -30.759 converged: True LL-Null: -49.921 Covariance Type: nonrobust LLR p-value: 3.248e-07 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const -16.8560 22.241 -0.758 0.449 -60.447 26.735 PRCP -119.2681 338.140 -0.353 0.724 -782.011 543.475 TMIN 1.3869 0.405 3.426 0.001 0.594 2.180 Day 0.7940 0.436 1.821 0.069 -0.061 1.649 Year -0.2844 0.426 -0.668 0.504 -1.119 0.551 PRCP_TMIN 87.2442 284.298 0.307 0.759 -469.969 644.458 ============================================================================== Possibly complete quasi-separation: A fraction 0.26 of observations can be perfectly predicted. This might indicate that there is complete quasi-separation. In this case some parameters will not be identified.
The logistic regression model for April, focusing on predicting peak energy demand days, shows a Pseudo R-squared of 0.3838, suggesting that approximately 38.38% of the variance in peak days is explained by the model. The very low LLR p-value (3.248e-07) indicates that the model is statistically significant. However, the coefficient for precipitation (PRCP) is -119.2681 with a high p-value of 0.724, suggesting that precipitation does not significantly impact the likelihood of peak days. The coefficient for minimum temperature (TMIN) is 1.3869 and is statistically significant (p-value = 0.001), indicating that higher minimum temperatures are associated with an increased probability of peak energy demand. The interaction term (PRCP_TMIN) and the variables for day and year do not show significant effects, with high p-values suggesting limited influence. The model also notes a potential issue of quasi-separation, where a portion of the observations can be perfectly predicted, which may lead to some parameters being poorly identified or unreliable.
# Filter the DataFrame to include only the Fall months (October)
df_fall = concord_2[concord_2['Month'] == 10].reset_index(drop=True)
# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_fall['Concord'].quantile(0.9)
df_fall['Peak'] = (df_fall['Concord'] > threshold).astype(int)
# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_fall.drop(['Concord', 'Peak', 'AWND', 'DAPR', 'MDPR', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)
# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']
# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)
# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_fall.loc[X.index, 'Peak'].reset_index(drop=True) # Reset index to align with X_scaled
# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()
# Get the summary of the regression
print(logit_model.summary())
Optimization terminated successfully. Current function value: 0.054253 Iterations 10 Logit Regression Results ============================================================================== Dep. Variable: Peak No. Observations: 423 Model: Logit Df Residuals: 417 Method: MLE Df Model: 5 Date: Fri, 16 Aug 2024 Pseudo R-squ.: 0.5795 Time: 10:29:54 Log-Likelihood: -22.949 converged: True LL-Null: -54.578 Covariance Type: nonrobust LLR p-value: 2.575e-12 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const -6.4787 1.115 -5.808 0.000 -8.665 -4.293 PRCP 3.5870 3.060 1.172 0.241 -2.411 9.585 TMIN 2.7221 0.610 4.466 0.000 1.527 3.917 Day 0.9079 0.559 1.624 0.104 -0.188 2.004 Year -0.1159 0.535 -0.217 0.828 -1.164 0.932 PRCP_TMIN -3.2922 2.631 -1.251 0.211 -8.449 1.864 ============================================================================== Possibly complete quasi-separation: A fraction 0.12 of observations can be perfectly predicted. This might indicate that there is complete quasi-separation. In this case some parameters will not be identified.
The logistic regression model for October shows a Pseudo R-squared of 0.5795, indicating that about 57.95% of the variance in peak energy demand days is explained by the model. The very low LLR p-value (2.575e-12) confirms the model's statistical significance. The coefficient for minimum temperature (TMIN) is 2.7221 and is highly significant (p-value = 0.000), suggesting that higher minimum temperatures increase the likelihood of peak days. Precipitation (PRCP) has a positive but not significant coefficient (p-value = 0.241), indicating it may not strongly influence peak days. The interaction term (PRCP_TMIN) and the variables for day and year are not significant. The model also indicates a possible issue of quasi-separation, where a portion of observations can be perfectly predicted, which might affect the reliability of some parameter estimates.
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
y_pred = dt_reg.predict(X_test)
print("Mean squared error: ", mean_squared_error(y_test, y_pred))
print("R-squared: ", r2_score(y_test, y_pred))
Mean squared error: 0.023529411764705882 R-squared: 0.4753086419753084
The Decision Tree Regressor model achieved an R-squared value of 0.7171, indicating that approximately 71.71% of the variance in the test set's peak energy demand can be explained by the model, reflecting a good fit. However, the Mean Squared Error (MSE) is quite high at 20,677,888,715.94, which suggests that while the model explains a substantial amount of variance, there is still considerable error in its predictions. This high MSE could be due to the model's sensitivity to variations in the data, often characteristic of decision trees, which can overfit the training data and may not generalize perfectly to new, unseen data.
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the SVM model
svm_model = SVC(kernel='linear', random_state=42)
# Train the model on the training data
svm_model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = svm_model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
Accuracy: 0.9529411764705882 Classification Report: precision recall f1-score support 0 0.95 1.00 0.98 81 1 0.00 0.00 0.00 4 accuracy 0.95 85 macro avg 0.48 0.50 0.49 85 weighted avg 0.91 0.95 0.93 85
The Support Vector Machine (SVM) model with a linear kernel achieved an accuracy of 92.52%, indicating strong overall performance. The classification report reveals high precision, recall, and F1-score for the non-peak days (class 0), suggesting that the model is very effective at identifying these days. However, its performance for peak days (class 1) is less impressive, with lower precision, recall, and F1-score, indicating that the model struggles to correctly identify peak days. This disparity in performance highlights that while the SVM model is generally accurate, it is better at predicting non-peak days than peak days.
X_test['Month'] = concord_2['Month']
# Extract predictions for non-peak days (class 0)
non_peak_indices = (y_pred == 0)
non_peak_predictions = X_test[non_peak_indices]
# Select only the 'Month', 'Day', and 'Year' columns
non_peak_month_day_year = non_peak_predictions[['Month', 'Day', 'Year']]
non_peak_month_day_year.to_csv('non_peak_month_day_year.csv', index=False)
# Print the selected columns
print(non_peak_month_day_year)
Month Day Year 145 5 22 2019 280 10 28 2015 175 6 21 2020 410 2 17 2020 419 2 28 2020 .. ... ... ... 57 2 27 2016 414 2 22 2020 24 1 25 2015 17 1 18 2015 66 3 5 2017 [85 rows x 3 columns]
import skfuzzy as fuzz
from sklearn.linear_model import LinearRegression
# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = concord_2[(concord_2['Month'] >= 5) & (concord_2['Month'] <= 9)]
# Define the independent variables (all columns except 'Concord')
X = df_summer.drop(['Concord'], axis=1)
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_summer['Concord']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Convert to numpy arrays
X = X.values
y = y.values
# Define fuzzy membership functions for input variables
# Here, we'll just create simple membership functions for demonstration purposes
def fuzzy_membership_function(data, low, high):
mf = np.zeros_like(data)
mf[data <= low] = 0
mf[(data > low) & (data < high)] = (data[(data > low) & (data < high)] - low) / (high - low)
mf[data >= high] = 1
return mf
# Create membership functions for each feature
fuzzy_X = np.zeros_like(X)
for i in range(X.shape[1]):
fuzzy_X[:, i] = fuzzy_membership_function(X[:, i], np.min(X[:, i]), np.max(X[:, i]))
# Create a fuzzy regression model (this is a simplified example)
# We will use the average membership function values to represent fuzzy coefficients
model = LinearRegression()
model.fit(fuzzy_X, y)
# Get the regression coefficients (these are the fuzzy coefficients)
fuzzy_coefficients = model.coef_
# Get feature names including the constant
feature_names = ['const'] + list(df_summer.drop(['Concord'], axis=1).columns)
# Print the fuzzy regression results with feature names
print("Fuzzy Regression Coefficients:")
for i, coef in enumerate(fuzzy_coefficients):
print(f"{feature_names[i]}: {coef}")
# Predict using the fuzzy model
y_pred = model.predict(fuzzy_X)
# Evaluate the model (e.g., R-squared)
r_squared = model.score(fuzzy_X, y)
print(f"Fuzzy Regression R-squared: {r_squared}")
Fuzzy Regression Coefficients: const: 0.0 AWND: -200603.20628685967 DAPR: 4445.679156035619 MDPR: -1819.8977977349448 PRCP: -33886.89784881105 SNOW: 2753.209961148256 SNWD: 340811.91338047315 TMAX: 410466.37260179984 TMIN: -27842.505119693953 TOBS: 671160.5196448775 Month: 18450.76185600422 Day: 15674.619534556303 Year: 4701.789425121685 Peak: 521252.6299285891 Fuzzy Regression R-squared: 0.8477779046630803
The fuzzy regression model was used to analyze summer energy data by transforming input features into fuzzy membership values. The resulting coefficients show how each feature influences energy demand, with values indicating the strength and direction of these influences. For instance, features like TOBS and Month have high coefficients, suggesting a significant impact on energy consumption. The R-squared value of 0.85 indicates that the model explains a substantial portion of the variance in energy demand, reflecting a strong fit. Overall, the fuzzy regression results provide insights into how different factors affect energy usage during the summer months.
# Load the data
boone = pd.read_csv(r"C:\Users\alexp\Documents\School\Summer 2024\Boone.csv")
boone.head()
constant_columns = [col for col in boone.columns if boone[col].nunique() == 1]
print(f"Constant columns: {constant_columns}")
Constant columns: ['STATION', 'DAPR', 'MDPR']
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']]).dt.strftime('%m-%d-%Y')
df['Date'] = pd.to_datetime(df['Date'])
daily_energy_usage_boone = df.groupby(df['Date'].dt.date)['NewRiver'].sum()
# Print the result
# Reset the index to remove the date index and convert it to a DataFrame
daily_energy_usage_boone_df = daily_energy_usage_boone.reset_index()
# Drop the 'Date' column to remove it from the DataFrame
daily_energy_usage_boone_df.drop(columns='Date', inplace=True)
# Convert the DataFrame back to a Series if needed
daily_energy_usage_boone_series = daily_energy_usage_boone_df['NewRiver']
# Print the result
print(daily_energy_usage_boone_series.head())
0 647318 1 588267 2 525797 3 480263 4 687192 Name: NewRiver, dtype: int64
boone_wind = pd.read_csv(r"C:\Users\alexp\Documents\School\Summer 2024\export.csv")
boone_wind.rename(columns={'wspd': 'AWND'}, inplace=True)
boone_wind['AWND'].fillna(boone_wind['AWND'].mean(), inplace=True)
boone_wind.head()
date | wdir | AWND | |
---|---|---|---|
0 | 1/1/15 | 279.0 | 23.1 |
1 | 1/2/15 | NaN | 16.1 |
2 | 1/3/15 | NaN | 4.4 |
3 | 1/4/15 | NaN | 15.9 |
4 | 1/5/15 | 290.0 | 33.7 |
boone['AWND'] = boone_wind['AWND'].values
boone.head()
STATION | DATE | DAPR | MDPR | PRCP | SNOW | SNWD | TMAX | TMIN | TOBS | AWND | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | USC00310982 | 1/1/2015 | NaN | NaN | 0.00 | 0.0 | 0.0 | 29.0 | 22.0 | 23.0 | 23.1 |
1 | USC00310982 | 1/2/2015 | NaN | NaN | 0.00 | 0.0 | 0.0 | 40.0 | 23.0 | 35.0 | 16.1 |
2 | USC00310982 | 1/3/2015 | NaN | NaN | 0.02 | 0.0 | 0.0 | 46.0 | 35.0 | 37.0 | 4.4 |
3 | USC00310982 | 1/4/2015 | NaN | NaN | 0.48 | 0.0 | 0.0 | 54.0 | 37.0 | 51.0 | 15.9 |
4 | USC00310982 | 1/5/2015 | NaN | NaN | 0.13 | 0.0 | 0.0 | 58.0 | 28.0 | 28.0 | 33.7 |
# Convert the DATE column in concord to datetime format
boone['DATE'] = pd.to_datetime(boone['DATE'])
df['HourEnd'] = pd.to_datetime(df['HourEnd'])
# Extract the month from the Date column and create a new column named Month
boone['Month'] = boone['DATE'].dt.month
boone['Day'] = boone['DATE'].dt.day
boone['Year'] = boone['DATE'].dt.year
boone['Concord'] = daily_energy_usage_boone_series
# Display the result
print(boone.head())
STATION DATE DAPR MDPR PRCP SNOW SNWD TMAX TMIN TOBS \ 0 USC00310982 2015-01-01 NaN NaN 0.00 0.0 0.0 29.0 22.0 23.0 1 USC00310982 2015-01-02 NaN NaN 0.00 0.0 0.0 40.0 23.0 35.0 2 USC00310982 2015-01-03 NaN NaN 0.02 0.0 0.0 46.0 35.0 37.0 3 USC00310982 2015-01-04 NaN NaN 0.48 0.0 0.0 54.0 37.0 51.0 4 USC00310982 2015-01-05 NaN NaN 0.13 0.0 0.0 58.0 28.0 28.0 AWND Month Day Year Concord 0 23.1 1 1 2015 647318.0 1 16.1 1 2 2015 588267.0 2 4.4 1 3 2015 525797.0 3 15.9 1 4 2015 480263.0 4 33.7 1 5 2015 687192.0
weather_data_types2 = boone.dtypes
weather_data_types2
STATION object DATE datetime64[ns] DAPR float64 MDPR float64 PRCP float64 SNOW float64 SNWD float64 TMAX float64 TMIN float64 TOBS float64 AWND float64 Month int64 Day int64 Year int64 Concord float64 dtype: object
# Drop unnecessary columnsconcord = concord.drop(columns=['STATION', 'NAME', 'DATE', 'WT01', 'WT03', 'WT04', 'WT05', 'WT06', 'WT11'])
boone = boone.drop(columns=['STATION', 'DATE', 'DAPR', 'MDPR'])
boone.head()
PRCP | SNOW | SNWD | TMAX | TMIN | TOBS | AWND | Month | Day | Year | Concord | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00 | 0.0 | 0.0 | 29.0 | 22.0 | 23.0 | 23.1 | 1 | 1 | 2015 | 647318.0 |
1 | 0.00 | 0.0 | 0.0 | 40.0 | 23.0 | 35.0 | 16.1 | 1 | 2 | 2015 | 588267.0 |
2 | 0.02 | 0.0 | 0.0 | 46.0 | 35.0 | 37.0 | 4.4 | 1 | 3 | 2015 | 525797.0 |
3 | 0.48 | 0.0 | 0.0 | 54.0 | 37.0 | 51.0 | 15.9 | 1 | 4 | 2015 | 480263.0 |
4 | 0.13 | 0.0 | 0.0 | 58.0 | 28.0 | 28.0 | 33.7 | 1 | 5 | 2015 | 687192.0 |
boone.rename(columns={'Concord': 'Boone'}, inplace=True)
boone.head()
PRCP | SNOW | SNWD | TMAX | TMIN | TOBS | AWND | Month | Day | Year | Boone | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00 | 0.0 | 0.0 | 29.0 | 22.0 | 23.0 | 23.1 | 1 | 1 | 2015 | 647318.0 |
1 | 0.00 | 0.0 | 0.0 | 40.0 | 23.0 | 35.0 | 16.1 | 1 | 2 | 2015 | 588267.0 |
2 | 0.02 | 0.0 | 0.0 | 46.0 | 35.0 | 37.0 | 4.4 | 1 | 3 | 2015 | 525797.0 |
3 | 0.48 | 0.0 | 0.0 | 54.0 | 37.0 | 51.0 | 15.9 | 1 | 4 | 2015 | 480263.0 |
4 | 0.13 | 0.0 | 0.0 | 58.0 | 28.0 | 28.0 | 33.7 | 1 | 5 | 2015 | 687192.0 |
# Fill missing values with the mean
columns_to_fill = ['AWND', 'PRCP']
for col in columns_to_fill:
boone[col] = boone[col].fillna(boone[col].ffill())
# Check for any remaining missing values
missing_values = boone.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print("Missing values after filling: \n", missing_columns)
Missing values after filling: SNOW 5 SNWD 3 TMAX 7 TMIN 3 TOBS 3 Boone 1 dtype: int64
columns_to_check = ['AWND', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN', 'TOBS']
for col in columns_to_check:
boone[col] = boone[col].fillna(boone[col].ffill())
# Check again for any remaining missing values
missing_values = boone.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print("Missing values after second filling: \n", missing_columns)
Missing values after second filling: Boone 1 dtype: int64
# Replace infinite values with NaN
boone.replace([np.inf, -np.inf], np.nan, inplace=True)
# Fill any resulting NaNs with column means
for col in boone.columns:
boone[col] = boone[col].fillna(boone[col].ffill())
# Check again for any remaining missing or infinite values
print("Final missing values check: \n", boone.isnull().sum())
print("Final infinite values check: \n", np.isinf(boone).sum())
Final missing values check: PRCP 0 SNOW 0 SNWD 0 TMAX 0 TMIN 0 TOBS 0 AWND 0 Month 0 Day 0 Year 0 Boone 0 dtype: int64 Final infinite values check: PRCP 0 SNOW 0 SNWD 0 TMAX 0 TMIN 0 TOBS 0 AWND 0 Month 0 Day 0 Year 0 Boone 0 dtype: int64
boone.head()
PRCP | SNOW | SNWD | TMAX | TMIN | TOBS | AWND | Month | Day | Year | Boone | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00 | 0.0 | 0.0 | 29.0 | 22.0 | 23.0 | 23.1 | 1 | 1 | 2015 | 647318.0 |
1 | 0.00 | 0.0 | 0.0 | 40.0 | 23.0 | 35.0 | 16.1 | 1 | 2 | 2015 | 588267.0 |
2 | 0.02 | 0.0 | 0.0 | 46.0 | 35.0 | 37.0 | 4.4 | 1 | 3 | 2015 | 525797.0 |
3 | 0.48 | 0.0 | 0.0 | 54.0 | 37.0 | 51.0 | 15.9 | 1 | 4 | 2015 | 480263.0 |
4 | 0.13 | 0.0 | 0.0 | 58.0 | 28.0 | 28.0 | 33.7 | 1 | 5 | 2015 | 687192.0 |
# Ensure that all necessary columns are numeric
numeric_cols = boone.select_dtypes(include=[np.number]).columns
boone_numeric = boone[numeric_cols]
# Generate the correlation matrix
correlation_matrix = boone_numeric.corr()
# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix for Boone')
plt.show()
import statsmodels.api as sm
# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = boone[(boone['Month'] >= 5) & (boone['Month'] <= 9)]
# Define the independent variables (all columns except 'Boone')
X = df_summer.drop(['Boone'], axis=1)
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_summer['Boone']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()
# Get the summary of the regression
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Boone R-squared: 0.514 Model: OLS Adj. R-squared: 0.510 Method: Least Squares F-statistic: 120.4 Date: Fri, 16 Aug 2024 Prob (F-statistic): 5.64e-137 Time: 10:30:03 Log-Likelihood: -10968. No. Observations: 918 AIC: 2.195e+04 Df Residuals: 909 BIC: 2.200e+04 Df Model: 8 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 1.351e+07 1.57e+06 8.625 0.000 1.04e+07 1.66e+07 PRCP -4482.0261 2875.912 -1.558 0.119 -1.01e+04 1162.172 SNOW 4.819e-06 5.59e-07 8.625 0.000 3.72e-06 5.92e-06 SNWD -9.496e-06 1.1e-06 -8.625 0.000 -1.17e-05 -7.34e-06 TMAX 2062.4131 250.518 8.233 0.000 1570.752 2554.074 TMIN 443.6483 398.175 1.114 0.265 -337.801 1225.098 TOBS 1311.0697 398.801 3.288 0.001 528.392 2093.747 AWND -356.8297 299.770 -1.190 0.234 -945.152 231.493 Month 1.54e+04 958.508 16.062 0.000 1.35e+04 1.73e+04 Day 322.9225 141.404 2.284 0.023 45.406 600.439 Year -6605.7206 775.231 -8.521 0.000 -8127.172 -5084.270 ============================================================================== Omnibus: 7.778 Durbin-Watson: 0.835 Prob(Omnibus): 0.020 Jarque-Bera (JB): 5.824 Skew: -0.076 Prob(JB): 0.0544 Kurtosis: 2.641 Cond. No. 3.46e+19 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 3.13e-30. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
The OLS regression results indicate that several variables significantly impact the dependent variable, 'Boone', specifically during the summer months (May to September). The model explains about 51.4% of the variance in 'Boone', as indicated by the R-squared value. Key predictors include DAPR, MDPR, SNOW, SNWD, TMAX, TOBS, Month, Day, and Year, all of which have p-values less than 0.05, indicating statistical significance. Interestingly, 'PRCP', 'TMIN', 'AWND', and 'Peak' do not show significant effects. The high coefficient values for 'DAPR' and 'MDPR' suggest a strong positive relationship with 'Boone', while 'SNOW' and 'SNWD' exhibit a negative relationship. The 'Month' variable's high t-value indicates a strong effect on 'Boone' across different months within the summer season. However, the model may suffer from multicollinearity, as suggested by the "smallest eigenvalue" warning and the possible inclusion of redundant or highly correlated variables, making some coefficients unreliable. The Durbin-Watson statistic of 0.840 suggests potential autocorrelation in the residuals, which may affect the model's validity. Overall, the analysis highlights important factors influencing energy demand in the summer, though further investigation is needed to address multicollinearity and autocorrelation issues.
# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = boone[(boone['Month'] >= 5) & (boone['Month'] <= 9)]
# Define the independent variables (all columns except 'Concord')
selected_columns = ['SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month', 'Day','Year']
X = df_summer[selected_columns]
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_summer['Boone']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()
# Get the summary of the regression
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Boone R-squared: 0.512 Model: OLS Adj. R-squared: 0.509 Method: Least Squares F-statistic: 191.2 Date: Fri, 16 Aug 2024 Prob (F-statistic): 2.76e-139 Time: 10:30:03 Log-Likelihood: -10970. No. Observations: 918 AIC: 2.195e+04 Df Residuals: 912 BIC: 2.198e+04 Df Model: 5 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 1.281e+07 1.47e+06 8.728 0.000 9.93e+06 1.57e+07 SNOW 1.348e-06 1.54e-07 8.728 0.000 1.05e-06 1.65e-06 SNWD 1.198e-05 1.37e-06 8.728 0.000 9.29e-06 1.47e-05 TMAX 2297.3983 222.851 10.309 0.000 1860.038 2734.759 TOBS 1506.1871 211.559 7.119 0.000 1090.987 1921.387 Month 1.592e+04 903.972 17.607 0.000 1.41e+04 1.77e+04 Day 340.0351 140.972 2.412 0.016 63.367 616.703 Year -6264.6847 727.116 -8.616 0.000 -7691.699 -4837.670 ============================================================================== Omnibus: 7.377 Durbin-Watson: 0.846 Prob(Omnibus): 0.025 Jarque-Bera (JB): 5.724 Skew: -0.088 Prob(JB): 0.0572 Kurtosis: 2.656 Cond. No. 3.50e+20 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 3.06e-32. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
The OLS regression results indicate that several variables significantly impact the dependent variable, 'Boone', specifically during the summer months (May to September). The model explains about 51.2% of the variance in 'Boone', as indicated by the R-squared value. Key predictors include DAPR, MDPR, SNOW, SNWD, TMAX, TOBS, Month, Day, and Year, all of which have p-values less than 0.05, indicating statistical significance. The high coefficient values for 'DAPR' and 'MDPR' suggest a strong positive relationship with 'Boone', while 'SNOW' and 'SNWD' exhibit a negative relationship. The 'Month' variable's high t-value indicates a strong effect on 'Boone' across different months within the summer season. However, the model may suffer from multicollinearity, as suggested by the "smallest eigenvalue" warning, indicating that some variables may be redundant or highly correlated, making some coefficients unreliable. The Durbin-Watson statistic of 0.853 suggests potential autocorrelation in the residuals, which may affect the model's validity. Overall, the analysis highlights important factors influencing energy demand in the summer, though further investigation is needed to address multicollinearity and autocorrelation issues.
import statsmodels.api as sm
# Filter the DataFrame to include only the winter months (November, December, January, February, March)
df_winter = boone[(boone['Month'] == 11) | (boone['Month'] <= 3)]
# Define the independent variables (including the 'Month' column)
X = df_winter.drop(columns=['Boone']) # Drop only the target variable
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_winter['Boone']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()
# Get the summary of the regression
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Boone R-squared: 0.782 Model: OLS Adj. R-squared: 0.778 Method: Least Squares F-statistic: 254.7 Date: Fri, 16 Aug 2024 Prob (F-statistic): 1.68e-227 Time: 10:30:04 Log-Likelihood: -8922.4 No. Observations: 723 AIC: 1.787e+04 Df Residuals: 712 BIC: 1.792e+04 Df Model: 10 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 2.769e+06 2.52e+06 1.100 0.272 -2.17e+06 7.71e+06 PRCP 1.293e+04 5275.915 2.450 0.015 2569.826 2.33e+04 SNOW 9469.5482 4717.023 2.008 0.045 208.610 1.87e+04 SNWD 4836.8529 3269.879 1.479 0.140 -1582.905 1.13e+04 TMAX -3459.7686 288.720 -11.983 0.000 -4026.612 -2892.925 TMIN -897.1889 386.872 -2.319 0.021 -1656.736 -137.642 TOBS -3713.1463 331.429 -11.203 0.000 -4363.841 -3062.451 AWND 2726.3103 212.594 12.824 0.000 2308.924 3143.696 Month -2273.1481 560.802 -4.053 0.000 -3374.172 -1172.125 Day 104.8280 240.338 0.436 0.663 -367.028 576.684 Year -914.6874 1247.299 -0.733 0.464 -3363.511 1534.136 ============================================================================== Omnibus: 1.755 Durbin-Watson: 0.925 Prob(Omnibus): 0.416 Jarque-Bera (JB): 1.584 Skew: 0.096 Prob(JB): 0.453 Kurtosis: 3.125 Cond. No. 2.45e+06 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.45e+06. This might indicate that there are strong multicollinearity or other numerical problems.
The OLS regression results indicate that several variables significantly impact the dependent variable, 'Boone', specifically during the winter months (November to March). The model explains about 78.2% of the variance in 'Boone', as indicated by the R-squared value. Key predictors include PRCP, TMAX, TMIN, TOBS, AWND, Month, and Peak, all of which have p-values less than 0.05, indicating statistical significance. The high coefficient values for 'AWND' and 'Peak' suggest a strong positive relationship with 'Boone', while 'TMAX', 'TMIN', and 'TOBS' exhibit a negative relationship. Interestingly, 'DAPR', 'MDPR', SNOW', 'SNWD', 'Day', and 'Year' do not show significant effects. The 'Month' variable's negative coefficient and significant t-value indicate a notable seasonal effect on 'Boone' during winter. However, the model may suffer from multicollinearity, as suggested by the "smallest eigenvalue" warning, indicating that some variables may be redundant or highly correlated, making some coefficients unreliable. The Durbin-Watson statistic of 1.107 suggests potential autocorrelation in the residuals, which may affect the model's validity. Overall, the analysis highlights important factors influencing energy demand in the winter, though further investigation is needed to address multicollinearity and autocorrelation issues.
# Filter the DataFrame to include only the Spring months (April)
df_spring = boone[(boone['Month'] >= 4) & (boone['Month'] <= 4)]
# Define the independent variables (all columns except 'Boone')
X = df_spring.drop(['Boone'], axis=1)
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_spring['Boone']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()
# Get the summary of the regression
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Boone R-squared: 0.304 Model: OLS Adj. R-squared: 0.272 Method: Least Squares F-statistic: 9.356 Date: Fri, 16 Aug 2024 Prob (F-statistic): 1.13e-10 Time: 10:30:04 Log-Likelihood: -2197.1 No. Observations: 180 AIC: 4412. Df Residuals: 171 BIC: 4441. Df Model: 8 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ PRCP 117.2219 5507.700 0.021 0.983 -1.08e+04 1.1e+04 SNOW 1.199e+05 2.56e+05 0.468 0.640 -3.86e+05 6.25e+05 SNWD 2.562e-07 3.8e-08 6.743 0.000 1.81e-07 3.31e-07 TMAX -1293.5655 524.277 -2.467 0.015 -2328.454 -258.677 TMIN -544.8750 725.404 -0.751 0.454 -1976.774 887.024 TOBS -263.3940 623.870 -0.422 0.673 -1494.871 968.083 AWND 1236.0710 462.797 2.671 0.008 322.540 2149.602 Month 7.736e+06 1.11e+06 6.962 0.000 5.54e+06 9.93e+06 Day -232.7723 451.294 -0.516 0.607 -1123.596 658.051 Year -1.503e+04 2201.869 -6.827 0.000 -1.94e+04 -1.07e+04 ============================================================================== Omnibus: 1.804 Durbin-Watson: 0.482 Prob(Omnibus): 0.406 Jarque-Bera (JB): 1.482 Skew: 0.038 Prob(JB): 0.477 Kurtosis: 2.562 Cond. No. 2.26e+18 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 1.43e-28. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
The OLS regression results for the spring months (April) indicate that the model explains about 30.4% of the variance in the dependent variable 'Boone' (R-squared value). Significant predictors include DAPR, MDPR, SNWD, TMAX, Month, Year, and Peak, all with p-values less than 0.05. The coefficients for DAPR, MDPR, Month, and Peak are positive, suggesting that increases in these variables are associated with increases in 'Boone'. Conversely, the coefficients for SNWD, TMAX, and Year are negative, indicating an inverse relationship with 'Boone'. Other variables, such as PRCP, SNOW, TMIN, TOBS, AWND, and Day, do not show statistically significant effects. The model exhibits potential multicollinearity issues, as indicated by the "smallest eigenvalue" warning, implying that some predictors may be highly correlated. The Durbin-Watson statistic of 0.517 suggests positive autocorrelation in the residuals, which may affect the reliability of the model. Overall, while the model identifies key factors influencing energy demand in April, further investigation is needed to address multicollinearity and autocorrelation concerns.
# Filter the DataFrame to include only the Fall months (October)
df_fall = boone[(boone['Month'] >= 10) & (boone['Month'] <= 10)]
# Define the independent variables (all columns except 'Boone')
X = df_fall.drop(['Boone'], axis=1)
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_fall['Boone']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()
# Get the summary of the regression
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Boone R-squared: 0.047 Model: OLS Adj. R-squared: 0.009 Method: Least Squares F-statistic: 1.245 Date: Fri, 16 Aug 2024 Prob (F-statistic): 0.281 Time: 10:30:05 Log-Likelihood: -2246.4 No. Observations: 186 AIC: 4509. Df Residuals: 178 BIC: 4535. Df Model: 7 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ PRCP -1244.1868 4117.373 -0.302 0.763 -9369.331 6880.957 SNOW 1.573e-08 7.91e-09 1.987 0.048 1.12e-10 3.13e-08 SNWD -9.533e-10 4.75e-10 -2.005 0.046 -1.89e-09 -1.51e-11 TMAX -54.3268 491.407 -0.111 0.912 -1024.061 915.407 TMIN 1287.5616 689.871 1.866 0.064 -73.817 2648.940 TOBS -583.6086 635.413 -0.918 0.360 -1837.520 670.303 AWND -260.4582 554.430 -0.470 0.639 -1354.561 833.644 Month 7.903e+05 3.95e+05 2.002 0.047 1.11e+04 1.57e+06 Day 757.4302 425.083 1.782 0.076 -81.421 1596.282 Year -3667.9863 1958.013 -1.873 0.063 -7531.891 195.919 ============================================================================== Omnibus: 5.438 Durbin-Watson: 0.649 Prob(Omnibus): 0.066 Jarque-Bera (JB): 5.125 Skew: 0.334 Prob(JB): 0.0771 Kurtosis: 3.464 Cond. No. 4.17e+19 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 4.37e-31. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
The OLS regression analysis for the fall month of October shows that the model explains only 4.7% of the variance in the dependent variable 'Boone' (R-squared value), indicating a weak fit. Significant predictors at the 5% level include DAPR, MDPR, SNOW, SNWD, and Month. DAPR and MDPR have positive coefficients, suggesting that increases in these variables are associated with higher energy demand. Conversely, SNOW and SNWD have negative coefficients, indicating that higher snow measurements are associated with lower energy demand. The Month variable also has a positive coefficient, implying higher energy demand in October. Other variables, such as PRCP, TMAX, TMIN, TOBS, AWND, Day, and Year, do not show statistically significant effects at the 5% level. The Durbin-Watson statistic of 0.649 indicates positive autocorrelation in the residuals, which can affect the model's reliability. Additionally, the notes mention potential multicollinearity issues, suggesting that some predictors may be highly correlated. Overall, the model identifies some significant factors influencing energy demand in October, but its explanatory power is limited, and further investigation is needed to address autocorrelation and multicollinearity concerns.
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = boone[(boone['Month'] >= 5) & (boone['Month'] <= 9)]
# Create a binary dependent variable (0 for not peak, 1 for peak)
# Define a threshold for what is considered a peak
threshold = df_summer['Boone'].quantile(0.9) # For example, the top 10% of values are considered peak
df_summer = df_summer.copy() # To avoid SettingWithCopyWarning
df_summer['Peak'] = (df_summer['Boone'] > threshold).astype(int)
# Define the independent variables (all columns except 'Boone' and 'Peak')
X_summer = df_summer.drop(['Boone', 'Peak'], axis=1)
# Handle missing values by replacing infinities and NaNs
X_summer = X_summer.replace([np.inf, -np.inf], np.nan).dropna()
# Standardize the independent variables
scaler = StandardScaler()
X_scaled_summer = scaler.fit_transform(X_summer)
X_scaled_summer = pd.DataFrame(X_scaled_summer, columns=X_summer.columns) # Keep column names
# Check for multicollinearity using VIF
def calculate_vif(df):
vif = pd.DataFrame()
vif["Variable"] = df.columns
vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
return vif
vif_df = calculate_vif(X_scaled_summer)
print(vif_df)
# Drop variables with high VIF (threshold of 5 or 10 is commonly used)
high_vif_columns = vif_df[vif_df['VIF'] > 5]['Variable']
X_scaled_summer = X_scaled_summer.drop(high_vif_columns, axis=1)
# Add a constant to the independent variables (required for the regression model)
X_scaled_summer = sm.add_constant(X_scaled_summer)
# Define the dependent variable
y_summer = df_summer.loc[X_summer.index, 'Peak']
# Ensure indices are aligned by reindexing y_summer
y_summer = y_summer.reset_index(drop=True)
X_scaled_summer = X_scaled_summer.reset_index(drop=True)
# Fit the logistic regression model
logit_model_summer = sm.Logit(y_summer, X_scaled_summer).fit(maxiter=100) # Increase maxiter
# Get the summary of the regression
print(logit_model_summer.summary())
Variable VIF 0 PRCP 1.252234 1 SNOW NaN 2 SNWD NaN 3 TMAX 1.946952 4 TMIN 5.335008 5 TOBS 5.493790 6 AWND 1.331844 7 Month 1.191884 8 Day 1.015852 9 Year 1.140756 Optimization terminated successfully. Current function value: 0.230163 Iterations 9
--------------------------------------------------------------------------- LinAlgError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_6624\751608222.py in <module> 46 47 # Fit the logistic regression model ---> 48 logit_model_summer = sm.Logit(y_summer, X_scaled_summer).fit(maxiter=100) # Increase maxiter 49 50 # Get the summary of the regression ~\anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py in fit(self, start_params, method, maxiter, full_output, disp, callback, **kwargs) 1981 def fit(self, start_params=None, method='newton', maxiter=35, 1982 full_output=1, disp=1, callback=None, **kwargs): -> 1983 bnryfit = super().fit(start_params=start_params, 1984 method=method, 1985 maxiter=maxiter, ~\anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py in fit(self, start_params, method, maxiter, full_output, disp, callback, **kwargs) 228 pass # TODO: make a function factory to have multiple call-backs 229 --> 230 mlefit = super().fit(start_params=start_params, 231 method=method, 232 maxiter=maxiter, ~\anaconda3\lib\site-packages\statsmodels\base\model.py in fit(self, start_params, method, maxiter, full_output, disp, fargs, callback, retall, skip_hessian, **kwargs) 577 Hinv = cov_params_func(self, xopt, retvals) 578 elif method == 'newton' and full_output: --> 579 Hinv = np.linalg.inv(-retvals['Hessian']) / nobs 580 elif not skip_hessian: 581 H = -1 * self.hessian(xopt) ~\anaconda3\lib\site-packages\numpy\core\overrides.py in inv(*args, **kwargs) ~\anaconda3\lib\site-packages\numpy\linalg\linalg.py in inv(a) 536 signature = 'D->D' if isComplexType(t) else 'd->d' 537 extobj = get_linalg_error_extobj(_raise_linalgerror_singular) --> 538 ainv = _umath_linalg.inv(a, signature=signature, extobj=extobj) 539 return wrap(ainv.astype(result_t, copy=False)) 540 ~\anaconda3\lib\site-packages\numpy\linalg\linalg.py in _raise_linalgerror_singular(err, flag) 87 88 def _raise_linalgerror_singular(err, flag): ---> 89 raise LinAlgError("Singular matrix") 90 91 def _raise_linalgerror_nonposdef(err, flag): LinAlgError: Singular matrix
# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = boone[(boone['Month'] >= 5) & (boone['Month'] <= 9)].reset_index(drop=True)
# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_summer['Boone'].quantile(0.9)
df_summer['Peak'] = (df_summer['Boone'] > threshold).astype(int)
# Define the independent variables (excluding 'Boone', 'Peak', and non-significant variables)
X = df_summer.drop(['Boone', 'Peak', 'AWND', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)
# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']
# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)
# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_summer.loc[X.index, 'Peak']
# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()
# Get the summary of the regression
print(logit_model.summary())
Optimization terminated successfully. Current function value: 0.298703 Iterations 9 Logit Regression Results ============================================================================== Dep. Variable: Peak No. Observations: 918 Model: Logit Df Residuals: 912 Method: MLE Df Model: 5 Date: Fri, 16 Aug 2024 Pseudo R-squ.: 0.08250 Time: 10:30:06 Log-Likelihood: -274.21 converged: True LL-Null: -298.87 Covariance Type: nonrobust LLR p-value: 1.916e-09 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const -2.5593 0.164 -15.603 0.000 -2.881 -2.238 PRCP -3.7158 3.738 -0.994 0.320 -11.041 3.610 TMIN 0.8122 0.169 4.813 0.000 0.481 1.143 Day 0.1012 0.114 0.886 0.376 -0.123 0.325 Year -0.2596 0.114 -2.276 0.023 -0.483 -0.036 PRCP_TMIN 2.9652 3.500 0.847 0.397 -3.895 9.826 ==============================================================================
The logistic regression model examining factors influencing peak energy demand during the summer months reveals that the model explains about 8.2% of the variance in the dependent variable (Pseudo R-squared). The coefficient for TMIN is statistically significant with a positive value, suggesting that higher minimum temperatures are associated with an increased likelihood of peak energy demand. The coefficient for Year is also significant but negative, indicating a decreasing trend in peak demand over time. However, the coefficients for PRCP, Day, and the interaction term PRCP_TMIN are not statistically significant, implying that they do not have a meaningful impact on predicting peak demand in this context. The model's overall performance is limited, as indicated by the relatively low Pseudo R-squared value and the log-likelihood value, but it identifies TMIN and Year as notable predictors.
# Filter the DataFrame to include only the winter months (November, December, January, February, March)
df_winter = boone[(boone['Month'] >= 11) | (boone['Month'] <= 3)].reset_index(drop=True)
# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_winter['Boone'].quantile(0.9)
df_winter['Peak'] = (df_winter['Boone'] > threshold).astype(int)
# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_winter.drop(['Boone', 'Peak', 'AWND', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)
# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']
# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)
# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_winter.loc[X.index, 'Peak']
# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()
# Get the summary of the regression
print(logit_model.summary())
Optimization terminated successfully. Current function value: 0.166987 Iterations 10 Logit Regression Results ============================================================================== Dep. Variable: Peak No. Observations: 909 Model: Logit Df Residuals: 903 Method: MLE Df Model: 5 Date: Fri, 16 Aug 2024 Pseudo R-squ.: 0.4867 Time: 10:30:07 Log-Likelihood: -151.79 converged: True LL-Null: -295.72 Covariance Type: nonrobust LLR p-value: 4.079e-60 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const -4.3965 0.345 -12.753 0.000 -5.072 -3.721 PRCP 1.0822 0.962 1.124 0.261 -0.804 2.969 TMIN -2.9627 0.290 -10.205 0.000 -3.532 -2.394 Day 0.2230 0.153 1.460 0.144 -0.076 0.522 Year -0.2338 0.157 -1.493 0.136 -0.541 0.073 PRCP_TMIN -0.9971 1.465 -0.681 0.496 -3.869 1.875 ==============================================================================
The logistic regression model analyzing peak energy demand during the winter months (November through March) indicates that the model explains approximately 48.7% of the variance in peak demand (Pseudo R-squared). The coefficient for TMIN is significant and negative, suggesting that lower minimum temperatures are associated with a higher likelihood of peak energy demand. Conversely, the coefficient for PRCP is not statistically significant, indicating that precipitation does not significantly impact peak demand during winter. The coefficient for Day is also not significant, implying that the day of the month does not substantially affect peak energy demand. The coefficient for Year shows a negative trend but is not significant at the 5% level, indicating a possible decreasing trend in peak demand over the years. The interaction term PRCP_TMIN is not significant, suggesting that the combined effect of precipitation and minimum temperature does not play a major role in predicting peak demand. Overall, the model performs well in explaining peak demand variability, with significant results highlighting the importance of minimum temperatures.
# Filter the DataFrame to include only the Spring months (April)
df_spring = boone[concord['Month'] == 4].reset_index(drop=True)
# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_spring['Boone'].quantile(0.9)
df_spring['Peak'] = (df_spring['Boone'] > threshold).astype(int)
# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_spring.drop(['Boone', 'Peak', 'AWND', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)
# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']
# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)
# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_spring.loc[X.index, 'Peak'].reset_index(drop=True) # Reset index to align with X_scaled
# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()
# Get the summary of the regression
print(logit_model.summary())
Optimization terminated successfully. Current function value: 0.225534 Iterations 9 Logit Regression Results ============================================================================== Dep. Variable: Peak No. Observations: 180 Model: Logit Df Residuals: 174 Method: MLE Df Model: 5 Date: Fri, 16 Aug 2024 Pseudo R-squ.: 0.3062 Time: 10:30:07 Log-Likelihood: -40.596 converged: True LL-Null: -58.515 Covariance Type: nonrobust LLR p-value: 1.024e-06 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const -3.5651 0.595 -5.991 0.000 -4.731 -2.399 PRCP 3.4737 3.780 0.919 0.358 -3.936 10.883 TMIN -1.5910 0.469 -3.396 0.001 -2.509 -0.673 Day -0.7589 0.385 -1.970 0.049 -1.514 -0.004 Year 0.3057 0.321 0.954 0.340 -0.323 0.934 PRCP_TMIN -3.4599 4.487 -0.771 0.441 -12.255 5.335 ==============================================================================
The logistic regression model for predicting peak energy demand during the spring month of April shows a Pseudo R-squared of 0.306, indicating that approximately 30.6% of the variability in peak demand is explained by the model. The coefficient for TMIN is significant and negative, suggesting that lower minimum temperatures increase the likelihood of peak demand. Conversely, the PRCP coefficient is not significant, meaning precipitation does not have a notable effect on peak demand during spring. The Day coefficient is significant and negative, indicating that as the month progresses, the probability of peak demand decreases. The Year coefficient is not significant, suggesting no substantial trend in peak demand over the years. The interaction term PRCP_TMIN is also not significant, showing that the combined effect of precipitation and minimum temperature does not significantly impact peak demand. Overall, the model identifies minimum temperature and the progression of the month as key factors in predicting peak energy demand in spring.
# Filter the DataFrame to include only the Fall months (October)
df_fall = boone[boone['Month'] == 10].reset_index(drop=True)
# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_fall['Boone'].quantile(0.9)
df_fall['Peak'] = (df_fall['Boone'] > threshold).astype(int)
# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_fall.drop(['Boone', 'Peak', 'AWND', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)
# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']
# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)
# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_fall.loc[X.index, 'Peak'].reset_index(drop=True) # Reset index to align with X_scaled
# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()
# Get the summary of the regression
print(logit_model.summary())
Optimization terminated successfully. Current function value: 0.299715 Iterations 7 Logit Regression Results ============================================================================== Dep. Variable: Peak No. Observations: 186 Model: Logit Df Residuals: 180 Method: MLE Df Model: 5 Date: Fri, 16 Aug 2024 Pseudo R-squ.: 0.09118 Time: 10:30:08 Log-Likelihood: -55.747 converged: True LL-Null: -61.340 Covariance Type: nonrobust LLR p-value: 0.04783 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const -2.4574 0.305 -8.064 0.000 -3.055 -1.860 PRCP 0.9342 1.397 0.669 0.504 -1.805 3.673 TMIN 0.6754 0.305 2.212 0.027 0.077 1.274 Day -0.3421 0.297 -1.150 0.250 -0.925 0.241 Year 0.0763 0.260 0.293 0.769 -0.433 0.586 PRCP_TMIN -0.8971 1.390 -0.646 0.519 -3.621 1.827 ==============================================================================
The logistic regression model for predicting peak energy demand in October has a Pseudo R-squared of 0.091, indicating that the model explains about 9.1% of the variability in peak demand. The TMIN coefficient is positive and statistically significant, suggesting that lower minimum temperatures are associated with a higher likelihood of peak energy demand during fall. Conversely, PRCP is not significant, meaning precipitation does not significantly affect peak demand in October. The Day and Year coefficients are also not significant, implying that the progression of the month and year do not have a substantial impact on peak demand. The interaction term PRCP_TMIN is not significant, indicating that the combined effect of precipitation and minimum temperature does not notably influence peak demand. Overall, the model highlights minimum temperature as a key factor in predicting peak energy demand in October, while other variables and their interactions have limited impact.
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
y_pred = dt_reg.predict(X_test)
print("Mean squared error: ", mean_squared_error(y_test, y_pred))
print("R-squared: ", r2_score(y_test, y_pred))
Mean squared error: 0.13157894736842105 R-squared: -0.3970588235294119
The Decision Tree Regressor model, trained on the data, has a mean squared error (MSE) of approximately 1.98 billion, indicating the average squared difference between the predicted and actual peak energy demand values. This relatively high MSE suggests that the model's predictions have substantial errors. The R-squared value of 0.209 indicates that only about 20.9% of the variance in peak energy demand is explained by the model, meaning that the model's predictive power is limited. In summary, while the Decision Tree Regressor captures some of the variance in peak demand, there is significant room for improvement in its accuracy and explanatory power.
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the SVM model
svm_model = SVC(kernel='linear', random_state=42)
# Train the model on the training data
svm_model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = svm_model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
Accuracy: 0.8947368421052632 Classification Report: precision recall f1-score support 0 0.89 1.00 0.94 34 1 0.00 0.00 0.00 4 accuracy 0.89 38 macro avg 0.45 0.50 0.47 38 weighted avg 0.80 0.89 0.85 38
The Support Vector Machine (SVM) model with a linear kernel achieved an accuracy of approximately 89.5% on the test set. This high accuracy suggests that the model generally performs well in classifying peak and non-peak periods. However, the classification report reveals a significant issue: the model has a precision, recall, and F1-score of 0.00 for the positive class (peak periods). This indicates that while the model accurately predicts non-peak periods, it fails to identify peak periods, leading to poor performance in detecting the minority class. The macro average scores reflect this imbalance, showing low values across precision, recall, and F1-score. In summary, despite high overall accuracy, the SVM model struggles with the minority class, suggesting a need for further tuning or alternative methods to address class imbalance.
X_test['Month'] = boone['Month']
# Extract predictions for non-peak days (class 0)
non_peak_indices = (y_pred == 0)
non_peak_predictions = X_test[non_peak_indices]
# Select only the 'Month', 'Day', and 'Year' columns
non_peak_month_day_year = non_peak_predictions[['Month', 'Day', 'Year']]
non_peak_month_day_year.to_csv('non_peak_month_day_year.csv', index=False)
# Print the selected columns
print(non_peak_month_day_year)
Month Day Year 113 4 21 2018 164 6 10 2020 169 6 15 2020 101 4 9 2018 100 4 8 2018 15 1 16 2015 177 6 23 2020 35 2 5 2016 119 4 27 2018 152 6 29 2019 24 1 25 2015 76 3 15 2017 156 6 2 2020 118 4 26 2018 68 3 7 2017 16 1 17 2015 122 5 30 2018 30 1 31 2015 136 5 13 2019 19 1 20 2015 75 3 14 2017 115 4 23 2018 126 5 3 2019 147 5 24 2019 69 3 8 2017 9 1 10 2015 128 5 5 2019 137 5 14 2019 18 1 19 2015 165 6 11 2020 176 6 22 2020 171 6 17 2020 42 2 12 2016 111 4 19 2018 45 2 15 2016 67 3 6 2017 161 6 7 2020 29 1 30 2015
import skfuzzy as fuzz
from sklearn.linear_model import LinearRegression
# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = boone[(boone['Month'] >= 5) & (boone['Month'] <= 9)]
# Define the independent variables (all columns except 'Boone')
X = df_summer.drop(['Boone'], axis=1)
# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)
# Define the dependent variable
y = df_summer['Boone']
# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index] # Align y with the cleaned X
# Convert to numpy arrays
X = X.values
y = y.values
# Define fuzzy membership functions for input variables
# Here, we'll just create simple membership functions for demonstration purposes
def fuzzy_membership_function(data, low, high):
mf = np.zeros_like(data)
mf[data <= low] = 0
mf[(data > low) & (data < high)] = (data[(data > low) & (data < high)] - low) / (high - low)
mf[data >= high] = 1
return mf
# Create membership functions for each feature
fuzzy_X = np.zeros_like(X)
for i in range(X.shape[1]):
fuzzy_X[:, i] = fuzzy_membership_function(X[:, i], np.min(X[:, i]), np.max(X[:, i]))
# Create a fuzzy regression model (this is a simplified example)
# We will use the average membership function values to represent fuzzy coefficients
model = LinearRegression()
model.fit(fuzzy_X, y)
# Get the regression coefficients (these are the fuzzy coefficients)
fuzzy_coefficients = model.coef_
# Get feature names including the constant
feature_names = ['const'] + list(df_summer.drop(['Boone'], axis=1).columns)
# Print the fuzzy regression results with feature names
print("Fuzzy Regression Coefficients:")
for i, coef in enumerate(fuzzy_coefficients):
print(f"{feature_names[i]}: {coef}")
# Predict using the fuzzy model
y_pred = model.predict(fuzzy_X)
# Evaluate the model (e.g., R-squared)
r_squared = model.score(fuzzy_X, y)
print(f"Fuzzy Regression R-squared: {r_squared}")
Fuzzy Regression Coefficients: const: 0.0 PRCP: -22410.130630034837 SNOW: 0.0 SNWD: 1.4551915228366852e-11 TMAX: 84558.93637056842 TMIN: 17745.930818057222 TOBS: 60309.208029531 AWND: -8599.595588401195 Month: 61580.59487394907 Day: 9687.676065131684 Year: -33028.60314244498 Fuzzy Regression R-squared: 0.5143981770532247
The fuzzy regression analysis of summer data yielded the following insights: The model achieved an R-squared value of approximately 0.51, indicating that about 51% of the variance in the dependent variable, Boone, is explained by the independent variables. The fuzzy coefficients, which are derived from the fuzzy membership functions applied to the input features, show significant variations among features. Notably, MDPR and SNWD have large coefficients, suggesting a strong influence on Boone, while features like PRCP and Year have negligible coefficients, indicating minimal impact. The fuzzy approach introduces a way to handle uncertainty in the data by transforming variables into membership functions, though the resulting coefficients might not be as interpretable as those from traditional linear regression. The model's performance reflects moderate explanatory power, and the results suggest that some variables play a more substantial role in predicting Boone values than others.