import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from scipy.stats import linregress
import warnings
warnings.filterwarnings("ignore")


df = pd.read_csv("Demand_All.csv")


df.head()


data_type = df.dtypes
data_type

UTC               object
Eastern           object
Year               int64
OrdDay             int64
OrdHr              int64
Weekday            int64
Month              int64
Day                int64
HourEnd            int64
DST                int64
Concord          float64
Greenwood        float64
NewRiver           int64
KingsMountain    float64
Winterville      float64
Total              int64
Peak               int64
dtype: object


nan_count = df.isna().sum()
nan_count

UTC                 0
Eastern             0
Year                0
OrdDay              0
OrdHr               0
Weekday             0
Month               0
Day                 0
HourEnd             0
DST                 0
Concord          7102
Greenwood        7102
NewRiver            0
KingsMountain    7102
Winterville      7102
Total               0
Peak                0
dtype: int64


# Checking bell curve to see if filling in missing values with mean is appropriate

#creating a new data frame with just "year" and "concord"
concord_data = df[['Year', 'Concord']]

concord_data.head()


greenwood_data = df[['Year', 'Greenwood']]

greenwood_data.head()


KingsMountain_data = df[['Year', 'KingsMountain']]

KingsMountain_data.head()


Winterville_data = df[['Year', 'Winterville']]

Winterville_data.head()


# Filling in Missing values with the mean
df['Concord'] = df['Concord'].fillna(df['Concord'].mean())
df['Greenwood'] = df['Greenwood'].fillna(df['Greenwood'].mean())
df['KingsMountain'] = df['KingsMountain'].fillna(df['KingsMountain'].mean())
df['Winterville'] = df['Winterville'].fillna(df['Winterville'].mean())


nan_count2 = df.isna().sum()
nan_count2

UTC              0
Eastern          0
Year             0
OrdDay           0
OrdHr            0
Weekday          0
Month            0
Day              0
HourEnd          0
DST              0
Concord          0
Greenwood        0
NewRiver         0
KingsMountain    0
Winterville      0
Total            0
Peak             0
dtype: int64


data_type = df.dtypes
data_type

UTC               object
Eastern           object
Year               int64
OrdDay             int64
OrdHr              int64
Weekday            int64
Month              int64
Day                int64
HourEnd            int64
DST                int64
Concord          float64
Greenwood        float64
NewRiver           int64
KingsMountain    float64
Winterville      float64
Total              int64
Peak               int64
dtype: object


df['Concord'] = df['Concord'].astype('int64')
df['Greenwood'] = df['Greenwood'].astype('int64')
df['KingsMountain'] = df['KingsMountain'].astype('int64')
df['Winterville'] = df['Winterville'].astype('int64')


data_type = df.dtypes
data_type

UTC              object
Eastern          object
Year              int64
OrdDay            int64
OrdHr             int64
Weekday           int64
Month             int64
Day               int64
HourEnd           int64
DST               int64
Concord           int64
Greenwood         int64
NewRiver          int64
KingsMountain     int64
Winterville       int64
Total             int64
Peak              int64
dtype: object


Concord_sum = df['Concord'].sum()
print("Concord total power usage:", Concord_sum)

Greenwood_sum = df['Greenwood'].sum()
print("Greenwood total power usage: ", Greenwood_sum)

NewRiver_sum = df['NewRiver'].sum()
print("NewRiver total power usage: ", NewRiver_sum)

KingsMountain_sum = df['KingsMountain'].sum()
print("KingsMountain total power usage: ", KingsMountain_sum)

Winterville_sum = df['Winterville'].sum()
print("Winterville total power usage: ", Winterville_sum)

Concord total power usage: 5587949911
Greenwood total power usage:  1873460604
NewRiver total power usage:  1266075128
KingsMountain total power usage:  902715964
Winterville total power usage:  324750027


total_power_by_region = df[['Concord', 'Greenwood', 'NewRiver', 'KingsMountain', 'Winterville']].sum()

total_power_df = total_power_by_region.reset_index(name='Total Power')

sns.barplot(x='index', y='Total Power', data=total_power_df)

<AxesSubplot:xlabel='index', ylabel='Total Power'>


totals_power_df = pd.DataFrame({
    "Region": ['Concord', 'Greenwood', 'NewRiver', 'KingsMountain', 'Winterville'],
    "Total Power": [Concord_sum, Greenwood_sum, NewRiver_sum, KingsMountain_sum, Winterville_sum]})

total_power_df_sorted = total_power_df.sort_values(by ='Total Power', ascending = False)

total_power_df_sorted


# All of January Data

Jan_df = df[(df['Month'] == 1)]
Jan_df


Jan_2015 = df[(df['Month'] == 1) & (df['Year'] == 2015)]

Jan_2015_NewRiver_Total = Jan_2015.groupby('HourEnd')['NewRiver'].sum()

Jan_2015_NewRiver_Total

HourEnd
1      830939
2      806077
3      791975
4      788556
5      800197
6      833628
7      899553
8      968696
9     1005189
10    1019562
11    1020847
12    1005319
13     981670
14     963095
15     947010
16     939390
17     939369
18     974770
19    1012672
20    1007196
21     988856
22     954404
23     911466
24     866841
Name: NewRiver, dtype: int64


df_result_Jan_2015 = Jan_2015_NewRiver_Total.reset_index(name ='TotalPowerUsage')

#df_result_Jan_2015

df_Jan_2015_sorted = df_result_Jan_2015.sort_values(by = 'TotalPowerUsage', ascending = False)
df_Jan_2015_sorted


Jan_NewRiver = df[(df['Month'] == 1)]

Jan_NewRiver_Total = Jan_NewRiver.groupby('HourEnd')['NewRiver'].sum()

Jan_NewRiver_Total

HourEnd
1     4877637
2     4741598
3     4671303
4     4653754
5     4726696
6     4917476
7     5327374
8     5719311
9     5939928
10    6032503
11    6041263
12    5954354
13    5846188
14    5733195
15    5631853
16    5566556
17    5578212
18    5767909
19    5981854
20    5941348
21    5834807
22    5631257
23    5366839
24    5109415
Name: NewRiver, dtype: int64


## Total electricity usage for each hour in January

df_result_Jan = Jan_NewRiver_Total.reset_index(name = 'TotalPowerUsage')
df_Jan_sorted = df_result_Jan.sort_values(by = 'TotalPowerUsage', ascending = False)

df_Jan_sorted


Jan_NewRiver_Avg = Jan_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_Jan = Jan_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_Jan_sorted = df_result_Avg_Jan.sort_values(by ='AvgTotalPowerUsage', ascending = False)

df_Avg_Jan_sorted


# ADDED TIME FRAME TO MAKE IT MORE INTUITIVE - THIS IS THE AVERAGE POWER USAGE LEADING UP TO THAT HOUR FOR EACH HOUR
# EX: HOUR END (1) = 26233.85 --> FROM MIDNIGHT TO 1:00 AM


df_Avg_Jan_sorted['Time Frame'] = df_Avg_Jan_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Jan_sorted['Time Frame'] = pd.to_datetime(df_Avg_Jan_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_Jan_sorted


Jan_max = df_Avg_Jan_sorted['AvgTotalPowerUsage'].idxmax()
Jan_max_df = df_Avg_Jan_sorted.loc[[Jan_max]]

Jan_max_df


line_Jan_sorted = df_result_Avg_Jan.sort_values(by = 'HourEnd', ascending = True)
line_Jan_sorted.set_index('HourEnd', inplace = True)

line_plot_Jan = df_result_Avg_Jan.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')
#line_plot_Jan = line_Jan_sorted.plot.line(y='AvgTotalPowerUsage')

plt.title("January Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


Feb_NewRiver = df[(df['Month'] == 2)]

Feb_NewRiver_Total = Feb_NewRiver.groupby('HourEnd')['NewRiver'].sum()

Feb_NewRiver_Total

HourEnd
1     4230578
2     4069790
3     3984407
4     3958556
5     4015610
6     4199592
7     4602717
8     4952400
9     5210838
10    5312829
11    5347523
12    5290742
13    5211663
14    5118994
15    5030838
16    4967112
17    4951879
18    5019501
19    5247533
20    5266013
21    5159570
22    4972692
23    4721096
24    4448769
Name: NewRiver, dtype: int64


df_result_Feb = Feb_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_Feb_sorted = df_result_Feb.sort_values(by = 'TotalPowerUsage', ascending = False)

df_Feb_sorted


Feb_NewRiver_Avg = Feb_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_Feb = Feb_NewRiver_Avg.reset_index(name ='AvgTotalPowerUsage')
df_Avg_Feb_sorted = df_result_Avg_Feb.sort_values(by = 'AvgTotalPowerUsage', ascending = False)

df_Avg_Feb_sorted


df_Avg_Feb_sorted['Time Frame'] = df_Avg_Feb_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Feb_sorted['Time Frame'] = pd.to_datetime(df_Avg_Feb_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_Feb_sorted


Feb_max = df_Avg_Feb_sorted['AvgTotalPowerUsage'].idxmax()
Feb_max_df = df_Avg_Feb_sorted.loc[[Feb_max]]

Feb_max_df


line_Feb_sorted = df_result_Avg_Feb.sort_values(by = 'HourEnd', ascending = True)
line_Feb_sorted.set_index('HourEnd', inplace = True)

line_plot_Feb = df_result_Avg_Feb.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("February Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


March_NewRiver = df[(df['Month'] == 3)]

March_NewRiver_Total = March_NewRiver.groupby('HourEnd')['NewRiver'].sum()

df_result_March = March_NewRiver_Total.reset_index(name = 'TotalPowerUsage')
df_March_sorted = df_result_March.sort_values(by ='TotalPowerUsage', ascending = False)

df_March_sorted


March_NewRiver_Avg = March_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_March = March_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_March_sorted = df_result_Avg_March.sort_values(by = 'AvgTotalPowerUsage', ascending = False)

df_Avg_March_sorted


df_Avg_March_sorted['Time Frame'] = df_Avg_March_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_March_sorted['Time Frame'] = pd.to_datetime(df_Avg_March_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_March_sorted


March_max = df_Avg_March_sorted['AvgTotalPowerUsage'].idxmax()
March_max_df = df_Avg_March_sorted.loc[[March_max]]

March_max_df


line_March_sorted = df_result_Avg_March.sort_values(by = 'HourEnd', ascending = True)
line_March_sorted.set_index('HourEnd', inplace=True)

line_plot_March = df_result_Avg_March.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("March Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


April_NewRiver = df[(df['Month'] == 4)]

April_NewRiver_Total = April_NewRiver.groupby('HourEnd')['NewRiver'].sum()

# April_NewRiver_Total

df_result_April = April_NewRiver_Total.reset_index(name = 'TotalPowerUsage')
df_April_sorted = df_result_April.sort_values(by = 'TotalPowerUsage', ascending = False)

df_April_sorted


April_NewRiver_Avg = April_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_April = April_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_April_sorted = df_result_Avg_April.sort_values(by = 'AvgTotalPowerUsage', ascending = False)

df_Avg_April_sorted


df_Avg_April_sorted['Time Frame'] = df_Avg_April_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_April_sorted['Time Frame'] = pd.to_datetime(df_Avg_April_sorted['Time Frame'], format ='%H:%M').dt.time

df_Avg_April_sorted


April_max = df_Avg_April_sorted['AvgTotalPowerUsage'].idxmax()
April_max_df = df_Avg_April_sorted.loc[[April_max]]

April_max_df


line_April_sorted = df_result_Avg_April.sort_values(by = 'HourEnd', ascending = True)
line_April_sorted.set_index('HourEnd', inplace = True)

line_plot_April = df_result_Avg_April.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("April Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


May_NewRiver = df[(df['Month'] == 5)]

May_NewRiver_Total = May_NewRiver.groupby('HourEnd')['NewRiver'].sum()

# May_NewRiver_Total

df_result_May = May_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_May_sorted = df_result_May.sort_values(by = 'TotalPowerUsage', ascending = False)

df_May_sorted


May_NewRiver_Avg = May_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_May = May_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_May_sorted = df_result_Avg_May.sort_values(by ='AvgTotalPowerUsage', ascending = False)

df_Avg_May_sorted


df_Avg_May_sorted['Time Frame'] = df_Avg_May_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_May_sorted['Time Frame'] = pd.to_datetime(df_Avg_May_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_May_sorted


May_max = df_Avg_May_sorted['AvgTotalPowerUsage'].idxmax()
May_max_df = df_Avg_May_sorted.loc[[May_max]]

May_max_df


line_May_sorted = df_result_Avg_May.sort_values(by = 'HourEnd', ascending = True)
line_May_sorted.set_index('HourEnd', inplace=True)

line_plot_May = df_result_Avg_May.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("May Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


June_NewRiver = df[(df['Month'] == 6)]

June_NewRiver_Total = June_NewRiver.groupby('HourEnd')['NewRiver'].sum()

# June_NewRiver_Total

df_result_June = June_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_June_sorted = df_result_June.sort_values(by = 'TotalPowerUsage', ascending = False)

df_June_sorted


June_NewRiver_Avg =June_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_June = June_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_June_sorted = df_result_Avg_June.sort_values(by = 'AvgTotalPowerUsage', ascending = False)

df_Avg_June_sorted


df_Avg_June_sorted['Time Frame'] = df_Avg_June_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_June_sorted['Time Frame'] = pd.to_datetime(df_Avg_June_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_June_sorted


June_max = df_Avg_June_sorted['AvgTotalPowerUsage'].idxmax()
June_max_df = df_Avg_June_sorted.loc[[June_max]]

June_max_df


line_June_sorted = df_result_Avg_June.sort_values(by = 'HourEnd', ascending = True)
line_June_sorted.set_index('HourEnd', inplace = True)

line_plot_June = df_result_Avg_June.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("June Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


July_NewRiver = df[(df['Month'] == 7)]

July_NewRiver_Total = July_NewRiver.groupby('HourEnd')['NewRiver'].sum()

# July_NewRiver_Total

df_result_July = July_NewRiver_Total.reset_index(name = 'TotalPowerUsage')
df_July_sorted = df_result_July.sort_values(by ='TotalPowerUsage', ascending = False)

df_July_sorted


July_NewRiver_Avg =July_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_July = July_NewRiver_Avg.reset_index(name ='AvgTotalPowerUsage')
df_Avg_July_sorted = df_result_Avg_July.sort_values(by ='AvgTotalPowerUsage', ascending = False)

df_Avg_July_sorted


df_Avg_July_sorted['Time Frame'] = df_Avg_July_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_July_sorted['Time Frame'] = pd.to_datetime(df_Avg_July_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_July_sorted


July_max = df_Avg_July_sorted['AvgTotalPowerUsage'].idxmax()
July_max_df = df_Avg_July_sorted.loc[[July_max]]

July_max_df


line_July_sorted = df_result_Avg_July.sort_values(by = 'HourEnd', ascending = True)
line_July_sorted.set_index('HourEnd', inplace = True)

line_plot_July = df_result_Avg_July.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("July Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


August_NewRiver = df[(df['Month'] == 8)]

August_NewRiver_Total = August_NewRiver.groupby('HourEnd')['NewRiver'].sum()


df_result_August = August_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_August_sorted = df_result_August.sort_values(by ='TotalPowerUsage', ascending = False)

df_August_sorted


August_NewRiver_Avg =August_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_August = August_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_August_sorted = df_result_Avg_August.sort_values(by = 'AvgTotalPowerUsage', ascending=False)

df_Avg_August_sorted


df_Avg_August_sorted['Time Frame'] = df_Avg_August_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_August_sorted['Time Frame'] = pd.to_datetime(df_Avg_August_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_August_sorted


August_max = df_Avg_August_sorted['AvgTotalPowerUsage'].idxmax()
August_max_df = df_Avg_August_sorted.loc[[August_max]]

August_max_df


line_August_sorted = df_result_Avg_August.sort_values(by = 'HourEnd', ascending = True)
line_August_sorted.set_index('HourEnd', inplace = True)

line_plot_August = df_result_Avg_August.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("August Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


Sep_NewRiver = df[(df['Month'] == 9)]

Sep_NewRiver_Total = Sep_NewRiver.groupby('HourEnd')['NewRiver'].sum()

# July_NewRiver_Total

df_result_Sep = Sep_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_Sep_sorted = df_result_Sep.sort_values(by = 'TotalPowerUsage', ascending = False)

df_Sep_sorted


Sep_NewRiver_Avg =Sep_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_Sep = Sep_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_Sep_sorted = df_result_Avg_Sep.sort_values(by = 'AvgTotalPowerUsage', ascending = False)

df_Avg_Sep_sorted


df_Avg_Sep_sorted['Time Frame'] = df_Avg_Sep_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Sep_sorted['Time Frame'] = pd.to_datetime(df_Avg_Sep_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_Sep_sorted


Sep_max = df_Avg_Sep_sorted['AvgTotalPowerUsage'].idxmax()
Sep_max_df = df_Avg_Sep_sorted.loc[[Sep_max]]

Sep_max_df


line_Sep_sorted = df_result_Avg_Sep.sort_values(by = 'HourEnd', ascending = True)
line_Sep_sorted.set_index('HourEnd', inplace = True)

line_plot_Sep = df_result_Avg_Sep.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("September Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


Oct_NewRiver = df[(df['Month'] == 10)]

Oct_NewRiver_Total = Oct_NewRiver.groupby('HourEnd')['NewRiver'].sum()

# July_NewRiver_Total

df_result_Oct = Oct_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_Oct_sorted = df_result_Oct.sort_values(by ='TotalPowerUsage', ascending = False)

df_Oct_sorted


Oct_NewRiver_Avg = Oct_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_Oct = Oct_NewRiver_Avg.reset_index(name = 'AvgTotalPowerUsage')
df_Avg_Oct_sorted = df_result_Avg_Oct.sort_values(by = 'AvgTotalPowerUsage', ascending = False)

df_Avg_Oct_sorted


df_Avg_Oct_sorted['Time Frame'] = df_Avg_Oct_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Oct_sorted['Time Frame'] = pd.to_datetime(df_Avg_Oct_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_Oct_sorted


Oct_max = df_Avg_Oct_sorted['AvgTotalPowerUsage'].idxmax()
Oct_max_df = df_Avg_Oct_sorted.loc[[Oct_max]]

Oct_max_df


line_Oct_sorted = df_result_Avg_Oct.sort_values(by = 'HourEnd', ascending = True)
line_Oct_sorted.set_index('HourEnd', inplace = True)

line_plot_Oct = df_result_Avg_Oct.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("October Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


Nov_NewRiver = df[(df['Month'] == 11)]

Nov_NewRiver_Total = Nov_NewRiver.groupby('HourEnd')['NewRiver'].sum()

# July_NewRiver_Total

df_result_Nov = Nov_NewRiver_Total.reset_index(name ='TotalPowerUsage')
df_Nov_sorted = df_result_Nov.sort_values(by = 'TotalPowerUsage', ascending = False)

df_Nov_sorted


Nov_NewRiver_Avg = Nov_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_Nov = Nov_NewRiver_Avg.reset_index(name ='AvgTotalPowerUsage')
df_Avg_Nov_sorted = df_result_Avg_Nov.sort_values(by = 'AvgTotalPowerUsage', ascending = False)

df_Avg_Nov_sorted


df_Avg_Nov_sorted['Time Frame'] = df_Avg_Nov_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Nov_sorted['Time Frame'] = pd.to_datetime(df_Avg_Nov_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_Nov_sorted


Nov_max = df_Avg_Nov_sorted['AvgTotalPowerUsage'].idxmax()
Nov_max_df = df_Avg_Nov_sorted.loc[[Nov_max]]

Nov_max_df


line_Nov_sorted = df_result_Avg_Nov.sort_values(by = 'HourEnd', ascending = True)
line_Nov_sorted.set_index('HourEnd', inplace = True)

line_plot_Nov = df_result_Avg_Nov.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("November Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


Dec_NewRiver = df[(df['Month'] == 12)]

Dec_NewRiver_Total = Dec_NewRiver.groupby('HourEnd')['NewRiver'].sum()

# July_NewRiver_Total

df_result_Dec = Dec_NewRiver_Total.reset_index(name='TotalPowerUsage')
df_Dec_sorted = df_result_Dec.sort_values(by='TotalPowerUsage', ascending = False)

df_Dec_sorted


Dec_NewRiver_Avg = Dec_NewRiver.groupby('HourEnd')['NewRiver'].mean().round(2)

df_result_Avg_Dec = Dec_NewRiver_Avg.reset_index(name ='AvgTotalPowerUsage')
df_Avg_Dec_sorted = df_result_Avg_Dec.sort_values(by ='AvgTotalPowerUsage', ascending = False)

df_Avg_Dec_sorted


df_Avg_Dec_sorted['Time Frame'] = df_Avg_Dec_sorted['HourEnd'].astype(str).replace('24', '00') + ':00'

# use dt.time accessor to get rid of the 1901-01-01
df_Avg_Dec_sorted['Time Frame'] = pd.to_datetime(df_Avg_Dec_sorted['Time Frame'], format = '%H:%M').dt.time

df_Avg_Dec_sorted


Dec_max = df_Avg_Dec_sorted['AvgTotalPowerUsage'].idxmax()
Dec_max_df = df_Avg_Dec_sorted.loc[[Dec_max]]

Dec_max_df


line_Dec_sorted = df_result_Avg_Dec.sort_values(by = 'HourEnd', ascending = True)
line_Dec_sorted.set_index('HourEnd', inplace = True)

line_plot_Dec = df_result_Avg_Dec.plot.line(x = 'HourEnd', y = 'AvgTotalPowerUsage')

plt.title("December Avg. Power Curve")
plt.xticks(range(0,25,4))
plt.xlabel("End of the hour (Military Time)")
plt.grid(True)
plt.show()


# DataFrame to use for the power usage patterns of different months
Months_comparison = pd.DataFrame({
    'January': df_Avg_Jan_sorted['AvgTotalPowerUsage'],
    'February': df_Avg_Feb_sorted['AvgTotalPowerUsage'],
    'March': df_Avg_March_sorted['AvgTotalPowerUsage'],
    'April': df_Avg_April_sorted['AvgTotalPowerUsage'],
    'May': df_Avg_May_sorted['AvgTotalPowerUsage'],
    'June': df_Avg_June_sorted['AvgTotalPowerUsage'],
    'July': df_Avg_July_sorted['AvgTotalPowerUsage'],
    'August': df_Avg_August_sorted['AvgTotalPowerUsage'],
    'September': df_Avg_Sep_sorted['AvgTotalPowerUsage'],
    'October': df_Avg_Oct_sorted['AvgTotalPowerUsage'],
    'November': df_Avg_Nov_sorted['AvgTotalPowerUsage'],
    'December': df_Avg_Dec_sorted['AvgTotalPowerUsage']
})

correlation_matrix = Months_comparison.corr()

correlation_matrix


sns.clustermap(correlation_matrix, method = 'average', cmap = 'coolwarm', figsize = (11, 9))
plt.title('Hierarchical Clustering of Monthly Correlation')
plt.show()


import numpy as np

# List of month names
months = [
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December']

corr_threshold = 0.97

mask = np.triu(np.ones_like(correlation_matrix), k = 1)

indices = np.where((correlation_matrix * mask) >= corr_threshold)

groupings = [(months[month1], months[month2]) for month1, month2 in zip(indices[0], indices[1])]

print("Groupings based on correlation above", corr_threshold)
print(groupings)

Groupings based on correlation above 0.97
[('January', 'February'), ('January', 'March'), ('January', 'November'), ('January', 'December'), ('February', 'March'), ('February', 'November'), ('February', 'December'), ('March', 'December'), ('April', 'May'), ('April', 'October'), ('May', 'June'), ('May', 'July'), ('May', 'August'), ('May', 'September'), ('May', 'October'), ('June', 'July'), ('June', 'August'), ('June', 'September'), ('June', 'October'), ('July', 'August'), ('July', 'September'), ('July', 'October'), ('August', 'September'), ('August', 'October'), ('September', 'October'), ('November', 'December')]


winter_months_to_plot = ['January', 'February', 'March', 'April', 'November', 'December']
dataframes = [df_result_Avg_Jan, df_result_Avg_Feb, df_result_Avg_March, df_result_Avg_April, df_result_Avg_Nov,df_result_Avg_Dec]

plt.figure(figsize = (9, 6))

for month, df_month in zip(winter_months_to_plot, dataframes):
    line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
    line_month_sorted.set_index('HourEnd', inplace = True)
    plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')

plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Winter Months')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()


winter_months_to_plot1 = ['January', 'February', 'March', 'November', 'December']
dataframes = [df_result_Avg_Jan, df_result_Avg_Feb, df_result_Avg_March, df_result_Avg_April, df_result_Avg_Nov,df_result_Avg_Dec]

plt.figure(figsize = (9, 6))

for month, df_month in zip(winter_months_to_plot1, dataframes):
    line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
    line_month_sorted.set_index('HourEnd', inplace = True)
    plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')

plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Cold Months')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()


summer_months_to_plot = ['May', 'June', 'July', 'August', 'September']
dataframes = [df_result_Avg_May, df_result_Avg_June, df_result_Avg_July, df_result_Avg_August ,df_result_Avg_Sep]

plt.figure(figsize = (9, 6))

for month, df_month in zip(summer_months_to_plot, dataframes):
    line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
    line_month_sorted.set_index('HourEnd', inplace = True)
    plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')

plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Warm Months')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()


# May -> June -> July -> Aug (Increases)
# Aug -> Sep (Starts to decrease)

summer_months_to_plot = ['May', 'June', 'July', 'August', 'September', 'October']
dataframes = [df_result_Avg_May, df_result_Avg_June, df_result_Avg_July, df_result_Avg_August ,df_result_Avg_Sep, df_result_Avg_Oct]

plt.figure(figsize = (9, 6))

for month, df_month in zip(summer_months_to_plot, dataframes):
    line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
    line_month_sorted.set_index('HourEnd', inplace = True)
    plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')

plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Summer Months (Inlcuding October)')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()


# Months with a second spike

spike_months_to_plot = ['January', 'February', 'March', 'April', 'November', 'December', 'October']
dataframes = [df_result_Avg_Jan, df_result_Avg_Feb, df_result_Avg_March, df_result_Avg_April, df_result_Avg_Nov,df_result_Avg_Dec, df_result_Avg_Oct]

plt.figure(figsize = (9, 6))

for month, df_month in zip(spike_months_to_plot, dataframes):
    line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
    line_month_sorted.set_index('HourEnd', inplace = True)
    plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')

plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Second Spiked Months')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()


# Months with a second spike

spike_months_to_plot = ['April', 'October']
dataframes = [df_result_Avg_April, df_result_Avg_Oct]

plt.figure(figsize = (9, 6))

for month, df_month in zip(spike_months_to_plot, dataframes):
    line_month_sorted = df_month.sort_values(by = 'HourEnd', ascending=True)
    line_month_sorted.set_index('HourEnd', inplace = True)
    plt.plot(line_month_sorted.index, line_month_sorted['AvgTotalPowerUsage'], label = f'{month} Avg. Power Curve')

plt.xlabel('End of the hour (Military Time)')
plt.ylabel('Average Power Usage')
plt.title('Power Curves for Transition Months')
plt.legend()
plt.xticks(range(0, 25, 4))
plt.grid(True)
plt.show()


# Create a DataFrame for K-means clustering
months_comparison = pd.concat([
    df_Avg_Jan_sorted['AvgTotalPowerUsage'],
    df_Avg_Feb_sorted['AvgTotalPowerUsage'],
    df_Avg_March_sorted['AvgTotalPowerUsage'],
    df_Avg_April_sorted['AvgTotalPowerUsage'],
    df_Avg_May_sorted['AvgTotalPowerUsage'],
    df_Avg_June_sorted['AvgTotalPowerUsage'],
    df_Avg_July_sorted['AvgTotalPowerUsage'],
    df_Avg_August_sorted['AvgTotalPowerUsage'],
    df_Avg_Sep_sorted['AvgTotalPowerUsage'],
    df_Avg_Oct_sorted['AvgTotalPowerUsage'],
    df_Avg_Nov_sorted['AvgTotalPowerUsage'],
    df_Avg_Dec_sorted['AvgTotalPowerUsage']
], axis = 1)

X = months_comparison.transpose()

num_clusters = 12
kmeans = KMeans(n_clusters = num_clusters, random_state = 42)
clusters = kmeans.fit_predict(X)

clustered_months_comparison = pd.DataFrame({'AvgTotalPowerUsage': X.values.flatten(), 'Cluster': np.repeat(clusters, len(X.columns))})

plt.figure(figsize = (9, 6))
for i in range(num_clusters):
    cluster_data = clustered_months_comparison[clustered_months_comparison['Cluster'] == i]
    plt.scatter(cluster_data.index, cluster_data['AvgTotalPowerUsage'], label = f'Cluster {i + 1}')

plt.xlabel('Month', fontsize = 12)
plt.ylabel('AvgTotalPowerUsage', fontsize = 12)
plt.title('KMeans Clustering of Monthly Power Usage', fontsize = 13)
plt.legend()
plt.show()


months_comparison = pd.concat([
    df_Avg_Jan_sorted['AvgTotalPowerUsage'],
    df_Avg_Feb_sorted['AvgTotalPowerUsage'],
    df_Avg_March_sorted['AvgTotalPowerUsage'],
    df_Avg_April_sorted['AvgTotalPowerUsage'],
    df_Avg_May_sorted['AvgTotalPowerUsage'],
    df_Avg_June_sorted['AvgTotalPowerUsage'],
    df_Avg_July_sorted['AvgTotalPowerUsage'],
    df_Avg_August_sorted['AvgTotalPowerUsage'],
    df_Avg_Sep_sorted['AvgTotalPowerUsage'],
    df_Avg_Oct_sorted['AvgTotalPowerUsage'],
    df_Avg_Nov_sorted['AvgTotalPowerUsage'],
    df_Avg_Dec_sorted['AvgTotalPowerUsage']
], axis = 1)

X = months_comparison.transpose()

num_clusters = 12

kmeans = KMeans(n_clusters = num_clusters, random_state = 42)
clusters = kmeans.fit_predict(X)

clustered_months_comparison = pd.DataFrame({'AvgTotalPowerUsage': X.values.flatten(), 'Cluster': np.repeat(clusters, len(X.columns))})

month_labels = {0: 'January', 1: 'February', 2: 'March', 3: 'April', 4: 'May', 5: 'June',
                6: 'July', 7: 'August', 8: 'September', 9: 'October', 10: 'November', 11: 'December'}

plt.figure(figsize = (10, 6))
for i in range(num_clusters):
    cluster_data = X[clusters == i]
    plt.scatter(cluster_data.index, cluster_data.transpose().values[0], label = f'Cluster {i + 1}: {month_labels[i]}')

plt.xlabel('Month', fontsize = 12)
plt.ylabel('AvgTotalPowerUsage', fontsize = 12)
plt.title('KMeans Clustering of Monthly Power Usage', fontsize = 13)
plt.legend()
plt.show()


# Create a DataFrame for K-means clustering
months_comparison = pd.concat([
    df_Avg_Jan_sorted['AvgTotalPowerUsage'],
    df_Avg_Feb_sorted['AvgTotalPowerUsage'],
    df_Avg_March_sorted['AvgTotalPowerUsage'],
    df_Avg_April_sorted['AvgTotalPowerUsage'],
    df_Avg_May_sorted['AvgTotalPowerUsage'],
    df_Avg_June_sorted['AvgTotalPowerUsage'],
    df_Avg_July_sorted['AvgTotalPowerUsage'],
    df_Avg_August_sorted['AvgTotalPowerUsage'],
    df_Avg_Sep_sorted['AvgTotalPowerUsage'],
    df_Avg_Oct_sorted['AvgTotalPowerUsage'],
    df_Avg_Nov_sorted['AvgTotalPowerUsage'],
    df_Avg_Dec_sorted['AvgTotalPowerUsage']
], axis = 1)

X = months_comparison.transpose()

num_clusters = 12

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(X)

# create a DataFrame for K-means clustering with the cluster labels
clustered_months_comparison = pd.DataFrame({'AvgTotalPowerUsage': X.values.flatten(), 'Cluster': np.repeat(clusters, len(X.columns))})

plt.figure(figsize = (10, 6))
for i in range(num_clusters):
    cluster_data = clustered_months_comparison[clustered_months_comparison['Cluster'] == i]

    # Calculatin average power usage for the cluster
    avg_power = cluster_data['AvgTotalPowerUsage'].mean()

    plt.scatter(cluster_data.index, cluster_data['AvgTotalPowerUsage'], label = f'Cluster {i + 1} (Avg Power: {avg_power:.2f})')

plt.xlabel('Month', fontsize = 12)
plt.ylabel('AvgTotalPowerUsage', fontsize = 12)
plt.title('KMeans Clustering of Monthly Power Usage', fontsize=13)
plt.legend()
plt.show()


### create a DataFrame for K-means clustering
months_comparison = pd.concat([
    df_Avg_Jan_sorted['AvgTotalPowerUsage'],
    df_Avg_Feb_sorted['AvgTotalPowerUsage'],
    df_Avg_March_sorted['AvgTotalPowerUsage'],
    df_Avg_April_sorted['AvgTotalPowerUsage'],
    df_Avg_May_sorted['AvgTotalPowerUsage'],
    df_Avg_June_sorted['AvgTotalPowerUsage'],
    df_Avg_July_sorted['AvgTotalPowerUsage'],
    df_Avg_August_sorted['AvgTotalPowerUsage'],
    df_Avg_Sep_sorted['AvgTotalPowerUsage'],
    df_Avg_Oct_sorted['AvgTotalPowerUsage'],
    df_Avg_Nov_sorted['AvgTotalPowerUsage'],
    df_Avg_Dec_sorted['AvgTotalPowerUsage']
], axis = 1)

# Transpose the DataFrame for K-means clustering
X = months_comparison.transpose()

num_clusters = 12

kmeans = KMeans(n_clusters = num_clusters, random_state=42)
clusters = kmeans.fit_predict(X)

clustered_months_comparison = pd.DataFrame({'AvgTotalPowerUsage': X.values.flatten(), 'Cluster': np.repeat(clusters, len(X.columns))})

plt.figure(figsize = (10, 6))
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

for i in range(num_clusters):
    cluster_data = clustered_months_comparison[clustered_months_comparison['Cluster'] == i]

    plt.scatter(cluster_data.index, cluster_data['AvgTotalPowerUsage'], label=f'Cluster {i + 1} ({month_names[i]})')

plt.xlabel('Month', fontsize = 12)
plt.ylabel('AvgTotalPowerUsage', fontsize = 12)
plt.title('KMeans Clustering of Monthly Power Usage', fontsize = 13)
plt.legend()
plt.show()


newriver = df[['Year', 'Weekday', 'Month', 'Day', 'HourEnd', 'NewRiver']]


newriver.head()


monthly_hourly_avg_demand_newriver = newriver.groupby(['Month', 'HourEnd'])['NewRiver'].mean().reset_index()


peak_hours = monthly_hourly_avg_demand_newriver.loc[monthly_hourly_avg_demand_newriver.groupby('Month')['NewRiver'].idxmax()]


peak_hours


X = newriver[['Year', 'Weekday', 'Month', 'Day', 'HourEnd']]
y = newriver['NewRiver']

# Splitting data in training and testing sets (80 /20 split)
train_data = newriver.iloc[:42088]
test_data = newriver.iloc[42088:]

X_train, y_train = train_data[['Year', 'Month', 'Day', 'HourEnd', 'Weekday']], train_data['NewRiver']
X_test, y_test = test_data[['Year', 'Month', 'Day', 'HourEnd', 'Weekday']], test_data['NewRiver']

# Making sure the calculateions are correct for the training and testing set
X_train.shape, X_test.shape

((42088, 5), (10520, 5))


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

rf_regressor = RandomForestRegressor(n_estimators = 100, random_state = 42)

rf_regressor.fit(X_train, y_train)  # training the model

# Predicting on the test set
y_pred = rf_regressor.predict(X_test)

# Calculating performance metrics here
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

mae, rmse

(3228.395100285171, 4173.730170727415)


rf_model = RandomForestRegressor(n_estimators = 100, random_state = 42)

rf_model.fit(X_train, y_train)

y_pred_test = rf_model.predict(X_test)

X_test_with_predictions = X_test.copy()
X_test_with_predictions['PredictedDemand'] = y_pred_test

monthly_hourly_demand = X_test_with_predictions.groupby(['Month', 'HourEnd'])['PredictedDemand'].mean().reset_index()

# this will find the maximum predicted demand for each month to identify the peak hour
peak_hours_per_month = monthly_hourly_demand.loc[monthly_hourly_demand.groupby('Month')['PredictedDemand'].idxmax()]

rf_df = peak_hours_per_month[['Month', 'HourEnd', 'PredictedDemand']]

rf_df


monthly_peak_dfs = [Jan_max_df, Feb_max_df, March_max_df, April_max_df, May_max_df, June_max_df, July_max_df, August_max_df, Sep_max_df, Oct_max_df, Nov_max_df, Dec_max_df]

monthly_peaks_df = pd.concat(monthly_peak_dfs).reset_index(drop = True)

final_df = pd.merge(rf_df, monthly_peaks_df, left_on = 'Month', right_index = True)

final_df = final_df.rename(columns = {
    'HourEnd_x' : 'HourEnd(RandomForest)',
    'PredictedDemand' : 'PredictedDemand(RandomForest)',
    'HourEnd_y' : 'HourEnd(Actual)',
})

final_df = final_df.drop(columns = 'Time Frame')

final_df


Jan_max_df['Month'] = 1
Feb_max_df['Month'] = 2
March_max_df['Month'] = 3
April_max_df['Month'] = 4
May_max_df['Month'] = 5
June_max_df['Month'] = 6
July_max_df['Month'] = 7
August_max_df['Month'] = 8
Sep_max_df['Month'] = 9
Oct_max_df['Month'] = 10
Nov_max_df['Month'] = 11
Dec_max_df['Month'] = 12

monthly_peak_dfs = [Jan_max_df, Feb_max_df, March_max_df, April_max_df, May_max_df, June_max_df, July_max_df, August_max_df, Sep_max_df, Oct_max_df, Nov_max_df, Dec_max_df]

monthly_peaks_df = pd.concat(monthly_peak_dfs).reset_index(drop = True)

final_df = pd.merge(rf_df, monthly_peaks_df, left_on = 'Month', right_on = 'Month')

final_df = final_df.rename(columns = {
    'HourEnd_x' : 'HourEnd(RandomForest)',
    'PredictedDemand' : 'PredictedDemand(RandomForest)',
    'HourEnd_y' : 'HourEnd(Actual)',
})

final_df = final_df.drop(columns = 'Time Frame')

final_df


pivot_df = final_df.pivot_table(index='Month', values=['PredictedDemand(RandomForest)', 'AvgTotalPowerUsage'])

pivot_df.columns = ['Predicted Demand (Random Forest Regression)', 'Actual Average Power Usage (Power Curves)']

visual = pivot_df.plot(kind = 'bar', figsize = (14, 7))
visual.set_xticklabels(['11    11\nJan', '11    11 \nFeb', '11    11 \nMar', '13    12 \nApr', '15    14 \nMay',
                    '15    15 \nJun', '15    15 \nJul', '15    15 \nAug', '16    15 \nSep',
                    '12    13 \nOct', '20    19 \nNov', '11    11 \nDec'], rotation = 360)


plt.xlabel('\nMonth and Hour End')
plt.ylabel('Power Usage (kWh)')
plt.title('Comparison of Predicted Demand vs Actual Average Power Usage')
plt.legend(title = 'Type of Model')
plt.show()


avg_peak_usage = final_df['AvgTotalPowerUsage'].values
predicted_peak_usage = final_df['PredictedDemand(RandomForest)'].values


from sklearn.metrics import r2_score

r_squared = r2_score(avg_peak_usage, predicted_peak_usage)

print("R-squared value:", r_squared)

R-squared value: 0.7822025378274747


plt.scatter(avg_peak_usage, predicted_peak_usage)
plt.title('Actual vs Predicted Peak Hour Demand')
plt.xlabel('Actual Peak Hour Demand')
plt.ylabel('Predicted Peak Hour Demand')
plt.plot([min(avg_peak_usage), max(avg_peak_usage)], [min(avg_peak_usage), max(avg_peak_usage)], color = 'red')
plt.show()


from sklearn.metrics import mean_absolute_error, mean_squared_error


# use avg_peak_usage and predicted_peak_usage from above

mae = mean_absolute_error(avg_peak_usage, predicted_peak_usage)
print("Mean Absolute Error (MAE):", mae)

mse = mean_squared_error(avg_peak_usage, predicted_peak_usage)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 911.6994296859826
Root Mean Squared Error (RMSE): 1092.4570900789852


from scipy import stats

t_stat, p_value = stats.ttest_rel(avg_peak_usage, predicted_peak_usage)

print("t-statistic:", t_stat, "p-value:", p_value)

t-statistic: -1.1523913939997472 p-value: 0.2735894453508184


alpha = 0.05
if p_value < alpha:
    print("We reject the null hypothesis (Ho), suggesting a significant difference between the actual and predicted peak demand.")
else:
    print("We fail to reject the null hypothesis (Ho), suggesting no significant difference between the actual and predicted peak demand.")

We fail to reject the null hypothesis (Ho), suggesting no significant difference between the actual and predicted peak demand.


#pip install xgboost


import xgboost as xgb

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_6624\2785909336.py in <module>
----> 1 import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'


xgb_regressor = xgb.XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 5, random_state = 42)
xgb_regressor.fit(X_train, y_train)

y_pred_xgb = xgb_regressor.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

print(f"MAE: {mae_xgb}, RMSE: {rmse_xgb}")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_6624\2220971069.py in <module>
----> 1 xgb_regressor = xgb.XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 5, random_state = 42)
      2 xgb_regressor.fit(X_train, y_train)
      3 
      4 y_pred_xgb = xgb_regressor.predict(X_test)
      5 

NameError: name 'xgb' is not defined


X_test['PredictedDemand'] = y_pred_xgb

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_6624\3308208585.py in <module>
----> 1 X_test['PredictedDemand'] = y_pred_xgb

NameError: name 'y_pred_xgb' is not defined


monthly_hourly_demand_xgb = X_test.groupby(['Month', 'HourEnd'])['PredictedDemand'].mean().reset_index()

# Finding th hour with the highest average predicted demand for each month
peak_hours_per_month_xgb = monthly_hourly_demand_xgb.loc[monthly_hourly_demand_xgb.groupby('Month')['PredictedDemand'].idxmax()]

peak_hours_per_month_xgb[['Month', 'HourEnd', 'PredictedDemand']]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_6624\1349816840.py in <module>
----> 1 monthly_hourly_demand_xgb = X_test.groupby(['Month', 'HourEnd'])['PredictedDemand'].mean().reset_index()
      2 
      3 # Finding th hour with the highest average predicted demand for each month
      4 peak_hours_per_month_xgb = monthly_hourly_demand_xgb.loc[monthly_hourly_demand_xgb.groupby('Month')['PredictedDemand'].idxmax()]
      5 

~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in __getitem__(self, key)
   1336                 stacklevel=find_stack_level(),
   1337             )
-> 1338         return super().__getitem__(key)
   1339 
   1340     def _gotitem(self, key, ndim: int, subset=None):

~\anaconda3\lib\site-packages\pandas\core\base.py in __getitem__(self, key)
    248         else:
    249             if key not in self.obj:
--> 250                 raise KeyError(f"Column not found: {key}")
    251             subset = self.obj[key]
    252             ndim = subset.ndim

KeyError: 'Column not found: PredictedDemand'


temp = pd.read_csv("AvgBooneTemp.csv")


temp.head()


temp_dtype = temp.dtypes
temp_dtype

Date                  object
Month                  int64
Day                    int64
Year                   int64
AverageAirTemp(F)    float64
dtype: object


new_df = pd.merge(df, temp, on = ['Year', 'Month', 'Day'])


new_df.head()


#new_df = new_df.drop(columns = ['UTC', 'DST', 'Date_x', 'Date_y'])

new_df = new_df.drop(columns = ['UTC', 'DST'])


new_df.head()


len(new_df)

52608


January = new_df[(new_df['Month'] == 1)]
February = new_df[(new_df['Month'] == 2)]
March = new_df[(new_df['Month'] == 3)]
April = new_df[(new_df['Month'] == 4)]
May = new_df[(new_df['Month'] == 5)]
June = new_df[(new_df['Month'] == 6)]
July = new_df[(new_df['Month'] == 7)]
August = new_df[(new_df['Month'] == 8)]
September = new_df[(new_df['Month'] == 9)]
October = new_df[(new_df['Month'] == 10)]
November = new_df[(new_df['Month'] == 11)]
December = new_df[(new_df['Month'] == 12)]


January['AverageAirTemp(F)'] = January['AverageAirTemp(F)'].astype(float)
February['AverageAirTemp(F)'] = February['AverageAirTemp(F)'].astype(float)
March['AverageAirTemp(F)'] = March['AverageAirTemp(F)'].astype(float)
April['AverageAirTemp(F)'] = April['AverageAirTemp(F)'].astype(float)
May['AverageAirTemp(F)'] = May['AverageAirTemp(F)'].astype(float)
June['AverageAirTemp(F)'] = June['AverageAirTemp(F)'].astype(float)
July['AverageAirTemp(F)'] = July['AverageAirTemp(F)'].astype(float)
August['AverageAirTemp(F)'] = August['AverageAirTemp(F)'].astype(float)
September['AverageAirTemp(F)'] = September['AverageAirTemp(F)'].astype(float)
October['AverageAirTemp(F)'] = October['AverageAirTemp(F)'].astype(float)
November['AverageAirTemp(F)'] = November['AverageAirTemp(F)'].astype(float)
December['AverageAirTemp(F)'] = December['AverageAirTemp(F)'].astype(float)


Jan_Avg = January['AverageAirTemp(F)'].mean()
Feb_Avg = February['AverageAirTemp(F)'].mean()
March_Avg = March['AverageAirTemp(F)'].mean()
April_Avg = April['AverageAirTemp(F)'].mean()
May_Avg = May['AverageAirTemp(F)'].mean()
June_Avg = June['AverageAirTemp(F)'].mean()
July_Avg = July['AverageAirTemp(F)'].mean()
Aug_Avg = August['AverageAirTemp(F)'].mean()
Sep_Avg = September['AverageAirTemp(F)'].mean()
Oct_Avg = October['AverageAirTemp(F)'].mean()
Nov_Avg = November['AverageAirTemp(F)'].mean()
Dec_Avg = December['AverageAirTemp(F)'].mean()


temp_data = {
    'Month': ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'],
    'Average Temperature': [Jan_Avg, Feb_Avg, March_Avg, April_Avg, May_Avg, June_Avg, July_Avg, Aug_Avg, Sep_Avg, Oct_Avg, Nov_Avg, Dec_Avg]
}


avg_monthly_temp = pd.DataFrame(temp_data)
avg_monthly_temp


final_df = pd.DataFrame(final_df)
final_df


temp_forest_df = pd.concat([final_df, avg_monthly_temp], axis = 1)
temp_forest_df = temp_forest_df[['Month', 'HourEnd(RandomForest)', 'PredictedDemand(RandomForest)', 'HourEnd(Actual)', 'AvgTotalPowerUsage', 'Average Temperature']]
temp_forest_df = temp_forest_df.iloc[:, [0, 2, 3, 4, 5, 6]]
temp_forest_df


# in the excel sheet, 2.6 is the lowest temp and 77 is the highest temp
# increment by about 15 degrees for the most part


temp_bins = [min(temp_forest_df['Average Temperature']), 40, 45, 50, 55, 60, 65, max(temp_forest_df['Average Temperature'])]

temp_bin_labels = ['<35°F', '35-40°F', '40-45°F', '45-50°F', '50-55°F', '55-60°F', '60-65°F']

temp_forest_df['Temperature Interval'] = pd.cut(temp_forest_df['Average Temperature'], bins = temp_bins, labels = temp_bin_labels, include_lowest = True)

peak_usage_by_temp_interval = temp_forest_df.groupby('Temperature Interval')['AvgTotalPowerUsage', 'PredictedDemand(RandomForest)'].mean().reset_index()

peak_usage_by_temp_interval


temp_bins2 = [min(new_df['AverageAirTemp(F)']), 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, max(new_df['AverageAirTemp(F)'])]

temp_bin_labels2 = ['<5°F', '5-10°F', '10-15°F', '15-20°F', '20-25°F', '25-30°F', '30-35°F', '35-40°F', '40-45°F', '45-50°F', '50-55°F', '55-60°F', '60-65°F', '>65°F']

new_df['Temperature Interval'] = pd.cut(new_df['AverageAirTemp(F)'], bins = temp_bins2, labels = temp_bin_labels2, include_lowest = True)

peak_usage_by_temp_interval2 = new_df.groupby('Temperature Interval')['NewRiver'].mean().reset_index()

peak_usage_by_temp_interval2


plt.figure(figsize = (13, 8))
sns.barplot(x = 'Temperature Interval', y = 'NewRiver', data = peak_usage_by_temp_interval2)
plt.title('Average Power Usage by Temperature Category')
plt.xlabel('Temperature Category')
plt.ylabel('Average Power Usage')
plt.grid(True)

plt.show()


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
peak_usage_by_temp_interval2['NormalizedPowerUsage'] = scaler.fit_transform(peak_usage_by_temp_interval2[['NewRiver']])

kmeans = KMeans(n_clusters = 5, random_state = 0)

peak_usage_by_temp_interval2['Cluster'] = kmeans.fit_predict(peak_usage_by_temp_interval2[['NormalizedPowerUsage']])

sns.scatterplot(data = peak_usage_by_temp_interval2, x = 'Temperature Interval', y = 'NormalizedPowerUsage', hue = 'Cluster', palette = 'viridis')


plt.xticks(rotation = 45) # turns x-axis labels

plt.title('Average Power Usage by Temperature Interval (Clustered)')
plt.xlabel('Temperature Interval')
plt.ylabel('Normalized Average Power Usage')
plt.show()


# Uses Random Forest Peak Demand Predictions
sns.lineplot(data=temp_forest_df, x = 'Average Temperature', y = 'PredictedDemand(RandomForest)', color = 'purple', marker = 'o')
plt.title("Average Total Power Usage (Random Forest Predictions) vs. Average Temperature\n\n(Each Data Point Corresponds to Each Month's Predictions)\n")
plt.xlabel('Average Temperature (°F)')
plt.ylabel('Average Total Power Usage')
plt.legend(labels = ['Predicted Demand', 'Actual Usage'])
plt.grid(True)
plt.show()

print("----------------------------------------------------------------------------------------")

# Using Power Curve Peaks
sns.lineplot(data=temp_forest_df, x = 'Average Temperature', y = 'AvgTotalPowerUsage', color = 'green', marker = 'o')
plt.title("Average Total Power Usage (Power Curves) vs. Average Temperature\n\n(Each Data Point Corresponds to Each Month's Predictions)\n")
plt.xlabel('Average Temperature (°F)')
plt.ylabel('Average Total Power Usage')
plt.legend(labels = ['Predicted Demand', 'Actual Usage'])
plt.grid(True)
plt.show()

----------------------------------------------------------------------------------------


correlation_forest = temp_forest_df['Average Temperature'].corr(temp_forest_df['PredictedDemand(RandomForest)'])
correlation_curves = temp_forest_df['Average Temperature'].corr(temp_forest_df['AvgTotalPowerUsage'])
print(f'Correlation coefficient (Avg. Temp vs. Random Forest Regression Predictions: {correlation_forest}')
print("")
print(f'Correlation coefficient (Avg. Temp vs Power Curve Predictions) {correlation_curves}')

Correlation coefficient (Avg. Temp vs. Random Forest Regression Predictions: -0.4582138897627224

Correlation coefficient (Avg. Temp vs Power Curve Predictions) -0.41090289030943566


df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])

daily_power_usage = df.groupby('Date')['NewRiver'].mean().reset_index()
daily_power_usage.columns = ['Date', 'AverageDailyPowerUsage']

daily_power_usage['Date'] = daily_power_usage['Date'].dt.strftime('%m/%d/%y')
daily_power_usage


daily_avg_power_df = pd.concat([daily_power_usage, temp], axis = 1)
daily_avg_power_df = daily_avg_power_df[['Date', 'AverageDailyPowerUsage', 'AverageAirTemp(F)']]
daily_avg_power_df.head()

daily_avg_power_df = daily_avg_power_df.iloc[:, [0, 2, 3]]

daily_avg_power_df


len(daily_avg_power_df)

2192


bins = [2.6, 32, 50, 65, 77]
labels = ['Very Cold\n (< 32°F)', 'Cold\n (32°F - 50°F)', 'Moderately Warm\n (50°F - 65°F)', 'Warm\n (> 65°F)']
daily_avg_power_df['TempCategory'] = pd.cut(daily_avg_power_df['AverageAirTemp(F)'], bins = bins, labels = labels, include_lowest = True)


plt.figure(figsize = (10, 6))
sns.barplot(x = 'TempCategory', y = 'AverageDailyPowerUsage', data = daily_avg_power_df)
plt.title('Average Peak Power Usage by Temperature Category')
plt.xlabel('Temperature Category')
plt.ylabel('Average Peak Power Usage')
plt.grid(True)

plt.show()


slope, intercept, r_value, p_value, std_err = linregress(daily_avg_power_df['AverageAirTemp(F)'], daily_avg_power_df['AverageDailyPowerUsage'])
print("Slope:", slope, "\nIntercept:", intercept, "\nR-squared:", r_value**2, "\np-value:", p_value)

Slope: -166.63618365376936 
Intercept: 32848.935907272906 
R-squared: 0.3678369444070465 
p-value: 2.2718415607098826e-220


bins = [2.6, 32, 50, 65, 77]
temp_labels = ['Very Cold\n (< 32°F)', 'Cold\n (32°F - 50°F)', 'Moderately Warm\n (50°F - 65°F', 'Warm\n (> 65°F)']
new_df['TempCategory'] = pd.cut(new_df['AverageAirTemp(F)'], bins = bins, labels = temp_labels, include_lowest = True)

pivot_table = new_df.pivot_table(
    values = 'NewRiver',
    index = 'HourEnd',
    columns = 'TempCategory',
    aggfunc = 'mean'
)

plt.figure(figsize = (12, 9))
ax = sns.heatmap(pivot_table, annot = True, fmt = ".1f", cmap = "coolwarm")
plt.title('Heatmap of Average Electricity Usage by Hour and Temperature Category')
plt.xlabel('\nTemperature Category')
plt.ylabel('Hour of Day')
color_bar = ax.collections[0].colorbar
color_bar.set_label('Average Power Usage\n\n (Blue: Least Power Usage)\n (Red: Most Power Usage)')
plt.show()


correlation = new_df['NewRiver'].corr(new_df['AverageAirTemp(F)'])

correlation

-0.4504009638977822


corr = daily_avg_power_df['AverageDailyPowerUsage'].corr(daily_avg_power_df['AverageAirTemp(F)'])

corr

-0.60649562604115


# This looks chaotic so don't use this

daily_avg_line = daily_avg_power_df.plot.line(x = 'AverageAirTemp(F)', y = 'AverageDailyPowerUsage')


sns.lmplot(x = 'AverageAirTemp(F)', y = 'AverageDailyPowerUsage', data = daily_avg_power_df, aspect = 1.5, line_kws = {"color": "red"})

plt.title('Correlation between Average Temperature and Average Power Usage')
plt.xlabel('Average Air Temperature (°F)')
plt.ylabel('Average Daily Power Usage')

# show the correlation coefficient that was calculated earlier
plt.text(85, 20000, 'Correlation:' + str(round(corr,3)), fontsize = 12, ha = 'left', va = 'center')

plt.show()


concord = pd.read_csv(r"C:\Users\alexp\Documents\School\Summer 2024\Concord.csv")

demand = pd.read_csv(r"C:\Users\alexp\Documents\School\Summer 2024\Demand_All.csv")

concord.head()

constant_columns = [col for col in concord.columns if concord[col].nunique() == 1]
print(f"Constant columns: {constant_columns}")

Constant columns: ['WT01', 'WT03', 'WT04', 'WT05', 'WT06', 'WT11']


df.head()


df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']]).dt.strftime('%m-%d-%Y')
df['Date'] = pd.to_datetime(df['Date'])
daily_energy_usage_concord = df.groupby(df['Date'].dt.date)['Concord'].sum()

daily_energy_usage_concord_df = daily_energy_usage_concord.reset_index()

daily_energy_usage_concord_df.drop(columns='Date', inplace=True)

daily_energy_usage_concord_series = daily_energy_usage_concord_df['Concord']

print(daily_energy_usage_concord_series.head())

0    2333478
1    2260705
2    2049778
3    1944530
4    2384060
Name: Concord, dtype: int64


df.head()


concord['DATE'] = pd.to_datetime(concord['DATE'])
df['HourEnd'] = pd.to_datetime(df['HourEnd'])

concord['Month'] = concord['DATE'].dt.month
concord['Day'] = concord['DATE'].dt.day
concord['Year'] = concord['DATE'].dt.year
concord['Concord'] = daily_energy_usage_concord_series

print(concord.head())

       STATION            NAME       DATE  AWND  DAPR  MDPR  PRCP  SNOW  SNWD  \
0  USC00311975  CONCORD, NC US 2015-01-01  0.00   NaN   NaN  0.00   0.0   0.0   
1  USC00311975  CONCORD, NC US 2015-01-02  2.91   NaN   NaN  0.00   0.0   0.0   
2  USC00311975  CONCORD, NC US 2015-01-03  2.91   NaN   NaN  0.18   0.0   0.0   
3  USC00311975  CONCORD, NC US 2015-01-04  2.91   NaN   NaN  0.17   0.0   0.0   
4  USC00311975  CONCORD, NC US 2015-01-05  2.91   NaN   NaN  0.22   0.0   0.0   

   TMAX  ...  WT01  WT03  WT04  WT05  WT06  WT11  Month  Day  Year    Concord  
0  47.0  ...   NaN   NaN   NaN   NaN   NaN   NaN      1    1  2015  2333478.0  
1  54.0  ...   NaN   NaN   NaN   NaN   NaN   NaN      1    2  2015  2260705.0  
2  52.0  ...   1.0   NaN   NaN   NaN   NaN   NaN      1    3  2015  2049778.0  
3  51.0  ...   1.0   NaN   NaN   NaN   NaN   NaN      1    4  2015  1944530.0  
4  65.0  ...   1.0   NaN   NaN   NaN   NaN   NaN      1    5  2015  2384060.0  

[5 rows x 22 columns]


weather_data_types = concord.dtypes
weather_data_types

STATION            object
NAME               object
DATE       datetime64[ns]
AWND              float64
DAPR              float64
MDPR              float64
PRCP              float64
SNOW              float64
SNWD              float64
TMAX              float64
TMIN              float64
TOBS              float64
WT01              float64
WT03              float64
WT04              float64
WT05              float64
WT06              float64
WT11              float64
Month               int64
Day                 int64
Year                int64
Concord           float64
dtype: object


concord = concord.drop(columns=['STATION', 'NAME', 'DATE','WT01', 'WT03', 'WT04', 'WT05', 'WT06', 'WT11'])

concord.head()


# Fill missing values with the mean
columns_to_fill = ['DAPR', 'MDPR', 'AWND', 'PRCP']
for col in columns_to_fill:
    concord[col] = concord[col].fillna(concord[col].mean())


# Check for any remaining missing values
missing_values = concord.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print("Missing values after filling: \n", missing_columns)

Missing values after filling: 
 SNOW       1546
SNWD       2779
TMAX       2782
TMIN       2782
TOBS       2782
Concord    2783
dtype: int64


columns_to_check = ['AWND', 'DAPR', 'MDPR', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN', 'TOBS']
for col in columns_to_check:
    concord[col] = concord[col].fillna(concord[col].mean())

# Check again for any remaining missing values
missing_values = concord.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print("Missing values after second filling: \n", missing_columns)

Missing values after second filling: 
 Concord    2783
dtype: int64


# Replace infinite values with NaN
concord.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill any resulting NaNs with column means
for col in concord.columns:
    concord[col] = concord[col].fillna(concord[col].mean())
    
# Check again for any remaining missing or infinite values
print("Final missing values check: \n", concord.isnull().sum())
print("Final infinite values check: \n", np.isinf(concord).sum())

Final missing values check: 
 AWND       0
DAPR       0
MDPR       0
PRCP       0
SNOW       0
SNWD       0
TMAX       0
TMIN       0
TOBS       0
Month      0
Day        0
Year       0
Concord    0
dtype: int64
Final infinite values check: 
 AWND       0
DAPR       0
MDPR       0
PRCP       0
SNOW       0
SNWD       0
TMAX       0
TMIN       0
TOBS       0
Month      0
Day        0
Year       0
Concord    0
dtype: int64


print(concord)

          AWND      DAPR   MDPR  PRCP      SNOW      SNWD       TMAX  \
0     0.000000  2.909091  0.739  0.00  0.000000  0.000000  47.000000   
1     2.910000  2.909091  0.739  0.00  0.000000  0.000000  54.000000   
2     2.910000  2.909091  0.739  0.18  0.000000  0.000000  52.000000   
3     2.910000  2.909091  0.739  0.17  0.000000  0.000000  51.000000   
4     2.910000  2.909091  0.739  0.22  0.000000  0.000000  65.000000   
...        ...       ...    ...   ...       ...       ...        ...   
4970  2.395222  2.909091  0.739  0.00  0.000000  0.025956  73.952576   
4971  2.395222  2.909091  0.739  0.00  0.000000  0.025956  73.952576   
4972  2.395222  2.909091  0.739  0.00  0.000000  0.025956  73.952576   
4973  2.395222  2.909091  0.739  0.03  0.008516  0.025956  73.952576   
4974  2.395222  2.909091  0.739  0.06  0.008516  0.025956  73.952576   

           TMIN       TOBS  Month  Day  Year       Concord  
0     24.000000  25.000000      1    1  2015  2.333478e+06  
1     25.000000  41.000000      1    2  2015  2.260705e+06  
2     41.000000  46.000000      1    3  2015  2.049778e+06  
3     46.000000  48.000000      1    4  2015  1.944530e+06  
4     38.000000  39.000000      1    5  2015  2.384060e+06  
...         ...        ...    ...  ...   ...           ...  
4970  50.076151  54.113543     12   28  2020  2.549247e+06  
4971  50.076151  54.113543     12   29  2020  2.549247e+06  
4972  50.076151  54.113543     12   30  2020  2.549247e+06  
4973  50.076151  54.113543     12   31  2020  2.549247e+06  
4974  50.076151  54.113543      1    1  2021  2.549247e+06  

[4975 rows x 13 columns]


concord_2 = pd.concat([concord], axis = 1)

#Create the correlation matrix
correlation_matrix = concord_2.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix for Concord')
plt.show()


import statsmodels.api as sm

# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = concord_2[(concord_2['Month'] >= 5) & (concord_2['Month'] <= 9)]

# Define the independent variables (all columns except 'Concord')
X = df_summer.drop(['Concord'], axis=1)

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_summer['Concord']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()

# Get the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                Concord   R-squared:                       0.636
Model:                            OLS   Adj. R-squared:                  0.634
Method:                 Least Squares   F-statistic:                     309.5
Date:                Fri, 16 Aug 2024   Prob (F-statistic):               0.00
Time:                        10:29:44   Log-Likelihood:                -28951.
No. Observations:                2136   AIC:                         5.793e+04
Df Residuals:                    2123   BIC:                         5.800e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -5.224e+06   5.02e+06     -1.041      0.298   -1.51e+07    4.62e+06
AWND       -1.146e+04   1990.437     -5.758      0.000   -1.54e+04   -7556.837
DAPR        2323.4023   5.43e+04      0.043      0.966   -1.04e+05    1.09e+05
MDPR       -1510.8804    7.8e+04     -0.019      0.985   -1.55e+05    1.52e+05
PRCP       -1.735e+04   9959.945     -1.742      0.082   -3.69e+04    2180.169
SNOW        6.235e+05   1.34e+06      0.464      0.642   -2.01e+06    3.26e+06
SNWD         1.54e+07   6.77e+05     22.754      0.000    1.41e+07    1.67e+07
TMAX        1.813e+04   1308.710     13.851      0.000    1.56e+04    2.07e+04
TMIN        1799.2123   3416.860      0.527      0.599   -4901.531    8499.956
TOBS        2.798e+04   3652.518      7.660      0.000    2.08e+04    3.51e+04
Month       7862.8203   2962.735      2.654      0.008    2052.654    1.37e+04
Day          568.1247    459.370      1.237      0.216    -332.738    1468.987
Year        2173.1386   2483.950      0.875      0.382   -2698.091    7044.368
==============================================================================
Omnibus:                      236.286   Durbin-Watson:                   0.778
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              936.030
Skew:                          -0.485   Prob(JB):                    5.54e-204
Kurtosis:                       6.094   Cond. No.                     2.51e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.51e+06. This might indicate that there are
strong multicollinearity or other numerical problems.


# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = concord_2[(concord_2['Month'] >= 5) & (concord_2['Month'] <= 9)]

selected_columns = ['AWND', 'SNWD', 'TMAX', 'TOBS', 'Month']

X = df_summer[selected_columns]

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_summer['Concord']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()

# Get the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                Concord   R-squared:                       0.635
Model:                            OLS   Adj. R-squared:                  0.634
Method:                 Least Squares   F-statistic:                     742.2
Date:                Fri, 16 Aug 2024   Prob (F-statistic):               0.00
Time:                        10:29:45   Log-Likelihood:                -28954.
No. Observations:                2136   AIC:                         5.792e+04
Df Residuals:                    2130   BIC:                         5.795e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -8.481e+05   7.89e+04    -10.752      0.000      -1e+06   -6.93e+05
AWND       -1.127e+04   1977.182     -5.702      0.000   -1.52e+04   -7396.010
SNWD        1.554e+07    6.2e+05     25.058      0.000    1.43e+07    1.68e+07
TMAX        1.848e+04   1277.373     14.469      0.000     1.6e+04     2.1e+04
TOBS        2.953e+04   1459.892     20.225      0.000    2.67e+04    3.24e+04
Month       7932.5183   2910.959      2.725      0.006    2223.900    1.36e+04
==============================================================================
Omnibus:                      231.247   Durbin-Watson:                   0.776
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              930.748
Skew:                          -0.467   Prob(JB):                    7.77e-203
Kurtosis:                       6.096   Cond. No.                     1.56e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.56e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


import pandas as pd
import numpy as np
import statsmodels.api as sm

# Filter the DataFrame to include only the winter months (November, December, January, February, March)
df_winter = concord_2[(concord_2['Month'] == 11) | (concord_2['Month'] <= 3)]

# Define the independent variables (all columns except 'Concord')
X = df_winter.drop(['Concord'], axis=1)

# Define the dependent variable
y = df_winter['Concord']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()

# Get the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                Concord   R-squared:                       0.153
Model:                            OLS   Adj. R-squared:                  0.146
Method:                 Least Squares   F-statistic:                     23.60
Date:                Fri, 16 Aug 2024   Prob (F-statistic):           6.50e-49
Time:                        10:29:45   Log-Likelihood:                -21618.
No. Observations:                1581   AIC:                         4.326e+04
Df Residuals:                    1568   BIC:                         4.333e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -3.899e+07   6.73e+06     -5.795      0.000   -5.22e+07   -2.58e+07
AWND       -2830.7719   2634.494     -1.075      0.283   -7998.274    2336.730
DAPR        1.597e+05   1.71e+05      0.935      0.350   -1.75e+05    4.95e+05
MDPR       -3.168e+05   1.87e+05     -1.697      0.090   -6.83e+05    4.94e+04
PRCP       -1.175e+04   1.58e+04     -0.744      0.457   -4.28e+04    1.92e+04
SNOW       -3.136e+04   3.91e+04     -0.802      0.422   -1.08e+05    4.53e+04
SNWD        1.314e+05   2.84e+04      4.625      0.000    7.57e+04    1.87e+05
TMAX       -2235.1571   1027.604     -2.175      0.030   -4250.779    -219.535
TMIN        8643.5301   1642.407      5.263      0.000    5421.985    1.19e+04
TOBS       -1.217e+04   1546.167     -7.872      0.000   -1.52e+04   -9138.921
Month      -5082.8420   1329.769     -3.822      0.000   -7691.155   -2474.529
Day        -1099.6557    612.037     -1.797      0.073   -2300.153     100.842
Year        2.063e+04   3318.292      6.218      0.000    1.41e+04    2.71e+04
==============================================================================
Omnibus:                      136.788   Durbin-Watson:                   0.410
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              335.644
Skew:                          -0.498   Prob(JB):                     1.31e-73
Kurtosis:                       5.026   Cond. No.                     2.56e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.56e+06. This might indicate that there are
strong multicollinearity or other numerical problems.


# Filter the DataFrame to include only the Spring months (April)
df_spring = concord_2[(concord_2['Month'] >= 4) & (concord_2['Month'] <= 4)]

# Define the independent variables (all columns except 'Concord')
X = df_spring.drop(['Concord'], axis=1)

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_spring['Concord']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()

# Get the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                Concord   R-squared:                       0.499
Model:                            OLS   Adj. R-squared:                  0.487
Method:                 Least Squares   F-statistic:                     41.43
Date:                Fri, 16 Aug 2024   Prob (F-statistic):           4.90e-51
Time:                        10:29:46   Log-Likelihood:                -5196.2
No. Observations:                 384   AIC:                         1.041e+04
Df Residuals:                     374   BIC:                         1.045e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
AWND        -1.15e+04   3915.995     -2.936      0.004   -1.92e+04   -3795.637
DAPR        1.366e+06   1.49e+06      0.918      0.359   -1.56e+06    4.29e+06
MDPR        3.471e+05   3.78e+05      0.918      0.359   -3.96e+05    1.09e+06
PRCP       -1.813e+04   2.45e+04     -0.740      0.459   -6.63e+04       3e+04
SNOW         1.12e+06   3.27e+06      0.342      0.732   -5.31e+06    7.55e+06
SNWD        1.276e+07   9.69e+05     13.174      0.000    1.09e+07    1.47e+07
TMAX        1877.1225   2145.348      0.875      0.382   -2341.333    6095.578
TMIN       -1755.3361   3321.921     -0.528      0.598   -8287.319    4776.647
TOBS        7872.6129   3230.893      2.437      0.015    1519.620    1.42e+04
Month       1.879e+06   2.05e+06      0.918      0.359   -2.14e+06     5.9e+06
Day         1297.6199   1123.012      1.155      0.249    -910.590    3505.829
Year       -4953.7791   6338.065     -0.782      0.435   -1.74e+04    7508.930
==============================================================================
Omnibus:                       17.211   Durbin-Watson:                   0.464
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               39.270
Skew:                           0.151   Prob(JB):                     2.97e-09
Kurtosis:                       4.537   Cond. No.                     8.30e+17
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.28e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


# Filter the DataFrame to include only the Fall months (October)
df_fall = concord_2[(concord_2['Month'] >= 10) & (concord_2['Month'] <= 10)]

# Define the independent variables (all columns except 'Concord')
X = df_fall.drop(['Concord'], axis=1)

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_fall['Concord']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()

# Get the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                Concord   R-squared:                       0.493
Model:                            OLS   Adj. R-squared:                  0.482
Method:                 Least Squares   F-statistic:                     44.64
Date:                Fri, 16 Aug 2024   Prob (F-statistic):           1.14e-55
Time:                        10:29:46   Log-Likelihood:                -5680.4
No. Observations:                 423   AIC:                         1.138e+04
Df Residuals:                     413   BIC:                         1.142e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
AWND       -8995.2467   4243.708     -2.120      0.035   -1.73e+04    -653.286
DAPR       -3.108e+05   2.67e+05     -1.166      0.244   -8.35e+05    2.13e+05
MDPR       -7.896e+04   6.77e+04     -1.166      0.244   -2.12e+05    5.41e+04
PRCP       -2.378e+04   1.88e+04     -1.262      0.208   -6.08e+04    1.33e+04
SNOW        1.883e+05   2.77e+06      0.068      0.946   -5.26e+06    5.64e+06
SNWD        1.036e+07   7.85e+05     13.190      0.000    8.82e+06    1.19e+07
TMAX        2194.7395   2059.582      1.066      0.287   -1853.831    6243.310
TMIN         1.61e+04   3808.618      4.226      0.000    8608.523    2.36e+04
TOBS       -1449.4191   3497.456     -0.414      0.679   -8324.455    5425.616
Month      -1.068e+06   9.16e+05     -1.166      0.244   -2.87e+06    7.32e+05
Day          487.0522    972.886      0.501      0.617   -1425.373    2399.478
Year        6469.2604   4952.167      1.306      0.192   -3265.335    1.62e+04
==============================================================================
Omnibus:                      102.146   Durbin-Watson:                   0.621
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              427.283
Skew:                           0.997   Prob(JB):                     1.65e-93
Kurtosis:                       7.502   Cond. No.                     5.03e+18
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.82e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


from statsmodels.stats.outliers_influence import variance_inflation_factor

# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = concord_2[(concord_2['Month'] >= 5) & (concord_2['Month'] <= 9)]

# Create a binary dependent variable (0 for not peak, 1 for peak)
# Define a threshold for what is considered a peak
threshold = df_summer['Concord'].quantile(0.9)  # For example, the top 10% of values are considered peak
df_summer = df_summer.copy()  # To avoid SettingWithCopyWarning
df_summer['Peak'] = (df_summer['Concord'] > threshold).astype(int)

# Define the independent variables (all columns except 'Concord' and 'Peak')
X_summer = df_summer.drop(['Concord', 'Peak'], axis=1)

# Handle missing values by replacing infinities and NaNs
X_summer = X_summer.replace([np.inf, -np.inf], np.nan).dropna()

# Standardize the independent variables
scaler = StandardScaler()
X_scaled_summer = scaler.fit_transform(X_summer)
X_scaled_summer = pd.DataFrame(X_scaled_summer, columns=X_summer.columns)  # Keep column names

# Check for multicollinearity using VIF
def calculate_vif(df):
    vif = pd.DataFrame()
    vif["Variable"] = df.columns
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif

vif_df = calculate_vif(X_scaled_summer)
print(vif_df)

# Drop variables with high VIF (threshold of 5 or 10 is commonly used)
high_vif_columns = vif_df[vif_df['VIF'] > 5]['Variable']
X_scaled_summer = X_scaled_summer.drop(high_vif_columns, axis=1)

# Add a constant to the independent variables (required for the regression model)
X_scaled_summer = sm.add_constant(X_scaled_summer)

# Define the dependent variable
y_summer = df_summer.loc[X_summer.index, 'Peak']

# Ensure indices are aligned by reindexing y_summer
y_summer = y_summer.reset_index(drop=True)
X_scaled_summer = X_scaled_summer.reset_index(drop=True)

# Fit the logistic regression model
logit_model_summer = sm.Logit(y_summer, X_scaled_summer).fit(maxiter=100)  # Increase maxiter

# Get the summary of the regression
print(logit_model_summer.summary())

   Variable        VIF
0      AWND   1.066291
1      DAPR   1.117928
2      MDPR   1.117425
3      PRCP   1.120202
4      SNOW   1.732618
5      SNWD   4.626884
6      TMAX   7.708757
7      TMIN  53.308442
8      TOBS  52.243191
9     Month   1.053460
10      Day   1.004712
11     Year   1.085283
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.224672
         Iterations: 100
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   Peak   No. Observations:                 2136
Model:                          Logit   Df Residuals:                     2126
Method:                           MLE   Df Model:                            9
Date:                Fri, 16 Aug 2024   Pseudo R-squ.:                  0.3098
Time:                        10:29:47   Log-Likelihood:                -479.90
converged:                      False   LL-Null:                       -695.26
Covariance Type:            nonrobust   LLR p-value:                 3.794e-87
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -15.5869   1.01e+04     -0.002      0.999   -1.97e+04    1.97e+04
AWND          -0.3236      0.070     -4.623      0.000      -0.461      -0.186
DAPR          -0.2686   4833.379  -5.56e-05      1.000   -9473.517    9472.980
MDPR          -0.0727   5317.784  -1.37e-05      1.000   -1.04e+04    1.04e+04
PRCP          -0.1144      0.093     -1.226      0.220      -0.297       0.069
SNOW           0.8046   1.78e+04   4.51e-05      1.000    -3.5e+04     3.5e+04
SNWD         -12.9022   1.76e+04     -0.001      0.999   -3.46e+04    3.46e+04
Month          0.2047      0.081      2.536      0.011       0.047       0.363
Day            0.0884      0.079      1.117      0.264      -0.067       0.243
Year           0.0262      0.079      0.332      0.740      -0.129       0.181
==============================================================================

Possibly complete quasi-separation: A fraction 0.57 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.


from sklearn.preprocessing import StandardScaler

# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = concord_2['Concord'].quantile(0.9)
concord_2['Peak'] = (concord_2['Concord'] > threshold).astype(int)

# Define the independent variables (all columns except 'Concord')
X = concord_2.drop(['Concord'], axis=1)

# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()

# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)

# Reset index of X_scaled to ensure alignment with y
X_scaled.reset_index(drop=True, inplace=True)

# Define the dependent variable 'Peak', ensuring it aligns with the cleaned X
y = concord_2.loc[X.index, 'Peak']

# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()

# Get the summary of the regression
print(logit_model.summary())

---------------------------------------------------------------------------
PerfectSeparationError                    Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_6624\3343644798.py in <module>
     26 
     27 # Fit the logistic regression model
---> 28 logit_model = sm.Logit(y, X_scaled).fit()
     29 
     30 # Get the summary of the regression

~\anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py in fit(self, start_params, method, maxiter, full_output, disp, callback, **kwargs)
   1981     def fit(self, start_params=None, method='newton', maxiter=35,
   1982             full_output=1, disp=1, callback=None, **kwargs):
-> 1983         bnryfit = super().fit(start_params=start_params,
   1984                               method=method,
   1985                               maxiter=maxiter,

~\anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py in fit(self, start_params, method, maxiter, full_output, disp, callback, **kwargs)
    228             pass  # TODO: make a function factory to have multiple call-backs
    229 
--> 230         mlefit = super().fit(start_params=start_params,
    231                              method=method,
    232                              maxiter=maxiter,

~\anaconda3\lib\site-packages\statsmodels\base\model.py in fit(self, start_params, method, maxiter, full_output, disp, fargs, callback, retall, skip_hessian, **kwargs)
    561 
    562         optimizer = Optimizer()
--> 563         xopt, retvals, optim_settings = optimizer._fit(f, score, start_params,
    564                                                        fargs, kwargs,
    565                                                        hessian=hess,

~\anaconda3\lib\site-packages\statsmodels\base\optimizer.py in _fit(self, objective, gradient, start_params, fargs, kwargs, hessian, method, maxiter, full_output, disp, callback, retall)
    239 
    240         func = fit_funcs[method]
--> 241         xopt, retvals = func(objective, gradient, start_params, fargs, kwargs,
    242                              disp=disp, maxiter=maxiter, callback=callback,
    243                              retall=retall, full_output=full_output,

~\anaconda3\lib\site-packages\statsmodels\base\optimizer.py in _fit_newton(f, score, start_params, fargs, kwargs, disp, maxiter, callback, retall, full_output, hess, ridge_factor)
    441             history.append(newparams)
    442         if callback is not None:
--> 443             callback(newparams)
    444         iterations += 1
    445     fval = f(newparams, *fargs)  # this is the negative likelihood

~\anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py in _check_perfect_pred(self, params, *args)
    212                 np.allclose(fittedvalues - endog, 0)):
    213             msg = "Perfect separation detected, results not available"
--> 214             raise PerfectSeparationError(msg)
    215 
    216     @Appender(base.LikelihoodModel.fit.__doc__)

PerfectSeparationError: Perfect separation detected, results not available


# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = concord_2[(concord_2['Month'] >= 5) & (concord_2['Month'] <= 9)].reset_index(drop=True)

# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_summer['Concord'].quantile(0.9)
df_summer['Peak'] = (df_summer['Concord'] > threshold).astype(int)

# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_summer.drop(['Concord', 'Peak', 'AWND', 'DAPR', 'MDPR', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)

# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()

# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']

# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)

# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_summer.loc[X.index, 'Peak']

# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()

# Get the summary of the regression
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.171261
         Iterations 11
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   Peak   No. Observations:                 2136
Model:                          Logit   Df Residuals:                     2130
Method:                           MLE   Df Model:                            5
Date:                Fri, 16 Aug 2024   Pseudo R-squ.:                  0.4738
Time:                        10:29:52   Log-Likelihood:                -365.81
converged:                       True   LL-Null:                       -695.26
Covariance Type:            nonrobust   LLR p-value:                3.803e-140
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.6339      0.428    -13.164      0.000      -6.473      -4.795
PRCP          -0.9820      3.287     -0.299      0.765      -7.425       5.461
TMIN           3.3166      0.292     11.365      0.000       2.745       3.889
Day            0.0724      0.092      0.785      0.432      -0.108       0.253
Year          -0.0379      0.091     -0.417      0.677      -0.216       0.140
PRCP_TMIN      0.7004      2.729      0.257      0.797      -4.649       6.050
==============================================================================


# Filter the DataFrame to include only the winter months (November, December, January, February, March)
df_winter = concord_2[(concord_2['Month'] >= 11) | (concord_2['Month'] <= 3)].reset_index(drop=True)

# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_winter['Concord'].quantile(0.9)
df_winter['Peak'] = (df_winter['Concord'] > threshold).astype(int)

# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_winter.drop(['Concord', 'Peak', 'AWND', 'DAPR', 'MDPR', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)

# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()

# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']

# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)

# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_winter.loc[X.index, 'Peak']

# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()

# Get the summary of the regression
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.129889
         Iterations 11
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   Peak   No. Observations:                 2032
Model:                          Logit   Df Residuals:                     2026
Method:                           MLE   Df Model:                            5
Date:                Fri, 16 Aug 2024   Pseudo R-squ.:                  0.6015
Time:                        10:29:53   Log-Likelihood:                -263.93
converged:                       True   LL-Null:                       -662.32
Covariance Type:            nonrobust   LLR p-value:                5.753e-170
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.6464      0.443    -12.750      0.000      -6.514      -4.778
PRCP           3.3993      1.780      1.910      0.056      -0.089       6.888
TMIN          -3.0349      0.240    -12.644      0.000      -3.505      -2.564
Day           -0.0974      0.112     -0.872      0.383      -0.316       0.121
Year           0.1525      0.110      1.391      0.164      -0.062       0.367
PRCP_TMIN     -4.5966      2.750     -1.671      0.095      -9.987       0.794
==============================================================================


# Filter the DataFrame to include only the Spring months (April)
df_spring = concord_2[concord_2['Month'] == 4].reset_index(drop=True)

# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_spring['Concord'].quantile(0.9)
df_spring['Peak'] = (df_spring['Concord'] > threshold).astype(int)

# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_spring.drop(['Concord', 'Peak', 'AWND', 'DAPR', 'MDPR', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)

# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()

# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']

# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)

# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_spring.loc[X.index, 'Peak'].reset_index(drop=True)  # Reset index to align with X_scaled

# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()

# Get the summary of the regression
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.080102
         Iterations 18
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   Peak   No. Observations:                  384
Model:                          Logit   Df Residuals:                      378
Method:                           MLE   Df Model:                            5
Date:                Fri, 16 Aug 2024   Pseudo R-squ.:                  0.3838
Time:                        10:29:53   Log-Likelihood:                -30.759
converged:                       True   LL-Null:                       -49.921
Covariance Type:            nonrobust   LLR p-value:                 3.248e-07
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -16.8560     22.241     -0.758      0.449     -60.447      26.735
PRCP        -119.2681    338.140     -0.353      0.724    -782.011     543.475
TMIN           1.3869      0.405      3.426      0.001       0.594       2.180
Day            0.7940      0.436      1.821      0.069      -0.061       1.649
Year          -0.2844      0.426     -0.668      0.504      -1.119       0.551
PRCP_TMIN     87.2442    284.298      0.307      0.759    -469.969     644.458
==============================================================================

Possibly complete quasi-separation: A fraction 0.26 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.


# Filter the DataFrame to include only the Fall months (October)
df_fall = concord_2[concord_2['Month'] == 10].reset_index(drop=True)

# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_fall['Concord'].quantile(0.9)
df_fall['Peak'] = (df_fall['Concord'] > threshold).astype(int)

# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_fall.drop(['Concord', 'Peak', 'AWND', 'DAPR', 'MDPR', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)

# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()

# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']

# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)

# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_fall.loc[X.index, 'Peak'].reset_index(drop=True)  # Reset index to align with X_scaled

# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()

# Get the summary of the regression
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.054253
         Iterations 10
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   Peak   No. Observations:                  423
Model:                          Logit   Df Residuals:                      417
Method:                           MLE   Df Model:                            5
Date:                Fri, 16 Aug 2024   Pseudo R-squ.:                  0.5795
Time:                        10:29:54   Log-Likelihood:                -22.949
converged:                       True   LL-Null:                       -54.578
Covariance Type:            nonrobust   LLR p-value:                 2.575e-12
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.4787      1.115     -5.808      0.000      -8.665      -4.293
PRCP           3.5870      3.060      1.172      0.241      -2.411       9.585
TMIN           2.7221      0.610      4.466      0.000       1.527       3.917
Day            0.9079      0.559      1.624      0.104      -0.188       2.004
Year          -0.1159      0.535     -0.217      0.828      -1.164       0.932
PRCP_TMIN     -3.2922      2.631     -1.251      0.211      -8.449       1.864
==============================================================================

Possibly complete quasi-separation: A fraction 0.12 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.


from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

y_pred = dt_reg.predict(X_test)

print("Mean squared error: ", mean_squared_error(y_test, y_pred))
print("R-squared: ", r2_score(y_test, y_pred))

Mean squared error:  0.023529411764705882
R-squared:  0.4753086419753084


from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the model on the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9529411764705882
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98        81
           1       0.00      0.00      0.00         4

    accuracy                           0.95        85
   macro avg       0.48      0.50      0.49        85
weighted avg       0.91      0.95      0.93        85


X_test['Month'] = concord_2['Month']

# Extract predictions for non-peak days (class 0)
non_peak_indices = (y_pred == 0)
non_peak_predictions = X_test[non_peak_indices]

# Select only the 'Month', 'Day', and 'Year' columns
non_peak_month_day_year = non_peak_predictions[['Month', 'Day', 'Year']]

non_peak_month_day_year.to_csv('non_peak_month_day_year.csv', index=False)

# Print the selected columns
print(non_peak_month_day_year)

     Month  Day  Year
145      5   22  2019
280     10   28  2015
175      6   21  2020
410      2   17  2020
419      2   28  2020
..     ...  ...   ...
57       2   27  2016
414      2   22  2020
24       1   25  2015
17       1   18  2015
66       3    5  2017

[85 rows x 3 columns]


import skfuzzy as fuzz
from sklearn.linear_model import LinearRegression

# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = concord_2[(concord_2['Month'] >= 5) & (concord_2['Month'] <= 9)]

# Define the independent variables (all columns except 'Concord')
X = df_summer.drop(['Concord'], axis=1)

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_summer['Concord']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Convert to numpy arrays
X = X.values
y = y.values

# Define fuzzy membership functions for input variables
# Here, we'll just create simple membership functions for demonstration purposes
def fuzzy_membership_function(data, low, high):
    mf = np.zeros_like(data)
    mf[data <= low] = 0
    mf[(data > low) & (data < high)] = (data[(data > low) & (data < high)] - low) / (high - low)
    mf[data >= high] = 1
    return mf

# Create membership functions for each feature
fuzzy_X = np.zeros_like(X)
for i in range(X.shape[1]):
    fuzzy_X[:, i] = fuzzy_membership_function(X[:, i], np.min(X[:, i]), np.max(X[:, i]))

# Create a fuzzy regression model (this is a simplified example)
# We will use the average membership function values to represent fuzzy coefficients
model = LinearRegression()
model.fit(fuzzy_X, y)

# Get the regression coefficients (these are the fuzzy coefficients)
fuzzy_coefficients = model.coef_

# Get feature names including the constant
feature_names = ['const'] + list(df_summer.drop(['Concord'], axis=1).columns)

# Print the fuzzy regression results with feature names
print("Fuzzy Regression Coefficients:")
for i, coef in enumerate(fuzzy_coefficients):
    print(f"{feature_names[i]}: {coef}")

# Predict using the fuzzy model
y_pred = model.predict(fuzzy_X)

# Evaluate the model (e.g., R-squared)
r_squared = model.score(fuzzy_X, y)
print(f"Fuzzy Regression R-squared: {r_squared}")

Fuzzy Regression Coefficients:
const: 0.0
AWND: -200603.20628685967
DAPR: 4445.679156035619
MDPR: -1819.8977977349448
PRCP: -33886.89784881105
SNOW: 2753.209961148256
SNWD: 340811.91338047315
TMAX: 410466.37260179984
TMIN: -27842.505119693953
TOBS: 671160.5196448775
Month: 18450.76185600422
Day: 15674.619534556303
Year: 4701.789425121685
Peak: 521252.6299285891
Fuzzy Regression R-squared: 0.8477779046630803


# Load the data
boone = pd.read_csv(r"C:\Users\alexp\Documents\School\Summer 2024\Boone.csv")

boone.head()

constant_columns = [col for col in boone.columns if boone[col].nunique() == 1]
print(f"Constant columns: {constant_columns}")

Constant columns: ['STATION', 'DAPR', 'MDPR']


df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']]).dt.strftime('%m-%d-%Y')
df['Date'] = pd.to_datetime(df['Date'])
daily_energy_usage_boone = df.groupby(df['Date'].dt.date)['NewRiver'].sum()

# Print the result
# Reset the index to remove the date index and convert it to a DataFrame
daily_energy_usage_boone_df = daily_energy_usage_boone.reset_index()

# Drop the 'Date' column to remove it from the DataFrame
daily_energy_usage_boone_df.drop(columns='Date', inplace=True)

# Convert the DataFrame back to a Series if needed
daily_energy_usage_boone_series = daily_energy_usage_boone_df['NewRiver']

# Print the result
print(daily_energy_usage_boone_series.head())

0    647318
1    588267
2    525797
3    480263
4    687192
Name: NewRiver, dtype: int64


boone_wind = pd.read_csv(r"C:\Users\alexp\Documents\School\Summer 2024\export.csv")


boone_wind.rename(columns={'wspd': 'AWND'}, inplace=True)
boone_wind['AWND'].fillna(boone_wind['AWND'].mean(), inplace=True)


boone_wind.head()


boone['AWND'] = boone_wind['AWND'].values


boone.head()


# Convert the DATE column in concord to datetime format
boone['DATE'] = pd.to_datetime(boone['DATE'])
df['HourEnd'] = pd.to_datetime(df['HourEnd'])

# Extract the month from the Date column and create a new column named Month
boone['Month'] = boone['DATE'].dt.month
boone['Day'] = boone['DATE'].dt.day
boone['Year'] = boone['DATE'].dt.year
boone['Concord'] = daily_energy_usage_boone_series
# Display the result
print(boone.head())

       STATION       DATE  DAPR  MDPR  PRCP  SNOW  SNWD  TMAX  TMIN  TOBS  \
0  USC00310982 2015-01-01   NaN   NaN  0.00   0.0   0.0  29.0  22.0  23.0   
1  USC00310982 2015-01-02   NaN   NaN  0.00   0.0   0.0  40.0  23.0  35.0   
2  USC00310982 2015-01-03   NaN   NaN  0.02   0.0   0.0  46.0  35.0  37.0   
3  USC00310982 2015-01-04   NaN   NaN  0.48   0.0   0.0  54.0  37.0  51.0   
4  USC00310982 2015-01-05   NaN   NaN  0.13   0.0   0.0  58.0  28.0  28.0   

   AWND  Month  Day  Year   Concord  
0  23.1      1    1  2015  647318.0  
1  16.1      1    2  2015  588267.0  
2   4.4      1    3  2015  525797.0  
3  15.9      1    4  2015  480263.0  
4  33.7      1    5  2015  687192.0


weather_data_types2 = boone.dtypes
weather_data_types2

STATION            object
DATE       datetime64[ns]
DAPR              float64
MDPR              float64
PRCP              float64
SNOW              float64
SNWD              float64
TMAX              float64
TMIN              float64
TOBS              float64
AWND              float64
Month               int64
Day                 int64
Year                int64
Concord           float64
dtype: object


# Drop unnecessary columnsconcord = concord.drop(columns=['STATION', 'NAME', 'DATE', 'WT01', 'WT03', 'WT04', 'WT05', 'WT06', 'WT11'])

boone = boone.drop(columns=['STATION', 'DATE', 'DAPR', 'MDPR'])

boone.head()


boone.rename(columns={'Concord': 'Boone'}, inplace=True)
boone.head()


# Fill missing values with the mean
columns_to_fill = ['AWND', 'PRCP']
for col in columns_to_fill:
    boone[col] = boone[col].fillna(boone[col].ffill())


# Check for any remaining missing values
missing_values = boone.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print("Missing values after filling: \n", missing_columns)

Missing values after filling: 
 SNOW     5
SNWD     3
TMAX     7
TMIN     3
TOBS     3
Boone    1
dtype: int64


columns_to_check = ['AWND', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN', 'TOBS']
for col in columns_to_check:
    boone[col] = boone[col].fillna(boone[col].ffill())

# Check again for any remaining missing values
missing_values = boone.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print("Missing values after second filling: \n", missing_columns)

Missing values after second filling: 
 Boone    1
dtype: int64


# Replace infinite values with NaN
boone.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill any resulting NaNs with column means
for col in boone.columns:
    boone[col] = boone[col].fillna(boone[col].ffill())
    
# Check again for any remaining missing or infinite values
print("Final missing values check: \n", boone.isnull().sum())
print("Final infinite values check: \n", np.isinf(boone).sum())

Final missing values check: 
 PRCP     0
SNOW     0
SNWD     0
TMAX     0
TMIN     0
TOBS     0
AWND     0
Month    0
Day      0
Year     0
Boone    0
dtype: int64
Final infinite values check: 
 PRCP     0
SNOW     0
SNWD     0
TMAX     0
TMIN     0
TOBS     0
AWND     0
Month    0
Day      0
Year     0
Boone    0
dtype: int64


boone.head()


# Ensure that all necessary columns are numeric
numeric_cols = boone.select_dtypes(include=[np.number]).columns
boone_numeric = boone[numeric_cols]

# Generate the correlation matrix
correlation_matrix = boone_numeric.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix for Boone')
plt.show()


import statsmodels.api as sm

# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = boone[(boone['Month'] >= 5) & (boone['Month'] <= 9)]

# Define the independent variables (all columns except 'Boone')
X = df_summer.drop(['Boone'], axis=1)

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_summer['Boone']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()

# Get the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Boone   R-squared:                       0.514
Model:                            OLS   Adj. R-squared:                  0.510
Method:                 Least Squares   F-statistic:                     120.4
Date:                Fri, 16 Aug 2024   Prob (F-statistic):          5.64e-137
Time:                        10:30:03   Log-Likelihood:                -10968.
No. Observations:                 918   AIC:                         2.195e+04
Df Residuals:                     909   BIC:                         2.200e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.351e+07   1.57e+06      8.625      0.000    1.04e+07    1.66e+07
PRCP       -4482.0261   2875.912     -1.558      0.119   -1.01e+04    1162.172
SNOW        4.819e-06   5.59e-07      8.625      0.000    3.72e-06    5.92e-06
SNWD       -9.496e-06    1.1e-06     -8.625      0.000   -1.17e-05   -7.34e-06
TMAX        2062.4131    250.518      8.233      0.000    1570.752    2554.074
TMIN         443.6483    398.175      1.114      0.265    -337.801    1225.098
TOBS        1311.0697    398.801      3.288      0.001     528.392    2093.747
AWND        -356.8297    299.770     -1.190      0.234    -945.152     231.493
Month        1.54e+04    958.508     16.062      0.000    1.35e+04    1.73e+04
Day          322.9225    141.404      2.284      0.023      45.406     600.439
Year       -6605.7206    775.231     -8.521      0.000   -8127.172   -5084.270
==============================================================================
Omnibus:                        7.778   Durbin-Watson:                   0.835
Prob(Omnibus):                  0.020   Jarque-Bera (JB):                5.824
Skew:                          -0.076   Prob(JB):                       0.0544
Kurtosis:                       2.641   Cond. No.                     3.46e+19
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.13e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = boone[(boone['Month'] >= 5) & (boone['Month'] <= 9)]

# Define the independent variables (all columns except 'Concord')
selected_columns = ['SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month', 'Day','Year']

X = df_summer[selected_columns]

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_summer['Boone']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()

# Get the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Boone   R-squared:                       0.512
Model:                            OLS   Adj. R-squared:                  0.509
Method:                 Least Squares   F-statistic:                     191.2
Date:                Fri, 16 Aug 2024   Prob (F-statistic):          2.76e-139
Time:                        10:30:03   Log-Likelihood:                -10970.
No. Observations:                 918   AIC:                         2.195e+04
Df Residuals:                     912   BIC:                         2.198e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.281e+07   1.47e+06      8.728      0.000    9.93e+06    1.57e+07
SNOW        1.348e-06   1.54e-07      8.728      0.000    1.05e-06    1.65e-06
SNWD        1.198e-05   1.37e-06      8.728      0.000    9.29e-06    1.47e-05
TMAX        2297.3983    222.851     10.309      0.000    1860.038    2734.759
TOBS        1506.1871    211.559      7.119      0.000    1090.987    1921.387
Month       1.592e+04    903.972     17.607      0.000    1.41e+04    1.77e+04
Day          340.0351    140.972      2.412      0.016      63.367     616.703
Year       -6264.6847    727.116     -8.616      0.000   -7691.699   -4837.670
==============================================================================
Omnibus:                        7.377   Durbin-Watson:                   0.846
Prob(Omnibus):                  0.025   Jarque-Bera (JB):                5.724
Skew:                          -0.088   Prob(JB):                       0.0572
Kurtosis:                       2.656   Cond. No.                     3.50e+20
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.06e-32. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


import statsmodels.api as sm

# Filter the DataFrame to include only the winter months (November, December, January, February, March)
df_winter = boone[(boone['Month'] == 11) | (boone['Month'] <= 3)]

# Define the independent variables (including the 'Month' column)
X = df_winter.drop(columns=['Boone'])  # Drop only the target variable

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_winter['Boone']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()

# Get the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Boone   R-squared:                       0.782
Model:                            OLS   Adj. R-squared:                  0.778
Method:                 Least Squares   F-statistic:                     254.7
Date:                Fri, 16 Aug 2024   Prob (F-statistic):          1.68e-227
Time:                        10:30:04   Log-Likelihood:                -8922.4
No. Observations:                 723   AIC:                         1.787e+04
Df Residuals:                     712   BIC:                         1.792e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       2.769e+06   2.52e+06      1.100      0.272   -2.17e+06    7.71e+06
PRCP        1.293e+04   5275.915      2.450      0.015    2569.826    2.33e+04
SNOW        9469.5482   4717.023      2.008      0.045     208.610    1.87e+04
SNWD        4836.8529   3269.879      1.479      0.140   -1582.905    1.13e+04
TMAX       -3459.7686    288.720    -11.983      0.000   -4026.612   -2892.925
TMIN        -897.1889    386.872     -2.319      0.021   -1656.736    -137.642
TOBS       -3713.1463    331.429    -11.203      0.000   -4363.841   -3062.451
AWND        2726.3103    212.594     12.824      0.000    2308.924    3143.696
Month      -2273.1481    560.802     -4.053      0.000   -3374.172   -1172.125
Day          104.8280    240.338      0.436      0.663    -367.028     576.684
Year        -914.6874   1247.299     -0.733      0.464   -3363.511    1534.136
==============================================================================
Omnibus:                        1.755   Durbin-Watson:                   0.925
Prob(Omnibus):                  0.416   Jarque-Bera (JB):                1.584
Skew:                           0.096   Prob(JB):                        0.453
Kurtosis:                       3.125   Cond. No.                     2.45e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.45e+06. This might indicate that there are
strong multicollinearity or other numerical problems.


# Filter the DataFrame to include only the Spring months (April)
df_spring = boone[(boone['Month'] >= 4) & (boone['Month'] <= 4)]

# Define the independent variables (all columns except 'Boone')
X = df_spring.drop(['Boone'], axis=1)

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_spring['Boone']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()

# Get the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Boone   R-squared:                       0.304
Model:                            OLS   Adj. R-squared:                  0.272
Method:                 Least Squares   F-statistic:                     9.356
Date:                Fri, 16 Aug 2024   Prob (F-statistic):           1.13e-10
Time:                        10:30:04   Log-Likelihood:                -2197.1
No. Observations:                 180   AIC:                             4412.
Df Residuals:                     171   BIC:                             4441.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
PRCP         117.2219   5507.700      0.021      0.983   -1.08e+04     1.1e+04
SNOW        1.199e+05   2.56e+05      0.468      0.640   -3.86e+05    6.25e+05
SNWD        2.562e-07    3.8e-08      6.743      0.000    1.81e-07    3.31e-07
TMAX       -1293.5655    524.277     -2.467      0.015   -2328.454    -258.677
TMIN        -544.8750    725.404     -0.751      0.454   -1976.774     887.024
TOBS        -263.3940    623.870     -0.422      0.673   -1494.871     968.083
AWND        1236.0710    462.797      2.671      0.008     322.540    2149.602
Month       7.736e+06   1.11e+06      6.962      0.000    5.54e+06    9.93e+06
Day         -232.7723    451.294     -0.516      0.607   -1123.596     658.051
Year       -1.503e+04   2201.869     -6.827      0.000   -1.94e+04   -1.07e+04
==============================================================================
Omnibus:                        1.804   Durbin-Watson:                   0.482
Prob(Omnibus):                  0.406   Jarque-Bera (JB):                1.482
Skew:                           0.038   Prob(JB):                        0.477
Kurtosis:                       2.562   Cond. No.                     2.26e+18
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.43e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


# Filter the DataFrame to include only the Fall months (October)
df_fall = boone[(boone['Month'] >= 10) & (boone['Month'] <= 10)]

# Define the independent variables (all columns except 'Boone')
X = df_fall.drop(['Boone'], axis=1)

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_fall['Boone']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X.astype(float)).fit()

# Get the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Boone   R-squared:                       0.047
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     1.245
Date:                Fri, 16 Aug 2024   Prob (F-statistic):              0.281
Time:                        10:30:05   Log-Likelihood:                -2246.4
No. Observations:                 186   AIC:                             4509.
Df Residuals:                     178   BIC:                             4535.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
PRCP       -1244.1868   4117.373     -0.302      0.763   -9369.331    6880.957
SNOW        1.573e-08   7.91e-09      1.987      0.048    1.12e-10    3.13e-08
SNWD       -9.533e-10   4.75e-10     -2.005      0.046   -1.89e-09   -1.51e-11
TMAX         -54.3268    491.407     -0.111      0.912   -1024.061     915.407
TMIN        1287.5616    689.871      1.866      0.064     -73.817    2648.940
TOBS        -583.6086    635.413     -0.918      0.360   -1837.520     670.303
AWND        -260.4582    554.430     -0.470      0.639   -1354.561     833.644
Month       7.903e+05   3.95e+05      2.002      0.047    1.11e+04    1.57e+06
Day          757.4302    425.083      1.782      0.076     -81.421    1596.282
Year       -3667.9863   1958.013     -1.873      0.063   -7531.891     195.919
==============================================================================
Omnibus:                        5.438   Durbin-Watson:                   0.649
Prob(Omnibus):                  0.066   Jarque-Bera (JB):                5.125
Skew:                           0.334   Prob(JB):                       0.0771
Kurtosis:                       3.464   Cond. No.                     4.17e+19
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 4.37e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


from statsmodels.stats.outliers_influence import variance_inflation_factor

# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = boone[(boone['Month'] >= 5) & (boone['Month'] <= 9)]

# Create a binary dependent variable (0 for not peak, 1 for peak)
# Define a threshold for what is considered a peak
threshold = df_summer['Boone'].quantile(0.9)  # For example, the top 10% of values are considered peak
df_summer = df_summer.copy()  # To avoid SettingWithCopyWarning
df_summer['Peak'] = (df_summer['Boone'] > threshold).astype(int)

# Define the independent variables (all columns except 'Boone' and 'Peak')
X_summer = df_summer.drop(['Boone', 'Peak'], axis=1)

# Handle missing values by replacing infinities and NaNs
X_summer = X_summer.replace([np.inf, -np.inf], np.nan).dropna()

# Standardize the independent variables
scaler = StandardScaler()
X_scaled_summer = scaler.fit_transform(X_summer)
X_scaled_summer = pd.DataFrame(X_scaled_summer, columns=X_summer.columns)  # Keep column names

# Check for multicollinearity using VIF
def calculate_vif(df):
    vif = pd.DataFrame()
    vif["Variable"] = df.columns
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif

vif_df = calculate_vif(X_scaled_summer)
print(vif_df)

# Drop variables with high VIF (threshold of 5 or 10 is commonly used)
high_vif_columns = vif_df[vif_df['VIF'] > 5]['Variable']
X_scaled_summer = X_scaled_summer.drop(high_vif_columns, axis=1)

# Add a constant to the independent variables (required for the regression model)
X_scaled_summer = sm.add_constant(X_scaled_summer)

# Define the dependent variable
y_summer = df_summer.loc[X_summer.index, 'Peak']

# Ensure indices are aligned by reindexing y_summer
y_summer = y_summer.reset_index(drop=True)
X_scaled_summer = X_scaled_summer.reset_index(drop=True)

# Fit the logistic regression model
logit_model_summer = sm.Logit(y_summer, X_scaled_summer).fit(maxiter=100)  # Increase maxiter

# Get the summary of the regression
print(logit_model_summer.summary())

  Variable       VIF
0     PRCP  1.252234
1     SNOW       NaN
2     SNWD       NaN
3     TMAX  1.946952
4     TMIN  5.335008
5     TOBS  5.493790
6     AWND  1.331844
7    Month  1.191884
8      Day  1.015852
9     Year  1.140756
Optimization terminated successfully.
         Current function value: 0.230163
         Iterations 9

---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_6624\751608222.py in <module>
     46 
     47 # Fit the logistic regression model
---> 48 logit_model_summer = sm.Logit(y_summer, X_scaled_summer).fit(maxiter=100)  # Increase maxiter
     49 
     50 # Get the summary of the regression

~\anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py in fit(self, start_params, method, maxiter, full_output, disp, callback, **kwargs)
   1981     def fit(self, start_params=None, method='newton', maxiter=35,
   1982             full_output=1, disp=1, callback=None, **kwargs):
-> 1983         bnryfit = super().fit(start_params=start_params,
   1984                               method=method,
   1985                               maxiter=maxiter,

~\anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py in fit(self, start_params, method, maxiter, full_output, disp, callback, **kwargs)
    228             pass  # TODO: make a function factory to have multiple call-backs
    229 
--> 230         mlefit = super().fit(start_params=start_params,
    231                              method=method,
    232                              maxiter=maxiter,

~\anaconda3\lib\site-packages\statsmodels\base\model.py in fit(self, start_params, method, maxiter, full_output, disp, fargs, callback, retall, skip_hessian, **kwargs)
    577             Hinv = cov_params_func(self, xopt, retvals)
    578         elif method == 'newton' and full_output:
--> 579             Hinv = np.linalg.inv(-retvals['Hessian']) / nobs
    580         elif not skip_hessian:
    581             H = -1 * self.hessian(xopt)

~\anaconda3\lib\site-packages\numpy\core\overrides.py in inv(*args, **kwargs)

~\anaconda3\lib\site-packages\numpy\linalg\linalg.py in inv(a)
    536     signature = 'D->D' if isComplexType(t) else 'd->d'
    537     extobj = get_linalg_error_extobj(_raise_linalgerror_singular)
--> 538     ainv = _umath_linalg.inv(a, signature=signature, extobj=extobj)
    539     return wrap(ainv.astype(result_t, copy=False))
    540 

~\anaconda3\lib\site-packages\numpy\linalg\linalg.py in _raise_linalgerror_singular(err, flag)
     87 
     88 def _raise_linalgerror_singular(err, flag):
---> 89     raise LinAlgError("Singular matrix")
     90 
     91 def _raise_linalgerror_nonposdef(err, flag):

LinAlgError: Singular matrix


# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = boone[(boone['Month'] >= 5) & (boone['Month'] <= 9)].reset_index(drop=True)

# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_summer['Boone'].quantile(0.9)
df_summer['Peak'] = (df_summer['Boone'] > threshold).astype(int)

# Define the independent variables (excluding 'Boone', 'Peak', and non-significant variables)
X = df_summer.drop(['Boone', 'Peak', 'AWND', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)

# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()

# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']

# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)

# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_summer.loc[X.index, 'Peak']

# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()

# Get the summary of the regression
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.298703
         Iterations 9
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   Peak   No. Observations:                  918
Model:                          Logit   Df Residuals:                      912
Method:                           MLE   Df Model:                            5
Date:                Fri, 16 Aug 2024   Pseudo R-squ.:                 0.08250
Time:                        10:30:06   Log-Likelihood:                -274.21
converged:                       True   LL-Null:                       -298.87
Covariance Type:            nonrobust   LLR p-value:                 1.916e-09
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.5593      0.164    -15.603      0.000      -2.881      -2.238
PRCP          -3.7158      3.738     -0.994      0.320     -11.041       3.610
TMIN           0.8122      0.169      4.813      0.000       0.481       1.143
Day            0.1012      0.114      0.886      0.376      -0.123       0.325
Year          -0.2596      0.114     -2.276      0.023      -0.483      -0.036
PRCP_TMIN      2.9652      3.500      0.847      0.397      -3.895       9.826
==============================================================================


# Filter the DataFrame to include only the winter months (November, December, January, February, March)
df_winter = boone[(boone['Month'] >= 11) | (boone['Month'] <= 3)].reset_index(drop=True)

# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_winter['Boone'].quantile(0.9)
df_winter['Peak'] = (df_winter['Boone'] > threshold).astype(int)

# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_winter.drop(['Boone', 'Peak', 'AWND', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)

# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()

# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']

# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)

# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_winter.loc[X.index, 'Peak']

# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()

# Get the summary of the regression
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.166987
         Iterations 10
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   Peak   No. Observations:                  909
Model:                          Logit   Df Residuals:                      903
Method:                           MLE   Df Model:                            5
Date:                Fri, 16 Aug 2024   Pseudo R-squ.:                  0.4867
Time:                        10:30:07   Log-Likelihood:                -151.79
converged:                       True   LL-Null:                       -295.72
Covariance Type:            nonrobust   LLR p-value:                 4.079e-60
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -4.3965      0.345    -12.753      0.000      -5.072      -3.721
PRCP           1.0822      0.962      1.124      0.261      -0.804       2.969
TMIN          -2.9627      0.290    -10.205      0.000      -3.532      -2.394
Day            0.2230      0.153      1.460      0.144      -0.076       0.522
Year          -0.2338      0.157     -1.493      0.136      -0.541       0.073
PRCP_TMIN     -0.9971      1.465     -0.681      0.496      -3.869       1.875
==============================================================================


# Filter the DataFrame to include only the Spring months (April)
df_spring = boone[concord['Month'] == 4].reset_index(drop=True)

# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_spring['Boone'].quantile(0.9)
df_spring['Peak'] = (df_spring['Boone'] > threshold).astype(int)

# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_spring.drop(['Boone', 'Peak', 'AWND', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)

# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()

# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']

# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)

# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_spring.loc[X.index, 'Peak'].reset_index(drop=True)  # Reset index to align with X_scaled

# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()

# Get the summary of the regression
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.225534
         Iterations 9
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   Peak   No. Observations:                  180
Model:                          Logit   Df Residuals:                      174
Method:                           MLE   Df Model:                            5
Date:                Fri, 16 Aug 2024   Pseudo R-squ.:                  0.3062
Time:                        10:30:07   Log-Likelihood:                -40.596
converged:                       True   LL-Null:                       -58.515
Covariance Type:            nonrobust   LLR p-value:                 1.024e-06
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.5651      0.595     -5.991      0.000      -4.731      -2.399
PRCP           3.4737      3.780      0.919      0.358      -3.936      10.883
TMIN          -1.5910      0.469     -3.396      0.001      -2.509      -0.673
Day           -0.7589      0.385     -1.970      0.049      -1.514      -0.004
Year           0.3057      0.321      0.954      0.340      -0.323       0.934
PRCP_TMIN     -3.4599      4.487     -0.771      0.441     -12.255       5.335
==============================================================================


# Filter the DataFrame to include only the Fall months (October)
df_fall = boone[boone['Month'] == 10].reset_index(drop=True)

# Create a binary dependent variable (0 for not peak, 1 for peak)
threshold = df_fall['Boone'].quantile(0.9)
df_fall['Peak'] = (df_fall['Boone'] > threshold).astype(int)

# Define the independent variables (excluding 'Concord', 'Peak', and non-significant variables)
X = df_fall.drop(['Boone', 'Peak', 'AWND', 'SNOW', 'SNWD', 'TMAX', 'TOBS', 'Month'], axis=1)

# Handle missing values by replacing infinities and NaNs
X = X.replace([np.inf, -np.inf], np.nan).dropna()

# Create an interaction term between PRCP and TMIN
X['PRCP_TMIN'] = X['PRCP'] * X['TMIN']

# Standardize the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add a constant to the independent variables (required for the regression model)
X_scaled = sm.add_constant(X_scaled)

# Define the dependent variable, ensuring it aligns with the cleaned X
y = df_fall.loc[X.index, 'Peak'].reset_index(drop=True)  # Reset index to align with X_scaled

# Fit the logistic regression model
logit_model = sm.Logit(y, X_scaled).fit()

# Get the summary of the regression
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.299715
         Iterations 7
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   Peak   No. Observations:                  186
Model:                          Logit   Df Residuals:                      180
Method:                           MLE   Df Model:                            5
Date:                Fri, 16 Aug 2024   Pseudo R-squ.:                 0.09118
Time:                        10:30:08   Log-Likelihood:                -55.747
converged:                       True   LL-Null:                       -61.340
Covariance Type:            nonrobust   LLR p-value:                   0.04783
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.4574      0.305     -8.064      0.000      -3.055      -1.860
PRCP           0.9342      1.397      0.669      0.504      -1.805       3.673
TMIN           0.6754      0.305      2.212      0.027       0.077       1.274
Day           -0.3421      0.297     -1.150      0.250      -0.925       0.241
Year           0.0763      0.260      0.293      0.769      -0.433       0.586
PRCP_TMIN     -0.8971      1.390     -0.646      0.519      -3.621       1.827
==============================================================================


from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

y_pred = dt_reg.predict(X_test)

print("Mean squared error: ", mean_squared_error(y_test, y_pred))
print("R-squared: ", r2_score(y_test, y_pred))

Mean squared error:  0.13157894736842105
R-squared:  -0.3970588235294119


from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the model on the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8947368421052632
Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94        34
           1       0.00      0.00      0.00         4

    accuracy                           0.89        38
   macro avg       0.45      0.50      0.47        38
weighted avg       0.80      0.89      0.85        38


X_test['Month'] = boone['Month']

# Extract predictions for non-peak days (class 0)
non_peak_indices = (y_pred == 0)
non_peak_predictions = X_test[non_peak_indices]

# Select only the 'Month', 'Day', and 'Year' columns
non_peak_month_day_year = non_peak_predictions[['Month', 'Day', 'Year']]

non_peak_month_day_year.to_csv('non_peak_month_day_year.csv', index=False)

# Print the selected columns
print(non_peak_month_day_year)

     Month  Day  Year
113      4   21  2018
164      6   10  2020
169      6   15  2020
101      4    9  2018
100      4    8  2018
15       1   16  2015
177      6   23  2020
35       2    5  2016
119      4   27  2018
152      6   29  2019
24       1   25  2015
76       3   15  2017
156      6    2  2020
118      4   26  2018
68       3    7  2017
16       1   17  2015
122      5   30  2018
30       1   31  2015
136      5   13  2019
19       1   20  2015
75       3   14  2017
115      4   23  2018
126      5    3  2019
147      5   24  2019
69       3    8  2017
9        1   10  2015
128      5    5  2019
137      5   14  2019
18       1   19  2015
165      6   11  2020
176      6   22  2020
171      6   17  2020
42       2   12  2016
111      4   19  2018
45       2   15  2016
67       3    6  2017
161      6    7  2020
29       1   30  2015


import skfuzzy as fuzz
from sklearn.linear_model import LinearRegression

# Filter the DataFrame to include only the summer months (May, June, July, August, September)
df_summer = boone[(boone['Month'] >= 5) & (boone['Month'] <= 9)]

# Define the independent variables (all columns except 'Boone')
X = df_summer.drop(['Boone'], axis=1)

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Define the dependent variable
y = df_summer['Boone']

# Handle missing values by dropping rows with any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]  # Align y with the cleaned X

# Convert to numpy arrays
X = X.values
y = y.values

# Define fuzzy membership functions for input variables
# Here, we'll just create simple membership functions for demonstration purposes
def fuzzy_membership_function(data, low, high):
    mf = np.zeros_like(data)
    mf[data <= low] = 0
    mf[(data > low) & (data < high)] = (data[(data > low) & (data < high)] - low) / (high - low)
    mf[data >= high] = 1
    return mf

# Create membership functions for each feature
fuzzy_X = np.zeros_like(X)
for i in range(X.shape[1]):
    fuzzy_X[:, i] = fuzzy_membership_function(X[:, i], np.min(X[:, i]), np.max(X[:, i]))

# Create a fuzzy regression model (this is a simplified example)
# We will use the average membership function values to represent fuzzy coefficients
model = LinearRegression()
model.fit(fuzzy_X, y)

# Get the regression coefficients (these are the fuzzy coefficients)
fuzzy_coefficients = model.coef_

# Get feature names including the constant
feature_names = ['const'] + list(df_summer.drop(['Boone'], axis=1).columns)

# Print the fuzzy regression results with feature names
print("Fuzzy Regression Coefficients:")
for i, coef in enumerate(fuzzy_coefficients):
    print(f"{feature_names[i]}: {coef}")

# Predict using the fuzzy model
y_pred = model.predict(fuzzy_X)

# Evaluate the model (e.g., R-squared)
r_squared = model.score(fuzzy_X, y)
print(f"Fuzzy Regression R-squared: {r_squared}")

Fuzzy Regression Coefficients:
const: 0.0
PRCP: -22410.130630034837
SNOW: 0.0
SNWD: 1.4551915228366852e-11
TMAX: 84558.93637056842
TMIN: 17745.930818057222
TOBS: 60309.208029531
AWND: -8599.595588401195
Month: 61580.59487394907
Day: 9687.676065131684
Year: -33028.60314244498
Fuzzy Regression R-squared: 0.5143981770532247

	UTC	Eastern	Year	OrdDay	OrdHr	Weekday	Month	Day	HourEnd	Concord	Greenwood	NewRiver	KingsMountain	Winterville	Total
0	2015-01-01 06:00:00+00:00	2015-01-01 01:00:00-05:00	2015	1	1	4	1	1	1	97098.0	33667.0	27333	14393.0	8060.0	180550
1	2015-01-01 07:00:00+00:00	2015-01-01 02:00:00-05:00	2015	1	2	4	1	1	2	96620.0	33223.0	27048	14306.0	8307.0	179504
2	2015-01-01 08:00:00+00:00	2015-01-01 03:00:00-05:00	2015	1	3	4	1	1	3	96084.0	32849.0	27008	14270.0	8174.0	178385
3	2015-01-01 09:00:00+00:00	2015-01-01 04:00:00-05:00	2015	1	4	4	1	1	4	95829.0	33053.0	26821	14285.0	8333.0	178320
4	2015-01-01 10:00:00+00:00	2015-01-01 05:00:00-05:00	2015	1	5	4	1	1	5	98489.0	33686.0	26924	14450.0	8396.0	181946

	UTC	Eastern	Year	OrdDay	OrdHr	Weekday	Month	Day	HourEnd	DST	Concord	Greenwood	NewRiver	KingsMountain	Winterville	Total	Peak
0	2015-01-01 06:00:00+00:00	2015-01-01 01:00:00-05:00	2015	1	1	4	1	1	1	0	97098	33667	27333	14393	8060	180550	0
1	2015-01-01 07:00:00+00:00	2015-01-01 02:00:00-05:00	2015	1	2	4	1	1	2	0	96620	33223	27048	14306	8307	179504	0
2	2015-01-01 08:00:00+00:00	2015-01-01 03:00:00-05:00	2015	1	3	4	1	1	3	0	96084	32849	27008	14270	8174	178385	0
3	2015-01-01 09:00:00+00:00	2015-01-01 04:00:00-05:00	2015	1	4	4	1	1	4	0	95829	33053	26821	14285	8333	178320	0
4	2015-01-01 10:00:00+00:00	2015-01-01 05:00:00-05:00	2015	1	5	4	1	1	5	0	98489	33686	26924	14450	8396	181946	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
44563	2020-02-01 01:00:00+00:00	2020-01-31 20:00:00-05:00	2020	31	740	5	1	31	20	0	113208	42034	31668	20362	7303	214574	0
44564	2020-02-01 02:00:00+00:00	2020-01-31 21:00:00-05:00	2020	31	741	5	1	31	21	0	109198	40802	30294	19678	7341	207313	0
44565	2020-02-01 03:00:00+00:00	2020-01-31 22:00:00-05:00	2020	31	742	5	1	31	22	0	103821	39485	28952	18814	7226	198297	0
44566	2020-02-01 04:00:00+00:00	2020-01-31 23:00:00-05:00	2020	31	743	5	1	31	23	0	97146	37034	27495	17582	6911	186169	0
44567	2020-02-01 05:00:00+00:00	2020-02-01 00:00:00-05:00	2020	31	744	5	1	31	24	0	91061	35146	26037	15977	6339	174559	0

	HourEnd	AvgTotalPowerUsage	Time Frame
10	11	32479.91	11:00:00
9	10	32432.81	10:00:00
18	19	32160.51	19:00:00
11	12	32012.66	12:00:00
19	20	31942.73	20:00:00
8	9	31935.10	09:00:00
12	13	31431.12	13:00:00
20	21	31369.93	21:00:00
17	18	31010.26	18:00:00
13	14	30823.63	14:00:00
7	8	30748.98	08:00:00
14	15	30278.78	15:00:00
21	22	30275.58	22:00:00
16	17	29990.39	17:00:00
15	16	29927.72	16:00:00
22	23	28853.97	23:00:00
6	7	28641.80	07:00:00
23	24	27469.97	00:00:00
5	6	26438.04	06:00:00
0	1	26223.85	01:00:00
1	2	25492.46	02:00:00
4	5	25412.34	05:00:00
2	3	25114.53	03:00:00
3	4	25020.18	04:00:00

	HourEnd	AvgTotalPowerUsage	Time Frame
10	11	31456.02	11:00:00
9	10	31251.94	10:00:00
11	12	31122.01	12:00:00
19	20	30976.55	20:00:00
18	19	30867.84	19:00:00
12	13	30656.84	13:00:00
8	9	30651.99	09:00:00
20	21	30350.41	21:00:00
13	14	30111.73	14:00:00
14	15	29593.16	15:00:00
17	18	29526.48	18:00:00
21	22	29251.13	22:00:00
15	16	29218.31	16:00:00
7	8	29131.76	08:00:00
16	17	29128.70	17:00:00
22	23	27771.15	23:00:00
6	7	27074.81	07:00:00
23	24	26169.23	00:00:00
0	1	24885.75	01:00:00
5	6	24703.48	06:00:00
1	2	23939.94	02:00:00
4	5	23621.24	05:00:00
2	3	23437.69	03:00:00
3	4	23285.62	04:00:00

	HourEnd	AvgTotalPowerUsage	Time Frame
10	11	27381.40	11:00:00
11	12	27201.25	12:00:00
9	10	27113.57	10:00:00
12	13	26783.80	13:00:00
8	9	26480.41	09:00:00
13	14	26304.45	14:00:00
20	21	25940.99	21:00:00
14	15	25802.03	15:00:00
19	20	25555.01	20:00:00
15	16	25343.22	16:00:00
7	8	25208.41	08:00:00
18	19	25013.01	19:00:00
16	17	25011.80	17:00:00
21	22	24944.12	22:00:00
17	18	24781.40	18:00:00
22	23	23558.61	23:00:00
6	7	23305.19	07:00:00
23	24	22142.82	00:00:00
0	1	21091.28	01:00:00
5	6	21067.44	06:00:00
1	2	20291.72	02:00:00
4	5	20077.59	05:00:00
2	3	19880.49	03:00:00
3	4	19766.62	04:00:00

	index	Total Power
0	Concord	5587949911
1	Greenwood	1873460604
2	NewRiver	1266075128
3	KingsMountain	902715964
4	Winterville	324750027

	HourEnd	TotalPowerUsage
10	11	1020847
9	10	1019562
18	19	1012672
19	20	1007196
11	12	1005319
8	9	1005189
20	21	988856
12	13	981670
17	18	974770
7	8	968696
13	14	963095
21	22	954404
14	15	947010
15	16	939390
16	17	939369
22	23	911466
6	7	899553
23	24	866841
5	6	833628
0	1	830939
1	2	806077
4	5	800197
2	3	791975
3	4	788556

	HourEnd	TotalPowerUsage
10	11	6041263
9	10	6032503
18	19	5981854
11	12	5954354
19	20	5941348
8	9	5939928
12	13	5846188
20	21	5834807
17	18	5767909
13	14	5733195
7	8	5719311
14	15	5631853
21	22	5631257
16	17	5578212
15	16	5566556
22	23	5366839
6	7	5327374
23	24	5109415
5	6	4917476
0	1	4877637
1	2	4741598
4	5	4726696
2	3	4671303
3	4	4653754

	HourEnd	TotalPowerUsage
10	11	5347523
9	10	5312829
11	12	5290742
19	20	5266013
18	19	5247533
12	13	5211663
8	9	5210838
20	21	5159570
13	14	5118994
14	15	5030838
17	18	5019501
21	22	4972692
15	16	4967112
7	8	4952400
16	17	4951879
22	23	4721096
6	7	4602717
23	24	4448769
0	1	4230578
5	6	4199592
1	2	4069790
4	5	4015610
2	3	3984407
3	4	3958556

	HourEnd	TotalPowerUsage
10	11	5092940
11	12	5059432
9	10	5043124
12	13	4981787
8	9	4925356
13	14	4892627
20	21	4825025
14	15	4799177
19	20	4753232
15	16	4713838
7	8	4688765
18	19	4652419
16	17	4652194
21	22	4639607
17	18	4609341
22	23	4381901
6	7	4334765
23	24	4118565
0	1	3922978
5	6	3918544
4	5	3734432
2	3	3697771
3	4	3676591
1	2	3652509

	HourEnd	TotalPowerUsage
11	12	4435441
12	13	4420712
10	11	4406145
13	14	4386029
14	15	4351748
15	16	4295107
9	10	4279043
20	21	4256999
16	17	4239153
17	18	4168634
18	19	4122328
19	20	4114828
21	22	4102228
8	9	4094587
22	23	3822150
7	8	3793255
23	24	3532353
6	7	3521216
0	1	3294988
5	6	3179018
1	2	3133484
2	3	3030580
4	5	3029341
3	4	2989220

	HourEnd	AvgTotalPowerUsage
11	12	24641.34
12	13	24559.51
10	11	24478.58
13	14	24366.83
14	15	24176.38
15	16	23861.71
9	10	23772.46
20	21	23649.99
16	17	23550.85
17	18	23159.08
18	19	22901.82
19	20	22860.16
21	22	22790.16
8	9	22747.71
22	23	21234.17
7	8	21073.64
23	24	19624.18
6	7	19562.31
0	1	18305.49
5	6	17661.21
1	2	17408.24
2	3	16836.56
4	5	16829.67
3	4	16606.78

	HourEnd	TotalPowerUsage
13	14	4537118
14	15	4536494
12	13	4524032
15	16	4500636
11	12	4485113
16	17	4450890
10	11	4372849
17	18	4331935
18	19	4222954
9	10	4156285
19	20	4122795
20	21	4121261
21	22	4006964
8	9	3867544
22	23	3710501
7	8	3514059
23	24	3427684
6	7	3203711
0	1	3194444
1	2	3032832
5	6	2989258
2	3	2924384
4	5	2877959
3	4	2867289

	HourEnd	AvgTotalPowerUsage
13	14	24393.11
14	15	24389.75
12	13	24322.75
15	16	24196.97
11	12	24113.51
16	17	23929.52
10	11	23509.94
17	18	23289.97
18	19	22704.05
9	10	22345.62
19	20	22165.56
20	21	22157.32
21	22	21542.82
8	9	20793.25
22	23	19948.93
7	8	18892.79
23	24	18428.41
6	7	17224.25
0	1	17174.43
1	2	16305.55
5	6	16071.28
2	3	15722.49
4	5	15472.90
3	4	15415.53

	HourEnd	TotalPowerUsage
14	15	4775708
15	16	4751685
13	14	4750091
16	17	4713692
12	13	4710142
11	12	4633166
17	18	4580174
10	11	4475726
18	19	4434029
19	20	4284271
9	10	4225169
20	21	4170072
21	22	4090353
8	9	3897867
22	23	3781411
7	8	3495794
23	24	3475879
0	1	3201895
6	7	3137094
1	2	3030709
5	6	2963153
2	3	2917942
4	5	2866555
3	4	2858986

	HourEnd	AvgTotalPowerUsage
14	15	26531.71
15	16	26398.25
13	14	26389.39
16	17	26187.18
12	13	26167.46
11	12	25739.81
17	18	25445.41
10	11	24865.14
18	19	24633.49
19	20	23801.51
9	10	23473.16
20	21	23167.07
21	22	22724.18
8	9	21654.82
22	23	21007.84
7	8	19421.08
23	24	19310.44
0	1	17788.31
6	7	17428.30
1	2	16837.27
5	6	16461.96
2	3	16210.79
4	5	15925.31
3	4	15883.26

	HourEnd	TotalPowerUsage
14	15	5279510
15	16	5255347
13	14	5242181
16	17	5209818
12	13	5186374
11	12	5085027
17	18	5072601
18	19	4915637
10	11	4898724
19	20	4725883
9	10	4598746
20	21	4584799
21	22	4478258
8	9	4219758
22	23	4141986
23	24	3812131
7	8	3789518
0	1	3522968
6	7	3464410
1	2	3343746
5	6	3248298
2	3	3220884
3	4	3148959
4	5	3146936

	HourEnd	AvgTotalPowerUsage
14	15	28384.46
15	16	28254.55
13	14	28183.77
16	17	28009.77
12	13	27883.73
11	12	27338.85
17	18	27272.05
18	19	26428.16
10	11	26337.23
19	20	25407.97
9	10	24724.44
20	21	24649.46
21	22	24076.66
8	9	22686.87
22	23	22268.74
23	24	20495.33
7	8	20373.75
0	1	18940.69
6	7	18625.86
1	2	17977.13
5	6	17463.97
2	3	17316.58
3	4	16929.89
4	5	16919.01

	HourEnd	TotalPowerUsage
14	15	5455978
15	16	5434155
13	14	5413379
16	17	5384494
12	13	5343976
17	18	5245833
11	12	5231457
18	19	5104080
10	11	5033108
19	20	4945743
20	21	4906699
9	10	4718279
21	22	4690815
8	9	4334715
22	23	4318253
23	24	3942678
7	8	3890285
0	1	3610968
6	7	3581665
1	2	3397087
5	6	3280723
2	3	3257721
3	4	3178692
4	5	3176919

	HourEnd	AvgTotalPowerUsage
14	15	29333.22
15	16	29215.89
13	14	29104.19
16	17	28948.89
12	13	28731.05
17	18	28203.40
11	12	28126.11
18	19	27441.29
10	11	27059.72
19	20	26590.02
20	21	26380.10
9	10	25367.09
21	22	25219.44
8	9	23304.92
22	23	23216.41
23	24	21197.19
7	8	20915.51
0	1	19413.81
6	7	19256.26
1	2	18263.91
5	6	17638.30
2	3	17514.63
3	4	17089.74
4	5	17080.21

	HourEnd	TotalPowerUsage
14	15	5154458
15	16	5144799
13	14	5118838
16	17	5104255
12	13	5068615
17	18	4989817
11	12	4969910
18	19	4875350
19	20	4835675
20	21	4807395
10	11	4781701
21	22	4511835
9	10	4481845
22	23	4153896
8	9	4133731
23	24	3791515
7	8	3763669
0	1	3484119
6	7	3458690
1	2	3254355
5	6	3128533
2	3	3109610
3	4	3025872
4	5	3019986

	HourEnd	AvgTotalPowerUsage
14	15	28635.88
15	16	28582.22
13	14	28437.99
16	17	28356.97
12	13	28158.97
17	18	27721.21
11	12	27610.61
18	19	27085.28
19	20	26864.86
20	21	26707.75
10	11	26565.01
21	22	25065.75
9	10	24899.14
22	23	23077.20
8	9	22965.17
23	24	21063.97
7	8	20909.27
0	1	19356.22
6	7	19214.94
1	2	18079.75
5	6	17380.74
2	3	17275.61
3	4	16810.40
4	5	16777.70

	HourEnd	TotalPowerUsage
12	13	4827029
13	14	4823857
14	15	4810720
11	12	4793565
15	16	4772443
19	20	4766087
16	17	4723308
10	11	4698497
17	18	4646941
18	19	4646055
20	21	4633197
9	10	4499015
21	22	4356992
8	9	4259317
22	23	4029804
7	8	3983510
23	24	3709803
6	7	3590751
0	1	3433807
1	2	3237594
5	6	3214255
2	3	3113159
4	5	3073695
3	4	3051473

	HourEnd	AvgTotalPowerUsage
12	13	25951.77
13	14	25934.72
14	15	25864.09
11	12	25771.85
15	16	25658.30
19	20	25624.12
16	17	25394.13
10	11	25260.74
17	18	24983.55
18	19	24978.79
20	21	24909.66
9	10	24188.25
21	22	23424.69
8	9	22899.55
22	23	21665.61
7	8	21416.72
23	24	19945.18
6	7	19305.11
0	1	18461.33
1	2	17406.42
5	6	17280.94
2	3	16737.41
4	5	16525.24
3	4	16405.77

	HourEnd	TotalPowerUsage
18	19	4912292
10	11	4872718
19	20	4838577
11	12	4833155
9	10	4818854
17	18	4811326
12	13	4769385
20	21	4712320
8	9	4699467
13	14	4695477
14	15	4631004
16	17	4610766
15	16	4590451
21	22	4520246
7	8	4453251
22	23	4271181
6	7	4148493
23	24	4014141
0	1	3911201
5	6	3776155
1	2	3653773
4	5	3602723
2	3	3571408
3	4	3544014

	HourEnd	AvgTotalPowerUsage
18	19	27290.51
10	11	27070.66
19	20	26880.98
11	12	26850.86
9	10	26771.41
17	18	26729.59
12	13	26496.58
20	21	26179.56
8	9	26108.15
13	14	26085.98
14	15	25727.80
16	17	25615.37
15	16	25502.51
21	22	25112.48
7	8	24740.28
22	23	23728.78
6	7	23047.18
23	24	22300.78
0	1	21027.96
5	6	20978.64
1	2	20298.74
4	5	20015.13
2	3	19841.16
3	4	19688.97

	HourEnd	TotalPowerUsage
10	11	5242411
9	10	5223392
18	19	5190862
11	12	5176594
8	9	5137713
19	20	5131540
17	18	5114881
12	13	5075601
20	21	5036919
13	14	4989454
7	8	4960518
14	15	4909678
16	17	4882546
21	22	4880097
15	16	4853640
6	7	4665050
22	23	4658048
23	24	4442213
5	6	4315304
0	1	4257858
1	2	4151315
4	5	4150074
2	3	4095161
3	4	4088404

	HourEnd	AvgTotalPowerUsage
10	11	28185.01
9	10	28082.75
18	19	27907.86
11	12	27831.15
8	9	27622.11
19	20	27588.92
17	18	27499.36
12	13	27288.18
20	21	27080.21
13	14	26825.02
7	8	26669.45
14	15	26396.12
16	17	26250.25
21	22	26237.08
15	16	26094.84
6	7	25080.91
22	23	25043.27
23	24	23882.87
5	6	23200.56
0	1	22891.71
1	2	22318.90
4	5	22312.23
2	3	22016.99
3	4	21980.67

	January	February	March	April	May	June	July	August	September	October	November	December
January	1.000000	0.995385	0.973206	0.934845	0.866108	0.838219	0.825292	0.832789	0.843728	0.913792	0.986630	0.997897
February	0.995385	1.000000	0.983460	0.960998	0.902131	0.876297	0.864381	0.871848	0.881830	0.942957	0.991596	0.992169
March	0.973206	0.983460	1.000000	0.968246	0.903988	0.873539	0.859544	0.862919	0.867933	0.930582	0.964714	0.970005
April	0.934845	0.960998	0.968246	1.000000	0.978875	0.962321	0.954468	0.958678	0.961230	0.986439	0.965883	0.937263
May	0.866108	0.902131	0.903988	0.978875	1.000000	0.997358	0.994905	0.994552	0.991866	0.988450	0.926692	0.874771
June	0.838219	0.876297	0.873539	0.962321	0.997358	1.000000	0.999425	0.997793	0.993667	0.981104	0.908416	0.849230
July	0.825292	0.864381	0.859544	0.954468	0.994905	0.999425	1.000000	0.998385	0.994079	0.977371	0.899741	0.837295
August	0.832789	0.871848	0.862919	0.958678	0.994552	0.997793	0.998385	1.000000	0.998469	0.982696	0.906434	0.843269
September	0.843728	0.881830	0.867933	0.961230	0.991866	0.993667	0.994079	0.998469	1.000000	0.987353	0.914976	0.852549
October	0.913792	0.942957	0.930582	0.986439	0.988450	0.981104	0.977371	0.982696	0.987353	1.000000	0.961754	0.918340
November	0.986630	0.991596	0.964714	0.965883	0.926692	0.908416	0.899741	0.906434	0.914976	0.961754	1.000000	0.990057
December	0.997897	0.992169	0.970005	0.937263	0.874771	0.849230	0.837295	0.843269	0.852549	0.918340	0.990057	1.000000

Checking for Null Values (They start at row 27964 and go to 35065)¶

Discovering which region is the 'Gorilla' (This is the output with the missing values filled in with the mean)¶

Use 'Hour End' and 'Month' to see which hour of January had the most power usage by NewRiver in 2015¶

January 2015 Total Power Usage¶

January Hourly Total Power Usage¶

Average electricity usage for each hour in January in descending order¶

February Total and Average¶

March Total and Average¶

April Total and Average¶

May Total and Average¶

June Total and Average¶

July Total and Average¶

August Total and Average¶

September Total and Average¶

October Total and Average¶

November Total and Average¶

December Total and Average¶

Correlation matrix for a statistical comparison of power usage between months¶

Further visualizaiton to see which month can be grouped together¶

We can see that April is an 'outlier' here ^^^¶

K Means Clustering¶

Peak Hour Analysis¶

Random Forest Regression Model¶

There are 43824 observations in the training set and 8784 observations in the test set (80% training and 20% testing)¶

Train the Model¶

A little background on Random Forest Regression and how it works¶

When looking at the graphs, the hour end number comes a little before the number on the graph.¶

For example, HourEnd 11 for december... is from 10 am - 11 am, and this matches the power curve¶

Barchart showing output from Random Forest Regression and Power Curves¶

Similarity Statistics for the output above¶

Blue data points:¶

MAE and RMSE for Random Forest Regressor Output¶

Hypothesis Testing¶

More Detailed (XG Boost) (DONT USE!!!)¶

Average Temperature Data Section¶

Based on random forest regressor output, for each peak demand, see what the temperature is for that hour. See if there is a pattern.¶

NOTE: Use Dataframe from random forest regressor model¶

Combined Average Temperature with findings from Random Forest and Power Curves¶

Discovering the temperature windows/intervals that contribute to peak hour electricity usage¶

Explanation of Bar Plot and K-Means Clustering¶

Bar Plot¶

Cluster Plot¶

Correlation Coefficient Meaning¶

Assumptions¶

Constructing Bar Chart to see how average daily temperature affects average daily power consumption¶

Linear Regression of daily average temperature and daily average power usage¶

Linear Regression output meaning¶

Heatmap¶

Overall Meaning¶

Reading Example (HourEnd 9 - Very Cold)¶

Findings¶

Specific Findings¶

Correlation between average temperature and electricity usage¶

Overall Meaning¶

Correlation Coefficient (-0.606)¶

Alex's Work¶

Concord¶

Multiple Regression Model¶

Logistic Regression¶

Logistic Regression with Interaction Term¶

Decision Tree¶

Support Vector Machine¶

Artifical Neural Networks¶

Boone Weather¶

Multiple Regression¶

Logistic Regression¶

Logistic Regression with Interaction Term¶

Decision Tree¶

Support Vector Machine¶

Fuzzy Regression¶

	Month	HourEnd	NewRiver
10	1	11	32479.908602
34	2	11	31456.017647
58	3	11	27381.397849
83	4	12	24641.338889
109	5	14	24393.107527
134	6	15	26531.711111
158	7	15	28384.462366
182	8	15	29333.215054
206	9	15	28635.877778
228	10	13	25951.768817
258	11	19	27290.511111
274	12	11	28185.005376

	Month	HourEnd	PredictedDemand
10	1	11	32993.610000
34	2	11	29406.464138
58	3	11	29205.801613
84	4	13	25170.978667
110	5	15	25441.357742
134	6	15	25900.754667
158	7	15	28210.920000
182	8	15	28868.415484
207	9	16	30382.872667
227	10	12	26412.976429
259	11	20	27795.040167
274	12	11	29177.830161

	Date	Month	Day	Year	AverageAirTemp(F)
0	1/1/2015	1	1	2015	32.0
1	1/2/2015	1	2	2015	40.6
2	1/3/2015	1	3	2015	42.3
3	1/4/2015	1	4	2015	49.2
4	1/5/2015	1	5	2015	28.5

	Month	Average Temperature
0	January	33.338172
1	February	37.728824
2	March	44.034141
3	April	52.006667
4	May	60.470430
5	June	66.946111
6	July	70.131720
7	August	67.834409
8	September	63.590556
9	October	54.049462
10	November	43.353560
11	December	38.182796

	Temperature Interval	AvgTotalPowerUsage	PredictedDemand(RandomForest)
0	<35°F	30706.980	30525.968100
1	35-40°F	27335.955	28500.420890
2	40-45°F	NaN	NaN
3	45-50°F	25296.555	25791.977548
4	50-55°F	NaN	NaN
5	55-60°F	26514.495	27912.115204
6	60-65°F	28083.130	27660.030050

	Temperature Interval	NewRiver
0	<5°F	48143.458333
1	5-10°F	40638.104167
2	10-15°F	38916.022727
3	15-20°F	37433.886667
4	20-25°F	33989.489583
5	25-30°F	31665.983043
6	30-35°F	28597.559711
7	35-40°F	25964.040308
8	40-45°F	23664.123180
9	45-50°F	22257.352887
10	50-55°F	21275.019806
11	55-60°F	21143.176656
12	60-65°F	21664.816699
13	>65°F	23531.379942

	Date	AverageDailyPowerUsage
0	01/01/15	26971.583333
1	01/02/15	24511.125000
2	01/03/15	21908.208333
3	01/04/15	20010.958333
4	01/05/15	28633.000000
...	...	...
2187	12/27/20	27481.583333
2188	12/28/20	26964.791667
2189	12/29/20	25295.166667
2190	12/30/20	26725.916667
2191	12/31/20	21952.875000

	AWND	DAPR	MDPR	PRCP	TMAX	TMIN	TOBS	Month	Day	Year	Concord
0	0.00	NaN	NaN	0.00	47.0	24.0	25.0	1	1	2015	2333478.0
1	2.91	NaN	NaN	0.00	54.0	25.0	41.0	1	2	2015	2260705.0
2	2.91	NaN	NaN	0.18	52.0	41.0	46.0	1	3	2015	2049778.0
3	2.91	NaN	NaN	0.17	51.0	46.0	48.0	1	4	2015	1944530.0
4	2.91	NaN	NaN	0.22	65.0	38.0	39.0	1	5	2015	2384060.0

	date	wdir	AWND
0	1/1/15	279.0	23.1
1	1/2/15	NaN	16.1
2	1/3/15	NaN	4.4
3	1/4/15	NaN	15.9
4	1/5/15	290.0	33.7

	STATION	DATE	DAPR	MDPR	PRCP	TMAX	TMIN	TOBS	AWND
0	USC00310982	1/1/2015	NaN	NaN	0.00	29.0	22.0	23.0	23.1
1	USC00310982	1/2/2015	NaN	NaN	0.00	40.0	23.0	35.0	16.1
2	USC00310982	1/3/2015	NaN	NaN	0.02	46.0	35.0	37.0	4.4
3	USC00310982	1/4/2015	NaN	NaN	0.48	54.0	37.0	51.0	15.9
4	USC00310982	1/5/2015	NaN	NaN	0.13	58.0	28.0	28.0	33.7