import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from statsmodels.stats.outliers_influence import variance_inflation_factor


pd.set_option('display.float_format', '{:.6f}'.format)


temp = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Avg Temp.csv")
bf = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Bird Flu.csv")
corn = pd.read_html(r"C:\Users\alexp\Desktop\Data\Corn.xls", skiprows = 0)
cpi = pd.read_excel(r"C:\Users\alexp\Desktop\Data\CPI.xlsx", skiprows = 9)
diesel = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Diesel.csv")
gas = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Gasoline.csv")
income = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Income.csv")
inflation = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Inflation.csv")
ngas = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Natural Gas.csv")
pop = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Population.csv")
propane = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Propane.csv")
soybean = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Soybean.csv", skiprows = 15)
unemploy = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Unemployment.csv")
egg_price = pd.read_csv(r"C:\Users\alexp\Desktop\Egg Price.csv")


temp.head()


# Create a list to hold all the rows
monthly_data = []

# Iterate over the years in the temp DataFrame
for year in temp['year']:
    for month in range(1, 13):  # For each month from January (1) to December (12)
        # Add the date for the current year and month
        monthly_data.append({'date': f"{year}-{month:02d}-01", 'temp': temp[temp['year'] == year]['avg temp'].values[0]})

# Replace the original temp DataFrame with the new monthly data
temp = pd.DataFrame(monthly_data)

# Convert the 'date' column to datetime format
temp['date'] = pd.to_datetime(temp['date'])

temp.head()


bf.head()


#0 → "None"

#1 → "Moderate"

#2 → "High"

# Create a list to hold the rows for the new dataframe
monthly_bf_data = []

# Iterate over the years in the original DataFrame (assuming 'bf' has 'year')
for year in bf['Year']:
    for month in range(1, 13):  # For each month from January (1) to December (12)
        # Get the 'Outbreak' value for the current year
        outbreak_value = bf[bf['Year'] == year]['Outbreak'].values[0]
        
        # Add the date and outbreak value to the list
        monthly_bf_data.append({'date': f"{year}-{month:02d}-01", 'Outbreak': outbreak_value})

# Replace the original bf DataFrame with the new monthly data
bf = pd.DataFrame(monthly_bf_data)

# Convert the 'date' column to datetime format
bf['date'] = pd.to_datetime(bf['date'])

bf.head()


corn = corn[0]

# Convert 'Year' and 'Frequency' to 'date' and select relevant columns
corn['Date'] = pd.to_datetime(corn['Year'].astype(str) + '-' + corn['Frequency'] + '-01', format='%Y-%b-%d')
corn = corn[['Date', 'Amount']]

# Rename columns for clarity
corn.rename(columns={"Date": "date", "Amount": "corn_price"}, inplace=True)

# Create new rows for Feb and Mar 2025 with NaN values for 'corn_price'
new_rows = pd.DataFrame({
    'date': [pd.Timestamp('2025-02-01'), pd.Timestamp('2025-03-01')],
    'corn_price': [np.nan, np.nan]
})

# Append the new rows to the DataFrame and sort by date
corn = pd.concat([corn, new_rows], ignore_index=True).sort_values('date').reset_index(drop=True)

# Perform linear interpolation once to fill missing 'corn_price' values
corn['corn_price'] = corn['corn_price'].interpolate(method='linear')

# Display the updated DataFrame
corn.tail()


# Reshape the CPI data to long format
cpi = cpi.melt(id_vars='Year', var_name='Month', value_name='cpi_value')

# Create a proper date column
cpi['Date'] = pd.to_datetime(cpi['Year'].astype(str) + '-' + cpi['Month'] + '-01', format='%Y-%b-%d')

# Keep only the necessary columns and sort by date
cpi = cpi[['Date', 'cpi_value']].sort_values('Date').reset_index(drop=True)
cpi.rename(columns = {"Date": "date"}, inplace = True)

# Check for March 2025 and interpolate just that value
march_2025_index = cpi[cpi['date'] == '2025-03-01'].index

if not march_2025_index.empty:
    # Perform linear interpolation just for March 2025
    cpi.loc[march_2025_index, 'cpi_value'] = cpi['cpi_value'].interpolate(method='linear').loc[march_2025_index]

cpi.tail()


diesel.rename(columns = {"observation_date": "date", "WPU057303": "diesel_ppi"}, inplace = True)
diesel = diesel[diesel['date'] >= '1980-01-01']

diesel.head()


gas.rename(columns = {"observation_date": "date", "WPS0571": "gas_ppi"}, inplace = True)
gas.head()


income.rename(columns = {"observation_date" : "date", "A229RX0": "rdpi_per_capita"}, inplace = True)

income['date'] = pd.to_datetime(income['date'], errors='coerce')

# Add a row for March 2025 with NaN value for 'rdpi_per_capita'
new_row = pd.DataFrame({
    'date': [pd.Timestamp('2025-03-01')],
    'rdpi_per_capita': [np.nan]  # If you don't have the value, keep it as NaN
})

# Append the new row to the income DataFrame
income = pd.concat([income, new_row], ignore_index=True)

# Sort by date (optional, just in case)
income.sort_values('date', inplace=True)
income.reset_index(drop=True, inplace=True)

# Interpolate missing values in 'rdpi_per_capita' (optional)
income['rdpi_per_capita'] = income['rdpi_per_capita'].interpolate(method='linear')

# Display the last few rows to verify the new row was added
income.tail()


inflation.rename(columns = {"observation_date": "date", "FEDFUNDS": "fed_funds_rate"}, inplace = True)
inflation.head()


ngas.rename(columns = {"observation_date" : "date", "WPU0531": "natural_gas_ppi"}, inplace = True)
ngas.head()


pop.columns

Index(['Year', 'Population', 'Growth Rate'], dtype='object')


pop.tail()


# Check if the 'Growth Rate' column contains any unwanted characters or missing data
pop['Growth Rate'] = pop['Growth Rate'].replace({'%': ''}, regex=True)

# Now, attempt to convert to numeric values
pop['Growth Rate'] = pd.to_numeric(pop['Growth Rate'], errors='coerce') / 100

# Check if there are any NaN values after the conversion
print(pop['Growth Rate'])

# Create an empty list to store the monthly data
monthly_data = []

# Iterate over each row to generate monthly data
for _, row in pop.iterrows():
    year = row['Year']
    growth_rate = row['Growth Rate']
    
    for month in range(1, 13):  # Loop for each month in the year
        monthly_data.append({
            'Year': year,
            'Month': month,
            'Growth Rate': growth_rate / 12  # Monthly growth rate
        })

# Create the DataFrame from the monthly data
pop = pd.DataFrame(monthly_data)

# Create a Date column (yyyy-mm-dd format)
pop['Date'] = pd.to_datetime(pop[['Year', 'Month']].assign(DAY=1))

# Now the `pop` DataFrame has monthly data with the 'Date' column
pop.head()

0    0.005200
1    0.005300
2    0.005000
3    0.003800
4    0.003100
5    0.004900
6    0.006600
7    0.007100
8    0.007900
9    0.008000
10   0.008000
11   0.008300
12   0.008600
13   0.008800
14   0.008700
15   0.008700
16   0.009200
17   0.009700
18   0.010000
19   0.009800
20   0.009800
21   0.009700
22   0.009600
23   0.010100
24   0.010900
25   0.011500
26   0.012100
27   0.012600
28   0.012700
29   0.012500
30   0.012900
31   0.013500
32   0.014100
33   0.014400
34   0.014000
35   0.012800
36   0.011000
37   0.010200
38   0.009900
39   0.010100
40   0.010200
41   0.010300
42   0.010500
43   0.010400
44   0.011300
45   0.012100
Name: Growth Rate, dtype: float64


# Remove the 'Year' and 'Month' columns from the 'pop' DataFrame
pop = pop.drop(columns=['Year', 'Month'])

# Display the updated 'pop' DataFrame
pop.head()


pop.rename(columns = {"Date": "date"}, inplace = True)

pop.head()


propane.rename(columns = {"observation_date": "date", "WPU05320104": "propane_ppi"}, inplace = True)

propane['propane_ppi'] = propane['propane_ppi'].interpolate(method='linear')

propane = propane[propane['date'] >= '1980-01-01']

propane.head()


soybean.rename(columns = {" value": "soybean_price"}, inplace = True)
soybean.head()


# Ensure the 'date' column is in datetime format
soybean['date'] = pd.to_datetime(soybean['date'])

# Create a complete date range from the min to max date
full_range = pd.date_range(start=soybean['date'].min(), end=soybean['date'].max(), freq='D')

# Manually add the missing date '1980-01-01' to the full range if it's not already present
if pd.Timestamp('1980-01-01') not in full_range:
    full_range = full_range.insert(0, pd.Timestamp('1980-01-01'))

# Reindex the DataFrame with the complete date range, ensuring the missing date is added
soybean = soybean.set_index('date').reindex(full_range)

# Interpolate missing values using linear interpolation
soybean['soybean_price'] = soybean['soybean_price'].interpolate(method='linear')

# Reset index to have 'date' as a column again
soybean = soybean.reset_index()

# Rename the columns to match the original
soybean.columns = ['date', 'soybean_price']

soybean.head()


# Set 'date' as the index
soybean = soybean.set_index('date')

# Reindex to include all dates in the full range, missing dates will have NaN prices
full_date_range = pd.date_range(start=soybean.index.min(), end=soybean.index.max(), freq='D')
soybean = soybean.reindex(full_date_range)

# Reset index and rename it back to 'date'
soybean = soybean.reset_index().rename(columns={'index': 'date'})

soybean.head()


soybean.rename(columns={' value': 'soybean_price'}, inplace = True)

soybean['date'] = pd.to_datetime(soybean['date'])

soybean = soybean[soybean['date'].dt.year >= 1980]

soybean['soybean_price'] = soybean['soybean_price'].interpolate(method='linear')

if pd.isna(soybean['soybean_price'].iloc[0]):
    soybean['soybean_price'].iloc[0] = soybean['soybean_price'].iloc[1]

soybean.head()


soybean.columns

Index(['date', 'soybean_price'], dtype='object')


# Ensure the date is in datetime format
soybean['date'] = pd.to_datetime(soybean['date'])

# Create a new column for the year and month
soybean['year'] = soybean['date'].dt.year
soybean['month'] = soybean['date'].dt.month

# Group by year and month, then calculate the mean of soybean_price for each group
soybean = soybean.groupby(['year', 'month'])['soybean_price'].mean().reset_index()

# Optionally, create a datetime column for the month-year combination
soybean['month_year'] = pd.to_datetime(soybean[['year', 'month']].assign(day=1))

# Display the result
soybean.head()


# Drop the 'year' and 'month' columns
soybean.drop(['year', 'month'], axis=1, inplace=True)

# Rename 'month_year' column to 'date'
soybean.rename(columns={'month_year': 'date'}, inplace=True)

soybean.head()


unemploy['observation_date'] = pd.to_datetime(unemploy['observation_date'])

#unemploy.rename(columns = {"observation_date": "date"}, inplace = True)

# Step 2: Filter rows to keep only those from 1980 or later
unemploy = unemploy[unemploy['observation_date'].dt.year >= 1980]

unemploy.rename(columns = {"observation_date": "date"}, inplace = True)

# Preview the filtered data
unemploy.head()


egg_price.rename(columns = {"observation_date": "date", "APU0000708111": "egg_price"}, inplace = True)

egg_price.head()


egg_price.isnull().sum()

date         0
egg_price    0
dtype: int64


egg_price.columns

Index(['date', 'egg_price'], dtype='object')


# Ensure all date columns are of type datetime64[ns]
egg_price['date'] = pd.to_datetime(egg_price['date'], errors = 'coerce')
unemploy['date'] = pd.to_datetime(unemploy['date'], errors = 'coerce')
soybean['date'] = pd.to_datetime(soybean['date'], errors = 'coerce')
propane['date'] = pd.to_datetime(propane['date'], errors = 'coerce')
temp['date'] = pd.to_datetime(temp['date'], errors = 'coerce')
bf['date'] = pd.to_datetime(bf['date'], errors = 'coerce')
corn['date'] = pd.to_datetime(corn['date'], errors = 'coerce')
cpi['date'] = pd.to_datetime(cpi['date'], errors = 'coerce')
diesel['date'] = pd.to_datetime(diesel['date'], errors = 'coerce')
gas['date'] = pd.to_datetime(gas['date'], errors = 'coerce')
income['date'] = pd.to_datetime(income['date'], errors = 'coerce')
inflation['date'] = pd.to_datetime(inflation['date'], errors = 'coerce')
ngas['date'] = pd.to_datetime(ngas['date'], errors = 'coerce')
pop['date'] = pd.to_datetime(pop['date'], errors = 'coerce')

# Now proceed with the merge
merged_data = pd.merge(egg_price, unemploy, on='date', how='outer')
merged_data = pd.merge(merged_data, soybean, on='date', how='outer')
merged_data = pd.merge(merged_data, propane, on='date', how='outer')
merged_data = pd.merge(merged_data, temp, on='date', how='outer')
merged_data = pd.merge(merged_data, bf, on='date', how='outer')
merged_data = pd.merge(merged_data, corn, on='date', how='outer')
merged_data = pd.merge(merged_data, cpi, on='date', how='outer')
merged_data = pd.merge(merged_data, diesel, on='date', how='outer')
merged_data = pd.merge(merged_data, gas, on='date', how='outer')
merged_data = pd.merge(merged_data, income, on='date', how='outer')
merged_data = pd.merge(merged_data, inflation, on='date', how='outer')
merged_data = pd.merge(merged_data, ngas, on='date', how='outer')
merged_data = pd.merge(merged_data, pop, on='date', how='outer')

# Display the merged dataframe
merged_data.head()


merged_data = merged_data.dropna(subset=['egg_price'])


# Check for missing values in the dataframe
missing_values = merged_data.isnull().sum()

# Display the columns with missing values
print(missing_values[missing_values > 0])

Series([], dtype: int64)


missing_egg_dates = merged_data[merged_data['egg_price'].isnull()]['date']
print(missing_egg_dates)

Series([], Name: date, dtype: datetime64[ns])


missing_income = merged_data[merged_data['rdpi_per_capita'].isnull()]['date']
print(missing_income)

Series([], Name: date, dtype: datetime64[ns])


missing_unrate = merged_data[merged_data['UNRATE'].isnull()]['date']
print(missing_unrate)

Series([], Name: date, dtype: datetime64[ns])


# Define dependent and independent variables
X = merged_data.drop(columns=['date', 'egg_price'])
y = merged_data['egg_price']

# Add intercept
X = sm.add_constant(X)

# Combine X and y for easier cleaning
combined = pd.concat([X, y], axis=1)

# Replace inf/-inf with NaN and drop rows with missing values
combined = combined.replace([np.inf, -np.inf], np.nan).dropna()

# Separate cleaned X and y
X_clean = combined.drop(columns=['egg_price'])
y_clean = combined['egg_price']

# Fit the model
model = sm.OLS(y_clean, X_clean).fit()

# Show regression results
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.196e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:32   Log-Likelihood:                 1589.4
No. Observations:                 543   AIC:                            -3151.
Df Residuals:                     529   BIC:                            -3091.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0003      0.030      0.012      0.991      -0.059       0.060
UNRATE           -8.17e-06      0.000     -0.018      0.986      -0.001       0.001
soybean_price      -0.0008      0.001     -1.228      0.220      -0.002       0.000
propane_ppi      3.507e-05   1.62e-05      2.162      0.031     3.2e-06    6.69e-05
temp             6.603e-05      0.001      0.131      0.896      -0.001       0.001
Outbreak           -0.0004      0.001     -0.296      0.767      -0.003       0.002
corn_price          0.0010      0.001      0.749      0.454      -0.002       0.004
cpi_value           1.0134      0.001    709.571      0.000       1.011       1.016
diesel_ppi      -3.029e-05   2.35e-05     -1.288      0.198   -7.65e-05    1.59e-05
gas_ppi         -5.099e-05   3.87e-05     -1.317      0.189      -0.000    2.51e-05
rdpi_per_capita -1.846e-07   2.35e-07     -0.784      0.433   -6.47e-07    2.78e-07
fed_funds_rate      0.0001      0.000      0.398      0.691      -0.000       0.001
natural_gas_ppi  -1.08e-05   1.35e-05     -0.798      0.425   -3.74e-05    1.58e-05
Growth Rate        -5.8753      6.100     -0.963      0.336     -17.859       6.108
==============================================================================
Omnibus:                     1202.076   Durbin-Watson:                   1.210
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          3269004.906
Skew:                          17.576   Prob(JB):                         0.00
Kurtosis:                     381.485   Cond. No.                     4.03e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.03e+08. This might indicate that there are
strong multicollinearity or other numerical problems.


# Add constant to the model for the intercept
X = add_constant(merged_data[['UNRATE', 'soybean_price', 'propane_ppi', 'temp', 'Outbreak', 
                               'corn_price', 'cpi_value', 'diesel_ppi', 'gas_ppi', 
                               'rdpi_per_capita', 'fed_funds_rate', 'natural_gas_ppi', 'Growth Rate']])

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display the VIFs
print(vif_data)

           Variable         VIF
0             const 2863.185734
1            UNRATE    2.208576
2     soybean_price   11.607555
3       propane_ppi    9.639871
4              temp    2.086108
5          Outbreak    2.592928
6        corn_price   11.508110
7         cpi_value    3.135601
8        diesel_ppi   23.830582
9           gas_ppi   32.298061
10  rdpi_per_capita   12.317555
11   fed_funds_rate    3.880146
12  natural_gas_ppi    3.490783
13      Growth Rate    5.788445


# Define dependent and independent variables
X = merged_data.drop(columns=['date', 'egg_price', 'gas_ppi', 'diesel_ppi'])
y = merged_data['egg_price']

# Add intercept
X = sm.add_constant(X)

# Combine X and y for easier cleaning
combined = pd.concat([X, y], axis=1)

# Replace inf/-inf with NaN and drop rows with missing values
combined = combined.replace([np.inf, -np.inf], np.nan).dropna()

# Separate cleaned X and y
X_clean = combined.drop(columns=['egg_price'])
y_clean = combined['egg_price']

# Fit the model
model = sm.OLS(y_clean, X_clean).fit()

# Show regression results
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.390e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:32   Log-Likelihood:                 1583.9
No. Observations:                 543   AIC:                            -3144.
Df Residuals:                     531   BIC:                            -3092.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0073      0.030      0.244      0.807      -0.052       0.066
UNRATE              0.0002      0.000      0.438      0.661      -0.001       0.001
soybean_price      -0.0009      0.001     -1.520      0.129      -0.002       0.000
propane_ppi      7.386e-06   1.21e-05      0.609      0.543   -1.65e-05    3.12e-05
temp            -5.213e-05      0.000     -0.105      0.917      -0.001       0.001
Outbreak            0.0003      0.001      0.317      0.751      -0.002       0.003
corn_price         -0.0010      0.001     -0.772      0.440      -0.003       0.001
cpi_value           1.0121      0.001    733.189      0.000       1.009       1.015
rdpi_per_capita  -2.94e-07   2.35e-07     -1.251      0.212   -7.56e-07    1.68e-07
fed_funds_rate  -8.104e-05      0.000     -0.292      0.770      -0.001       0.000
natural_gas_ppi -1.247e-05   1.31e-05     -0.952      0.342   -3.82e-05    1.33e-05
Growth Rate         0.3498      5.847      0.060      0.952     -11.137      11.836
==============================================================================
Omnibus:                     1219.694   Durbin-Watson:                   1.188
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          3542783.390
Skew:                          18.172   Prob(JB):                         0.00
Kurtosis:                     397.038   Cond. No.                     3.83e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.83e+08. This might indicate that there are
strong multicollinearity or other numerical problems.


# Select only the independent variables (excluding 'egg_price')
independent_vars = merged_data.drop(columns=['egg_price'])

# Compute correlation matrix
corr_matrix = independent_vars.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Independent Variables')
plt.show()


# Define dependent and independent variables
X = merged_data.drop(columns=['date', 'egg_price', 'gas_ppi', 'diesel_ppi', 'soybean_price', 'propane_ppi', 'rdpi_per_capita'])
y = merged_data['egg_price']

# Add intercept
X = sm.add_constant(X)

# Combine X and y for easier cleaning
combined = pd.concat([X, y], axis=1)

# Replace inf/-inf with NaN and drop rows with missing values
combined = combined.replace([np.inf, -np.inf], np.nan).dropna()

# Separate cleaned X and y
X_clean = combined.drop(columns=['egg_price'])
y_clean = combined['egg_price']

# Fit the model
model = sm.OLS(y_clean, X_clean).fit()

# Show regression results
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.909e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:33   Log-Likelihood:                 1582.0
No. Observations:                 543   AIC:                            -3146.
Df Residuals:                     534   BIC:                            -3107.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -0.0025      0.029     -0.088      0.930      -0.059       0.054
UNRATE              0.0005      0.000      1.287      0.199      -0.000       0.001
temp               -0.0002      0.000     -0.491      0.624      -0.001       0.001
Outbreak            0.0003      0.001      0.269      0.788      -0.002       0.002
corn_price         -0.0025      0.001     -3.989      0.000      -0.004      -0.001
cpi_value           1.0114      0.001    787.491      0.000       1.009       1.014
fed_funds_rate      0.0002      0.000      1.143      0.253      -0.000       0.001
natural_gas_ppi -7.455e-06   8.31e-06     -0.898      0.370   -2.38e-05    8.86e-06
Growth Rate         6.4289      4.471      1.438      0.151      -2.355      15.213
==============================================================================
Omnibus:                     1225.887   Durbin-Watson:                   1.180
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          3641899.526
Skew:                          18.386   Prob(JB):                         0.00
Kurtosis:                     402.519   Cond. No.                     1.27e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.27e+06. This might indicate that there are
strong multicollinearity or other numerical problems.


# List of predictors used in your regression
predictors = ['UNRATE', 'temp', 'Outbreak', 'corn_price', 'cpi_value',
              'fed_funds_rate', 'natural_gas_ppi', 'Growth Rate']

# Create X matrix with constant
X = sm.add_constant(merged_data[predictors])

# Calculate VIF
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

          Variable         VIF
0            const 2552.008626
1           UNRATE    1.382154
2             temp    1.916131
3         Outbreak    1.834292
4       corn_price    2.312824
5        cpi_value    2.490735
6   fed_funds_rate    1.574224
7  natural_gas_ppi    1.290526
8      Growth Rate    3.054922


# Create interaction terms in merged_data
merged_data['corn_outbreak'] = merged_data['corn_price'] * merged_data['Outbreak']
merged_data['soy_outbreak'] = merged_data['soybean_price'] * merged_data['Outbreak']
merged_data['corn_soy'] = merged_data['corn_price'] * merged_data['soybean_price']

merged_data['cpi_rdpi'] = merged_data['cpi_value'] * merged_data['rdpi_per_capita']
merged_data['cpi_fed'] = merged_data['cpi_value'] * merged_data['fed_funds_rate']
merged_data['rdpi_fed'] = merged_data['rdpi_per_capita'] * merged_data['fed_funds_rate']
merged_data['unrate_rdpi'] = merged_data['UNRATE'] * merged_data['rdpi_per_capita']

merged_data['diesel_corn'] = merged_data['diesel_ppi'] * merged_data['corn_price']
merged_data['propane_soy'] = merged_data['propane_ppi'] * merged_data['soybean_price']
merged_data['gas_temp'] = merged_data['gas_ppi'] * merged_data['temp']

merged_data['temp_outbreak'] = merged_data['temp'] * merged_data['Outbreak']
merged_data['temp_corn'] = merged_data['temp'] * merged_data['corn_price']
merged_data['temp_natural_gas'] = merged_data['temp'] * merged_data['natural_gas_ppi']

merged_data['growth_fed'] = merged_data['Growth Rate'] * merged_data['fed_funds_rate']
merged_data['growth_cpi'] = merged_data['Growth Rate'] * merged_data['cpi_value']
merged_data['growth_corn'] = merged_data['Growth Rate'] * merged_data['corn_price']


# Define predictors including interaction terms
predictors = [
    'UNRATE', 'temp', 'Outbreak', 'corn_price', 'cpi_value', 'fed_funds_rate',
    'natural_gas_ppi', 'Growth Rate', 'soybean_price', 'propane_ppi', 'diesel_ppi',
    'gas_ppi', 'rdpi_per_capita',

    # Interaction terms
    'corn_outbreak', 'soy_outbreak', 'corn_soy',
    'cpi_rdpi', 'cpi_fed', 'rdpi_fed', 'unrate_rdpi',
    'diesel_corn', 'propane_soy', 'gas_temp',
    'temp_outbreak', 'temp_corn', 'temp_natural_gas',
    'growth_fed', 'growth_cpi', 'growth_corn'
]

X = merged_data[predictors]
y = merged_data['egg_price']

# Add constant for intercept
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Print regression results
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 5.794e+04
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:33   Log-Likelihood:                 1618.8
No. Observations:                 543   AIC:                            -3178.
Df Residuals:                     513   BIC:                            -3049.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.3017      0.105      2.874      0.004       0.095       0.508
UNRATE              -0.0014      0.003     -0.557      0.578      -0.006       0.004
temp                -0.0029      0.002     -1.700      0.090      -0.006       0.000
Outbreak             0.0440      0.041      1.063      0.288      -0.037       0.125
corn_price          -0.0483      0.041     -1.165      0.245      -0.130       0.033
cpi_value            0.8811      0.035     24.993      0.000       0.812       0.950
fed_funds_rate      -0.0036      0.004     -0.887      0.376      -0.012       0.004
natural_gas_ppi     -0.0009      0.000     -1.767      0.078      -0.002    9.65e-05
Growth Rate        -41.6700     31.034     -1.343      0.180    -102.638      19.298
soybean_price       -0.0001      0.001     -0.097      0.922      -0.003       0.002
propane_ppi       4.289e-05   4.18e-05      1.027      0.305   -3.91e-05       0.000
diesel_ppi       -1.116e-06   6.61e-05     -0.017      0.987      -0.000       0.000
gas_ppi              0.0010      0.001      1.281      0.201      -0.001       0.002
rdpi_per_capita  -2.651e-06   8.17e-07     -3.246      0.001   -4.26e-06   -1.05e-06
corn_outbreak       -0.0024      0.002     -1.012      0.312      -0.007       0.002
soy_outbreak     -3.148e-05      0.001     -0.036      0.972      -0.002       0.002
corn_soy          8.998e-05      0.000      0.287      0.774      -0.001       0.001
cpi_rdpi          2.336e-06   5.23e-07      4.466      0.000    1.31e-06    3.36e-06
cpi_fed              0.0012      0.001      1.328      0.185      -0.001       0.003
rdpi_fed         -6.819e-08   6.55e-08     -1.040      0.299   -1.97e-07    6.06e-08
unrate_rdpi       4.093e-09   5.73e-08      0.071      0.943   -1.08e-07    1.17e-07
diesel_corn      -1.542e-06   1.03e-05     -0.149      0.882   -2.19e-05    1.88e-05
propane_soy      -1.613e-06   3.12e-06     -0.516      0.606   -7.75e-06    4.52e-06
gas_temp         -1.867e-05   1.37e-05     -1.360      0.175   -4.56e-05    8.31e-06
temp_outbreak       -0.0007      0.001     -0.830      0.407      -0.002       0.001
temp_corn            0.0009      0.001      1.227      0.220      -0.001       0.002
temp_natural_gas  1.635e-05    9.1e-06      1.797      0.073   -1.52e-06    3.42e-05
growth_fed           4.0072      2.876      1.393      0.164      -1.642       9.657
growth_cpi          32.5163     20.088      1.619      0.106      -6.948      71.981
growth_corn         -1.5425      7.478     -0.206      0.837     -16.233      13.148
==============================================================================
Omnibus:                     1098.871   Durbin-Watson:                   1.369
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          2165590.659
Skew:                          14.336   Prob(JB):                         0.00
Kurtosis:                     311.050   Cond. No.                     1.64e+10
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.64e+10. This might indicate that there are
strong multicollinearity or other numerical problems.


vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

             feature          VIF
0              const 37544.064509
1             UNRATE    70.704086
2               temp    26.467847
3           Outbreak  3221.050777
4         corn_price 11470.599064
5          cpi_value  2064.639688
6     fed_funds_rate   875.694055
7    natural_gas_ppi  4917.387954
8        Growth Rate   161.915984
9      soybean_price    56.905525
10       propane_ppi    68.990165
11        diesel_ppi   203.455282
12           gas_ppi 12846.196126
13   rdpi_per_capita   160.089083
14     corn_outbreak   234.162282
15      soy_outbreak   197.008944
16          corn_soy   231.413475
17          cpi_rdpi  1480.992322
18           cpi_fed    53.411866
19          rdpi_fed   149.877657
20       unrate_rdpi    59.589198
21       diesel_corn   221.607819
22       propane_soy    75.572238
23          gas_temp 13357.918509
24     temp_outbreak  3741.915213
25         temp_corn 12358.043858
26  temp_natural_gas  4858.189758
27        growth_fed   402.504304
28        growth_cpi   124.304248
29       growth_corn   128.244498


# Scale data for Lasso
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.drop(columns=['const']))

lasso = LassoCV(cv=5, random_state=0).fit(X_scaled, y)
coef = pd.Series(lasso.coef_, index=X.columns[1:])
print(coef[coef != 0].sort_values())

diesel_ppi        -0.001713
soybean_price     -0.000166
rdpi_per_capita   -0.000043
UNRATE             0.000002
growth_cpi         0.001369
cpi_fed            0.001431
cpi_rdpi           0.008845
cpi_value          0.693240
dtype: float64


# Selected features from Lasso
selected_features = ['diesel_ppi', 'soybean_price', 'rdpi_per_capita', 'UNRATE',
                     'growth_cpi', 'cpi_fed', 'cpi_rdpi', 'cpi_value']

# Subset the original X (with constant)
X_lasso = X[selected_features]
X_lasso = sm.add_constant(X_lasso)

# Fit OLS model
model_lasso = sm.OLS(y, X_lasso).fit()

# Summary of the regression
print(model_lasso.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 2.019e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:33   Log-Likelihood:                 1597.2
No. Observations:                 543   AIC:                            -3176.
Df Residuals:                     534   BIC:                            -3138.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0446      0.013      3.499      0.001       0.020       0.070
diesel_ppi      -1.664e-05   1.23e-05     -1.358      0.175   -4.07e-05    7.43e-06
soybean_price      -0.0003      0.000     -0.677      0.499      -0.001       0.001
rdpi_per_capita  -1.26e-06   2.97e-07     -4.239      0.000   -1.84e-06   -6.76e-07
UNRATE              0.0002      0.000      0.590      0.556      -0.001       0.001
growth_cpi         10.6088      4.529      2.342      0.020       1.712      19.506
cpi_fed            -0.0003      0.000     -1.589      0.113      -0.001       8e-05
cpi_rdpi         1.175e-06   2.68e-07      4.379      0.000    6.48e-07     1.7e-06
cpi_value           0.9509      0.015     64.341      0.000       0.922       0.980
==============================================================================
Omnibus:                     1174.548   Durbin-Watson:                   1.249
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          2925313.857
Skew:                          16.664   Prob(JB):                         0.00
Kurtosis:                     361.029   Cond. No.                     6.22e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.22e+08. This might indicate that there are
strong multicollinearity or other numerical problems.


# VIF DataFrame
vif_data = pd.DataFrame()
vif_data["feature"] = X_lasso.columns
vif_data["VIF"] = [variance_inflation_factor(X_lasso.values, i)
                   for i in range(X_lasso.shape[1])]

print(vif_data)

           feature        VIF
0            const 532.584877
1       diesel_ppi   6.726621
2    soybean_price   4.836786
3  rdpi_per_capita  20.403635
4           UNRATE   1.574427
5       growth_cpi   6.073492
6          cpi_fed   3.036188
7         cpi_rdpi 374.649905
8        cpi_value 348.758535


coef[coef != 0].abs().sort_values().plot(kind='barh', figsize=(8, 6))
plt.title("Feature Importance from Lasso Regression")
plt.xlabel("Absolute Coefficient Value")
plt.tight_layout()
plt.show()


# Define reduced set of features
X_reduced = X[['const', 'rdpi_per_capita', 'growth_cpi', 'cpi_rdpi', 'cpi_value']]

# Refit OLS model
model_reduced = sm.OLS(y, X_reduced).fit()

# Display summary
print(model_reduced.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 3.997e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:34   Log-Likelihood:                 1592.4
No. Observations:                 543   AIC:                            -3175.
Df Residuals:                     538   BIC:                            -3153.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0436      0.007      6.079      0.000       0.030       0.058
rdpi_per_capita -1.264e-06   1.64e-07     -7.703      0.000   -1.59e-06   -9.41e-07
growth_cpi         15.5789      3.847      4.049      0.000       8.022      23.136
cpi_rdpi         1.274e-06   2.04e-07      6.254      0.000    8.74e-07    1.67e-06
cpi_value           0.9404      0.011     83.276      0.000       0.918       0.963
==============================================================================
Omnibus:                     1190.346   Durbin-Watson:                   1.226
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          3132994.257
Skew:                          17.179   Prob(JB):                         0.00
Kurtosis:                     373.533   Cond. No.                     5.26e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.26e+08. This might indicate that there are
strong multicollinearity or other numerical problems.


# Compute VIF for each feature in the reduced model
vif_data = pd.DataFrame()
vif_data["feature"] = X_reduced.columns
vif_data["VIF"] = [variance_inflation_factor(X_reduced.values, i) for i in range(X_reduced.shape[1])]

print(vif_data)

           feature        VIF
0            const 166.546852
1  rdpi_per_capita   6.147673
2       growth_cpi   4.338013
3         cpi_rdpi 213.766659
4        cpi_value 201.584149


# Calculate correlation and p-value
corr_coef, p_value = pearsonr(X['cpi_rdpi'], X['cpi_value'])

print(f"Pearson correlation coefficient: {corr_coef:.4f}")
print(f"P-value: {p_value:.4e}")

Pearson correlation coefficient: 0.9883
P-value: 0.0000e+00


# Define new reduced set of features
X_dropped = X[['const', 'rdpi_per_capita', 'growth_cpi', 'cpi_value']]

# Refit OLS model
model_dropped = sm.OLS(y, X_dropped).fit()

# Display summary
print(model_dropped.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 4.977e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:34   Log-Likelihood:                 1573.4
No. Observations:                 543   AIC:                            -3139.
Df Residuals:                     539   BIC:                            -3122.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0052      0.004      1.361      0.174      -0.002       0.013
rdpi_per_capita  -4.75e-07   1.09e-07     -4.374      0.000   -6.88e-07   -2.62e-07
growth_cpi         -1.7079      2.769     -0.617      0.538      -7.147       3.731
cpi_value           1.0103      0.002    603.847      0.000       1.007       1.014
==============================================================================
Omnibus:                     1253.952   Durbin-Watson:                   1.136
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          4133395.965
Skew:                          19.378   Prob(JB):                         0.00
Kurtosis:                     428.664   Cond. No.                     1.79e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.79e+08. This might indicate that there are
strong multicollinearity or other numerical problems.


# Calculate VIFs

vif_data = pd.DataFrame()
vif_data["feature"] = X_dropped.columns
vif_data["VIF"] = [variance_inflation_factor(X_dropped.values, i) for i in range(X_dropped.shape[1])]

print(vif_data)

           feature       VIF
0            const 44.697063
1  rdpi_per_capita  2.515052
2       growth_cpi  2.098828
3        cpi_value  4.132810


# Define final set of predictors (dropping growth_cpi)
X_final = X[['const', 'rdpi_per_capita', 'cpi_value']]

# Refit OLS model
model_final = sm.OLS(y, X_final).fit()

# Show summary
print(model_final.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 7.474e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:35   Log-Likelihood:                 1573.2
No. Observations:                 543   AIC:                            -3140.
Df Residuals:                     540   BIC:                            -3127.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0036      0.003      1.304      0.193      -0.002       0.009
rdpi_per_capita -4.502e-07   1.01e-07     -4.464      0.000   -6.48e-07   -2.52e-07
cpi_value           1.0096      0.001    832.728      0.000       1.007       1.012
==============================================================================
Omnibus:                     1254.686   Durbin-Watson:                   1.135
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          4144896.104
Skew:                          19.405   Prob(JB):                         0.00
Kurtosis:                     429.255   Cond. No.                     1.80e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.8e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

	year	avg temp
0	1980	52.800000
1	1981	54.300000
2	1982	51.000000
3	1983	53.500000
4	1984	51.000000

	date	temp
0	1980-01-01	52.800000
1	1980-02-01	52.800000
2	1980-03-01	52.800000
3	1980-04-01	52.800000
4	1980-05-01	52.800000

	Year	Outbreak
0	1980	0
1	1981	0
2	1982	0
3	1983	2
4	1984	2

	date	Outbreak
0	1980-01-01	0
1	1980-02-01	0
2	1980-03-01	0
3	1980-04-01	0
4	1980-05-01	0

	date	corn_price
538	2024-11-01	4.070000
539	2024-12-01	4.230000
540	2025-01-01	4.290000
541	2025-02-01	4.290000
542	2025-03-01	4.290000

	date	cpi_value
547	2025-08-01	NaN
548	2025-09-01	NaN
549	2025-10-01	NaN
550	2025-11-01	NaN
551	2025-12-01	NaN

	date	diesel_ppi
83	1980-01-01	74.600000
84	1980-02-01	80.100000
85	1980-03-01	84.500000
86	1980-04-01	86.500000
87	1980-05-01	87.300000

	date	gas_ppi
0	1973-03-01	17.500000
1	1973-04-01	17.800000
2	1973-05-01	18.000000
3	1973-06-01	18.000000
4	1973-07-01	17.800000

	date	rdpi_per_capita
790	2024-11-01	51565.000000
791	2024-12-01	51581.000000
792	2025-01-01	51724.000000
793	2025-02-01	51981.000000
794	2025-03-01	51981.000000

	date	fed_funds_rate
0	1954-07-01	0.800000
1	1954-08-01	1.220000
2	1954-09-01	1.070000
3	1954-10-01	0.850000
4	1954-11-01	0.830000

	date	natural_gas_ppi
0	1967-01-01	7.400000
1	1967-02-01	7.400000
2	1967-03-01	7.400000
3	1967-04-01	7.500000
4	1967-05-01	7.500000

	Year	Population	Growth Rate
41	1984	232,766,280	1.03%
42	1983	230,389,964	1.05%
43	1982	228,001,418	1.04%
44	1981	225,654,008	1.13%
45	1980	223,140,018	1.21%

	Year	Month	Growth Rate	Date
0	2025	1	0.000433	2025-01-01
1	2025	2	0.000433	2025-02-01
2	2025	3	0.000433	2025-03-01
3	2025	4	0.000433	2025-04-01
4	2025	5	0.000433	2025-05-01

	date	propane_ppi
31	1980-01-01	95.100000
32	1980-02-01	99.600000
33	1980-03-01	101.400000
34	1980-04-01	100.800000
35	1980-05-01	101.800000

	date	soybean_price
0	1968-12-05	2.437500
1	1968-12-06	2.447500
2	1968-12-09	2.436300
3	1968-12-10	2.437500
4	1968-12-11	2.446300

	date	soybean_price
4044	1980-01-01	6.617750
4045	1980-01-02	6.635000
4046	1980-01-03	6.700000
4047	1980-01-04	6.655000
4048	1980-01-05	6.595000

	date	UNRATE
384	1980-01-01	6.300000
385	1980-02-01	6.300000
386	1980-03-01	6.300000
387	1980-04-01	6.900000
388	1980-05-01	7.500000

	date	egg_price	UNRATE	soybean_price	propane_ppi	temp	corn_price	cpi_value	diesel_ppi	gas_ppi	rdpi_per_capita	fed_funds_rate	natural_gas_ppi	Growth Rate
0	1980-01-01	0.879000	6.300000	6.605250	95.100000	52.800000	2.450000	0.879000	74.600000	79.500000	23036.000000	13.820000	55.100000	0.001008
1	1980-02-01	0.774000	6.300000	6.588698	99.600000	52.800000	2.390000	0.774000	80.100000	84.900000	22894.000000	14.130000	58.300000	0.001008
2	1980-03-01	0.812000	6.300000	6.310048	101.400000	52.800000	2.400000	0.812000	84.500000	90.500000	22731.000000	17.190000	58.200000	0.001008
3	1980-04-01	0.797000	6.900000	5.954567	100.800000	52.800000	2.360000	0.797000	86.500000	96.200000	22687.000000	17.610000	59.700000	0.001008
4	1980-05-01	0.737000	7.500000	6.250417	101.800000	52.800000	2.420000	0.737000	87.300000	96.300000	22580.000000	10.980000	61.000000	0.001008