In [1308]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from statsmodels.stats.outliers_influence import variance_inflation_factor
In [1309]:
pd.set_option('display.float_format', '{:.6f}'.format)
In [1310]:
temp = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Avg Temp.csv")
bf = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Bird Flu.csv")
corn = pd.read_html(r"C:\Users\alexp\Desktop\Data\Corn.xls", skiprows = 0)
cpi = pd.read_excel(r"C:\Users\alexp\Desktop\Data\CPI.xlsx", skiprows = 9)
diesel = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Diesel.csv")
gas = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Gasoline.csv")
income = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Income.csv")
inflation = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Inflation.csv")
ngas = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Natural Gas.csv")
pop = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Population.csv")
propane = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Propane.csv")
soybean = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Soybean.csv", skiprows = 15)
unemploy = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Unemployment.csv")
egg_price = pd.read_csv(r"C:\Users\alexp\Desktop\Egg Price.csv")
In [1311]:
temp.head()
Out[1311]:
year avg temp
0 1980 52.800000
1 1981 54.300000
2 1982 51.000000
3 1983 53.500000
4 1984 51.000000
In [1312]:
# Create a list to hold all the rows
monthly_data = []

# Iterate over the years in the temp DataFrame
for year in temp['year']:
    for month in range(1, 13):  # For each month from January (1) to December (12)
        # Add the date for the current year and month
        monthly_data.append({'date': f"{year}-{month:02d}-01", 'temp': temp[temp['year'] == year]['avg temp'].values[0]})

# Replace the original temp DataFrame with the new monthly data
temp = pd.DataFrame(monthly_data)

# Convert the 'date' column to datetime format
temp['date'] = pd.to_datetime(temp['date'])

temp.head()
Out[1312]:
date temp
0 1980-01-01 52.800000
1 1980-02-01 52.800000
2 1980-03-01 52.800000
3 1980-04-01 52.800000
4 1980-05-01 52.800000
In [1313]:
bf.head()
Out[1313]:
Year Outbreak
0 1980 0
1 1981 0
2 1982 0
3 1983 2
4 1984 2
In [1314]:
#0 → "None"

#1 → "Moderate"

#2 → "High"

# Create a list to hold the rows for the new dataframe
monthly_bf_data = []

# Iterate over the years in the original DataFrame (assuming 'bf' has 'year')
for year in bf['Year']:
    for month in range(1, 13):  # For each month from January (1) to December (12)
        # Get the 'Outbreak' value for the current year
        outbreak_value = bf[bf['Year'] == year]['Outbreak'].values[0]
        
        # Add the date and outbreak value to the list
        monthly_bf_data.append({'date': f"{year}-{month:02d}-01", 'Outbreak': outbreak_value})

# Replace the original bf DataFrame with the new monthly data
bf = pd.DataFrame(monthly_bf_data)

# Convert the 'date' column to datetime format
bf['date'] = pd.to_datetime(bf['date'])

bf.head()
Out[1314]:
date Outbreak
0 1980-01-01 0
1 1980-02-01 0
2 1980-03-01 0
3 1980-04-01 0
4 1980-05-01 0
In [1315]:
corn = corn[0]

# Convert 'Year' and 'Frequency' to 'date' and select relevant columns
corn['Date'] = pd.to_datetime(corn['Year'].astype(str) + '-' + corn['Frequency'] + '-01', format='%Y-%b-%d')
corn = corn[['Date', 'Amount']]

# Rename columns for clarity
corn.rename(columns={"Date": "date", "Amount": "corn_price"}, inplace=True)

# Create new rows for Feb and Mar 2025 with NaN values for 'corn_price'
new_rows = pd.DataFrame({
    'date': [pd.Timestamp('2025-02-01'), pd.Timestamp('2025-03-01')],
    'corn_price': [np.nan, np.nan]
})

# Append the new rows to the DataFrame and sort by date
corn = pd.concat([corn, new_rows], ignore_index=True).sort_values('date').reset_index(drop=True)

# Perform linear interpolation once to fill missing 'corn_price' values
corn['corn_price'] = corn['corn_price'].interpolate(method='linear')

# Display the updated DataFrame
corn.tail()
Out[1315]:
date corn_price
538 2024-11-01 4.070000
539 2024-12-01 4.230000
540 2025-01-01 4.290000
541 2025-02-01 4.290000
542 2025-03-01 4.290000
In [1316]:
# Reshape the CPI data to long format
cpi = cpi.melt(id_vars='Year', var_name='Month', value_name='cpi_value')

# Create a proper date column
cpi['Date'] = pd.to_datetime(cpi['Year'].astype(str) + '-' + cpi['Month'] + '-01', format='%Y-%b-%d')

# Keep only the necessary columns and sort by date
cpi = cpi[['Date', 'cpi_value']].sort_values('Date').reset_index(drop=True)
cpi.rename(columns = {"Date": "date"}, inplace = True)

# Check for March 2025 and interpolate just that value
march_2025_index = cpi[cpi['date'] == '2025-03-01'].index

if not march_2025_index.empty:
    # Perform linear interpolation just for March 2025
    cpi.loc[march_2025_index, 'cpi_value'] = cpi['cpi_value'].interpolate(method='linear').loc[march_2025_index]

cpi.tail()
Out[1316]:
date cpi_value
547 2025-08-01 NaN
548 2025-09-01 NaN
549 2025-10-01 NaN
550 2025-11-01 NaN
551 2025-12-01 NaN
In [1317]:
diesel.rename(columns = {"observation_date": "date", "WPU057303": "diesel_ppi"}, inplace = True)
diesel = diesel[diesel['date'] >= '1980-01-01']

diesel.head()
Out[1317]:
date diesel_ppi
83 1980-01-01 74.600000
84 1980-02-01 80.100000
85 1980-03-01 84.500000
86 1980-04-01 86.500000
87 1980-05-01 87.300000
In [1318]:
gas.rename(columns = {"observation_date": "date", "WPS0571": "gas_ppi"}, inplace = True)
gas.head()
Out[1318]:
date gas_ppi
0 1973-03-01 17.500000
1 1973-04-01 17.800000
2 1973-05-01 18.000000
3 1973-06-01 18.000000
4 1973-07-01 17.800000
In [1319]:
income.rename(columns = {"observation_date" : "date", "A229RX0": "rdpi_per_capita"}, inplace = True)

income['date'] = pd.to_datetime(income['date'], errors='coerce')

# Add a row for March 2025 with NaN value for 'rdpi_per_capita'
new_row = pd.DataFrame({
    'date': [pd.Timestamp('2025-03-01')],
    'rdpi_per_capita': [np.nan]  # If you don't have the value, keep it as NaN
})

# Append the new row to the income DataFrame
income = pd.concat([income, new_row], ignore_index=True)

# Sort by date (optional, just in case)
income.sort_values('date', inplace=True)
income.reset_index(drop=True, inplace=True)

# Interpolate missing values in 'rdpi_per_capita' (optional)
income['rdpi_per_capita'] = income['rdpi_per_capita'].interpolate(method='linear')

# Display the last few rows to verify the new row was added
income.tail()
Out[1319]:
date rdpi_per_capita
790 2024-11-01 51565.000000
791 2024-12-01 51581.000000
792 2025-01-01 51724.000000
793 2025-02-01 51981.000000
794 2025-03-01 51981.000000
In [1320]:
inflation.rename(columns = {"observation_date": "date", "FEDFUNDS": "fed_funds_rate"}, inplace = True)
inflation.head()
Out[1320]:
date fed_funds_rate
0 1954-07-01 0.800000
1 1954-08-01 1.220000
2 1954-09-01 1.070000
3 1954-10-01 0.850000
4 1954-11-01 0.830000
In [1321]:
ngas.rename(columns = {"observation_date" : "date", "WPU0531": "natural_gas_ppi"}, inplace = True)
ngas.head()
Out[1321]:
date natural_gas_ppi
0 1967-01-01 7.400000
1 1967-02-01 7.400000
2 1967-03-01 7.400000
3 1967-04-01 7.500000
4 1967-05-01 7.500000
In [1322]:
pop.columns
Out[1322]:
Index(['Year', 'Population', 'Growth Rate'], dtype='object')
In [1323]:
pop.tail()
Out[1323]:
Year Population Growth Rate
41 1984 232,766,280 1.03%
42 1983 230,389,964 1.05%
43 1982 228,001,418 1.04%
44 1981 225,654,008 1.13%
45 1980 223,140,018 1.21%
In [1324]:
# Check if the 'Growth Rate' column contains any unwanted characters or missing data
pop['Growth Rate'] = pop['Growth Rate'].replace({'%': ''}, regex=True)

# Now, attempt to convert to numeric values
pop['Growth Rate'] = pd.to_numeric(pop['Growth Rate'], errors='coerce') / 100

# Check if there are any NaN values after the conversion
print(pop['Growth Rate'])

# Create an empty list to store the monthly data
monthly_data = []

# Iterate over each row to generate monthly data
for _, row in pop.iterrows():
    year = row['Year']
    growth_rate = row['Growth Rate']
    
    for month in range(1, 13):  # Loop for each month in the year
        monthly_data.append({
            'Year': year,
            'Month': month,
            'Growth Rate': growth_rate / 12  # Monthly growth rate
        })

# Create the DataFrame from the monthly data
pop = pd.DataFrame(monthly_data)

# Create a Date column (yyyy-mm-dd format)
pop['Date'] = pd.to_datetime(pop[['Year', 'Month']].assign(DAY=1))

# Now the `pop` DataFrame has monthly data with the 'Date' column
pop.head()
0    0.005200
1    0.005300
2    0.005000
3    0.003800
4    0.003100
5    0.004900
6    0.006600
7    0.007100
8    0.007900
9    0.008000
10   0.008000
11   0.008300
12   0.008600
13   0.008800
14   0.008700
15   0.008700
16   0.009200
17   0.009700
18   0.010000
19   0.009800
20   0.009800
21   0.009700
22   0.009600
23   0.010100
24   0.010900
25   0.011500
26   0.012100
27   0.012600
28   0.012700
29   0.012500
30   0.012900
31   0.013500
32   0.014100
33   0.014400
34   0.014000
35   0.012800
36   0.011000
37   0.010200
38   0.009900
39   0.010100
40   0.010200
41   0.010300
42   0.010500
43   0.010400
44   0.011300
45   0.012100
Name: Growth Rate, dtype: float64
Out[1324]:
Year Month Growth Rate Date
0 2025 1 0.000433 2025-01-01
1 2025 2 0.000433 2025-02-01
2 2025 3 0.000433 2025-03-01
3 2025 4 0.000433 2025-04-01
4 2025 5 0.000433 2025-05-01
In [1325]:
# Remove the 'Year' and 'Month' columns from the 'pop' DataFrame
pop = pop.drop(columns=['Year', 'Month'])

# Display the updated 'pop' DataFrame
pop.head()
Out[1325]:
Growth Rate Date
0 0.000433 2025-01-01
1 0.000433 2025-02-01
2 0.000433 2025-03-01
3 0.000433 2025-04-01
4 0.000433 2025-05-01
In [1326]:
pop.rename(columns = {"Date": "date"}, inplace = True)

pop.head()
Out[1326]:
Growth Rate date
0 0.000433 2025-01-01
1 0.000433 2025-02-01
2 0.000433 2025-03-01
3 0.000433 2025-04-01
4 0.000433 2025-05-01
In [1327]:
propane.rename(columns = {"observation_date": "date", "WPU05320104": "propane_ppi"}, inplace = True)

propane['propane_ppi'] = propane['propane_ppi'].interpolate(method='linear')

propane = propane[propane['date'] >= '1980-01-01']

propane.head()
Out[1327]:
date propane_ppi
31 1980-01-01 95.100000
32 1980-02-01 99.600000
33 1980-03-01 101.400000
34 1980-04-01 100.800000
35 1980-05-01 101.800000
In [1328]:
soybean.rename(columns = {" value": "soybean_price"}, inplace = True)
soybean.head()
Out[1328]:
date soybean_price
0 1968-12-05 2.437500
1 1968-12-06 2.447500
2 1968-12-09 2.436300
3 1968-12-10 2.437500
4 1968-12-11 2.446300
In [1329]:
# Ensure the 'date' column is in datetime format
soybean['date'] = pd.to_datetime(soybean['date'])

# Create a complete date range from the min to max date
full_range = pd.date_range(start=soybean['date'].min(), end=soybean['date'].max(), freq='D')

# Manually add the missing date '1980-01-01' to the full range if it's not already present
if pd.Timestamp('1980-01-01') not in full_range:
    full_range = full_range.insert(0, pd.Timestamp('1980-01-01'))

# Reindex the DataFrame with the complete date range, ensuring the missing date is added
soybean = soybean.set_index('date').reindex(full_range)

# Interpolate missing values using linear interpolation
soybean['soybean_price'] = soybean['soybean_price'].interpolate(method='linear')

# Reset index to have 'date' as a column again
soybean = soybean.reset_index()

# Rename the columns to match the original
soybean.columns = ['date', 'soybean_price']

soybean.head()
Out[1329]:
date soybean_price
0 1968-12-05 2.437500
1 1968-12-06 2.447500
2 1968-12-07 2.443767
3 1968-12-08 2.440033
4 1968-12-09 2.436300
In [1330]:
# Set 'date' as the index
soybean = soybean.set_index('date')

# Reindex to include all dates in the full range, missing dates will have NaN prices
full_date_range = pd.date_range(start=soybean.index.min(), end=soybean.index.max(), freq='D')
soybean = soybean.reindex(full_date_range)

# Reset index and rename it back to 'date'
soybean = soybean.reset_index().rename(columns={'index': 'date'})

soybean.head()
Out[1330]:
date soybean_price
0 1968-12-05 2.437500
1 1968-12-06 2.447500
2 1968-12-07 2.443767
3 1968-12-08 2.440033
4 1968-12-09 2.436300
In [1331]:
soybean.rename(columns={' value': 'soybean_price'}, inplace = True)

soybean['date'] = pd.to_datetime(soybean['date'])

soybean = soybean[soybean['date'].dt.year >= 1980]

soybean['soybean_price'] = soybean['soybean_price'].interpolate(method='linear')

if pd.isna(soybean['soybean_price'].iloc[0]):
    soybean['soybean_price'].iloc[0] = soybean['soybean_price'].iloc[1]

soybean.head()
Out[1331]:
date soybean_price
4044 1980-01-01 6.617750
4045 1980-01-02 6.635000
4046 1980-01-03 6.700000
4047 1980-01-04 6.655000
4048 1980-01-05 6.595000
In [1332]:
soybean.columns
Out[1332]:
Index(['date', 'soybean_price'], dtype='object')
In [1333]:
# Ensure the date is in datetime format
soybean['date'] = pd.to_datetime(soybean['date'])

# Create a new column for the year and month
soybean['year'] = soybean['date'].dt.year
soybean['month'] = soybean['date'].dt.month

# Group by year and month, then calculate the mean of soybean_price for each group
soybean = soybean.groupby(['year', 'month'])['soybean_price'].mean().reset_index()

# Optionally, create a datetime column for the month-year combination
soybean['month_year'] = pd.to_datetime(soybean[['year', 'month']].assign(day=1))

# Display the result
soybean.head()
Out[1333]:
year month soybean_price month_year
0 1980 1 6.605250 1980-01-01
1 1980 2 6.588698 1980-02-01
2 1980 3 6.310048 1980-03-01
3 1980 4 5.954567 1980-04-01
4 1980 5 6.250417 1980-05-01
In [1334]:
# Drop the 'year' and 'month' columns
soybean.drop(['year', 'month'], axis=1, inplace=True)

# Rename 'month_year' column to 'date'
soybean.rename(columns={'month_year': 'date'}, inplace=True)

soybean.head()
Out[1334]:
soybean_price date
0 6.605250 1980-01-01
1 6.588698 1980-02-01
2 6.310048 1980-03-01
3 5.954567 1980-04-01
4 6.250417 1980-05-01
In [1335]:
unemploy['observation_date'] = pd.to_datetime(unemploy['observation_date'])

#unemploy.rename(columns = {"observation_date": "date"}, inplace = True)

# Step 2: Filter rows to keep only those from 1980 or later
unemploy = unemploy[unemploy['observation_date'].dt.year >= 1980]

unemploy.rename(columns = {"observation_date": "date"}, inplace = True)

# Preview the filtered data
unemploy.head()
Out[1335]:
date UNRATE
384 1980-01-01 6.300000
385 1980-02-01 6.300000
386 1980-03-01 6.300000
387 1980-04-01 6.900000
388 1980-05-01 7.500000
In [1336]:
egg_price.rename(columns = {"observation_date": "date", "APU0000708111": "egg_price"}, inplace = True)

egg_price.head()
Out[1336]:
date egg_price
0 1980-01-01 0.879000
1 1980-02-01 0.774000
2 1980-03-01 0.812000
3 1980-04-01 0.797000
4 1980-05-01 0.737000
In [1337]:
egg_price.isnull().sum()
Out[1337]:
date         0
egg_price    0
dtype: int64
In [1338]:
egg_price.columns
Out[1338]:
Index(['date', 'egg_price'], dtype='object')
In [1339]:
# Ensure all date columns are of type datetime64[ns]
egg_price['date'] = pd.to_datetime(egg_price['date'], errors = 'coerce')
unemploy['date'] = pd.to_datetime(unemploy['date'], errors = 'coerce')
soybean['date'] = pd.to_datetime(soybean['date'], errors = 'coerce')
propane['date'] = pd.to_datetime(propane['date'], errors = 'coerce')
temp['date'] = pd.to_datetime(temp['date'], errors = 'coerce')
bf['date'] = pd.to_datetime(bf['date'], errors = 'coerce')
corn['date'] = pd.to_datetime(corn['date'], errors = 'coerce')
cpi['date'] = pd.to_datetime(cpi['date'], errors = 'coerce')
diesel['date'] = pd.to_datetime(diesel['date'], errors = 'coerce')
gas['date'] = pd.to_datetime(gas['date'], errors = 'coerce')
income['date'] = pd.to_datetime(income['date'], errors = 'coerce')
inflation['date'] = pd.to_datetime(inflation['date'], errors = 'coerce')
ngas['date'] = pd.to_datetime(ngas['date'], errors = 'coerce')
pop['date'] = pd.to_datetime(pop['date'], errors = 'coerce')

# Now proceed with the merge
merged_data = pd.merge(egg_price, unemploy, on='date', how='outer')
merged_data = pd.merge(merged_data, soybean, on='date', how='outer')
merged_data = pd.merge(merged_data, propane, on='date', how='outer')
merged_data = pd.merge(merged_data, temp, on='date', how='outer')
merged_data = pd.merge(merged_data, bf, on='date', how='outer')
merged_data = pd.merge(merged_data, corn, on='date', how='outer')
merged_data = pd.merge(merged_data, cpi, on='date', how='outer')
merged_data = pd.merge(merged_data, diesel, on='date', how='outer')
merged_data = pd.merge(merged_data, gas, on='date', how='outer')
merged_data = pd.merge(merged_data, income, on='date', how='outer')
merged_data = pd.merge(merged_data, inflation, on='date', how='outer')
merged_data = pd.merge(merged_data, ngas, on='date', how='outer')
merged_data = pd.merge(merged_data, pop, on='date', how='outer')

# Display the merged dataframe
merged_data.head()
Out[1339]:
date egg_price UNRATE soybean_price propane_ppi temp Outbreak corn_price cpi_value diesel_ppi gas_ppi rdpi_per_capita fed_funds_rate natural_gas_ppi Growth Rate
0 1980-01-01 0.879000 6.300000 6.605250 95.100000 52.800000 0.000000 2.450000 0.879000 74.600000 79.500000 23036.000000 13.820000 55.100000 0.001008
1 1980-02-01 0.774000 6.300000 6.588698 99.600000 52.800000 0.000000 2.390000 0.774000 80.100000 84.900000 22894.000000 14.130000 58.300000 0.001008
2 1980-03-01 0.812000 6.300000 6.310048 101.400000 52.800000 0.000000 2.400000 0.812000 84.500000 90.500000 22731.000000 17.190000 58.200000 0.001008
3 1980-04-01 0.797000 6.900000 5.954567 100.800000 52.800000 0.000000 2.360000 0.797000 86.500000 96.200000 22687.000000 17.610000 59.700000 0.001008
4 1980-05-01 0.737000 7.500000 6.250417 101.800000 52.800000 0.000000 2.420000 0.737000 87.300000 96.300000 22580.000000 10.980000 61.000000 0.001008
In [1340]:
merged_data = merged_data.dropna(subset=['egg_price'])
In [1341]:
# Check for missing values in the dataframe
missing_values = merged_data.isnull().sum()

# Display the columns with missing values
print(missing_values[missing_values > 0])
Series([], dtype: int64)
In [1342]:
missing_egg_dates = merged_data[merged_data['egg_price'].isnull()]['date']
print(missing_egg_dates)
Series([], Name: date, dtype: datetime64[ns])
In [1343]:
missing_income = merged_data[merged_data['rdpi_per_capita'].isnull()]['date']
print(missing_income)
Series([], Name: date, dtype: datetime64[ns])
In [1344]:
missing_unrate = merged_data[merged_data['UNRATE'].isnull()]['date']
print(missing_unrate)
Series([], Name: date, dtype: datetime64[ns])
In [1345]:
# Define dependent and independent variables
X = merged_data.drop(columns=['date', 'egg_price'])
y = merged_data['egg_price']

# Add intercept
X = sm.add_constant(X)

# Combine X and y for easier cleaning
combined = pd.concat([X, y], axis=1)

# Replace inf/-inf with NaN and drop rows with missing values
combined = combined.replace([np.inf, -np.inf], np.nan).dropna()

# Separate cleaned X and y
X_clean = combined.drop(columns=['egg_price'])
y_clean = combined['egg_price']

# Fit the model
model = sm.OLS(y_clean, X_clean).fit()

# Show regression results
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.196e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:32   Log-Likelihood:                 1589.4
No. Observations:                 543   AIC:                            -3151.
Df Residuals:                     529   BIC:                            -3091.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0003      0.030      0.012      0.991      -0.059       0.060
UNRATE           -8.17e-06      0.000     -0.018      0.986      -0.001       0.001
soybean_price      -0.0008      0.001     -1.228      0.220      -0.002       0.000
propane_ppi      3.507e-05   1.62e-05      2.162      0.031     3.2e-06    6.69e-05
temp             6.603e-05      0.001      0.131      0.896      -0.001       0.001
Outbreak           -0.0004      0.001     -0.296      0.767      -0.003       0.002
corn_price          0.0010      0.001      0.749      0.454      -0.002       0.004
cpi_value           1.0134      0.001    709.571      0.000       1.011       1.016
diesel_ppi      -3.029e-05   2.35e-05     -1.288      0.198   -7.65e-05    1.59e-05
gas_ppi         -5.099e-05   3.87e-05     -1.317      0.189      -0.000    2.51e-05
rdpi_per_capita -1.846e-07   2.35e-07     -0.784      0.433   -6.47e-07    2.78e-07
fed_funds_rate      0.0001      0.000      0.398      0.691      -0.000       0.001
natural_gas_ppi  -1.08e-05   1.35e-05     -0.798      0.425   -3.74e-05    1.58e-05
Growth Rate        -5.8753      6.100     -0.963      0.336     -17.859       6.108
==============================================================================
Omnibus:                     1202.076   Durbin-Watson:                   1.210
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          3269004.906
Skew:                          17.576   Prob(JB):                         0.00
Kurtosis:                     381.485   Cond. No.                     4.03e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.03e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
In [1346]:
# Add constant to the model for the intercept
X = add_constant(merged_data[['UNRATE', 'soybean_price', 'propane_ppi', 'temp', 'Outbreak', 
                               'corn_price', 'cpi_value', 'diesel_ppi', 'gas_ppi', 
                               'rdpi_per_capita', 'fed_funds_rate', 'natural_gas_ppi', 'Growth Rate']])

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display the VIFs
print(vif_data)
           Variable         VIF
0             const 2863.185734
1            UNRATE    2.208576
2     soybean_price   11.607555
3       propane_ppi    9.639871
4              temp    2.086108
5          Outbreak    2.592928
6        corn_price   11.508110
7         cpi_value    3.135601
8        diesel_ppi   23.830582
9           gas_ppi   32.298061
10  rdpi_per_capita   12.317555
11   fed_funds_rate    3.880146
12  natural_gas_ppi    3.490783
13      Growth Rate    5.788445
In [1347]:
# Define dependent and independent variables
X = merged_data.drop(columns=['date', 'egg_price', 'gas_ppi', 'diesel_ppi'])
y = merged_data['egg_price']

# Add intercept
X = sm.add_constant(X)

# Combine X and y for easier cleaning
combined = pd.concat([X, y], axis=1)

# Replace inf/-inf with NaN and drop rows with missing values
combined = combined.replace([np.inf, -np.inf], np.nan).dropna()

# Separate cleaned X and y
X_clean = combined.drop(columns=['egg_price'])
y_clean = combined['egg_price']

# Fit the model
model = sm.OLS(y_clean, X_clean).fit()

# Show regression results
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.390e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:32   Log-Likelihood:                 1583.9
No. Observations:                 543   AIC:                            -3144.
Df Residuals:                     531   BIC:                            -3092.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0073      0.030      0.244      0.807      -0.052       0.066
UNRATE              0.0002      0.000      0.438      0.661      -0.001       0.001
soybean_price      -0.0009      0.001     -1.520      0.129      -0.002       0.000
propane_ppi      7.386e-06   1.21e-05      0.609      0.543   -1.65e-05    3.12e-05
temp            -5.213e-05      0.000     -0.105      0.917      -0.001       0.001
Outbreak            0.0003      0.001      0.317      0.751      -0.002       0.003
corn_price         -0.0010      0.001     -0.772      0.440      -0.003       0.001
cpi_value           1.0121      0.001    733.189      0.000       1.009       1.015
rdpi_per_capita  -2.94e-07   2.35e-07     -1.251      0.212   -7.56e-07    1.68e-07
fed_funds_rate  -8.104e-05      0.000     -0.292      0.770      -0.001       0.000
natural_gas_ppi -1.247e-05   1.31e-05     -0.952      0.342   -3.82e-05    1.33e-05
Growth Rate         0.3498      5.847      0.060      0.952     -11.137      11.836
==============================================================================
Omnibus:                     1219.694   Durbin-Watson:                   1.188
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          3542783.390
Skew:                          18.172   Prob(JB):                         0.00
Kurtosis:                     397.038   Cond. No.                     3.83e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.83e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
In [1348]:
# Select only the independent variables (excluding 'egg_price')
independent_vars = merged_data.drop(columns=['egg_price'])

# Compute correlation matrix
corr_matrix = independent_vars.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Independent Variables')
plt.show()
In [1349]:
# Define dependent and independent variables
X = merged_data.drop(columns=['date', 'egg_price', 'gas_ppi', 'diesel_ppi', 'soybean_price', 'propane_ppi', 'rdpi_per_capita'])
y = merged_data['egg_price']

# Add intercept
X = sm.add_constant(X)

# Combine X and y for easier cleaning
combined = pd.concat([X, y], axis=1)

# Replace inf/-inf with NaN and drop rows with missing values
combined = combined.replace([np.inf, -np.inf], np.nan).dropna()

# Separate cleaned X and y
X_clean = combined.drop(columns=['egg_price'])
y_clean = combined['egg_price']

# Fit the model
model = sm.OLS(y_clean, X_clean).fit()

# Show regression results
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.909e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:33   Log-Likelihood:                 1582.0
No. Observations:                 543   AIC:                            -3146.
Df Residuals:                     534   BIC:                            -3107.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -0.0025      0.029     -0.088      0.930      -0.059       0.054
UNRATE              0.0005      0.000      1.287      0.199      -0.000       0.001
temp               -0.0002      0.000     -0.491      0.624      -0.001       0.001
Outbreak            0.0003      0.001      0.269      0.788      -0.002       0.002
corn_price         -0.0025      0.001     -3.989      0.000      -0.004      -0.001
cpi_value           1.0114      0.001    787.491      0.000       1.009       1.014
fed_funds_rate      0.0002      0.000      1.143      0.253      -0.000       0.001
natural_gas_ppi -7.455e-06   8.31e-06     -0.898      0.370   -2.38e-05    8.86e-06
Growth Rate         6.4289      4.471      1.438      0.151      -2.355      15.213
==============================================================================
Omnibus:                     1225.887   Durbin-Watson:                   1.180
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          3641899.526
Skew:                          18.386   Prob(JB):                         0.00
Kurtosis:                     402.519   Cond. No.                     1.27e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.27e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [1350]:
# List of predictors used in your regression
predictors = ['UNRATE', 'temp', 'Outbreak', 'corn_price', 'cpi_value',
              'fed_funds_rate', 'natural_gas_ppi', 'Growth Rate']

# Create X matrix with constant
X = sm.add_constant(merged_data[predictors])

# Calculate VIF
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)
          Variable         VIF
0            const 2552.008626
1           UNRATE    1.382154
2             temp    1.916131
3         Outbreak    1.834292
4       corn_price    2.312824
5        cpi_value    2.490735
6   fed_funds_rate    1.574224
7  natural_gas_ppi    1.290526
8      Growth Rate    3.054922
In [1351]:
# Create interaction terms in merged_data
merged_data['corn_outbreak'] = merged_data['corn_price'] * merged_data['Outbreak']
merged_data['soy_outbreak'] = merged_data['soybean_price'] * merged_data['Outbreak']
merged_data['corn_soy'] = merged_data['corn_price'] * merged_data['soybean_price']

merged_data['cpi_rdpi'] = merged_data['cpi_value'] * merged_data['rdpi_per_capita']
merged_data['cpi_fed'] = merged_data['cpi_value'] * merged_data['fed_funds_rate']
merged_data['rdpi_fed'] = merged_data['rdpi_per_capita'] * merged_data['fed_funds_rate']
merged_data['unrate_rdpi'] = merged_data['UNRATE'] * merged_data['rdpi_per_capita']

merged_data['diesel_corn'] = merged_data['diesel_ppi'] * merged_data['corn_price']
merged_data['propane_soy'] = merged_data['propane_ppi'] * merged_data['soybean_price']
merged_data['gas_temp'] = merged_data['gas_ppi'] * merged_data['temp']

merged_data['temp_outbreak'] = merged_data['temp'] * merged_data['Outbreak']
merged_data['temp_corn'] = merged_data['temp'] * merged_data['corn_price']
merged_data['temp_natural_gas'] = merged_data['temp'] * merged_data['natural_gas_ppi']

merged_data['growth_fed'] = merged_data['Growth Rate'] * merged_data['fed_funds_rate']
merged_data['growth_cpi'] = merged_data['Growth Rate'] * merged_data['cpi_value']
merged_data['growth_corn'] = merged_data['Growth Rate'] * merged_data['corn_price']
In [1352]:
# Define predictors including interaction terms
predictors = [
    'UNRATE', 'temp', 'Outbreak', 'corn_price', 'cpi_value', 'fed_funds_rate',
    'natural_gas_ppi', 'Growth Rate', 'soybean_price', 'propane_ppi', 'diesel_ppi',
    'gas_ppi', 'rdpi_per_capita',

    # Interaction terms
    'corn_outbreak', 'soy_outbreak', 'corn_soy',
    'cpi_rdpi', 'cpi_fed', 'rdpi_fed', 'unrate_rdpi',
    'diesel_corn', 'propane_soy', 'gas_temp',
    'temp_outbreak', 'temp_corn', 'temp_natural_gas',
    'growth_fed', 'growth_cpi', 'growth_corn'
]

X = merged_data[predictors]
y = merged_data['egg_price']

# Add constant for intercept
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Print regression results
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 5.794e+04
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:33   Log-Likelihood:                 1618.8
No. Observations:                 543   AIC:                            -3178.
Df Residuals:                     513   BIC:                            -3049.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.3017      0.105      2.874      0.004       0.095       0.508
UNRATE              -0.0014      0.003     -0.557      0.578      -0.006       0.004
temp                -0.0029      0.002     -1.700      0.090      -0.006       0.000
Outbreak             0.0440      0.041      1.063      0.288      -0.037       0.125
corn_price          -0.0483      0.041     -1.165      0.245      -0.130       0.033
cpi_value            0.8811      0.035     24.993      0.000       0.812       0.950
fed_funds_rate      -0.0036      0.004     -0.887      0.376      -0.012       0.004
natural_gas_ppi     -0.0009      0.000     -1.767      0.078      -0.002    9.65e-05
Growth Rate        -41.6700     31.034     -1.343      0.180    -102.638      19.298
soybean_price       -0.0001      0.001     -0.097      0.922      -0.003       0.002
propane_ppi       4.289e-05   4.18e-05      1.027      0.305   -3.91e-05       0.000
diesel_ppi       -1.116e-06   6.61e-05     -0.017      0.987      -0.000       0.000
gas_ppi              0.0010      0.001      1.281      0.201      -0.001       0.002
rdpi_per_capita  -2.651e-06   8.17e-07     -3.246      0.001   -4.26e-06   -1.05e-06
corn_outbreak       -0.0024      0.002     -1.012      0.312      -0.007       0.002
soy_outbreak     -3.148e-05      0.001     -0.036      0.972      -0.002       0.002
corn_soy          8.998e-05      0.000      0.287      0.774      -0.001       0.001
cpi_rdpi          2.336e-06   5.23e-07      4.466      0.000    1.31e-06    3.36e-06
cpi_fed              0.0012      0.001      1.328      0.185      -0.001       0.003
rdpi_fed         -6.819e-08   6.55e-08     -1.040      0.299   -1.97e-07    6.06e-08
unrate_rdpi       4.093e-09   5.73e-08      0.071      0.943   -1.08e-07    1.17e-07
diesel_corn      -1.542e-06   1.03e-05     -0.149      0.882   -2.19e-05    1.88e-05
propane_soy      -1.613e-06   3.12e-06     -0.516      0.606   -7.75e-06    4.52e-06
gas_temp         -1.867e-05   1.37e-05     -1.360      0.175   -4.56e-05    8.31e-06
temp_outbreak       -0.0007      0.001     -0.830      0.407      -0.002       0.001
temp_corn            0.0009      0.001      1.227      0.220      -0.001       0.002
temp_natural_gas  1.635e-05    9.1e-06      1.797      0.073   -1.52e-06    3.42e-05
growth_fed           4.0072      2.876      1.393      0.164      -1.642       9.657
growth_cpi          32.5163     20.088      1.619      0.106      -6.948      71.981
growth_corn         -1.5425      7.478     -0.206      0.837     -16.233      13.148
==============================================================================
Omnibus:                     1098.871   Durbin-Watson:                   1.369
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          2165590.659
Skew:                          14.336   Prob(JB):                         0.00
Kurtosis:                     311.050   Cond. No.                     1.64e+10
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.64e+10. This might indicate that there are
strong multicollinearity or other numerical problems.
In [1353]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)
             feature          VIF
0              const 37544.064509
1             UNRATE    70.704086
2               temp    26.467847
3           Outbreak  3221.050777
4         corn_price 11470.599064
5          cpi_value  2064.639688
6     fed_funds_rate   875.694055
7    natural_gas_ppi  4917.387954
8        Growth Rate   161.915984
9      soybean_price    56.905525
10       propane_ppi    68.990165
11        diesel_ppi   203.455282
12           gas_ppi 12846.196126
13   rdpi_per_capita   160.089083
14     corn_outbreak   234.162282
15      soy_outbreak   197.008944
16          corn_soy   231.413475
17          cpi_rdpi  1480.992322
18           cpi_fed    53.411866
19          rdpi_fed   149.877657
20       unrate_rdpi    59.589198
21       diesel_corn   221.607819
22       propane_soy    75.572238
23          gas_temp 13357.918509
24     temp_outbreak  3741.915213
25         temp_corn 12358.043858
26  temp_natural_gas  4858.189758
27        growth_fed   402.504304
28        growth_cpi   124.304248
29       growth_corn   128.244498
In [1354]:
# Scale data for Lasso
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.drop(columns=['const']))

lasso = LassoCV(cv=5, random_state=0).fit(X_scaled, y)
coef = pd.Series(lasso.coef_, index=X.columns[1:])
print(coef[coef != 0].sort_values())
diesel_ppi        -0.001713
soybean_price     -0.000166
rdpi_per_capita   -0.000043
UNRATE             0.000002
growth_cpi         0.001369
cpi_fed            0.001431
cpi_rdpi           0.008845
cpi_value          0.693240
dtype: float64
In [1355]:
# Selected features from Lasso
selected_features = ['diesel_ppi', 'soybean_price', 'rdpi_per_capita', 'UNRATE',
                     'growth_cpi', 'cpi_fed', 'cpi_rdpi', 'cpi_value']

# Subset the original X (with constant)
X_lasso = X[selected_features]
X_lasso = sm.add_constant(X_lasso)

# Fit OLS model
model_lasso = sm.OLS(y, X_lasso).fit()

# Summary of the regression
print(model_lasso.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 2.019e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:33   Log-Likelihood:                 1597.2
No. Observations:                 543   AIC:                            -3176.
Df Residuals:                     534   BIC:                            -3138.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0446      0.013      3.499      0.001       0.020       0.070
diesel_ppi      -1.664e-05   1.23e-05     -1.358      0.175   -4.07e-05    7.43e-06
soybean_price      -0.0003      0.000     -0.677      0.499      -0.001       0.001
rdpi_per_capita  -1.26e-06   2.97e-07     -4.239      0.000   -1.84e-06   -6.76e-07
UNRATE              0.0002      0.000      0.590      0.556      -0.001       0.001
growth_cpi         10.6088      4.529      2.342      0.020       1.712      19.506
cpi_fed            -0.0003      0.000     -1.589      0.113      -0.001       8e-05
cpi_rdpi         1.175e-06   2.68e-07      4.379      0.000    6.48e-07     1.7e-06
cpi_value           0.9509      0.015     64.341      0.000       0.922       0.980
==============================================================================
Omnibus:                     1174.548   Durbin-Watson:                   1.249
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          2925313.857
Skew:                          16.664   Prob(JB):                         0.00
Kurtosis:                     361.029   Cond. No.                     6.22e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.22e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
In [1356]:
# VIF DataFrame
vif_data = pd.DataFrame()
vif_data["feature"] = X_lasso.columns
vif_data["VIF"] = [variance_inflation_factor(X_lasso.values, i)
                   for i in range(X_lasso.shape[1])]

print(vif_data)
           feature        VIF
0            const 532.584877
1       diesel_ppi   6.726621
2    soybean_price   4.836786
3  rdpi_per_capita  20.403635
4           UNRATE   1.574427
5       growth_cpi   6.073492
6          cpi_fed   3.036188
7         cpi_rdpi 374.649905
8        cpi_value 348.758535
In [1357]:
coef[coef != 0].abs().sort_values().plot(kind='barh', figsize=(8, 6))
plt.title("Feature Importance from Lasso Regression")
plt.xlabel("Absolute Coefficient Value")
plt.tight_layout()
plt.show()
In [1358]:
# Define reduced set of features
X_reduced = X[['const', 'rdpi_per_capita', 'growth_cpi', 'cpi_rdpi', 'cpi_value']]

# Refit OLS model
model_reduced = sm.OLS(y, X_reduced).fit()

# Display summary
print(model_reduced.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 3.997e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:34   Log-Likelihood:                 1592.4
No. Observations:                 543   AIC:                            -3175.
Df Residuals:                     538   BIC:                            -3153.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0436      0.007      6.079      0.000       0.030       0.058
rdpi_per_capita -1.264e-06   1.64e-07     -7.703      0.000   -1.59e-06   -9.41e-07
growth_cpi         15.5789      3.847      4.049      0.000       8.022      23.136
cpi_rdpi         1.274e-06   2.04e-07      6.254      0.000    8.74e-07    1.67e-06
cpi_value           0.9404      0.011     83.276      0.000       0.918       0.963
==============================================================================
Omnibus:                     1190.346   Durbin-Watson:                   1.226
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          3132994.257
Skew:                          17.179   Prob(JB):                         0.00
Kurtosis:                     373.533   Cond. No.                     5.26e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.26e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
In [1359]:
# Compute VIF for each feature in the reduced model
vif_data = pd.DataFrame()
vif_data["feature"] = X_reduced.columns
vif_data["VIF"] = [variance_inflation_factor(X_reduced.values, i) for i in range(X_reduced.shape[1])]

print(vif_data)
           feature        VIF
0            const 166.546852
1  rdpi_per_capita   6.147673
2       growth_cpi   4.338013
3         cpi_rdpi 213.766659
4        cpi_value 201.584149
In [1360]:
# Calculate correlation and p-value
corr_coef, p_value = pearsonr(X['cpi_rdpi'], X['cpi_value'])

print(f"Pearson correlation coefficient: {corr_coef:.4f}")
print(f"P-value: {p_value:.4e}")
Pearson correlation coefficient: 0.9883
P-value: 0.0000e+00
In [1361]:
# Define new reduced set of features
X_dropped = X[['const', 'rdpi_per_capita', 'growth_cpi', 'cpi_value']]

# Refit OLS model
model_dropped = sm.OLS(y, X_dropped).fit()

# Display summary
print(model_dropped.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 4.977e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:34   Log-Likelihood:                 1573.4
No. Observations:                 543   AIC:                            -3139.
Df Residuals:                     539   BIC:                            -3122.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0052      0.004      1.361      0.174      -0.002       0.013
rdpi_per_capita  -4.75e-07   1.09e-07     -4.374      0.000   -6.88e-07   -2.62e-07
growth_cpi         -1.7079      2.769     -0.617      0.538      -7.147       3.731
cpi_value           1.0103      0.002    603.847      0.000       1.007       1.014
==============================================================================
Omnibus:                     1253.952   Durbin-Watson:                   1.136
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          4133395.965
Skew:                          19.378   Prob(JB):                         0.00
Kurtosis:                     428.664   Cond. No.                     1.79e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.79e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
In [1362]:
# Calculate VIFs

vif_data = pd.DataFrame()
vif_data["feature"] = X_dropped.columns
vif_data["VIF"] = [variance_inflation_factor(X_dropped.values, i) for i in range(X_dropped.shape[1])]

print(vif_data)
           feature       VIF
0            const 44.697063
1  rdpi_per_capita  2.515052
2       growth_cpi  2.098828
3        cpi_value  4.132810
In [1363]:
# Define final set of predictors (dropping growth_cpi)
X_final = X[['const', 'rdpi_per_capita', 'cpi_value']]

# Refit OLS model
model_final = sm.OLS(y, X_final).fit()

# Show summary
print(model_final.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              egg_price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 7.474e+05
Date:                Thu, 24 Apr 2025   Prob (F-statistic):               0.00
Time:                        12:54:35   Log-Likelihood:                 1573.2
No. Observations:                 543   AIC:                            -3140.
Df Residuals:                     540   BIC:                            -3127.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0036      0.003      1.304      0.193      -0.002       0.009
rdpi_per_capita -4.502e-07   1.01e-07     -4.464      0.000   -6.48e-07   -2.52e-07
cpi_value           1.0096      0.001    832.728      0.000       1.007       1.012
==============================================================================
Omnibus:                     1254.686   Durbin-Watson:                   1.135
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          4144896.104
Skew:                          19.405   Prob(JB):                         0.00
Kurtosis:                     429.255   Cond. No.                     1.80e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.8e+05. This might indicate that there are
strong multicollinearity or other numerical problems.