import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from statsmodels.stats.outliers_influence import variance_inflation_factor
pd.set_option('display.float_format', '{:.6f}'.format)
temp = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Avg Temp.csv")
bf = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Bird Flu.csv")
corn = pd.read_html(r"C:\Users\alexp\Desktop\Data\Corn.xls", skiprows = 0)
cpi = pd.read_excel(r"C:\Users\alexp\Desktop\Data\CPI.xlsx", skiprows = 9)
diesel = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Diesel.csv")
gas = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Gasoline.csv")
income = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Income.csv")
inflation = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Inflation.csv")
ngas = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Natural Gas.csv")
pop = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Population.csv")
propane = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Propane.csv")
soybean = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Soybean.csv", skiprows = 15)
unemploy = pd.read_csv(r"C:\Users\alexp\Desktop\Data\Unemployment.csv")
egg_price = pd.read_csv(r"C:\Users\alexp\Desktop\Egg Price.csv")
temp.head()
year | avg temp | |
---|---|---|
0 | 1980 | 52.800000 |
1 | 1981 | 54.300000 |
2 | 1982 | 51.000000 |
3 | 1983 | 53.500000 |
4 | 1984 | 51.000000 |
# Create a list to hold all the rows
monthly_data = []
# Iterate over the years in the temp DataFrame
for year in temp['year']:
for month in range(1, 13): # For each month from January (1) to December (12)
# Add the date for the current year and month
monthly_data.append({'date': f"{year}-{month:02d}-01", 'temp': temp[temp['year'] == year]['avg temp'].values[0]})
# Replace the original temp DataFrame with the new monthly data
temp = pd.DataFrame(monthly_data)
# Convert the 'date' column to datetime format
temp['date'] = pd.to_datetime(temp['date'])
temp.head()
date | temp | |
---|---|---|
0 | 1980-01-01 | 52.800000 |
1 | 1980-02-01 | 52.800000 |
2 | 1980-03-01 | 52.800000 |
3 | 1980-04-01 | 52.800000 |
4 | 1980-05-01 | 52.800000 |
bf.head()
Year | Outbreak | |
---|---|---|
0 | 1980 | 0 |
1 | 1981 | 0 |
2 | 1982 | 0 |
3 | 1983 | 2 |
4 | 1984 | 2 |
#0 → "None"
#1 → "Moderate"
#2 → "High"
# Create a list to hold the rows for the new dataframe
monthly_bf_data = []
# Iterate over the years in the original DataFrame (assuming 'bf' has 'year')
for year in bf['Year']:
for month in range(1, 13): # For each month from January (1) to December (12)
# Get the 'Outbreak' value for the current year
outbreak_value = bf[bf['Year'] == year]['Outbreak'].values[0]
# Add the date and outbreak value to the list
monthly_bf_data.append({'date': f"{year}-{month:02d}-01", 'Outbreak': outbreak_value})
# Replace the original bf DataFrame with the new monthly data
bf = pd.DataFrame(monthly_bf_data)
# Convert the 'date' column to datetime format
bf['date'] = pd.to_datetime(bf['date'])
bf.head()
date | Outbreak | |
---|---|---|
0 | 1980-01-01 | 0 |
1 | 1980-02-01 | 0 |
2 | 1980-03-01 | 0 |
3 | 1980-04-01 | 0 |
4 | 1980-05-01 | 0 |
corn = corn[0]
# Convert 'Year' and 'Frequency' to 'date' and select relevant columns
corn['Date'] = pd.to_datetime(corn['Year'].astype(str) + '-' + corn['Frequency'] + '-01', format='%Y-%b-%d')
corn = corn[['Date', 'Amount']]
# Rename columns for clarity
corn.rename(columns={"Date": "date", "Amount": "corn_price"}, inplace=True)
# Create new rows for Feb and Mar 2025 with NaN values for 'corn_price'
new_rows = pd.DataFrame({
'date': [pd.Timestamp('2025-02-01'), pd.Timestamp('2025-03-01')],
'corn_price': [np.nan, np.nan]
})
# Append the new rows to the DataFrame and sort by date
corn = pd.concat([corn, new_rows], ignore_index=True).sort_values('date').reset_index(drop=True)
# Perform linear interpolation once to fill missing 'corn_price' values
corn['corn_price'] = corn['corn_price'].interpolate(method='linear')
# Display the updated DataFrame
corn.tail()
date | corn_price | |
---|---|---|
538 | 2024-11-01 | 4.070000 |
539 | 2024-12-01 | 4.230000 |
540 | 2025-01-01 | 4.290000 |
541 | 2025-02-01 | 4.290000 |
542 | 2025-03-01 | 4.290000 |
# Reshape the CPI data to long format
cpi = cpi.melt(id_vars='Year', var_name='Month', value_name='cpi_value')
# Create a proper date column
cpi['Date'] = pd.to_datetime(cpi['Year'].astype(str) + '-' + cpi['Month'] + '-01', format='%Y-%b-%d')
# Keep only the necessary columns and sort by date
cpi = cpi[['Date', 'cpi_value']].sort_values('Date').reset_index(drop=True)
cpi.rename(columns = {"Date": "date"}, inplace = True)
# Check for March 2025 and interpolate just that value
march_2025_index = cpi[cpi['date'] == '2025-03-01'].index
if not march_2025_index.empty:
# Perform linear interpolation just for March 2025
cpi.loc[march_2025_index, 'cpi_value'] = cpi['cpi_value'].interpolate(method='linear').loc[march_2025_index]
cpi.tail()
date | cpi_value | |
---|---|---|
547 | 2025-08-01 | NaN |
548 | 2025-09-01 | NaN |
549 | 2025-10-01 | NaN |
550 | 2025-11-01 | NaN |
551 | 2025-12-01 | NaN |
diesel.rename(columns = {"observation_date": "date", "WPU057303": "diesel_ppi"}, inplace = True)
diesel = diesel[diesel['date'] >= '1980-01-01']
diesel.head()
date | diesel_ppi | |
---|---|---|
83 | 1980-01-01 | 74.600000 |
84 | 1980-02-01 | 80.100000 |
85 | 1980-03-01 | 84.500000 |
86 | 1980-04-01 | 86.500000 |
87 | 1980-05-01 | 87.300000 |
gas.rename(columns = {"observation_date": "date", "WPS0571": "gas_ppi"}, inplace = True)
gas.head()
date | gas_ppi | |
---|---|---|
0 | 1973-03-01 | 17.500000 |
1 | 1973-04-01 | 17.800000 |
2 | 1973-05-01 | 18.000000 |
3 | 1973-06-01 | 18.000000 |
4 | 1973-07-01 | 17.800000 |
income.rename(columns = {"observation_date" : "date", "A229RX0": "rdpi_per_capita"}, inplace = True)
income['date'] = pd.to_datetime(income['date'], errors='coerce')
# Add a row for March 2025 with NaN value for 'rdpi_per_capita'
new_row = pd.DataFrame({
'date': [pd.Timestamp('2025-03-01')],
'rdpi_per_capita': [np.nan] # If you don't have the value, keep it as NaN
})
# Append the new row to the income DataFrame
income = pd.concat([income, new_row], ignore_index=True)
# Sort by date (optional, just in case)
income.sort_values('date', inplace=True)
income.reset_index(drop=True, inplace=True)
# Interpolate missing values in 'rdpi_per_capita' (optional)
income['rdpi_per_capita'] = income['rdpi_per_capita'].interpolate(method='linear')
# Display the last few rows to verify the new row was added
income.tail()
date | rdpi_per_capita | |
---|---|---|
790 | 2024-11-01 | 51565.000000 |
791 | 2024-12-01 | 51581.000000 |
792 | 2025-01-01 | 51724.000000 |
793 | 2025-02-01 | 51981.000000 |
794 | 2025-03-01 | 51981.000000 |
inflation.rename(columns = {"observation_date": "date", "FEDFUNDS": "fed_funds_rate"}, inplace = True)
inflation.head()
date | fed_funds_rate | |
---|---|---|
0 | 1954-07-01 | 0.800000 |
1 | 1954-08-01 | 1.220000 |
2 | 1954-09-01 | 1.070000 |
3 | 1954-10-01 | 0.850000 |
4 | 1954-11-01 | 0.830000 |
ngas.rename(columns = {"observation_date" : "date", "WPU0531": "natural_gas_ppi"}, inplace = True)
ngas.head()
date | natural_gas_ppi | |
---|---|---|
0 | 1967-01-01 | 7.400000 |
1 | 1967-02-01 | 7.400000 |
2 | 1967-03-01 | 7.400000 |
3 | 1967-04-01 | 7.500000 |
4 | 1967-05-01 | 7.500000 |
pop.columns
Index(['Year', 'Population', 'Growth Rate'], dtype='object')
pop.tail()
Year | Population | Growth Rate | |
---|---|---|---|
41 | 1984 | 232,766,280 | 1.03% |
42 | 1983 | 230,389,964 | 1.05% |
43 | 1982 | 228,001,418 | 1.04% |
44 | 1981 | 225,654,008 | 1.13% |
45 | 1980 | 223,140,018 | 1.21% |
# Check if the 'Growth Rate' column contains any unwanted characters or missing data
pop['Growth Rate'] = pop['Growth Rate'].replace({'%': ''}, regex=True)
# Now, attempt to convert to numeric values
pop['Growth Rate'] = pd.to_numeric(pop['Growth Rate'], errors='coerce') / 100
# Check if there are any NaN values after the conversion
print(pop['Growth Rate'])
# Create an empty list to store the monthly data
monthly_data = []
# Iterate over each row to generate monthly data
for _, row in pop.iterrows():
year = row['Year']
growth_rate = row['Growth Rate']
for month in range(1, 13): # Loop for each month in the year
monthly_data.append({
'Year': year,
'Month': month,
'Growth Rate': growth_rate / 12 # Monthly growth rate
})
# Create the DataFrame from the monthly data
pop = pd.DataFrame(monthly_data)
# Create a Date column (yyyy-mm-dd format)
pop['Date'] = pd.to_datetime(pop[['Year', 'Month']].assign(DAY=1))
# Now the `pop` DataFrame has monthly data with the 'Date' column
pop.head()
0 0.005200 1 0.005300 2 0.005000 3 0.003800 4 0.003100 5 0.004900 6 0.006600 7 0.007100 8 0.007900 9 0.008000 10 0.008000 11 0.008300 12 0.008600 13 0.008800 14 0.008700 15 0.008700 16 0.009200 17 0.009700 18 0.010000 19 0.009800 20 0.009800 21 0.009700 22 0.009600 23 0.010100 24 0.010900 25 0.011500 26 0.012100 27 0.012600 28 0.012700 29 0.012500 30 0.012900 31 0.013500 32 0.014100 33 0.014400 34 0.014000 35 0.012800 36 0.011000 37 0.010200 38 0.009900 39 0.010100 40 0.010200 41 0.010300 42 0.010500 43 0.010400 44 0.011300 45 0.012100 Name: Growth Rate, dtype: float64
Year | Month | Growth Rate | Date | |
---|---|---|---|---|
0 | 2025 | 1 | 0.000433 | 2025-01-01 |
1 | 2025 | 2 | 0.000433 | 2025-02-01 |
2 | 2025 | 3 | 0.000433 | 2025-03-01 |
3 | 2025 | 4 | 0.000433 | 2025-04-01 |
4 | 2025 | 5 | 0.000433 | 2025-05-01 |
# Remove the 'Year' and 'Month' columns from the 'pop' DataFrame
pop = pop.drop(columns=['Year', 'Month'])
# Display the updated 'pop' DataFrame
pop.head()
Growth Rate | Date | |
---|---|---|
0 | 0.000433 | 2025-01-01 |
1 | 0.000433 | 2025-02-01 |
2 | 0.000433 | 2025-03-01 |
3 | 0.000433 | 2025-04-01 |
4 | 0.000433 | 2025-05-01 |
pop.rename(columns = {"Date": "date"}, inplace = True)
pop.head()
Growth Rate | date | |
---|---|---|
0 | 0.000433 | 2025-01-01 |
1 | 0.000433 | 2025-02-01 |
2 | 0.000433 | 2025-03-01 |
3 | 0.000433 | 2025-04-01 |
4 | 0.000433 | 2025-05-01 |
propane.rename(columns = {"observation_date": "date", "WPU05320104": "propane_ppi"}, inplace = True)
propane['propane_ppi'] = propane['propane_ppi'].interpolate(method='linear')
propane = propane[propane['date'] >= '1980-01-01']
propane.head()
date | propane_ppi | |
---|---|---|
31 | 1980-01-01 | 95.100000 |
32 | 1980-02-01 | 99.600000 |
33 | 1980-03-01 | 101.400000 |
34 | 1980-04-01 | 100.800000 |
35 | 1980-05-01 | 101.800000 |
soybean.rename(columns = {" value": "soybean_price"}, inplace = True)
soybean.head()
date | soybean_price | |
---|---|---|
0 | 1968-12-05 | 2.437500 |
1 | 1968-12-06 | 2.447500 |
2 | 1968-12-09 | 2.436300 |
3 | 1968-12-10 | 2.437500 |
4 | 1968-12-11 | 2.446300 |
# Ensure the 'date' column is in datetime format
soybean['date'] = pd.to_datetime(soybean['date'])
# Create a complete date range from the min to max date
full_range = pd.date_range(start=soybean['date'].min(), end=soybean['date'].max(), freq='D')
# Manually add the missing date '1980-01-01' to the full range if it's not already present
if pd.Timestamp('1980-01-01') not in full_range:
full_range = full_range.insert(0, pd.Timestamp('1980-01-01'))
# Reindex the DataFrame with the complete date range, ensuring the missing date is added
soybean = soybean.set_index('date').reindex(full_range)
# Interpolate missing values using linear interpolation
soybean['soybean_price'] = soybean['soybean_price'].interpolate(method='linear')
# Reset index to have 'date' as a column again
soybean = soybean.reset_index()
# Rename the columns to match the original
soybean.columns = ['date', 'soybean_price']
soybean.head()
date | soybean_price | |
---|---|---|
0 | 1968-12-05 | 2.437500 |
1 | 1968-12-06 | 2.447500 |
2 | 1968-12-07 | 2.443767 |
3 | 1968-12-08 | 2.440033 |
4 | 1968-12-09 | 2.436300 |
# Set 'date' as the index
soybean = soybean.set_index('date')
# Reindex to include all dates in the full range, missing dates will have NaN prices
full_date_range = pd.date_range(start=soybean.index.min(), end=soybean.index.max(), freq='D')
soybean = soybean.reindex(full_date_range)
# Reset index and rename it back to 'date'
soybean = soybean.reset_index().rename(columns={'index': 'date'})
soybean.head()
date | soybean_price | |
---|---|---|
0 | 1968-12-05 | 2.437500 |
1 | 1968-12-06 | 2.447500 |
2 | 1968-12-07 | 2.443767 |
3 | 1968-12-08 | 2.440033 |
4 | 1968-12-09 | 2.436300 |
soybean.rename(columns={' value': 'soybean_price'}, inplace = True)
soybean['date'] = pd.to_datetime(soybean['date'])
soybean = soybean[soybean['date'].dt.year >= 1980]
soybean['soybean_price'] = soybean['soybean_price'].interpolate(method='linear')
if pd.isna(soybean['soybean_price'].iloc[0]):
soybean['soybean_price'].iloc[0] = soybean['soybean_price'].iloc[1]
soybean.head()
date | soybean_price | |
---|---|---|
4044 | 1980-01-01 | 6.617750 |
4045 | 1980-01-02 | 6.635000 |
4046 | 1980-01-03 | 6.700000 |
4047 | 1980-01-04 | 6.655000 |
4048 | 1980-01-05 | 6.595000 |
soybean.columns
Index(['date', 'soybean_price'], dtype='object')
# Ensure the date is in datetime format
soybean['date'] = pd.to_datetime(soybean['date'])
# Create a new column for the year and month
soybean['year'] = soybean['date'].dt.year
soybean['month'] = soybean['date'].dt.month
# Group by year and month, then calculate the mean of soybean_price for each group
soybean = soybean.groupby(['year', 'month'])['soybean_price'].mean().reset_index()
# Optionally, create a datetime column for the month-year combination
soybean['month_year'] = pd.to_datetime(soybean[['year', 'month']].assign(day=1))
# Display the result
soybean.head()
year | month | soybean_price | month_year | |
---|---|---|---|---|
0 | 1980 | 1 | 6.605250 | 1980-01-01 |
1 | 1980 | 2 | 6.588698 | 1980-02-01 |
2 | 1980 | 3 | 6.310048 | 1980-03-01 |
3 | 1980 | 4 | 5.954567 | 1980-04-01 |
4 | 1980 | 5 | 6.250417 | 1980-05-01 |
# Drop the 'year' and 'month' columns
soybean.drop(['year', 'month'], axis=1, inplace=True)
# Rename 'month_year' column to 'date'
soybean.rename(columns={'month_year': 'date'}, inplace=True)
soybean.head()
soybean_price | date | |
---|---|---|
0 | 6.605250 | 1980-01-01 |
1 | 6.588698 | 1980-02-01 |
2 | 6.310048 | 1980-03-01 |
3 | 5.954567 | 1980-04-01 |
4 | 6.250417 | 1980-05-01 |
unemploy['observation_date'] = pd.to_datetime(unemploy['observation_date'])
#unemploy.rename(columns = {"observation_date": "date"}, inplace = True)
# Step 2: Filter rows to keep only those from 1980 or later
unemploy = unemploy[unemploy['observation_date'].dt.year >= 1980]
unemploy.rename(columns = {"observation_date": "date"}, inplace = True)
# Preview the filtered data
unemploy.head()
date | UNRATE | |
---|---|---|
384 | 1980-01-01 | 6.300000 |
385 | 1980-02-01 | 6.300000 |
386 | 1980-03-01 | 6.300000 |
387 | 1980-04-01 | 6.900000 |
388 | 1980-05-01 | 7.500000 |
egg_price.rename(columns = {"observation_date": "date", "APU0000708111": "egg_price"}, inplace = True)
egg_price.head()
date | egg_price | |
---|---|---|
0 | 1980-01-01 | 0.879000 |
1 | 1980-02-01 | 0.774000 |
2 | 1980-03-01 | 0.812000 |
3 | 1980-04-01 | 0.797000 |
4 | 1980-05-01 | 0.737000 |
egg_price.isnull().sum()
date 0 egg_price 0 dtype: int64
egg_price.columns
Index(['date', 'egg_price'], dtype='object')
# Ensure all date columns are of type datetime64[ns]
egg_price['date'] = pd.to_datetime(egg_price['date'], errors = 'coerce')
unemploy['date'] = pd.to_datetime(unemploy['date'], errors = 'coerce')
soybean['date'] = pd.to_datetime(soybean['date'], errors = 'coerce')
propane['date'] = pd.to_datetime(propane['date'], errors = 'coerce')
temp['date'] = pd.to_datetime(temp['date'], errors = 'coerce')
bf['date'] = pd.to_datetime(bf['date'], errors = 'coerce')
corn['date'] = pd.to_datetime(corn['date'], errors = 'coerce')
cpi['date'] = pd.to_datetime(cpi['date'], errors = 'coerce')
diesel['date'] = pd.to_datetime(diesel['date'], errors = 'coerce')
gas['date'] = pd.to_datetime(gas['date'], errors = 'coerce')
income['date'] = pd.to_datetime(income['date'], errors = 'coerce')
inflation['date'] = pd.to_datetime(inflation['date'], errors = 'coerce')
ngas['date'] = pd.to_datetime(ngas['date'], errors = 'coerce')
pop['date'] = pd.to_datetime(pop['date'], errors = 'coerce')
# Now proceed with the merge
merged_data = pd.merge(egg_price, unemploy, on='date', how='outer')
merged_data = pd.merge(merged_data, soybean, on='date', how='outer')
merged_data = pd.merge(merged_data, propane, on='date', how='outer')
merged_data = pd.merge(merged_data, temp, on='date', how='outer')
merged_data = pd.merge(merged_data, bf, on='date', how='outer')
merged_data = pd.merge(merged_data, corn, on='date', how='outer')
merged_data = pd.merge(merged_data, cpi, on='date', how='outer')
merged_data = pd.merge(merged_data, diesel, on='date', how='outer')
merged_data = pd.merge(merged_data, gas, on='date', how='outer')
merged_data = pd.merge(merged_data, income, on='date', how='outer')
merged_data = pd.merge(merged_data, inflation, on='date', how='outer')
merged_data = pd.merge(merged_data, ngas, on='date', how='outer')
merged_data = pd.merge(merged_data, pop, on='date', how='outer')
# Display the merged dataframe
merged_data.head()
date | egg_price | UNRATE | soybean_price | propane_ppi | temp | Outbreak | corn_price | cpi_value | diesel_ppi | gas_ppi | rdpi_per_capita | fed_funds_rate | natural_gas_ppi | Growth Rate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1980-01-01 | 0.879000 | 6.300000 | 6.605250 | 95.100000 | 52.800000 | 0.000000 | 2.450000 | 0.879000 | 74.600000 | 79.500000 | 23036.000000 | 13.820000 | 55.100000 | 0.001008 |
1 | 1980-02-01 | 0.774000 | 6.300000 | 6.588698 | 99.600000 | 52.800000 | 0.000000 | 2.390000 | 0.774000 | 80.100000 | 84.900000 | 22894.000000 | 14.130000 | 58.300000 | 0.001008 |
2 | 1980-03-01 | 0.812000 | 6.300000 | 6.310048 | 101.400000 | 52.800000 | 0.000000 | 2.400000 | 0.812000 | 84.500000 | 90.500000 | 22731.000000 | 17.190000 | 58.200000 | 0.001008 |
3 | 1980-04-01 | 0.797000 | 6.900000 | 5.954567 | 100.800000 | 52.800000 | 0.000000 | 2.360000 | 0.797000 | 86.500000 | 96.200000 | 22687.000000 | 17.610000 | 59.700000 | 0.001008 |
4 | 1980-05-01 | 0.737000 | 7.500000 | 6.250417 | 101.800000 | 52.800000 | 0.000000 | 2.420000 | 0.737000 | 87.300000 | 96.300000 | 22580.000000 | 10.980000 | 61.000000 | 0.001008 |
merged_data = merged_data.dropna(subset=['egg_price'])
# Check for missing values in the dataframe
missing_values = merged_data.isnull().sum()
# Display the columns with missing values
print(missing_values[missing_values > 0])
Series([], dtype: int64)
missing_egg_dates = merged_data[merged_data['egg_price'].isnull()]['date']
print(missing_egg_dates)
Series([], Name: date, dtype: datetime64[ns])
missing_income = merged_data[merged_data['rdpi_per_capita'].isnull()]['date']
print(missing_income)
Series([], Name: date, dtype: datetime64[ns])
missing_unrate = merged_data[merged_data['UNRATE'].isnull()]['date']
print(missing_unrate)
Series([], Name: date, dtype: datetime64[ns])
# Define dependent and independent variables
X = merged_data.drop(columns=['date', 'egg_price'])
y = merged_data['egg_price']
# Add intercept
X = sm.add_constant(X)
# Combine X and y for easier cleaning
combined = pd.concat([X, y], axis=1)
# Replace inf/-inf with NaN and drop rows with missing values
combined = combined.replace([np.inf, -np.inf], np.nan).dropna()
# Separate cleaned X and y
X_clean = combined.drop(columns=['egg_price'])
y_clean = combined['egg_price']
# Fit the model
model = sm.OLS(y_clean, X_clean).fit()
# Show regression results
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: egg_price R-squared: 1.000 Model: OLS Adj. R-squared: 1.000 Method: Least Squares F-statistic: 1.196e+05 Date: Thu, 24 Apr 2025 Prob (F-statistic): 0.00 Time: 12:54:32 Log-Likelihood: 1589.4 No. Observations: 543 AIC: -3151. Df Residuals: 529 BIC: -3091. Df Model: 13 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- const 0.0003 0.030 0.012 0.991 -0.059 0.060 UNRATE -8.17e-06 0.000 -0.018 0.986 -0.001 0.001 soybean_price -0.0008 0.001 -1.228 0.220 -0.002 0.000 propane_ppi 3.507e-05 1.62e-05 2.162 0.031 3.2e-06 6.69e-05 temp 6.603e-05 0.001 0.131 0.896 -0.001 0.001 Outbreak -0.0004 0.001 -0.296 0.767 -0.003 0.002 corn_price 0.0010 0.001 0.749 0.454 -0.002 0.004 cpi_value 1.0134 0.001 709.571 0.000 1.011 1.016 diesel_ppi -3.029e-05 2.35e-05 -1.288 0.198 -7.65e-05 1.59e-05 gas_ppi -5.099e-05 3.87e-05 -1.317 0.189 -0.000 2.51e-05 rdpi_per_capita -1.846e-07 2.35e-07 -0.784 0.433 -6.47e-07 2.78e-07 fed_funds_rate 0.0001 0.000 0.398 0.691 -0.000 0.001 natural_gas_ppi -1.08e-05 1.35e-05 -0.798 0.425 -3.74e-05 1.58e-05 Growth Rate -5.8753 6.100 -0.963 0.336 -17.859 6.108 ============================================================================== Omnibus: 1202.076 Durbin-Watson: 1.210 Prob(Omnibus): 0.000 Jarque-Bera (JB): 3269004.906 Skew: 17.576 Prob(JB): 0.00 Kurtosis: 381.485 Cond. No. 4.03e+08 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 4.03e+08. This might indicate that there are strong multicollinearity or other numerical problems.
# Add constant to the model for the intercept
X = add_constant(merged_data[['UNRATE', 'soybean_price', 'propane_ppi', 'temp', 'Outbreak',
'corn_price', 'cpi_value', 'diesel_ppi', 'gas_ppi',
'rdpi_per_capita', 'fed_funds_rate', 'natural_gas_ppi', 'Growth Rate']])
# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
# Display the VIFs
print(vif_data)
Variable VIF 0 const 2863.185734 1 UNRATE 2.208576 2 soybean_price 11.607555 3 propane_ppi 9.639871 4 temp 2.086108 5 Outbreak 2.592928 6 corn_price 11.508110 7 cpi_value 3.135601 8 diesel_ppi 23.830582 9 gas_ppi 32.298061 10 rdpi_per_capita 12.317555 11 fed_funds_rate 3.880146 12 natural_gas_ppi 3.490783 13 Growth Rate 5.788445
# Define dependent and independent variables
X = merged_data.drop(columns=['date', 'egg_price', 'gas_ppi', 'diesel_ppi'])
y = merged_data['egg_price']
# Add intercept
X = sm.add_constant(X)
# Combine X and y for easier cleaning
combined = pd.concat([X, y], axis=1)
# Replace inf/-inf with NaN and drop rows with missing values
combined = combined.replace([np.inf, -np.inf], np.nan).dropna()
# Separate cleaned X and y
X_clean = combined.drop(columns=['egg_price'])
y_clean = combined['egg_price']
# Fit the model
model = sm.OLS(y_clean, X_clean).fit()
# Show regression results
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: egg_price R-squared: 1.000 Model: OLS Adj. R-squared: 1.000 Method: Least Squares F-statistic: 1.390e+05 Date: Thu, 24 Apr 2025 Prob (F-statistic): 0.00 Time: 12:54:32 Log-Likelihood: 1583.9 No. Observations: 543 AIC: -3144. Df Residuals: 531 BIC: -3092. Df Model: 11 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- const 0.0073 0.030 0.244 0.807 -0.052 0.066 UNRATE 0.0002 0.000 0.438 0.661 -0.001 0.001 soybean_price -0.0009 0.001 -1.520 0.129 -0.002 0.000 propane_ppi 7.386e-06 1.21e-05 0.609 0.543 -1.65e-05 3.12e-05 temp -5.213e-05 0.000 -0.105 0.917 -0.001 0.001 Outbreak 0.0003 0.001 0.317 0.751 -0.002 0.003 corn_price -0.0010 0.001 -0.772 0.440 -0.003 0.001 cpi_value 1.0121 0.001 733.189 0.000 1.009 1.015 rdpi_per_capita -2.94e-07 2.35e-07 -1.251 0.212 -7.56e-07 1.68e-07 fed_funds_rate -8.104e-05 0.000 -0.292 0.770 -0.001 0.000 natural_gas_ppi -1.247e-05 1.31e-05 -0.952 0.342 -3.82e-05 1.33e-05 Growth Rate 0.3498 5.847 0.060 0.952 -11.137 11.836 ============================================================================== Omnibus: 1219.694 Durbin-Watson: 1.188 Prob(Omnibus): 0.000 Jarque-Bera (JB): 3542783.390 Skew: 18.172 Prob(JB): 0.00 Kurtosis: 397.038 Cond. No. 3.83e+08 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 3.83e+08. This might indicate that there are strong multicollinearity or other numerical problems.
# Select only the independent variables (excluding 'egg_price')
independent_vars = merged_data.drop(columns=['egg_price'])
# Compute correlation matrix
corr_matrix = independent_vars.corr()
# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Independent Variables')
plt.show()
# Define dependent and independent variables
X = merged_data.drop(columns=['date', 'egg_price', 'gas_ppi', 'diesel_ppi', 'soybean_price', 'propane_ppi', 'rdpi_per_capita'])
y = merged_data['egg_price']
# Add intercept
X = sm.add_constant(X)
# Combine X and y for easier cleaning
combined = pd.concat([X, y], axis=1)
# Replace inf/-inf with NaN and drop rows with missing values
combined = combined.replace([np.inf, -np.inf], np.nan).dropna()
# Separate cleaned X and y
X_clean = combined.drop(columns=['egg_price'])
y_clean = combined['egg_price']
# Fit the model
model = sm.OLS(y_clean, X_clean).fit()
# Show regression results
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: egg_price R-squared: 1.000 Model: OLS Adj. R-squared: 1.000 Method: Least Squares F-statistic: 1.909e+05 Date: Thu, 24 Apr 2025 Prob (F-statistic): 0.00 Time: 12:54:33 Log-Likelihood: 1582.0 No. Observations: 543 AIC: -3146. Df Residuals: 534 BIC: -3107. Df Model: 8 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- const -0.0025 0.029 -0.088 0.930 -0.059 0.054 UNRATE 0.0005 0.000 1.287 0.199 -0.000 0.001 temp -0.0002 0.000 -0.491 0.624 -0.001 0.001 Outbreak 0.0003 0.001 0.269 0.788 -0.002 0.002 corn_price -0.0025 0.001 -3.989 0.000 -0.004 -0.001 cpi_value 1.0114 0.001 787.491 0.000 1.009 1.014 fed_funds_rate 0.0002 0.000 1.143 0.253 -0.000 0.001 natural_gas_ppi -7.455e-06 8.31e-06 -0.898 0.370 -2.38e-05 8.86e-06 Growth Rate 6.4289 4.471 1.438 0.151 -2.355 15.213 ============================================================================== Omnibus: 1225.887 Durbin-Watson: 1.180 Prob(Omnibus): 0.000 Jarque-Bera (JB): 3641899.526 Skew: 18.386 Prob(JB): 0.00 Kurtosis: 402.519 Cond. No. 1.27e+06 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.27e+06. This might indicate that there are strong multicollinearity or other numerical problems.
# List of predictors used in your regression
predictors = ['UNRATE', 'temp', 'Outbreak', 'corn_price', 'cpi_value',
'fed_funds_rate', 'natural_gas_ppi', 'Growth Rate']
# Create X matrix with constant
X = sm.add_constant(merged_data[predictors])
# Calculate VIF
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)
Variable VIF 0 const 2552.008626 1 UNRATE 1.382154 2 temp 1.916131 3 Outbreak 1.834292 4 corn_price 2.312824 5 cpi_value 2.490735 6 fed_funds_rate 1.574224 7 natural_gas_ppi 1.290526 8 Growth Rate 3.054922
# Create interaction terms in merged_data
merged_data['corn_outbreak'] = merged_data['corn_price'] * merged_data['Outbreak']
merged_data['soy_outbreak'] = merged_data['soybean_price'] * merged_data['Outbreak']
merged_data['corn_soy'] = merged_data['corn_price'] * merged_data['soybean_price']
merged_data['cpi_rdpi'] = merged_data['cpi_value'] * merged_data['rdpi_per_capita']
merged_data['cpi_fed'] = merged_data['cpi_value'] * merged_data['fed_funds_rate']
merged_data['rdpi_fed'] = merged_data['rdpi_per_capita'] * merged_data['fed_funds_rate']
merged_data['unrate_rdpi'] = merged_data['UNRATE'] * merged_data['rdpi_per_capita']
merged_data['diesel_corn'] = merged_data['diesel_ppi'] * merged_data['corn_price']
merged_data['propane_soy'] = merged_data['propane_ppi'] * merged_data['soybean_price']
merged_data['gas_temp'] = merged_data['gas_ppi'] * merged_data['temp']
merged_data['temp_outbreak'] = merged_data['temp'] * merged_data['Outbreak']
merged_data['temp_corn'] = merged_data['temp'] * merged_data['corn_price']
merged_data['temp_natural_gas'] = merged_data['temp'] * merged_data['natural_gas_ppi']
merged_data['growth_fed'] = merged_data['Growth Rate'] * merged_data['fed_funds_rate']
merged_data['growth_cpi'] = merged_data['Growth Rate'] * merged_data['cpi_value']
merged_data['growth_corn'] = merged_data['Growth Rate'] * merged_data['corn_price']
# Define predictors including interaction terms
predictors = [
'UNRATE', 'temp', 'Outbreak', 'corn_price', 'cpi_value', 'fed_funds_rate',
'natural_gas_ppi', 'Growth Rate', 'soybean_price', 'propane_ppi', 'diesel_ppi',
'gas_ppi', 'rdpi_per_capita',
# Interaction terms
'corn_outbreak', 'soy_outbreak', 'corn_soy',
'cpi_rdpi', 'cpi_fed', 'rdpi_fed', 'unrate_rdpi',
'diesel_corn', 'propane_soy', 'gas_temp',
'temp_outbreak', 'temp_corn', 'temp_natural_gas',
'growth_fed', 'growth_cpi', 'growth_corn'
]
X = merged_data[predictors]
y = merged_data['egg_price']
# Add constant for intercept
X = sm.add_constant(X)
# Fit the OLS model
model = sm.OLS(y, X).fit()
# Print regression results
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: egg_price R-squared: 1.000 Model: OLS Adj. R-squared: 1.000 Method: Least Squares F-statistic: 5.794e+04 Date: Thu, 24 Apr 2025 Prob (F-statistic): 0.00 Time: 12:54:33 Log-Likelihood: 1618.8 No. Observations: 543 AIC: -3178. Df Residuals: 513 BIC: -3049. Df Model: 29 Covariance Type: nonrobust ==================================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------------ const 0.3017 0.105 2.874 0.004 0.095 0.508 UNRATE -0.0014 0.003 -0.557 0.578 -0.006 0.004 temp -0.0029 0.002 -1.700 0.090 -0.006 0.000 Outbreak 0.0440 0.041 1.063 0.288 -0.037 0.125 corn_price -0.0483 0.041 -1.165 0.245 -0.130 0.033 cpi_value 0.8811 0.035 24.993 0.000 0.812 0.950 fed_funds_rate -0.0036 0.004 -0.887 0.376 -0.012 0.004 natural_gas_ppi -0.0009 0.000 -1.767 0.078 -0.002 9.65e-05 Growth Rate -41.6700 31.034 -1.343 0.180 -102.638 19.298 soybean_price -0.0001 0.001 -0.097 0.922 -0.003 0.002 propane_ppi 4.289e-05 4.18e-05 1.027 0.305 -3.91e-05 0.000 diesel_ppi -1.116e-06 6.61e-05 -0.017 0.987 -0.000 0.000 gas_ppi 0.0010 0.001 1.281 0.201 -0.001 0.002 rdpi_per_capita -2.651e-06 8.17e-07 -3.246 0.001 -4.26e-06 -1.05e-06 corn_outbreak -0.0024 0.002 -1.012 0.312 -0.007 0.002 soy_outbreak -3.148e-05 0.001 -0.036 0.972 -0.002 0.002 corn_soy 8.998e-05 0.000 0.287 0.774 -0.001 0.001 cpi_rdpi 2.336e-06 5.23e-07 4.466 0.000 1.31e-06 3.36e-06 cpi_fed 0.0012 0.001 1.328 0.185 -0.001 0.003 rdpi_fed -6.819e-08 6.55e-08 -1.040 0.299 -1.97e-07 6.06e-08 unrate_rdpi 4.093e-09 5.73e-08 0.071 0.943 -1.08e-07 1.17e-07 diesel_corn -1.542e-06 1.03e-05 -0.149 0.882 -2.19e-05 1.88e-05 propane_soy -1.613e-06 3.12e-06 -0.516 0.606 -7.75e-06 4.52e-06 gas_temp -1.867e-05 1.37e-05 -1.360 0.175 -4.56e-05 8.31e-06 temp_outbreak -0.0007 0.001 -0.830 0.407 -0.002 0.001 temp_corn 0.0009 0.001 1.227 0.220 -0.001 0.002 temp_natural_gas 1.635e-05 9.1e-06 1.797 0.073 -1.52e-06 3.42e-05 growth_fed 4.0072 2.876 1.393 0.164 -1.642 9.657 growth_cpi 32.5163 20.088 1.619 0.106 -6.948 71.981 growth_corn -1.5425 7.478 -0.206 0.837 -16.233 13.148 ============================================================================== Omnibus: 1098.871 Durbin-Watson: 1.369 Prob(Omnibus): 0.000 Jarque-Bera (JB): 2165590.659 Skew: 14.336 Prob(JB): 0.00 Kurtosis: 311.050 Cond. No. 1.64e+10 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.64e+10. This might indicate that there are strong multicollinearity or other numerical problems.
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)
feature VIF 0 const 37544.064509 1 UNRATE 70.704086 2 temp 26.467847 3 Outbreak 3221.050777 4 corn_price 11470.599064 5 cpi_value 2064.639688 6 fed_funds_rate 875.694055 7 natural_gas_ppi 4917.387954 8 Growth Rate 161.915984 9 soybean_price 56.905525 10 propane_ppi 68.990165 11 diesel_ppi 203.455282 12 gas_ppi 12846.196126 13 rdpi_per_capita 160.089083 14 corn_outbreak 234.162282 15 soy_outbreak 197.008944 16 corn_soy 231.413475 17 cpi_rdpi 1480.992322 18 cpi_fed 53.411866 19 rdpi_fed 149.877657 20 unrate_rdpi 59.589198 21 diesel_corn 221.607819 22 propane_soy 75.572238 23 gas_temp 13357.918509 24 temp_outbreak 3741.915213 25 temp_corn 12358.043858 26 temp_natural_gas 4858.189758 27 growth_fed 402.504304 28 growth_cpi 124.304248 29 growth_corn 128.244498
# Scale data for Lasso
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.drop(columns=['const']))
lasso = LassoCV(cv=5, random_state=0).fit(X_scaled, y)
coef = pd.Series(lasso.coef_, index=X.columns[1:])
print(coef[coef != 0].sort_values())
diesel_ppi -0.001713 soybean_price -0.000166 rdpi_per_capita -0.000043 UNRATE 0.000002 growth_cpi 0.001369 cpi_fed 0.001431 cpi_rdpi 0.008845 cpi_value 0.693240 dtype: float64
# Selected features from Lasso
selected_features = ['diesel_ppi', 'soybean_price', 'rdpi_per_capita', 'UNRATE',
'growth_cpi', 'cpi_fed', 'cpi_rdpi', 'cpi_value']
# Subset the original X (with constant)
X_lasso = X[selected_features]
X_lasso = sm.add_constant(X_lasso)
# Fit OLS model
model_lasso = sm.OLS(y, X_lasso).fit()
# Summary of the regression
print(model_lasso.summary())
OLS Regression Results ============================================================================== Dep. Variable: egg_price R-squared: 1.000 Model: OLS Adj. R-squared: 1.000 Method: Least Squares F-statistic: 2.019e+05 Date: Thu, 24 Apr 2025 Prob (F-statistic): 0.00 Time: 12:54:33 Log-Likelihood: 1597.2 No. Observations: 543 AIC: -3176. Df Residuals: 534 BIC: -3138. Df Model: 8 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- const 0.0446 0.013 3.499 0.001 0.020 0.070 diesel_ppi -1.664e-05 1.23e-05 -1.358 0.175 -4.07e-05 7.43e-06 soybean_price -0.0003 0.000 -0.677 0.499 -0.001 0.001 rdpi_per_capita -1.26e-06 2.97e-07 -4.239 0.000 -1.84e-06 -6.76e-07 UNRATE 0.0002 0.000 0.590 0.556 -0.001 0.001 growth_cpi 10.6088 4.529 2.342 0.020 1.712 19.506 cpi_fed -0.0003 0.000 -1.589 0.113 -0.001 8e-05 cpi_rdpi 1.175e-06 2.68e-07 4.379 0.000 6.48e-07 1.7e-06 cpi_value 0.9509 0.015 64.341 0.000 0.922 0.980 ============================================================================== Omnibus: 1174.548 Durbin-Watson: 1.249 Prob(Omnibus): 0.000 Jarque-Bera (JB): 2925313.857 Skew: 16.664 Prob(JB): 0.00 Kurtosis: 361.029 Cond. No. 6.22e+08 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 6.22e+08. This might indicate that there are strong multicollinearity or other numerical problems.
# VIF DataFrame
vif_data = pd.DataFrame()
vif_data["feature"] = X_lasso.columns
vif_data["VIF"] = [variance_inflation_factor(X_lasso.values, i)
for i in range(X_lasso.shape[1])]
print(vif_data)
feature VIF 0 const 532.584877 1 diesel_ppi 6.726621 2 soybean_price 4.836786 3 rdpi_per_capita 20.403635 4 UNRATE 1.574427 5 growth_cpi 6.073492 6 cpi_fed 3.036188 7 cpi_rdpi 374.649905 8 cpi_value 348.758535
coef[coef != 0].abs().sort_values().plot(kind='barh', figsize=(8, 6))
plt.title("Feature Importance from Lasso Regression")
plt.xlabel("Absolute Coefficient Value")
plt.tight_layout()
plt.show()
# Define reduced set of features
X_reduced = X[['const', 'rdpi_per_capita', 'growth_cpi', 'cpi_rdpi', 'cpi_value']]
# Refit OLS model
model_reduced = sm.OLS(y, X_reduced).fit()
# Display summary
print(model_reduced.summary())
OLS Regression Results ============================================================================== Dep. Variable: egg_price R-squared: 1.000 Model: OLS Adj. R-squared: 1.000 Method: Least Squares F-statistic: 3.997e+05 Date: Thu, 24 Apr 2025 Prob (F-statistic): 0.00 Time: 12:54:34 Log-Likelihood: 1592.4 No. Observations: 543 AIC: -3175. Df Residuals: 538 BIC: -3153. Df Model: 4 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- const 0.0436 0.007 6.079 0.000 0.030 0.058 rdpi_per_capita -1.264e-06 1.64e-07 -7.703 0.000 -1.59e-06 -9.41e-07 growth_cpi 15.5789 3.847 4.049 0.000 8.022 23.136 cpi_rdpi 1.274e-06 2.04e-07 6.254 0.000 8.74e-07 1.67e-06 cpi_value 0.9404 0.011 83.276 0.000 0.918 0.963 ============================================================================== Omnibus: 1190.346 Durbin-Watson: 1.226 Prob(Omnibus): 0.000 Jarque-Bera (JB): 3132994.257 Skew: 17.179 Prob(JB): 0.00 Kurtosis: 373.533 Cond. No. 5.26e+08 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 5.26e+08. This might indicate that there are strong multicollinearity or other numerical problems.
# Compute VIF for each feature in the reduced model
vif_data = pd.DataFrame()
vif_data["feature"] = X_reduced.columns
vif_data["VIF"] = [variance_inflation_factor(X_reduced.values, i) for i in range(X_reduced.shape[1])]
print(vif_data)
feature VIF 0 const 166.546852 1 rdpi_per_capita 6.147673 2 growth_cpi 4.338013 3 cpi_rdpi 213.766659 4 cpi_value 201.584149
# Calculate correlation and p-value
corr_coef, p_value = pearsonr(X['cpi_rdpi'], X['cpi_value'])
print(f"Pearson correlation coefficient: {corr_coef:.4f}")
print(f"P-value: {p_value:.4e}")
Pearson correlation coefficient: 0.9883 P-value: 0.0000e+00
# Define new reduced set of features
X_dropped = X[['const', 'rdpi_per_capita', 'growth_cpi', 'cpi_value']]
# Refit OLS model
model_dropped = sm.OLS(y, X_dropped).fit()
# Display summary
print(model_dropped.summary())
OLS Regression Results ============================================================================== Dep. Variable: egg_price R-squared: 1.000 Model: OLS Adj. R-squared: 1.000 Method: Least Squares F-statistic: 4.977e+05 Date: Thu, 24 Apr 2025 Prob (F-statistic): 0.00 Time: 12:54:34 Log-Likelihood: 1573.4 No. Observations: 543 AIC: -3139. Df Residuals: 539 BIC: -3122. Df Model: 3 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- const 0.0052 0.004 1.361 0.174 -0.002 0.013 rdpi_per_capita -4.75e-07 1.09e-07 -4.374 0.000 -6.88e-07 -2.62e-07 growth_cpi -1.7079 2.769 -0.617 0.538 -7.147 3.731 cpi_value 1.0103 0.002 603.847 0.000 1.007 1.014 ============================================================================== Omnibus: 1253.952 Durbin-Watson: 1.136 Prob(Omnibus): 0.000 Jarque-Bera (JB): 4133395.965 Skew: 19.378 Prob(JB): 0.00 Kurtosis: 428.664 Cond. No. 1.79e+08 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.79e+08. This might indicate that there are strong multicollinearity or other numerical problems.
# Calculate VIFs
vif_data = pd.DataFrame()
vif_data["feature"] = X_dropped.columns
vif_data["VIF"] = [variance_inflation_factor(X_dropped.values, i) for i in range(X_dropped.shape[1])]
print(vif_data)
feature VIF 0 const 44.697063 1 rdpi_per_capita 2.515052 2 growth_cpi 2.098828 3 cpi_value 4.132810
# Define final set of predictors (dropping growth_cpi)
X_final = X[['const', 'rdpi_per_capita', 'cpi_value']]
# Refit OLS model
model_final = sm.OLS(y, X_final).fit()
# Show summary
print(model_final.summary())
OLS Regression Results ============================================================================== Dep. Variable: egg_price R-squared: 1.000 Model: OLS Adj. R-squared: 1.000 Method: Least Squares F-statistic: 7.474e+05 Date: Thu, 24 Apr 2025 Prob (F-statistic): 0.00 Time: 12:54:35 Log-Likelihood: 1573.2 No. Observations: 543 AIC: -3140. Df Residuals: 540 BIC: -3127. Df Model: 2 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- const 0.0036 0.003 1.304 0.193 -0.002 0.009 rdpi_per_capita -4.502e-07 1.01e-07 -4.464 0.000 -6.48e-07 -2.52e-07 cpi_value 1.0096 0.001 832.728 0.000 1.007 1.012 ============================================================================== Omnibus: 1254.686 Durbin-Watson: 1.135 Prob(Omnibus): 0.000 Jarque-Bera (JB): 4144896.104 Skew: 19.405 Prob(JB): 0.00 Kurtosis: 429.255 Cond. No. 1.80e+05 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.8e+05. This might indicate that there are strong multicollinearity or other numerical problems.