import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import het_breuschpagan
from scipy import stats

data = pd.read_csv('https://www.statlearning.com/s/Advertising.csv', index_col=0)
data.head()

X = data[['TV', 'radio', 'newspaper']]
# X = data[['TV', 'radio']]
y = data['sales']

# Adding Intercept Term (c)
X_const = sm.add_constant(X)

# Fitting the model
ols_model = sm.OLS(y, X_const).fit()
print(ols_model.summary())

# Making Predictions on Data
y_pred = ols_model.predict(X_const)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     570.3
Date:                Sat, 26 Apr 2025   Prob (F-statistic):           1.58e-96
Time:                        15:12:59   Log-Likelihood:                -386.18
No. Observations:                 200   AIC:                             780.4
Df Residuals:                     196   BIC:                             793.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9389      0.312      9.422      0.000       2.324       3.554
TV             0.0458      0.001     32.809      0.000       0.043       0.049
radio          0.1885      0.009     21.893      0.000       0.172       0.206
newspaper     -0.0010      0.006     -0.177      0.860      -0.013       0.011
==============================================================================
Omnibus:                       60.414   Durbin-Watson:                   2.084
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              151.241
Skew:                          -1.327   Prob(JB):                     1.44e-33
Kurtosis:                       6.332   Cond. No.                         454.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

residuals = y - y_pred

plt.figure(figsize=(8,6))
sns.residplot(x=y_pred, y=residuals, lowess=True, line_kws={'color': 'red'})
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted')
plt.show()

sm.qqplot(residuals, line='45')
plt.title('Q-Q Plot')
plt.show()

plt.figure(figsize=(8,6))
sns.histplot(residuals, kde=True)
plt.title('Histogram of Residuals')
plt.xlabel('Residuals')
plt.show()

vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data,"\n\nValues are in the range 0-5 so it's all right")

     feature       VIF
0         TV  2.486772
1      radio  3.285462
2  newspaper  3.055245 

Values are in the range 0-5 so it's all right

influence = ols_model.get_influence()
(c, p) = influence.cooks_distance
plt.figure(figsize=(8,6))
plt.stem(np.arange(len(c)), c, markerfmt=",")
plt.title("Cook's Distance")
plt.xlabel("Observation Index")
plt.ylabel("Cook's Distance")
plt.show()

# Breusch-Pagan test
bp_test = het_breuschpagan(residuals, X_const)
labels = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
print(dict(zip(labels, bp_test)))
print("\nP Value should be Greater than 0.05 for Homoscedasticity")

{'Lagrange multiplier statistic': 5.132872353285567, 'p-value': 0.16232215845412676, 'f-value': 1.7209042102915795, 'f p-value': 0.16399908905607216}

P Value should be Greater than 0.05 for Homoscedasticity

shapiro_test = stats.shapiro(residuals)
print('Shapiro-Wilk Test statistic:', shapiro_test[0], 'p-value:', shapiro_test[1])

Shapiro-Wilk Test statistic: 0.9176652036187539 p-value: 3.938571556266683e-09

dw_stat = durbin_watson(residuals)
print('Durbin-Watson statistic:', dw_stat)

Durbin-Watson statistic: 2.083648405294407

print('R^2:', r2_score(y, y_pred))
print('Adjusted R^2:', 1 - (1 - r2_score(y, y_pred)) * (len(y) - 1) / (len(y) - X.shape[1] - 1))
print('RMSE:', np.sqrt(mean_squared_error(y, y_pred)))

R^2: 0.8972106381789522
Adjusted R^2: 0.8956373316204668
RMSE: 1.6685701407225697

print("\n--- Summary of Diagnostics ---")
print(f"Residuals are {'normally distributed' if shapiro_test[1]>0.05 else 'not normally distributed'}.")
print(f"Homoscedasticity {'present' if bp_test[1]>0.05 else 'violated'} (Breusch-Pagan p-value: {bp_test[1]:.4f}).")
print(f"Durbin-Watson statistic close to 2: {dw_stat:.4f} (No autocorrelation if close to 2).")
print(f"VIF values (should be < 5 ideally):\n{vif_data}")

--- Summary of Diagnostics ---
Residuals are not normally distributed.
Homoscedasticity present (Breusch-Pagan p-value: 0.1623).
Durbin-Watson statistic close to 2: 2.0836 (No autocorrelation if close to 2).
VIF values (should be < 5 ideally):
     feature       VIF
0         TV  2.486772
1      radio  3.285462
2  newspaper  3.055245

	TV	radio	newspaper	sales
1	230.1	37.8	69.2	22.1
2	44.5	39.3	45.1	10.4
3	17.2	45.9	69.3	9.3
4	151.5	41.3	58.5	18.5
5	180.8	10.8	58.4	12.9

Importing necessary Libraries¶

Loading Dataset¶

Define predictors and target¶

Statistical Modeling Using Statsmodels¶

1. Residual Analysis¶

Calculating Residuals¶

Residual vs Fitted Graph¶

Q-Q Plot¶

Histogram of residuals¶

2. Multicollinearity Check¶

3. Influential Observations¶

4. Homoscedasticity Check¶

5. Normality Check¶

6. Autocorrelation Check¶

7. Goodness of Fit¶

Summary¶