Importing necessary Libraries¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import het_breuschpagan
from scipy import stats
Loading Dataset¶
In [ ]:
data = pd.read_csv('https://www.statlearning.com/s/Advertising.csv', index_col=0)
data.head()
Out[ ]:
TV | radio | newspaper | sales | |
---|---|---|---|---|
1 | 230.1 | 37.8 | 69.2 | 22.1 |
2 | 44.5 | 39.3 | 45.1 | 10.4 |
3 | 17.2 | 45.9 | 69.3 | 9.3 |
4 | 151.5 | 41.3 | 58.5 | 18.5 |
5 | 180.8 | 10.8 | 58.4 | 12.9 |
Define predictors and target¶
In [ ]:
X = data[['TV', 'radio', 'newspaper']]
# X = data[['TV', 'radio']]
y = data['sales']
Statistical Modeling Using Statsmodels¶
In [23]:
# Adding Intercept Term (c)
X_const = sm.add_constant(X)
# Fitting the model
ols_model = sm.OLS(y, X_const).fit()
print(ols_model.summary())
# Making Predictions on Data
y_pred = ols_model.predict(X_const)
OLS Regression Results ============================================================================== Dep. Variable: sales R-squared: 0.897 Model: OLS Adj. R-squared: 0.896 Method: Least Squares F-statistic: 570.3 Date: Sat, 26 Apr 2025 Prob (F-statistic): 1.58e-96 Time: 15:12:59 Log-Likelihood: -386.18 No. Observations: 200 AIC: 780.4 Df Residuals: 196 BIC: 793.6 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 2.9389 0.312 9.422 0.000 2.324 3.554 TV 0.0458 0.001 32.809 0.000 0.043 0.049 radio 0.1885 0.009 21.893 0.000 0.172 0.206 newspaper -0.0010 0.006 -0.177 0.860 -0.013 0.011 ============================================================================== Omnibus: 60.414 Durbin-Watson: 2.084 Prob(Omnibus): 0.000 Jarque-Bera (JB): 151.241 Skew: -1.327 Prob(JB): 1.44e-33 Kurtosis: 6.332 Cond. No. 454. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
1. Residual Analysis¶
Calculating Residuals¶
In [25]:
residuals = y - y_pred
Residual vs Fitted Graph¶
In [27]:
plt.figure(figsize=(8,6))
sns.residplot(x=y_pred, y=residuals, lowess=True, line_kws={'color': 'red'})
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted')
plt.show()
Q-Q Plot¶
In [ ]:
sm.qqplot(residuals, line='45')
plt.title('Q-Q Plot')
plt.show()
Histogram of residuals¶
In [28]:
plt.figure(figsize=(8,6))
sns.histplot(residuals, kde=True)
plt.title('Histogram of Residuals')
plt.xlabel('Residuals')
plt.show()
2. Multicollinearity Check¶
In [31]:
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data,"\n\nValues are in the range 0-5 so it's all right")
feature VIF 0 TV 2.486772 1 radio 3.285462 2 newspaper 3.055245 Values are in the range 0-5 so it's all right
3. Influential Observations¶
In [32]:
influence = ols_model.get_influence()
(c, p) = influence.cooks_distance
plt.figure(figsize=(8,6))
plt.stem(np.arange(len(c)), c, markerfmt=",")
plt.title("Cook's Distance")
plt.xlabel("Observation Index")
plt.ylabel("Cook's Distance")
plt.show()
4. Homoscedasticity Check¶
In [38]:
# Breusch-Pagan test
bp_test = het_breuschpagan(residuals, X_const)
labels = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
print(dict(zip(labels, bp_test)))
print("\nP Value should be Greater than 0.05 for Homoscedasticity")
{'Lagrange multiplier statistic': 5.132872353285567, 'p-value': 0.16232215845412676, 'f-value': 1.7209042102915795, 'f p-value': 0.16399908905607216} P Value should be Greater than 0.05 for Homoscedasticity
5. Normality Check¶
In [ ]:
shapiro_test = stats.shapiro(residuals)
print('Shapiro-Wilk Test statistic:', shapiro_test[0], 'p-value:', shapiro_test[1])
Shapiro-Wilk Test statistic: 0.9176652036187539 p-value: 3.938571556266683e-09
6. Autocorrelation Check¶
In [34]:
dw_stat = durbin_watson(residuals)
print('Durbin-Watson statistic:', dw_stat)
Durbin-Watson statistic: 2.083648405294407
7. Goodness of Fit¶
In [35]:
print('R^2:', r2_score(y, y_pred))
print('Adjusted R^2:', 1 - (1 - r2_score(y, y_pred)) * (len(y) - 1) / (len(y) - X.shape[1] - 1))
print('RMSE:', np.sqrt(mean_squared_error(y, y_pred)))
R^2: 0.8972106381789522 Adjusted R^2: 0.8956373316204668 RMSE: 1.6685701407225697
Summary¶
In [36]:
print("\n--- Summary of Diagnostics ---")
print(f"Residuals are {'normally distributed' if shapiro_test[1]>0.05 else 'not normally distributed'}.")
print(f"Homoscedasticity {'present' if bp_test[1]>0.05 else 'violated'} (Breusch-Pagan p-value: {bp_test[1]:.4f}).")
print(f"Durbin-Watson statistic close to 2: {dw_stat:.4f} (No autocorrelation if close to 2).")
print(f"VIF values (should be < 5 ideally):\n{vif_data}")
--- Summary of Diagnostics --- Residuals are not normally distributed. Homoscedasticity present (Breusch-Pagan p-value: 0.1623). Durbin-Watson statistic close to 2: 2.0836 (No autocorrelation if close to 2). VIF values (should be < 5 ideally): feature VIF 0 TV 2.486772 1 radio 3.285462 2 newspaper 3.055245