Exploring the Python library statsmodels

Published

July 4, 2023

plot_fit

import matplotlib.pyplot as plt
import numpy as np

import statsmodels.api as sm

data = sm.datasets.statecrime.load_pandas().data
murder = data['murder']
X = data[['poverty', 'hs_grad']].copy()
X['constant'] = 1

y = murder
model = sm.OLS(y, X)
results = model.fit()

# Create a plot just for the variable 'Poverty':

fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(results, 0, ax=ax)
ax.set_ylabel("Murder Rate")
ax.set_xlabel("Poverty Level")
ax.set_title("Linear Regression")

plt.show()

plot_regress_exog

import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf


fig = plt.figure(figsize=(8, 6))
crime_data = sm.datasets.statecrime.load_pandas()
results = smf.ols('murder ~ hs_grad + urban + poverty + single',
                   data=crime_data.data).fit()
sm.graphics.plot_regress_exog(results, 'poverty', fig=fig)
plt.show()

plot_partregress

crime_data = sm.datasets.statecrime.load_pandas()
sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad',
                              exog_others=['urban', 'poverty', 'single'],
                              data=crime_data.data, obs_labels=False)
plt.show()

plot_partregress_grid

import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import plot_partregress_grid

fig = plt.figure(figsize=(8, 6))
crime_data = sm.datasets.statecrime.load_pandas()
results = smf.ols('murder ~ hs_grad + urban + poverty + single',
                  data=crime_data.data).fit()
plot_partregress_grid(results, fig=fig)
plt.show()

import statsmodels.api as sm
import pandas
df = sm.datasets.get_rdataset("Guerry", "HistData").data

fm = sm.formula.ols('Lottery ~ Literacy + Wealth + C(Region)', data = df).fit()
print(fm.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                Lottery   R-squared:                       0.338
Model:                            OLS   Adj. R-squared:                  0.287
Method:                 Least Squares   F-statistic:                     6.636
Date:                Wed, 09 Aug 2023   Prob (F-statistic):           1.07e-05
Time:                        13:39:12   Log-Likelihood:                -375.30
No. Observations:                  85   AIC:                             764.6
Df Residuals:                      78   BIC:                             781.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         38.6517      9.456      4.087      0.000      19.826      57.478
C(Region)[T.E]   -15.4278      9.727     -1.586      0.117     -34.793       3.938
C(Region)[T.N]   -10.0170      9.260     -1.082      0.283     -28.453       8.419
C(Region)[T.S]    -4.5483      7.279     -0.625      0.534     -19.039       9.943
C(Region)[T.W]   -10.0913      7.196     -1.402      0.165     -24.418       4.235
Literacy          -0.1858      0.210     -0.886      0.378      -0.603       0.232
Wealth             0.4515      0.103      4.390      0.000       0.247       0.656
==============================================================================
Omnibus:                        3.049   Durbin-Watson:                   1.785
Prob(Omnibus):                  0.218   Jarque-Bera (JB):                2.694
Skew:                          -0.340   Prob(JB):                        0.260
Kurtosis:                       2.454   Cond. No.                         371.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

fig = sm.graphics.influence_plot(fm, criterion="cooks")
fig.show()
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

# Load the example diamonds dataset
diamonds = sns.load_dataset("diamonds")

# Draw a scatter plot while assigning point colors and sizes to different
# variables in the dataset
f, ax = plt.subplots(figsize=(6.5, 6.5))
sns.despine(f, left=True, bottom=True)
clarity_ranking = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]
sns.scatterplot(x="carat", y="price",
                hue="clarity", size="depth",
                palette="ch:r=-.2,d=.3_r",
                hue_order=clarity_ranking,
                sizes=(1, 8), linewidth=0,
                data=diamonds, ax=ax)