Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!

This notebook illustrate how to do a simple linear regression using sk-learn.

import numpy as np
import pandas as pd
import hvplot.pandas
from sklearn.linear_model import LinearRegression

Load the Iris dataset and convert the response variable to numeric (so that a linear regression can be performed).

iris = pd.read_csv("../../home/media/data/iris.csv")
iris["y"] = iris.species.apply(
    lambda spe: {"Iris-virginica": 3, "Iris-setosa": 1, "Iris-versicolor": 2}[spe]
)
iris.head()
Loading...
x = iris[["sepal_length_cm", "sepal_width_cm", "petal_length_cm", "petal_width_cm"]]
y = iris.y
reg1 = LinearRegression().fit(x, y)
reg1.coef_
array([-0.10974146, -0.04424045, 0.22700138, 0.60989412])
reg1.intercept_
1.1920839948281388
reg1.rank_
4
reg1.score(x, y)
0.9304223675331597
reg1.predict(x)
array([0.91734173, 0.96141024, 0.95181031, 1.01260878, 0.92389183, 1.05680235, 1.03762592, 0.95544006, 1.02070502, 0.91869693, 0.89827134, 1.00008849, 0.91139498, 0.89816529, 0.7730022 , 0.95635941, 0.9660018 , 0.97833114, 0.96731454, 0.98775914, 0.95694375, 1.0531726 , 0.87698786, 1.17725847, 1.0681889 , 0.99583637, 1.10011902, 0.92906772, 0.91079163, 1.01991072, 1.01336062, 1.0335223 , 0.84153404, 0.84247683, 0.91869693, 0.89618773, 0.850745 , 0.91869693, 0.99358084, 0.94446591, 0.96660515, 1.07456442, 0.98473275, 1.2176738 , 1.13954911, 1.0333738 , 0.94946987, 0.98548459, 0.90924548, 0.93716396, 2.20308259, 2.2845166 , 2.32487047, 2.1876208 , 2.31393877, 2.25705298, 2.39745639, 1.90717243, 2.17656176, 2.24113634, 1.95929474, 2.28013501, 1.95420588, 2.31512204, 2.05930184, 2.17232866, 2.38115786, 1.97673409, 2.35070534, 2.02311961, 2.59045598, 2.0996557 , 2.41725961, 2.19756726, 2.13040963, 2.18772685, 2.2654272 , 2.49592176, 2.34168532, 1.85593145, 2.01581766, 1.93212811, 2.05331264, 2.54772365, 2.40310615, 2.38055451, 2.30141848, 2.19062819, 2.16837848, 2.17877271, 2.20415981, 2.28799785, 2.08043682, 1.90062233, 2.20435076, 2.11911506, 2.18452852, 2.15235793, 1.87368909, 2.16625243, 3.24146289, 2.75264018, 2.90028407, 2.74143264, 3.00441822, 3.00431431, 2.60207593, 2.79059214, 2.76063251, 3.15212358, 2.71469034, 2.73219558, 2.84240596, 2.81075169, 3.05316319, 2.954033 , 2.69236016, 3.04163735, 3.20111558, 2.48615432, 2.98996282, 2.78575356, 2.96389898, 2.59137976, 2.88550825, 2.72019374, 2.57522972, 2.60005592, 2.91785077, 2.56166273, 2.79963117, 2.82960982, 2.97884018, 2.44938775, 2.53269542, 3.00181829, 3.08524888, 2.69891026, 2.58832992, 2.80430763, 3.05462443, 2.85818604, 2.75264018, 3.04633725, 3.12946589, 2.90725851, 2.6839174 , 2.74623857, 2.98983334, 2.66740449])
1 - sum((reg1.predict(x) - y) ** 2) / sum((y - np.mean(y)) ** 2)
0.9304223675331595
1 - sum((np.round(reg1.predict(x)) - y) ** 2) / sum((y - np.mean(y)) ** 2)
0.96

Train - Test

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
x_train
Loading...
y_train
85 2 30 1 101 3 94 2 64 2 .. 9 1 103 3 67 2 117 3 47 1 Name: y, Length: 90, dtype: int64
reg = LinearRegression().fit(x_train, y_train)
reg.score(x_train, y_train)
0.9518623736306308
reg.score(x_test, y_test)
0.8886093167492697

Cross Validation

from sklearn.model_selection import cross_val_score

lreg = LinearRegression()
scores = cross_val_score(lreg, x, y, cv=5)
scores
array([0. , 0.85215955, 0. , 0.76225759, 0. ])
scores.mean()
0.3228834275997373