Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!
This notebook illustrate how to do a simple linear regression using sk-learn.
import numpy as np
import pandas as pd
import hvplot.pandas
from sklearn.linear_model import LinearRegressionLoad the Iris dataset and convert the response variable to numeric (so that a linear regression can be performed).
iris = pd.read_csv("../../home/media/data/iris.csv")
iris["y"] = iris.species.apply(
lambda spe: {"Iris-virginica": 3, "Iris-setosa": 1, "Iris-versicolor": 2}[spe]
)
iris.head()Loading...
x = iris[["sepal_length_cm", "sepal_width_cm", "petal_length_cm", "petal_width_cm"]]
y = iris.yreg1 = LinearRegression().fit(x, y)reg1.coef_array([-0.10974146, -0.04424045, 0.22700138, 0.60989412])reg1.intercept_1.1920839948281388reg1.rank_4reg1.score(x, y)0.9304223675331597reg1.predict(x)array([0.91734173, 0.96141024, 0.95181031, 1.01260878, 0.92389183,
1.05680235, 1.03762592, 0.95544006, 1.02070502, 0.91869693,
0.89827134, 1.00008849, 0.91139498, 0.89816529, 0.7730022 ,
0.95635941, 0.9660018 , 0.97833114, 0.96731454, 0.98775914,
0.95694375, 1.0531726 , 0.87698786, 1.17725847, 1.0681889 ,
0.99583637, 1.10011902, 0.92906772, 0.91079163, 1.01991072,
1.01336062, 1.0335223 , 0.84153404, 0.84247683, 0.91869693,
0.89618773, 0.850745 , 0.91869693, 0.99358084, 0.94446591,
0.96660515, 1.07456442, 0.98473275, 1.2176738 , 1.13954911,
1.0333738 , 0.94946987, 0.98548459, 0.90924548, 0.93716396,
2.20308259, 2.2845166 , 2.32487047, 2.1876208 , 2.31393877,
2.25705298, 2.39745639, 1.90717243, 2.17656176, 2.24113634,
1.95929474, 2.28013501, 1.95420588, 2.31512204, 2.05930184,
2.17232866, 2.38115786, 1.97673409, 2.35070534, 2.02311961,
2.59045598, 2.0996557 , 2.41725961, 2.19756726, 2.13040963,
2.18772685, 2.2654272 , 2.49592176, 2.34168532, 1.85593145,
2.01581766, 1.93212811, 2.05331264, 2.54772365, 2.40310615,
2.38055451, 2.30141848, 2.19062819, 2.16837848, 2.17877271,
2.20415981, 2.28799785, 2.08043682, 1.90062233, 2.20435076,
2.11911506, 2.18452852, 2.15235793, 1.87368909, 2.16625243,
3.24146289, 2.75264018, 2.90028407, 2.74143264, 3.00441822,
3.00431431, 2.60207593, 2.79059214, 2.76063251, 3.15212358,
2.71469034, 2.73219558, 2.84240596, 2.81075169, 3.05316319,
2.954033 , 2.69236016, 3.04163735, 3.20111558, 2.48615432,
2.98996282, 2.78575356, 2.96389898, 2.59137976, 2.88550825,
2.72019374, 2.57522972, 2.60005592, 2.91785077, 2.56166273,
2.79963117, 2.82960982, 2.97884018, 2.44938775, 2.53269542,
3.00181829, 3.08524888, 2.69891026, 2.58832992, 2.80430763,
3.05462443, 2.85818604, 2.75264018, 3.04633725, 3.12946589,
2.90725851, 2.6839174 , 2.74623857, 2.98983334, 2.66740449])1 - sum((reg1.predict(x) - y) ** 2) / sum((y - np.mean(y)) ** 2)0.93042236753315951 - sum((np.round(reg1.predict(x)) - y) ** 2) / sum((y - np.mean(y)) ** 2)0.96Train - Test¶
from sklearn.model_selection import train_test_splitx_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)x_trainLoading...
y_train85 2
30 1
101 3
94 2
64 2
..
9 1
103 3
67 2
117 3
47 1
Name: y, Length: 90, dtype: int64reg = LinearRegression().fit(x_train, y_train)reg.score(x_train, y_train)0.9518623736306308reg.score(x_test, y_test)0.8886093167492697Cross Validation¶
from sklearn.model_selection import cross_val_score
lreg = LinearRegression()
scores = cross_val_score(lreg, x, y, cv=5)scoresarray([0. , 0.85215955, 0. , 0.76225759, 0. ])scores.mean()0.3228834275997373