©2023 Raazesh Sainudiin, Benny Avelin. Attribution 4.0 International (CC BY 4.0)
from sklearn.datasets import load_diabetes
dataset = load_diabetes()
print(dataset.DESCR)
from sklearn.model_selection import train_test_split
X,Y = load_diabetes(return_X_y=True)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=0)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,Y_train)
import matplotlib.pyplot as plt
plt.scatter(lr.predict(X_test),Y_test)
plt.scatter(lr.predict(X_test),lr.predict(X_test))
import numpy as np
MAE = np.mean(np.abs(Y_test - lr.predict(X_test)))
MAE
b = 346
a = 25
span = b-a
span
from Utils import epsilon_bounded
epsilon = epsilon_bounded(len(Y_test),span*2,0.05)
epsilon
[MAE-epsilon,MAE+epsilon]
import pandas as pd
df_red = pd.read_csv('/Users/avelin/Downloads/winequality-red.csv',sep=';')
df_white = pd.read_csv('/Users/avelin/Downloads/winequality-white.csv',sep=';')
df_red['type'] = 1
df_red.head(5)
df_white['type'] = 0
feature_cols = [col for col in df_red.columns if col!='quality']
feature_cols
target = 'quality'
X1 = df_red[feature_cols].to_numpy()
X2 = df_white[feature_cols].to_numpy()
Y1 = df_red[target].to_numpy()
Y2 = df_white[target].to_numpy()
X = np.concatenate([X1,X2],axis=0)
Y = np.concatenate([Y1,Y2],axis=0)
print(X.shape,Y.shape)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=0)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,Y_train)
import matplotlib.pyplot as plt
plt.scatter(lr.predict(X_test),Y_test)
plt.scatter(lr.predict(X_test),lr.predict(X_test))
np.max(np.abs(Y_test-lr.predict(X_test)))
import numpy as np
MAE = np.mean(np.abs(Y_test - lr.predict(X_test)))
MAE
len(Y_test)
epsilon = epsilon_bounded(len(Y_test),5,0.05)
epsilon
[MAE-epsilon,MAE+epsilon]