import numpy as np

def gen_data(n_points=5,sep=1):
    np.random.seed(3)
    W = np.array([[1,-1],[1,1]])/np.sqrt(2)
    rnd_pos_1 = np.random.uniform(0,1,size=(n_points,2))
    X1 = (rnd_pos_1 + np.array([-1-sep,0]))@W
    rnd_pos_2 = np.random.uniform(0,1,size=(n_points,2))
    X2 = (rnd_pos_2 + np.array([+sep,0]))@W
    
    Xall = np.concatenate([X1,X2],axis=0)
    y1 = np.ones(shape=n_points)
    y2 = -np.ones(shape=n_points)
    yall = np.concatenate([y1,y2])

    X = np.concatenate([Xall,np.ones((n_points*2,1))],axis=1)
    return X,X1,X2,Xall,yall
X, X1, X2, Xall, yall = gen_data(n_points=100,sep=0.05)


%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(8,8))
plt.scatter(X1[:,0],X1[:,1], color='red')
plt.scatter(X2[:,0],X2[:,1], color='blue')

<matplotlib.collections.PathCollection object at 0x7f5f1c3add68>


@interact
def _(n_steps=(0,(0..63))):
    # X = (n_points,3)
    # W = (n_points,3)
    n_points = X.shape[0]
    W = np.array([0,0,0])
    P=points(zip(X1[:,0],X1[:,1]),color='blue')
    P+=points(zip(X2[:,0],X2[:,1]),color='red')
    
    k = 0
    max_iter=10000
    j = 0
    while ((k < n_steps) and (j < max_iter)):
        i = j % n_points
        j+=1
        if (X[i,:]@W * yall[i] <= 0):
            W = W + X[i,:]*yall[i]
            P+=points(X[i,:2],color='yellow')
            k+=1
    print(W)
    
    x_left = -2
    x_right = 2
    y_left = 0
    y_right = 0
    if (W[1] != 0):
        y_left = (-W[2] - x_left*W[0])/W[1]
        y_right = (-W[2] - x_right*W[0])/W[1]
    P+=line([(x_left,y_left),(x_right,y_right)])
    
    show(P,xmin=-1,xmax=2,ymin=-1,ymax=2)


A = np.random.normal(size=(100,2))
A_unit = A/(np.linalg.norm(A,axis=1).reshape(-1,1))
radial_A = 3+np.random.uniform(size=(100,1))
P=points(A_unit*radial_A,color='blue')

B = np.random.normal(size=(100,2))
B_unit = B/(np.linalg.norm(B,axis=1).reshape(-1,1))
radial_B = np.random.uniform(size=(100,1))
P+=points(B_unit*radial_B,color='red')
P.show()


A_2d = A_unit*radial_A
A_3d = np.concatenate([A_2d,np.linalg.norm(A_2d,axis=1).reshape(-1,1)^2],axis=1)
B_2d = B_unit*radial_B
B_3d = np.concatenate([B_2d,np.linalg.norm(B_2d,axis=1).reshape(-1,1)^2],axis=1)

P=points(A_3d,size=20,color='blue')
P+=points(B_3d,size=20,color='red')
P.show()


from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
digits = load_digits()
fig, ax = plt.subplots(2,5)
plt.gray()
for i in range(10):
    row = floor(i/5)
    column = i % 5
    ax[row,column].imshow(digits['data'][i,:].reshape(8,8))


from sklearn.svm import SVC

labels = digits['target'] > 5

X = digits['data']

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,labels)

per = SVC(kernel='linear')

per.fit(X_train,Y_train)

SVC(kernel='linear')


print("Training precision: \t", per.score(X_train,Y_train))

print("Test precision: \t", per.score(X_test,Y_test))

Training precision: 	 0.8864142538975501
Test precision: 	 0.9133333333333333


from sklearn.svm import SVC

labels = digits['target'] > 5

X = digits['data']

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,labels)

per = SVC(kernel='rbf')

per.fit(X_train,Y_train)

print("Training precision: \t", per.score(X_train,Y_train))

print("Test precision: \t", per.score(X_test,Y_test))

Training precision: 	 0.9873793615441723
Test precision: 	 0.9866666666666667


import numpy as np
n_samples = 20
sigma=0.3
np.random.seed(6)
X = np.random.uniform(size=n_samples)*2-1
Y = -X**2 + np.random.normal(size=n_samples)*sigma
points(zip(X,Y))


# We will be trying all models on the same training data and hold out some as test-data
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.5)


X_lin_train = X_train.reshape(-1,1)
X_second_train = np.stack([X_train,X_train^2],axis=1)
X_third_train = np.stack([X_train,X_train^2,X_train^3],axis=1)


from sklearn.linear_model import LinearRegression
lr_lin = LinearRegression()
lr_second = LinearRegression()
lr_third = LinearRegression()

lr_lin.fit(X_lin_train,Y_train)
lr_second.fit(X_second_train,Y_train)
lr_third.fit(X_third_train,Y_train)

LinearRegression()


x_plot = np.linspace(-1,1,100).reshape(-1,1)
P = line(zip(x_plot,lr_lin.predict(x_plot)),color='brown')
P+= line(zip(x_plot,lr_second.predict(np.concatenate([x_plot,x_plot^2],axis=1))),color='green')
P+= line(zip(x_plot,lr_third.predict(np.concatenate([x_plot,x_plot^2,x_plot^3],axis=1))),color='blue')
P+= points(zip(X_train,Y_train),color='black')
P+= points(zip(X_test,Y_test),color='red')
show(P)


from scipy.optimize import fsolve
import numpy as np
import matplotlib.pyplot as plt

def find_N(k=1,prec=0.1):
    delta=0.1
    c = 8*k
    t = 8*k*np.log(2*np.e/k)+8*np.log(8/delta)
    func = lambda x: t+c*np.log(x)-prec*x
    x_init = k*1000
    return fsolve(func,x_init)

k = np.arange(1,100)
N = np.array([find_N(k0,1) for k0 in k]).ravel()
def plot_numreq():
    plt.figure(figsize=(7,7))
    plt.plot(k,N)
    plt.xlabel("VC-dimension = $k$")
    plt.ylabel("Datapoints required = $N$")
    plt.title("$\\frac{8 k\\ln\\left ( \\frac{2Ne}{k}\\right )+8\\ln\\frac{8}{\\delta}}{N} = 1, \\delta = 0.1$",fontsize=15,pad=10)
    plt.show()

plot_numreq()


import csv
data = []
with open('data/CORIS.csv',mode='r') as f:
    reader  = csv.reader(f)
    header = next(reader)
    _ = next(reader) # remove info
    _ = next(reader)
    _ = next(reader)
    for row in csv.reader(f):
        data.append(np.array(row[1:],dtype=float))


header # This is why we remove the first item in the row

['row.names',
 'sbp',
 'tobacco',
 'ldl',
 'adiposity',
 'famhist',
 'typea',
 'obesity',
 'alcohol',
 'age',
 'chd']


data_full = np.stack(data)
X =  data_full[:,:-1]
Y =  data_full[:,-1]


X.shape

(462, 9)


import numpy as np
def kfold(index_set,k=10,shuffle=False):
    '''This  function will take an index set and split that index set into k chunks
        The first n_samples % k folds have size n_samples // k + 1, 
        (if n_samples is not evenly divisable (remove +1 otherwise))
        other folds have size n_samples // k, where n_samples is the number of samples.'''
    n_samples = len(index_set)
    groups = np.array(list(range(k))*(n_samples//k)+list(range(n_samples-(n_samples//k)*k)))
    if (shuffle):
        np.random.shuffle(groups)
    test_chunks = [index_set[groups == i] for i in range(k)]
    train_chunks = [index_set[groups != i] for i in range(k)]
    return train_chunks,test_chunks


k = 10
indexes = np.arange(len(X))
train_chunks,test_chunks = kfold(indexes,k=k,shuffle=True)


from sklearn.linear_model import LogisticRegression

error = []
for train,test in zip(train_chunks,test_chunks):
    X_train = X[train]
    Y_train = Y[train]
    X_test = X[test]
    Y_test = Y[test]
    
    # Fit model
    lr = LogisticRegression(max_iter=10000)
    lr.fit(X_train,Y_train)
    error.append(1-lr.score(X_test,Y_test))


np.mean(error)

0.27927844588344125


#Lets add a bunch of features, that corresponds to X and X^2
X_augmented =  np.concatenate([X,X^2],axis=-1)


from sklearn.linear_model import LogisticRegression

error = []
for train,test in zip(train_chunks,test_chunks):
    X_train = X_augmented[train]
    Y_train = Y[train]
    X_test = X_augmented[test]
    Y_test = Y[test]
    
    # Fit model with n_params
    fold_errors = []
    for n in range(1,X_augmented.shape[1]):
        lr = LogisticRegression(max_iter=10000)
        lr.fit(X_train[:,0:n],Y_train)
        fold_errors.append(1-lr.score(X_test[:,0:n],Y_test))
    error.append(fold_errors)


points(zip(range(1,X_augmented.shape[1]),np.mean(np.array(error),axis=0)))

Introduction to Data Science: A Comp-Math-Stat Approach¶

1MS041, 2021¶

15. Supervised learning continued¶

Topics¶

The perceptron algorithm¶

The algorithm¶

Theorem: Perceptron for linearly separable data¶

Kernels¶

Lemma¶

Proof¶

Definition¶

Theorem (properties)¶

Let me cheat a bit¶

The Radial Basis Function Kernel¶

The underlying problem¶

Understanding model fit and generalization¶

Formalizing the problem¶

The issue of overfitting and what to do about it?¶

Example¶

Uniform convergence¶

Theorem [Finite hypothesis class]¶

Proof¶

Theorem [PAC Learnability of Empirical Risk Minimization]¶

Theorem (Hoeffding bounds)¶

Theorem (Uniform convergence)¶

Proof¶

The case of infinite hypothesis spaces¶

Growth function (shattering number)¶

Can we bound $s(\mathcal{H},N)$?¶

Vapnik Chervonenkis dimension¶

Sauer–Shelah lemma (1972)¶

Putting it all together¶

VC inequality (1971)¶

VC generalization bound¶

VC generalization bound¶

Testing true error¶

General problem¶

Question: how do we estimate $err(h_S)$?¶

k-fold cross validation¶

Lets try it!¶

Big whoop! How does this help me?¶

Question: what is different from this and when we considered the model as fixed and tested the model on the test data?¶

Using it to select models¶

Question: Can we use the resulting smallest error as an estimate for the total error for that model?¶

Introduction to Data Science: A Comp-Math-Stat Approach ¶