import csv


data = []
header = []
with open('data/portland.csv', mode='r') as f:
    reader = csv.reader(f)
    header = tuple(next(reader))
    for row in reader:
        try:
            data.append((int(row[0]),int(row[1]),int(row[2])))
        except e:
            print(e)


print("The data consists of %d observations" % len(data))
print("")
print("%s \t %s \t %s" % header)
for row in data[:5]:
    print("%d \t\t\t\t\t %d \t\t\t %d" % row)

The data consists of 47 observations

Size of the house (in square feet) 	 Number of bedrooms 	 Price of the house
2104 					 3 			 399900
1600 					 3 			 329900
2400 					 3 			 369000
1416 					 2 			 232000
3000 					 4 			 539900


def showURL(url, ht=500):
    """Return an IFrame of the url to show in notebook with height ht"""
    from IPython.display import IFrame
    return IFrame(url, width='95%', height=ht) 
showURL('https://scikit-learn.org/stable/',600)


from sklearn.linear_model import LinearRegression
lr = LinearRegression()


#?LinearRegression


import numpy as np
X = np.array([row[0] for row in data]).reshape(-1,1) # This since we only have one feature (n_samples,1) is the shape
Y = np.array([row[2] for row in data])


X.shape

(47, 1)


Y.shape

(47,)


lr.fit(X,Y)

LinearRegression()


P = points([(x,y) for x,y in zip (X,Y)])
P += points([(x,lr.predict(x.reshape(-1,1))) for x,y in zip (X,Y)],color='red')
show(P)


import numpy as np
X2 = np.array([(row[0],row[1]) for row in data])
Y = np.array([row[2] for row in data])
lr2 = LinearRegression()
lr2.fit(X2,Y)

LinearRegression()


P = points([(x[0],y) for x,y in zip (X2,Y)]) # x[0] is size of the house
P += points([(x[0],lr2.predict(x.reshape(1,-1))) for x,y in zip (X2,Y)],color='red')
show(P)


P = points([(x[1],y) for x,y in zip (X2,Y)]) # x[1] is number of bedrooms
P += points([(x[1],lr2.predict(x.reshape(1,-1))) for x,y in zip (X2,Y)],color='red')
show(P)


import ssl # this hack of unverified ssl context is not recommended and unnecessary if you do: 'sage --pip install certifi' in your bash shell after installing SageMath
ssl._create_default_https_context = ssl._create_unverified_context

import sklearn.datasets as datasets
X, Y = datasets.load_wine(return_X_y=True)


Y_binary = (Y > 1)*1
Y_binary

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])


X.shape # 178 samples and 13 features

(178, 13)


Y_binary.shape # 178 samples

(178,)


import numpy as np
from scipy import optimize

# define the objective/cost/loss function we want to minimise
def f(x):
    return -np.sum(Y_binary*log(x[0] + x[1]*X[:,0])+(1-Y_binary)*log(1-x[0] - x[1]*X[:,0]))

# multi-dimensional optimisation is syntactically similar to 1D, 
# but we are using Gradient and Hessian information from numerical evaluation of f to 
# iteratively improve the solution along the steepest direction, etc. 
# It 'LBFGS' method you will see in scientific computing
parameter_bounding_box=((0.0001, 0.1), (0.0001, 0.1)) # specify the constraints for each parameter
initial_arguments = np.array([0.0001, 0.0001]) # point in 2D to initialise the minimize algorithm
optimize.minimize(f, initial_arguments, bounds=parameter_bounding_box,) # just call the minimize method!

/ext/sage/sage-9.1/local/lib/python3.7/site-packages/sage/functions/log.py:436: RuntimeWarning: invalid value encountered in log
  return ln(args[0], **kwds)

      fun: nan
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>
      jac: array([ -33829.47056139, -444334.41454999])
  message: b'ABNORMAL_TERMINATION_IN_LNSRCH'
     nfev: 63
      nit: 0
   status: 2
  success: False
        x: array([0.0001, 0.0001])


b1, b2 = var('b1 b2')
f = sum([y*log(b1 + b2*x[0]) + (1-y)*log(1-b1-b2*x[0]) for x,y in zip(X,Y_binary)])
plot3d(f(b1,b2),(b1,0.001,0.2),(b2,0.001,0.1), frame=False, color='purple', opacity=0.8,aspect_ratio=[10,10,1])


plot(1/(1+e^(-x)),-10,10)


from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)
Y1 = 2*Y_binary-1 # Transform into +-1


# Uncomment and run to understand what it does
#?StandardScaler


b1, b2 = var('b1 b2')
f = sum([log(1+exp(-y*(b1+b2*x))) for x,y in zip(X_sc[:,9],Y1)])
plot3d(f(b1,b2),(b1,-4,0),(b2,0,4), frame=True, color='purple', opacity=0.8,aspect_ratio=[5,5,1])


import numpy as np
from scipy import optimize

# define the objective/cost/loss function we want to minimise
def f(x):
    return np.sum(np.log(1+np.exp(-Y1*(x[0] + x[1]*X_sc[:,9]))))

# multi-dimensional optimisation is syntactically similar to 1D, 
# but we are using Gradient and Hessian information from numerical evaluation of f to 
# iteratively improve the solution along the steepest direction, etc. 
# It 'LBFGS' method you will see in scientific computing
parameter_bounding_box=((-10, 2), (-10, 2)) # specify the constraints for each parameter
initial_arguments = np.array([0, 0]) # point in 2D to initialise the minimize algorithm
result = optimize.minimize(f, initial_arguments, bounds=parameter_bounding_box,) # just call the minimize method!
result

      fun: 67.32689020081042
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>
      jac: array([-3.55271368e-05,  2.13162821e-05])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 30
      nit: 8
   status: 0
  success: True
        x: array([-1.43278954,  1.84018362])


result_func(z) = 1/(1+exp(-result.x[0]-result.x[1]*z))
result_func

z |--> 1/(e^(-1.8401836221193224*z + 1.432789536610928) + 1)


P = points(zip(X_sc[:,9],Y_binary))
P+= plot(result_func,-10,10)
P+= plot(0.5,color='grey')
show(P)


from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(penalty='none')
logreg.fit(X_sc[:,9].reshape(-1,1),Y_binary)

LogisticRegression(penalty='none')


(logreg.coef_,logreg.intercept_)

(array([[1.84018447]]), array([-1.43278738]))


# Just some code copied from 11.ipynb about nonparametric estimation

def makeEMFHidden(myDataList):
    '''Make an empirical mass function from a data list.
    
    Param myDataList, list of data to make emf from.
    Return list of tuples comprising (data value, relative frequency) ordered by data value.'''
    
    sortedUniqueValues = sorted(list(set(myDataList)))
    freqs = [myDataList.count(i) for i in sortedUniqueValues]
    relFreqs = [ZZ(fr)/len(myDataList) for fr in freqs] # use a list comprehension
    
    return list(zip(sortedUniqueValues, relFreqs))
    

from pylab import array

def makeEDFHidden(myDataList, offset=0):
    '''Make an empirical distribution function from a data list.
    
    Param myDataList, list of data to make ecdf from.
    Param offset is an offset to adjust the edf by, used for doing confidence bands.
    Return list of tuples comprising (data value, cumulative relative frequency) ordered by data value.'''
    
    sortedUniqueValues = sorted(list(set(myDataList)))
    freqs = [myDataList.count(i) for i in sortedUniqueValues]
    from pylab import cumsum
    cumFreqs = list(cumsum(freqs)) #
    cumRelFreqs = [ZZ(i)/len(myDataList) for i in cumFreqs] # get cumulative relative frequencies as rationals
    if offset > 0: # an upper band
        cumRelFreqs = [min(i ,1) for i in cumRelFreqs] # use a list comprehension
    if offset < 0: # a lower band
        cumRelFreqs = [max(i, 0) for i in cumFreqs] # use a list comprehension
    return list(zip(sortedUniqueValues, cumRelFreqs))
    
# EPMF plot
def epmfPlot(samples):
    '''Returns an empirical probability mass function plot from samples data.'''
    
    epmf_pairs = makeEMFHidden(samples)
    epmf = point(epmf_pairs, rgbcolor = "blue", pointsize="20")
    for k in epmf_pairs:    # for each tuple in the list
        kkey, kheight = k     # unpack tuple
        epmf += line([(kkey, 0),(kkey, kheight)], rgbcolor="blue", linestyle=":")
    # padding
    epmf += point((0,1), rgbcolor="black", pointsize="0")
    return epmf
    

# ECDF plot
def ecdfPlot(samples):
    '''Returns an empirical probability mass function plot from samples data.'''
    ecdf_pairs = makeEDFHidden(samples)
    ecdf = point(ecdf_pairs, rgbcolor = "red", faceted = false, pointsize="20")
    for k in range(len(ecdf_pairs)):
        x, kheight = ecdf_pairs[k]     # unpack tuple
        previous_x = 0
        previous_height = 0
        if k > 0:
            previous_x, previous_height = ecdf_pairs[k-1] # unpack previous tuple
        ecdf += line([(previous_x, previous_height),(x, previous_height)], rgbcolor="grey")
        ecdf += points((x, previous_height),rgbcolor = "white", faceted = true, pointsize="20")
        ecdf += line([(x, previous_height),(x, kheight)], rgbcolor="grey", linestyle=":")
    # padding
    ecdf += line([(ecdf_pairs[0][0]-0.2, 0),(ecdf_pairs[0][0], 0)], rgbcolor="grey")
    max_index = len(ecdf_pairs)-1
    ecdf += line([(ecdf_pairs[max_index][0], ecdf_pairs[max_index][1]),(ecdf_pairs[max_index][0]+0.2, ecdf_pairs[max_index][1])],rgbcolor="grey")
    return ecdf
    
def calcEpsilon(alphaE, nE):
    '''Return confidence band epsilon calculated from parameters alphaE > 0 and nE > 0.'''
    
    return sqrt(1/(2*nE)*log(2/alphaE))

# ECDF plot given a list of points to plot
def ecdfPointsPlot(listOfPoints, colour='grey', lines_only=False):
    '''Returns an empirical probability mass function plot from a list of points to plot.
    
    Param listOfPoints is the list of points to plot.
    Param colour is used for plotting the lines, defaulting to grey.
    Param lines_only controls wether only lines are plotted (true) or points are added (false, the default value).
    Returns an ecdf plot graphic.'''
    
    ecdfP = point((0,0), pointsize="0")
    if not lines_only: ecdfP = point(listOfPoints, rgbcolor = "red", faceted = false, pointsize="20")
    for k in range(len(listOfPoints)):
        x, kheight = listOfPoints[k]     # unpack tuple
        previous_x = 0
        previous_height = 0
        if k > 0:
            previous_x, previous_height = listOfPoints[k-1] # unpack previous tuple
        ecdfP += line([(previous_x, previous_height),(x, previous_height)], rgbcolor=colour)
        ecdfP += line([(x, previous_height),(x, kheight)], rgbcolor=colour, linestyle=":")
        if not lines_only: 
            ecdfP += points((x, previous_height),rgbcolor = "white", faceted = true, pointsize="20")
            # padding
    max_index = len(listOfPoints)-1
    ecdfP += line([(listOfPoints[0][0]-0.2, 0),(listOfPoints[0][0], 0)], rgbcolor=colour)
    ecdfP += line([(listOfPoints[max_index][0], listOfPoints[max_index][1]),(listOfPoints[max_index][0]+0.2,\
                                                                listOfPoints[max_index][1])],rgbcolor=colour)
    return ecdfP

def makeEDFPoints(myDataList, offset=0):
    '''Make a list empirical distribution plotting points from from a data list.
    
    Param myDataList, list of data to make ecdf from.
    Param offset is an offset to adjust the edf by, used for doing confidence bands.
    Return list of tuples comprising (data value, cumulative relative frequency(with offset)) 
    ordered by data value.'''
    
    sortedUniqueValues = sorted(list(set(myDataList)))
    freqs = [myDataList.count(i) for i in sortedUniqueValues]
    from pylab import cumsum
    cumFreqs = list(cumsum(freqs)) 
    cumRelFreqs = [ZZ(i)/len(myDataList) for i in cumFreqs] # get cumulative relative frequencies as rationals
    if offset > 0: # an upper band
        cumRelFreqs = [min(i+offset ,1) for i in cumRelFreqs]
    if offset < 0: # a lower band
        cumRelFreqs = [max(i+offset, 0) for i in cumRelFreqs] 
    return list(zip(sortedUniqueValues, cumRelFreqs))


def randomSplit(X,Y,proportion=0.7):
    '''Randomly splits the pairs X,Y into two disjoint sets
    with proportionality that the first set corresponds to proprtion and the
    second is 1-proportion
    '''
    assert type(X) == np.ndarray
    assert len(X.shape) == 2
    assert len(Y.shape) == 1
    assert X.shape[0] == Y.shape[0]
    numSamples = X.shape[0]
    numSamplesFirstPart = int(numSamples*proportion)
    #numSamplesSecondPart = numSamples - numSamplesFirstPart
    indexes = np.arange(numSamples)
    np.random.shuffle(indexes)
    firstPartIndexes = indexes[:numSamplesFirstPart]
    secondPartIndexes = indexes[numSamplesFirstPart:]
    return X[firstPartIndexes],Y[firstPartIndexes],X[secondPartIndexes],Y[secondPartIndexes]


import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import sklearn.datasets as datasets
california_housing = datasets.fetch_california_housing()
print(california_housing['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bureau publishes sample data (a block group typically has a population
of 600 to 3,000 people).

It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.

.. topic:: References

    - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
      Statistics and Probability Letters, 33 (1997) 291-297


X = california_housing.data
Y = california_housing.target
X_train,Y_train,X_test,Y_test = randomSplit(X,Y,proportion=0.9)


from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,Y_train)

LinearRegression()


residual = Y_test-lr.predict(X_test)


len(residual)

2064


def plotResidualECDFBand(residuals,alpha=0.05):
    if type(residuals) == np.ndarray:
        residual = residuals.tolist()
    elif type(residuals) == list:
        residual = residuals
    
    residualPoints = makeEDFPoints(residual)
    p = ecdfPointsPlot(residualPoints,lines_only=True)
    epResidual = calcEpsilon(alpha,len(residual))
    residualPointsLower = makeEDFPoints(residual, offset=-epResidual)
    residualPointsUpper = makeEDFPoints(residual, offset=epResidual)
    p+=ecdfPointsPlot(residualPointsLower,lines_only=True)
    p+=ecdfPointsPlot(residualPointsUpper,lines_only=True)
    show(p)


plotResidualECDFBand(residual)


bootstrap_means = [np.mean(np.random.choice(residual,size=len(residual),replace=True)) for i in range(10000)]


histogram(bootstrap_means)


lower95BootstrapCIForMean = np.percentile(bootstrap_means,2.5)
upper95BootstrapCIForMean = np.percentile(bootstrap_means,97.5)

print ("The inner 95% percentile based Confidence Interval for the mean = ")
print ("[ "+str(lower95BootstrapCIForMean) + " , " + str(upper95BootstrapCIForMean) +" ]")

The inner 95% percentile based Confidence Interval for the mean = 
[ -0.010323753406153989 , 0.05347325551326631 ]


bootstrap_vars = [np.var(np.random.choice(residual,size=len(residual),replace=True)) for i in range(1000)]
show(histogram(bootstrap_vars))
lower95BootstrapCIForVar = np.percentile(bootstrap_vars,2.5)
upper95BootstrapCIForVar = np.percentile(bootstrap_vars,97.5)

print ("The inner 95% percentile based Confidence Interval for the variance = ")
print ("[ "+str(lower95BootstrapCIForVar) + " , " + str(upper95BootstrapCIForVar) +" ]")

The inner 95% percentile based Confidence Interval for the variance = 
[ 0.4947251198872258 , 0.6000747572077545 ]


RsquaredBoot = []
for i in range(1000):
    indices = np.random.choice(np.arange(len(residual)),size=len(residual),replace=True)
    res_boot = residual[indices]
    residual_squares = np.mean(res_boot^2)
    y_boot = Y_train[indices]
    y_boot_variance = np.var(y_boot)
    RsquaredBoot.append(1-residual_squares/y_boot_variance)


histogram(RsquaredBoot)


lower95BootstrapCIForVar = np.percentile(RsquaredBoot,2.5)
upper95BootstrapCIForVar = np.percentile(RsquaredBoot,97.5)
print ("The inner 95% percentile based Confidence Interval for R^2 = ")
print ("[ "+str(lower95BootstrapCIForVar) + " , " + str(upper95BootstrapCIForVar) +" ]")

The inner 95% percentile based Confidence Interval for R^2 = 
[ 0.5271436909419323 , 0.6248155291456209 ]


import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
digits = load_digits()
fig, ax = plt.subplots(2,5)
plt.gray()
for i in range(10):
    row = floor(i/5)
    column = i % 5
    ax[row,column].imshow(digits['data'][i,:].reshape(8,8))


target = (digits['target'] >= 5)*1


from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(digits['data'],target)


from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)

StandardScaler()


from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
logReg.fit(sc.transform(X_train),Y_train)

LogisticRegression()


logReg.score(sc.transform(X_train),Y_train)

0.9101707498144024


plotResidualECDFBand(logReg.predict(X_test)-Y_test)


from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(digits['data'],digits.target)


from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)

StandardScaler()


from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
logReg.fit(sc.transform(X_train),Y_train)

LogisticRegression()


logReg.score(sc.transform(X_train),Y_train)

0.9985152190051967


plotResidualECDFBand(logReg.predict(X_test)-Y_test)

Introduction to Data Science: A Comp-Math-Stat Approach ¶

1MS041, 2021¶

14. Supervised Learning & what is machine learning?¶

Topics¶

Supervised learning¶

What we will cover in this and coming lectures¶

Probabilistic viewpoint of machine learning¶

The Portland house price example using Sci-kit learn¶

Other models¶

Wine recognition dataset¶

Whats the problem?¶

Wrap up¶

How do we know if our model is any good?¶

Bootstrap estimation of the mean of the residual¶

Bootstrap estimation of the variance of the residual¶

Measuring how good a model is (explained variance)¶

More interesting example¶

Multiple classes¶

Introduction to Data Science: A Comp-Math-Stat Approach¶

1MS041, 2021¶

14. Supervised Learning & what is machine learning?¶

Topics¶

Supervised learning¶

What we will cover in this and coming lectures¶

Probabilistic viewpoint of machine learning¶

The Portland house price example using Sci-kit learn¶

Other models¶

Wine recognition dataset¶

Whats the problem?¶

Wrap up¶

How do we know if our model is any good?¶

Bootstrap estimation of the mean of the residual¶

Bootstrap estimation of the variance of the residual¶

Measuring how good a model is (explained variance)¶

More interesting example¶

Multiple classes¶

Introduction to Data Science: A Comp-Math-Stat Approach ¶