import numpy as np
X = np.random.multivariate_normal(mean=(0,0),cov=[(1,0.8),(0.8,1)],size=(1000,))
points(X)


import numpy as np
from scipy.optimize import minimize
def findFirstSingularVector(A):
    '''Takes a matrix of form n x m and returns a tuple of the singular vector and singular value
    '''
    def negsumofsquares(v):
        return -np.sum((A@v)**2)
    constraints = ({'type': 'eq', 'fun': lambda w:  1-(np.sum(w**2))})
    m = A.shape[1]
    random_v = np.random.normal(size=m)
    initial_vector=random_v/np.linalg.norm(random_v)
    ans1 = minimize(negsumofsquares, x0=initial_vector,constraints=constraints)
    if (ans1.success):
        return ans1.x,np.sqrt(-ans1.fun)
    else:
        raise Exception("No convergence")

def projectOnPlane(A,normal):
    '''Project the rows of A onto the plane given by "normal * x = 0"'''
    
    return A - (A@normal).reshape(-1,1)*normal


v1,s1 = findFirstSingularVector(X)
print(v1,s1)

[-0.71191709 -0.70226352] 43.587923587503624


P_1A = projectOnPlane(X,v1)
points(P_1A)


v_2,s2 = findFirstSingularVector(P_1A)
print(v_2,s2)

[ 0.70226351 -0.7119171 ] 14.365618434588386


P=points(projectOnPlane(P_1A,v_2))
P.xmax(1);P.xmin(-1);P.ymax(1);P.ymin(-1);P.show()


def findSingularVectors(A,k=1):
    '''Takes a n x m matrix of points, where the points are on the rows. 
    It then computes the first k singular vectors with corresponding singular values'''
    assert(len(A.shape) == 2), "A needs to be a 2d matrix"
    assert((1 <= k) and (k <= A.shape[1])), "We can only compute the number of singular vectors up to the dimension of the data"
    vectors = []
    values = []
    PA = A
    for i in range(k):
        v,s = findFirstSingularVector(PA)
        PA = projectOnPlane(PA,v)
        vectors.append(v)
        values.append(s)
    return vectors,values


findSingularVectors(X,2)

([array([0.71191709, 0.70226352]), array([ 0.70226352, -0.71191709])],
 [43.5879235930413, 14.365618429059491])


#We can even do 3 dimension
Z = np.random.normal(size=(1000,3))


findSingularVectors(Z,3)

([array([0.43379043, 0.06596745, 0.89859566]),
  array([-0.07030653,  0.9967536 , -0.03923331]),
  array([-0.89826655, -0.04615825,  0.43702016])],
 [32.59826037754821, 31.59420900435691, 29.414743263754758])


Z1 = np.random.normal(size=(1000,10))
Z1 = Z1-np.mean(Z1,axis=0).reshape(1,-1)
B=Z1.T@Z1
print(B.shape)

(10, 10)


B_0 = B
v1_approx = B[:,0]
for i in range(200):
    B_0 = B@B_0
    B_0 = B_0 / np.linalg.norm(B_0) # Just normalize so that we reduce change of overflow
    # Take the first column of B_0
    b = B_0[:,0]
    v1_approx = b/np.linalg.norm(b)


v1_approx

array([ 0.17307599, -0.23962929, -0.3032961 , -0.46885279,  0.36048518,
       -0.26414788,  0.04976714,  0.10971497, -0.25323512,  0.56784259])


Bv1 = B@v1_approx
v1_approx-Bv1/np.linalg.norm(Bv1)

array([ 2.21238273e-03, -2.67825079e-03,  1.54625490e-04,  1.47299257e-03,
        2.93406625e-03,  2.78087820e-04,  2.59100001e-03,  3.69225009e-03,
        3.53409382e-05, -3.11706391e-03])


U,D,VT = np.linalg.svd(Z1,full_matrices=False) # Interestingly enough numpy documentation calls this reduced SVD


V = VT.T
V[:,0]

array([-0.0178222 ,  0.04850188,  0.28080799,  0.51048594, -0.14074569,
        0.25344231,  0.11589338,  0.13045542,  0.22865492, -0.70078927])


# square error is of the order $10^{-13}$, pretty good
np.linalg.norm(Z1-(U@np.diag(D)@VT))

2.1398127522566523e-13


U@np.diag(D)

array([[-0.76635149,  2.05570774, -0.13635384, ...,  0.04765146,
        -0.87751416,  0.67527962],
       [-0.38880805, -1.68711651, -0.43072077, ...,  0.55198908,
         0.69774245, -0.4195591 ],
       [ 0.88712757, -0.01433301, -0.73917784, ..., -1.49304488,
        -1.36776005,  1.15753913],
       ...,
       [ 0.0503714 , -0.83830681,  0.64190555, ..., -0.08342697,
        -0.90039476, -2.18728033],
       [ 0.1947741 ,  0.39127624,  4.00866915, ...,  0.13284496,
        -0.59507049, -0.47715572],
       [ 1.49974953, -1.10664026, -1.69201254, ..., -0.5315721 ,
         0.11957512, -0.4095345 ]])


(Z1@V)@V.T

array([[ 2.13690769, -0.71214437, -0.13128913, ...,  0.06237733,
        -0.9311444 , -0.60382278],
       [-1.87079527, -0.56325569,  0.07489574, ..., -0.96202915,
         0.80549259,  0.62332987],
       [ 0.93739944, -1.53148254,  0.90713692, ..., -0.89808873,
         0.9841879 , -0.39676187],
       ...,
       [ 1.33688098, -0.02959541, -0.37821044, ..., -1.45458732,
        -0.44532597,  1.0763427 ],
       [-0.39859239,  1.59748252, -2.16388461, ..., -0.25979978,
        -2.37252184, -0.73021981],
       [-0.327264  , -0.94891629,  1.90382775, ..., -1.63042026,
         1.65318587, -0.69207775]])


import numpy as np

class PCA(object):

    def __init__(self,n_components):
        self.n_components =  n_components
        self.components=None
        self.singular_values=None
        self.dimension=None
        self.mean=None

    def fit(self,X):
        assert(type(X) == np.ndarray), "Please input type np.ndarray!"
        assert(len(X.shape) == 2), "Please input 2d array!"
        assert(min(X.shape[1],X.shape[0]) >= self.n_components), "min(X.shape[1],X.shape[0]) >= self.n_components!"
        print("Number of data_points: %d, number of features: %d, Number of components: %d" %(X.shape[0],X.shape[1],self.n_components))
        
        X = self._center(X) # Make sure the data is centered (each feature has empirical mean zero)
        self.dimension=X.shape[1] # This is the dimension of the data
        self.components,self.singular_values = self._compute(X) # Compute the singular vectors
        
    def _center(self,X):
        self.mean = np.mean(X,axis=0).reshape(1,-1)
        return X-self.mean
        
    def transform(self,X):
        if (type(self.components) == None):
            raise Exception("Not fitted yet!")
        
        assert(type(X) == np.ndarray), "Please input type np.ndarray!"
        assert(len(X.shape) == 2), "Please input 2d array!"
        assert(X.shape[1] == self.dimension), "Incorrect number of features!"
        
        return (X-self.mean)@self.components
    
    def inverse_transform(self,X):
        if (type(self.components) == None):
            raise Exception("Not fitted yet!")
        
        assert(type(X) == np.ndarray), "Please input type np.ndarray!"
        assert(len(X.shape) == 2), "Please input 2d array!"
        assert(X.shape[1] == self.n_components), "Incorrect number of features!"
        
        return X@self.components.T+self.mean
    
    def project(self,X):
        return self.inverse_transform(self.transform(X))
        
    def _compute(self,X):
        U,D,VT = np.linalg.svd(X,full_matrices=False)
        return (VT.T)[:,:self.n_components],D


pca = PCA(2)


pca.fit(Z1)

Number of data_points: 1000, number of features: 10, Number of components: 2


def showURL(url, ht=500):
    """Return an IFrame of the url to show in notebook with height ht"""
    from IPython.display import IFrame
    return IFrame(url, width='95%', height=ht) 
showURL('https://en.wikipedia.org/wiki/G_factor_(psychometrics)#Factor_structure_of_cognitive_abilities',300)


showURL('https://en.wikipedia.org/wiki/Principal_component_analysis#Applications')


def load_digits():
    import csv
    data = []
    target = []
    with open('data/digits.csv', mode='r') as f:
        reader = csv.reader(f)
        header = next(reader)
        for row in reader:
            data.append([float(x) for x in row[:-1]])
            target.append(int(row[-1]))
    return np.array(data),np.array(target)


X,Y = load_digits()


import matplotlib.pyplot as plt
fig, ax = plt.subplots(2,5)
plt.gray()
for i in range(10):
    row = floor(i/5)
    column = i % 5
    ax[row,column].imshow(X[i,:].reshape(8,8))


pca_digit = PCA(10)


pca_digit.fit(X)

Number of data_points: 1797, number of features: 64, Number of components: 10


X_projected = pca_digit.project(X)
fig, ax = plt.subplots(2,5)
plt.gray()
for i in range(10):
    row = floor(i/5)
    column = i % 5
    ax[row,column].imshow(X_projected[i,:].reshape(8,8))


variances = pca_digit.singular_values**2/(X.shape[0]-1)


np.cumsum(variances/np.sum(variances))

array([0.14890594, 0.28509365, 0.40303959, 0.48713938, 0.54496353,
       0.59413263, 0.6372925 , 0.67390623, 0.70743871, 0.73822677,
       0.76195018, 0.78467714, 0.80289578, 0.82063433, 0.83530534,
       0.84940249, 0.86258838, 0.87506976, 0.88524694, 0.89430312,
       0.9031985 , 0.91116973, 0.91884467, 0.9260737 , 0.93303259,
       0.9389934 , 0.94474955, 0.94990113, 0.95479652, 0.9590854 ,
       0.96282146, 0.96635421, 0.96972105, 0.97300135, 0.97608455,
       0.97902234, 0.98158823, 0.98386565, 0.98608843, 0.98820273,
       0.99010182, 0.99168835, 0.99319995, 0.99460574, 0.99577196,
       0.99684689, 0.99781094, 0.99858557, 0.99914278, 0.99954711,
       0.99975703, 0.99983951, 0.99989203, 0.99994255, 0.99997555,
       0.99998798, 0.99999503, 0.99999804, 0.99999911, 0.99999966,
       1.        , 1.        , 1.        , 1.        ])


import csv
features = []
labels = []
with open('data/leukemia.csv',mode='r') as f:
    reader = csv.reader(f)
    header=next(f)
    for row in reader:
        features.append(np.array(row[:-1],dtype=float))
        labels.append((row[-1] == 'ALL')*1)

X = np.stack(features,axis=0)
Y = np.array(labels)

# https://github.com/mavaladezt/kNN-from-Scratch

def knn_distances(xTrain,xTest,k):
    """
    Finds the k nearest neighbors of xTest in xTrain.
    Input:
    xTrain = n x d matrix. n=rows and d=features
    xTest = m x d matrix. m=rows and d=features (same amount of features as xTrain)
    k = number of nearest neighbors to be found
    Output:
    dists = distances between all xTrain and all XTest points. Size of n x m
    indices = k x m matrix with the indices of the yTrain labels that represent the point
    """
    #the following formula calculates the Euclidean distances.
    import numpy as np
    distances = -2 * xTrain@xTest.T + np.sum(xTest**2,axis=1) + np.sum(xTrain**2,axis=1)[:, np.newaxis]
    #because of float precision, some small numbers can become negatives. Need to be replace with 0.
    distances[distances < 0] = 0
    distances = distances**.5
    indices = np.argsort(distances, 0) #get indices of sorted items
    distances = np.sort(distances,0) #distances sorted in axis 0
    #returning the top-k closest distances.
    return indices[0:k,:], distances[0:k,:]

def knn_predictions(xTrain,yTrain,xTest=None,k=3):
    """
    Uses xTrain and yTrain to predict xTest.
    Input:
    xTrain = n x d matrix. n=rows and d=features
    yTrain = n x 1 array. n=rows with label value
    xTest = m x d matrix. m=rows and d=features (same amount of features as xTrain)
    k = number of nearest neighbors to be found
    Output:
    predictions = predicted labels, ie preds(i) is the predicted label of xTest(i,:)
    """
    import numpy as np
    if (xTest == None):
        xTest = xTrain
        
    indices, distances = knn_distances(xTrain,xTest,k)
    yTrain = yTrain.flatten()
    rows, columns = indices.shape
    predictions = list()
    for j in range(columns):
        temp = list()
        for i in range(rows):
            cell = indices[i][j]
            temp.append(yTrain[cell])
        predictions.append(max(temp,key=temp.count)) #this is the key function, brings the mode value
    predictions=np.array(predictions)
    return predictions

def score(prediction,true_values):
    return np.sum(prediction == true_values)/len(prediction)


score(knn_predictions(X,Y,k=5),Y)

0.9583333333333334


k = 2
pca_leukemia = PCA(n_components=k)
pca_leukemia.fit(X)
X_leukemia_project = pca_leukemia.transform(X)

Number of data_points: 72, number of features: 7129, Number of components: 2


score(knn_predictions(X_leukemia_project,Y,k=5),Y)

0.9583333333333334


def standardScaler(X_in):
    '''Takes an array of shape (n_samples,n_features) and centers and normalizes the data'''
    X_out = (X_in-np.mean(X_in,axis=0))/np.std(X_in,axis=0)
    return X_out

if (k in [2,3]):
    X_leukemia_project_rescale = standardScaler(X_leukemia_project)
    class0 = X_leukemia_project_rescale[Y==0]
    class1 = X_leukemia_project_rescale[Y==1]
    P=points(class0,color='blue',size=20)
    P+=points(class1,color='red',size=20)
    P.show()


pca_leukemia_2d = PCA(n_components=2)
pca_leukemia_2d.fit(X)
X_leukemia_project_2d = standardScaler(pca_leukemia_2d.transform(X))

Number of data_points: 72, number of features: 7129, Number of components: 2


class LogisticRegression(object):
    def __init__(self,lam=0,max_iter=10000):
        self.coeffs = None
        self.result = None
        self.lam = lam
        self.max_iter = max_iter
    
    def fit(self,X,Y):
        import numpy as np
        from scipy import optimize

        # define the objective/cost/loss function we want to minimise
        def f(x):
            return np.sum(np.log(1+np.exp(-(2*Y-1)*(x[0] + (np.dot(X,x[1:]))))))+self.lam*np.sum(x**2)

        # multi-dimensional optimisation is syntactically similar to 1D, 
        # but we are using Gradient and Hessian information from numerical evaluation of f to 
        # iteratively improve the solution along the steepest direction, etc. 
        # It 'LBFGS' method you will see in scientific computing
        b = [-100,100]
        parameter_bounding_box=([b]*(X.shape[1]+1)) # specify the constraints for each parameter
        initial_arguments = np.array([0]*(X.shape[1]+1)) # point in 2D to initialise the minimize algorithm
        result = optimize.minimize(f, initial_arguments, bounds=parameter_bounding_box,options={'maxiter':self.max_iter}) # just call the minimize method!
        self.result = result
        assert result.success, "Not converged"
        
        self.coeffs = result.x
        
        
    
    def predict(self,X):
        if (self.coeffs is not None):
            theta = self.coeffs[0]+(X@(self.coeffs[1:]).reshape(-1,1))
            prediction = ((1/(1+exp(-theta)))>1/2)*1
            return prediction.ravel()
    
    def score(self,X,Y):
        return np.mean(Y==self.predict(X))


lr = LogisticRegression(lam=0)
lr.fit(X_leukemia_project_2d,Y)


lr.score(X_leukemia_project_2d,Y)

0.9444444444444444


lr_full = LogisticRegression()

X_sc = standardScaler(X)

lr_full.fit(X_sc,Y)

/Users/avelin/opt/miniconda3/envs/sage/lib/python3.7/site-packages/sage/repl/ipython_kernel/__main__.py:14: RuntimeWarning: overflow encountered in exp
/Users/avelin/opt/miniconda3/envs/sage/lib/python3.7/site-packages/scipy/optimize/_numdiff.py:497: RuntimeWarning: invalid value encountered in subtract
  df = fun(x) - f0

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-630-ed074dc5e983> in <module>()
      3 X_sc = standardScaler(X)
      4 
----> 5 lr_full.fit(X_sc,Y)

<ipython-input-627-8ff6416882e9> in fit(self, X, Y)
     23         result = optimize.minimize(f, initial_arguments, bounds=parameter_bounding_box,options={'maxiter':self.max_iter}) # just call the minimize method!
     24         self.result = result
---> 25         assert result.success, "Not converged"
     26 
     27         self.coeffs = result.x

AssertionError: Not converged


plot(log(1+exp(-x)),x,-10,10)


# Lets try to map it to 2d
pca1 = PCA(n_components=2)
pca1.fit(X)
lr1 = LogisticRegression()
lr1.fit(standardScaler(pca1.transform(X)),Y)
lr1.score(standardScaler(pca1.transform(X)),Y)

Number of data_points: 72, number of features: 7129, Number of components: 2

0.9444444444444444


# Lets try to map it to 5d
pca2 = PCA(n_components=5)
pca2.fit(X)
lr2 = LogisticRegression()
lr2.fit(standardScaler(pca2.transform(X)),Y)
lr2.score(standardScaler(pca2.transform(X)),Y)

Number of data_points: 72, number of features: 7129, Number of components: 5

0.9444444444444444


# Lets try to map it to 10d
pca3 = PCA(n_components=10)
pca3.fit(X)
lr3 = LogisticRegression()
lr3.fit(standardScaler(pca3.transform(X)),Y)
lr3.score(standardScaler(pca3.transform(X)),Y)

Number of data_points: 72, number of features: 7129, Number of components: 10

1.0


import numpy as np

class PCAScaler(PCA):

    def __init__(self,n_components=1):
        PCA.__init__(self,n_components)
    
    def transform(self,X):
        if (type(self.components) == None):
            raise Exception("Not fitted yet!")
        
        assert(type(X) == np.ndarray), "Please input type np.ndarray!"
        assert(len(X.shape) == 2), "Please input 2d array!"
        assert(X.shape[1] == self.dimension), "Incorrect number of features!"
        
        X_trans = PCA.transform(self,X)
        X_rescaled = X_trans/self.singular_values
        
        return X_rescaled
    
    def inverse_transform(self,X):
        X_out = PCA.inverse_transform(self,X)-self.mean
        
        return X_out
    
    def fit_transform(self,X):
        self.fit(X)
        return self.transform(X)
    
    def fit_transform_inverse_transform(self,X):
        return self.inverse_transform(self.fit_transform(X))


pcaScaler = PCAScaler(72)
pcaScaler.fit(X)
pcaScaler.transform(X)

Number of data_points: 72, number of features: 7129, Number of components: 72

array([[-4.65618594e-02, -8.06560817e-02,  1.16142890e-01, ...,
         1.98777979e-01, -8.04499931e-02, -1.82102223e-01],
       [-1.41386155e-01,  2.81507786e-04, -4.20989991e-04, ...,
        -1.77340276e-03, -1.10313769e-02, -1.20437978e-01],
       [-7.19480180e-02, -8.90149307e-02, -1.44936289e-01, ...,
        -2.71308720e-03, -6.19764331e-02, -3.75766492e-02],
       ...,
       [ 1.15490338e-01, -9.30875178e-02, -3.15082032e-02, ...,
         2.62603826e-02,  1.50688252e-01, -1.15620459e-02],
       [-3.35440720e-02, -5.09992249e-02,  6.89532582e-02, ...,
        -1.93112875e-01, -1.99104434e-01,  6.16642448e-02],
       [ 8.74890480e-03, -7.08617052e-02,  1.25388253e-01, ...,
         2.09269403e-01,  2.83971544e-01, -4.81751912e-03]])


lr_pca = LogisticRegression()
pcaScaler2 = PCAScaler(2)
X_transformed_and_scaled = pcaScaler.fit_transform(X)
lr_pca.fit(X_transformed_and_scaled,Y)
lr_pca.score(X_transformed_and_scaled,Y)

Number of data_points: 72, number of features: 7129, Number of components: 72

1.0


showURL('https://en.wikipedia.org/wiki/Collaborative_filtering')


showURL('https://en.wikipedia.org/wiki/Matrix_factorization_(recommender_systems)',500)


import csv
user_item_data=[]
with open('data/ratings.csv',mode='r') as f:
    reader = csv.reader(f)
    print(next(reader))
    for row in reader:
        user_item_data.append(row)

['userId', 'movieId', 'rating', 'timestamp']


movie_ids = [row[1] for row in user_item_data]
users = [row[0] for row in user_item_data]
#user_item_data[:10]


n_movies = len(set(movie_ids))
movie_to_index_dict = dict(zip(set(movie_ids),range(n_movies)))
n_users = len(set(users))
user_to_index_dict = dict(zip(set(users),range(n_users)))


user_item = np.zeros((n_users,n_movies))


for row in user_item_data:
    user_index = user_to_index_dict[row[0]]
    movie_index = movie_to_index_dict[row[1]]
    if (float(row[2]) > 3): # Lets say that rating >3 is that user liked it
        user_item[user_index,movie_index] = 1


U,D,VT = np.linalg.svd(user_item,full_matrices=False)


n_latent = 100
C = U[:,:n_latent]
F = np.diag(D[:n_latent])@VT[:n_latent,:]


C.shape

(610, 100)


F.shape

(100, 9724)


np.sum(np.abs(user_item-(C@F > 1/2)))/np.sum(user_item)

0.3111024693758507

Introduction to Data Science: A Comp-Math-Stat Approach ¶

1MS041, 2021¶

Singular value decomposition¶

Background¶

Projections of vectors¶

Definition¶

Note¶

Theorem (Greedy Algorithm)¶

Singular Value Decomposition of a Matrix¶

Power method¶

PCA¶

PCA class¶

SVD in Action¶

Factor Analysis¶

Example on compressing data¶

Explained variance¶

Decorrelation¶

Another example on compressing data, building a classifier¶

Ok, so what happens if we try our code on the full data-set?¶

Comments¶

Scaling data¶

Conclusion¶

Recommender engines¶

Introduction to Data Science: A Comp-Math-Stat Approach¶

1MS041, 2021¶

Singular value decomposition¶

Background¶

Projections of vectors¶

Definition¶

Note¶

Theorem (Greedy Algorithm)¶

Singular Value Decomposition of a Matrix¶

Power method¶

PCA¶

PCA class¶

SVD in Action¶

Factor Analysis¶

Example on compressing data¶

Explained variance¶

Decorrelation¶

Another example on compressing data, building a classifier¶

Ok, so what happens if we try our code on the full data-set?¶

Comments¶

Scaling data¶

Conclusion¶

Recommender engines¶

Introduction to Data Science: A Comp-Math-Stat Approach ¶