import numpy as np
dimensions = range(1,20,1) # Lets try dimension 1 to 20
numberOfExperiments = 10000 # Using 10000 experiments to estimate probability
scale = sqrt(1/(2*pi)).n(digits=8)
normals = [np.random.normal(size=(n,numberOfExperiments))*scale for n in dimensions]


norms = [np.linalg.norm(x,axis=0) for x in normals] # Compute the length


probInsideUnitBall = [np.mean(x < 1) for x in norms] # Estimate probability


probInsideUnitBall

[0.9871,
 0.9596,
 0.8969,
 0.8209,
 0.7177,
 0.6066,
 0.4849,
 0.386,
 0.2866,
 0.2138,
 0.1471,
 0.0998,
 0.0607,
 0.0406,
 0.0235,
 0.0149,
 0.0073,
 0.0042,
 0.0035]


display = line(list(zip(dimensions,probInsideUnitBall)))
# The 20 is just for scale, recall the constant c 
# and the arbitrary \epsilon in our estimate
display+=plot(20/(pi*x),xmin=1,xmax=20,color='red') 
show(display)


meanNorm = [np.mean(x) for x in norms]
display = line(zip(dimensions,meanNorm))
display += plot(sqrt(x/(2*pi)),xmin=1,xmax=20,color='red')
show(display)


P=plot((1-0.1)^x,1,10)
P+=plot(exp(-0.1*x),1,10,color='green')
P.show()


def showURL(url, ht=500):
    """Return an IFrame of the url to show in notebook with height ht"""
    from IPython.display import IFrame
    return IFrame(url, width='95%', height=ht) 
showURL('https://en.wikipedia.org/wiki/Change_of_variables#Coordinate_transformation',300)


showURL('https://en.wikipedia.org/wiki/N-sphere#Spherical_coordinates',300)


x = var('x')
integrate(exp(-x^2),x,-infinity,infinity)

sqrt(pi)


points([(d,2*pi^(d/2)/(d*factorial(d/2-1))) for d in range(2,20,2)])


showURL('https://en.wikipedia.org/wiki/Gamma_function',400)


XY = np.random.uniform(-1,1,size=(10000,2))


points(XY)


XY = XY / np.linalg.norm(XY,axis=1).reshape(-1,1)
points(XY,aspect_ratio=1)


import pylab
_=pylab.hist(np.arctan2(XY[:,1],XY[:,0]),bins=20)


showURL('https://numpy.org/doc/stable/reference/generated/numpy.arctan2.html',300)


XY = np.random.uniform(-1,1,size=(10000,2))
XY_inCircle = XY[np.linalg.norm(XY,axis=1) < 1]

XY_inCircle = XY_inCircle / np.linalg.norm(XY_inCircle,axis=1).reshape(-1,1)
points(XY_inCircle,aspect_ratio=1)


import pylab
_=pylab.hist(np.arctan2(XY_inCircle[:,1],XY_inCircle[:,0]),bins=20)


XY = np.random.normal(size=(10000,2))

XY = XY / np.linalg.norm(XY,axis=1).reshape(-1,1)
_=pylab.hist(np.arctan2(XY[:,1],XY[:,0]),bins=20)


import numpy as np
XY = np.random.normal(size=(100000,2)) # Spherical Gaussian with unit variance in each coordinate in R^2

XY = XY / np.linalg.norm(XY,axis=1).reshape(-1,1) # Make all vector unit length

r = np.random.uniform(size=(XY.shape[0],1)) # Sample the radii uniformly from [0,1]
uniform_ball = np.sqrt(r)*XY # Consider the correctly scaled radius


def sage_histogram2d(data_x,data_y, bins=10, normed=False):
    import numpy as np
    from sage.plot.plot3d.shapes import Box
    counts, x_edges, y_edges = np.histogram2d(data_x,data_y,normed=normed,bins=bins)
    width = np.mean(np.diff(x_edges))
    depth = np.mean(np.diff(y_edges))
    XX,YY =  np.meshgrid(x_edges[:-1],y_edges[:-1])
    show(sum([Box((width/2,depth/2,c/2)).translate(x+width/2,y+depth/2,c/2) for x,y,c in zip(XX.reshape(-1),YY.reshape(-1),counts.reshape(-1)) if c > 0]))


sage_histogram2d(uniform_ball[:,0],uniform_ball[:,1],normed=True,bins=20)


d = 10^2
n_samples = 10000
dGaussian = np.random.normal(size=(n_samples,d))
dGaussianNorm = np.linalg.norm(dGaussian,axis=1)
histogram(dGaussianNorm,bins=100)


# For simplicity we will construct a classification dataset with two features and two classes

X_data = np.array([[ 0.36731074, -0.26731719, -0.72426635, -0.54930901, -0.47614201,
                     0.1327083 ,  1.30847308,  0.19501328, -0.99805696,  0.14979754],
                   [ 0.37311239,  0.56515267, -0.1917514 , -0.14742026,  0.2890942 ,
                    -0.02590534, -0.53987907,  0.70816002, -0.92185638,  0.92011316],
                   [-0.30774856,  0.84222474,  1.36755686,  0.2035808 ,  0.91745894,
                     2.39470366, -0.11227247, -0.36218045,  0.96482992,  0.94849202],
                   [-0.66770717, -0.93943336,  1.12547913, -0.48933722, -0.21269764,
                    -0.80459114, -0.33914025,  0.31216994,  1.78198367, -0.54211499],
                   [ 0.40071561,  0.40020999, -2.30125766, -0.33763234, -0.7319695 ,
                     1.25647226,  0.66023155, -0.35087189, -1.34430587, -1.96996738]])

y_data = np.array([0,0,1,1,0])


XX = X_data.reshape(X_data.shape[0],1,X_data.shape[1])-X_data.reshape(1,X_data.shape[0],X_data.shape[1])
distance_matrix = np.linalg.norm(XX,axis=2)


distance_matrix

array([[0.        , 2.45515562, 4.52856743, 4.10265945, 3.10454658],
       [2.45515562, 0.        , 3.77333293, 3.96272646, 4.27715699],
       [4.52856743, 3.77333293, 0.        , 4.33026799, 5.73698516],
       [4.10265945, 3.96272646, 4.33026799, 0.        , 5.69825883],
       [3.10454658, 4.27715699, 5.73698516, 5.69825883, 0.        ]])


# Lets simulate
import numpy as np
@interact
def _(d=(100,(100..2000)),k=(2,(2..100))):
    np.random.seed(1)
    v_pre = np.random.normal(size=d)
    v = v_pre / np.linalg.norm(v_pre)

    print("v has length: %.2f" % np.linalg.norm(v))

    num_simulations = 300

    error = []

    for i in range(num_simulations):
        uis = np.random.normal(size=(k,d))
        f = uis@v
        error.append(abs(np.linalg.norm(f)-np.linalg.norm(v)*sqrt(k))/(np.linalg.norm(v)*sqrt(k)))
    P=histogram(error)
    P.xmax(1)
    P.show()


import numpy as np
d = 1000
n = 100
vis = np.random.normal(size=(n,d))


import csv
features = []
labels = []
with open('data/leukemia.csv',mode='r') as f:
    reader = csv.reader(f)
    header=next(f)
    for row in reader:
        features.append(np.array(row[:-1],dtype=float))
        labels.append((row[-1] == 'ALL')*1)


X = np.stack(features,axis=0)
Y = np.array(labels)


X.shape

(72, 7129)


def knn_distances(xTrain,xTest,k):
    """
    Finds the k nearest neighbors of xTest in xTrain.
    Input:
    xTrain = n x d matrix. n=rows and d=features
    xTest = m x d matrix. m=rows and d=features (same amount of features as xTrain)
    k = number of nearest neighbors to be found
    Output:
    dists = distances between all xTrain and all XTest points. Size of n x m
    indices = k x m matrix with the indices of the yTrain labels that represent the point
    """
    #the following formula calculates the Euclidean distances.
    import numpy as np
    distances = -2 * xTrain@xTest.T + np.sum(xTest**2,axis=1) + np.sum(xTrain**2,axis=1)[:, np.newaxis]
    #because of float precision, some small numbers can become negatives. Need to be replace with 0.
    distances[distances < 0] = 0
    distances = distances**.5
    indices = np.argsort(distances, 0) #get indices of sorted items
    distances = np.sort(distances,0) #distances sorted in axis 0
    #returning the top-k closest distances.
    return indices[0:k,:], distances[0:k,:]

def knn_predictions(xTrain,yTrain,xTest=None,k=3):
    """
    Uses xTrain and yTrain to predict xTest.
    Input:
    xTrain = n x d matrix. n=rows and d=features
    yTrain = n x 1 array. n=rows with label value
    xTest = m x d matrix. m=rows and d=features (same amount of features as xTrain)
    k = number of nearest neighbors to be found
    Output:
    predictions = predicted labels, ie preds(i) is the predicted label of xTest(i,:)
    """
    import numpy as np
    if (xTest == None):
        xTest = xTrain
        
    indices, distances = knn_distances(xTrain,xTest,k)
    yTrain = yTrain.flatten()
    rows, columns = indices.shape
    predictions = list()
    for j in range(columns):
        temp = list()
        for i in range(rows):
            cell = indices[i][j]
            temp.append(yTrain[cell])
        predictions.append(max(temp,key=temp.count)) #this is the key function, brings the mode value
    predictions=np.array(predictions)
    return predictions

def score(prediction,true_values):
    return np.sum(prediction == true_values)/len(prediction)


score(knn_predictions(X,Y,k=5),Y)

0.9583333333333334


%%timeit

score(knn_predictions(X,Y,k=5),Y)

The slowest run took 17.36 times longer than the fastest. This could mean that an intermediate result is being cached.
10 loops, best of 5: 4.66 ms per loop


k = 3
X_proj_kd = random_projection(X,X.shape[1],k)


score(knn_predictions(X_proj_kd,Y,k=5),Y)

0.8472222222222222


%%timeit

score(knn_predictions(X_proj_kd,Y,k=5),Y)

1000 loops, best of 5: 1.04 ms per loop


def standardScaler(X_in):
    '''Takes an array of shape (n_samples,n_features) and centers and normalizes the data'''
    X_out = (X_in-np.mean(X_in,axis=0))/np.std(X_in,axis=0)
    return X_out


if (k in [2,3]):
    X_proj_kd_rescale = standardScaler(X_proj_kd)
    class0 = X_proj_kd_rescale[Y==0]
    class1 = X_proj_kd_rescale[Y==1]
    P=points(class0,color='blue',size=20)
    P+=points(class1,color='red',size=20)
    P.show()

Introduction to Data Science: A Comp-Math-Stat Approach ¶

1MS041, 2021¶

16. High-Dimensional Space¶

Topics¶

Some tail inequalities¶

Markovs inequality¶

Proof¶

Chebyshev's inequality¶

Proof¶

LLN (Law of large numbers)¶

Proof¶

The LLN and volume of the unit ball in $d$ dimensions¶

Lets try it¶

Lets also look at the average length¶

The geometry of high dimension¶

Some preliminaries¶

Properties of the unit ball¶

Generating points uniformly at random from a ball¶

Uniform at random on the unit sphere (better version)¶

How do we generate uniform at random from the unit ball?¶

Gaussian Annulus theorem¶

Proof¶

K-Nearest Neighbors Algorithm¶

Random Projection and Johnson-Lindenstrauss Lemma¶

Random Projection Theorem¶

What can you get from the above simulation?¶

You try at home¶

Proof¶

Johnson-Lindenstrauss Lemma¶

Proof¶

YouTry:¶

High dimensional data-set and nearest neighbor search¶

Description¶

Introduction to Data Science: A Comp-Math-Stat Approach¶

1MS041, 2021¶

16. High-Dimensional Space¶

Topics¶

Some tail inequalities¶

Markovs inequality¶

Proof¶

Chebyshev's inequality¶

Proof¶

LLN (Law of large numbers)¶

Proof¶

The LLN and volume of the unit ball in $d$ dimensions¶

Lets try it¶

Lets also look at the average length¶

The geometry of high dimension¶

Some preliminaries¶

Properties of the unit ball¶

Generating points uniformly at random from a ball¶

Uniform at random on the unit sphere (better version)¶

How do we generate uniform at random from the unit ball?¶

Gaussian Annulus theorem¶

Proof¶

K-Nearest Neighbors Algorithm¶

Random Projection and Johnson-Lindenstrauss Lemma¶

Random Projection Theorem¶

What can you get from the above simulation?¶

You try at home¶

Proof¶

Johnson-Lindenstrauss Lemma¶

Proof¶

YouTry:¶

High dimensional data-set and nearest neighbor search¶

Description¶

Introduction to Data Science: A Comp-Math-Stat Approach ¶