Introduction to Data Science ¶

1MS041, 2023¶

Generating random variables in different forms¶

from Utils import linConGen
import numpy as np
import matplotlib.pyplot as plt
from Utils import discrete_histogram

m,a,b = (2**32, 1103515245,12345)
seed = 1
np.array(linConGen(m,a,b,seed,10))/m

array([2.32830644e-10, 2.56935039e-01, 5.87870652e-01, 1.54325758e-01,
       7.67266943e-01, 9.73813963e-01, 5.85868151e-01, 8.51115584e-01,
       6.13215341e-01, 7.47386723e-01])

def random():
    """Generates one random sample from the uniform [0,1] distribution"""
    global seed
    seed = linConGen(m,a,b,seed,2)[1]
    return seed/m

unif_x = [random() for i in range(1000)]

_=plt.hist(unif_x,bins=10)

Generating Bernoulli rv.s¶

def unif_to_bernoulli(x,p):
    from math import floor
    return floor(x+p)

bernoulli_x = [unif_to_bernoulli(random(),0.9) for i in range(1000)]

discrete_histogram(bernoulli_x,normed=True)

Generating integers with different probabilites¶

import numpy as np

p = np.array([0.1,0.2,0.5,0.2])

p_cumsum = np.cumsum(p)

p_cumsum

array([0.1, 0.3, 0.8, 1. ])

uniform_x = np.array([random() for i in range(10000)])
#np.argmax(uniform_x-p_cumsum,)

xp = (uniform_x.reshape(-1,1)-p_cumsum.reshape(1,-1)) # (10000,1), (1,4) -> (10000,4)
xp.shape

(10000, 4)

xp <= 0
x - p1
x - p1 - p2
x - p1 - p2 -p3
...
(0,p1),(p1,p1+p2),...,(p1+p2+p3,1]
0, 1, ..., 3

from Utils import discrete_histogram
discrete_histogram(np.argmax(xp <= 0,axis=1),normed=True)

Shuffling an array¶

-- To shuffle an array a of n elements (indices 0..n-1): for i from n−1 downto 1 do j ← random integer such that 0 ≤ j ≤ i exchange a[j] and a[i]

$$ F^{-1} (y) = \min\{x: F(x) \geq y\} $$

def randint(b):
    """Producing random integers between 0 and b inclusive"""
    u = random()
    from math import floor
    return floor(u*(b+1))

from Utils import plotEDF,makeEDF
random_samples_int = [randint(10) for i in range(10000)]
discrete_histogram(random_samples_int)
plotEDF(makeEDF(random_samples_int))

arr = np.arange(0,100)
n = len(arr)
for i in range(n-1,0,-1):
    j = randint(i) # Random integer between 0 and i inclusive
    tmp = arr[j]
    arr[j] = arr[i]
    arr[i] = tmp

arr

array([68, 42, 71, 11, 65, 75, 74, 85, 60, 91, 80, 56, 87, 30, 35, 89, 93,
       58, 17,  4, 13, 16, 39, 57,  2, 36, 37, 96, 19,  0, 62, 29,  3,  6,
       64, 66, 88, 77, 38, 24, 83, 90, 20, 81, 10, 84, 61, 86, 63, 95, 45,
       41, 98, 26, 44, 54, 22, 82, 14, 49, 76, 23, 21, 79, 51, 32, 28,  7,
       46, 97, 53, 43, 18,  1, 55,  9, 78, 48, 52, 59, 12,  8, 50, 31, 73,
       33, 70, 25,  5, 34, 92, 72, 94, 99, 67, 15, 27, 47, 69, 40])

def random_shuffle(arr):
    """Shuffles an array in place"""
    n = len(arr)
    for i in range(n-1,0,-1):
        j = randint(i) # Random integer between 0 and i inclusive
        tmp = arr[j]
        arr[j] = arr[i]
        arr[i] = tmp
    return arr

aa = np.arange(10)
random_shuffle(aa)
aa

array([3, 7, 4, 9, 5, 6, 8, 1, 2, 0])

from sklearn.datasets import make_classification
X,y = make_classification()

index = np.arange(0,len(y))
random_shuffle(index)
X_shuffle = X[index,:]
y_shuffle = y[index]

Permutation testing¶

See chapter 10 in All of Statistics

from math import floor
X = np.array([floor(random()+0.5) for i in range(100)])
Y = np.array([floor(random()+0.2) for i in range(100)])
XY = np.concatenate([X,Y])

def compute_diff(arr):
    return np.abs(np.mean(arr[:100])-np.mean(arr[100:]))

compute_diff(XY)

0.35

differences = np.array([compute_diff(random_shuffle(XY)) for i in range(10000)])

from Utils import plotEDF,makeEDF
plotEDF(makeEDF(differences))

Box Muller¶

Suppose that $U_1,U_2 \overset{\text{IID}}{\sim} \text{Uniform}([0,1])$, then \begin{align*} Z_0 &= \sqrt{-2\ln(U_1)} \cos(2\pi U_2) \\ Z_1 &= \sqrt{-2\ln(U_1)} \sin(2\pi U_2) \\ \end{align*} are independent random variables, and $Z_0,Z_1 \sim \mathcal{N}(0,1)$.

uniform_sequence = np.array([random() for i in range(10000)])
re_unif = uniform_sequence.reshape(-1,2)
Z1 = np.sqrt(-2*np.log(re_unif[:,0]))*np.cos(2*np.pi*re_unif[:,1])
Z2 = np.sqrt(-2*np.log(re_unif[:,0]))*np.sin(2*np.pi*re_unif[:,1])
Z = np.column_stack([Z1,Z2])

import matplotlib.pyplot as plt
plt.scatter(Z1,Z2,alpha=0.1)

<matplotlib.collections.PathCollection at 0x15f458550>

from Utils import makeEDF,plotEDF
from scipy.stats import norm
x_plot = np.linspace(-3,3)
plotEDF(makeEDF(Z.flatten()),force_display=False)
plt.plot(x_plot,norm.cdf(x_plot),color='red')

[<matplotlib.lines.Line2D at 0x161e98dc0>]

_=plt.hist(Z.flatten())

Rejection sampling¶

input
- a target density $f(x)$
- a sampling density $g(x)$ that satisfies $f(x) \leq M g(x)$
output: a sequence of samples $x_0, \ldots$ with distribution $f$
Sample initial state $X^{(0)}$ from $g$

REPEAT

At iteration $t$,
Generate $x$ from $g$ and compute the ratio $r(x) = \frac{f(x)}{Mg(x)}$
Draw $U \sim \text{uniform}([0,1])$ and set $X^{{t+1}} = x$, if $U \leq r(x)$, otherwise goto 2?

UNTIL desired number of samples are obtained.

Let us sample from the uniform distribution on the unit disk, our proposal distribution can be taken to be the uniform distribution on the 2-square. The area of the disk is $r^2 \pi$, i.e. the density $f$ should be $1/\pi$ on the circle and 0 elsewhere. The 2-square has area $4$ so the density of $g = 1/4$, we thus need $M = 4/\pi$

samples = []
rejections = 0
while (len(samples) < 100000):
    # We know that the ratio is 1 when x is in the circle and 0 if it is outside
    (X1,X2) = (2*random()-1,2*random()-1)
    if (np.linalg.norm([X1,X2]) <= 1): # We are inside the circle
        samples.append((X1,X2))
    else:
        rejections+=1

100000/(100000+rejections)

0.7847137756503315

np.pi/4

0.7853981633974483

samp_arr = np.array(samples)

plt.scatter(samp_arr[:,0],samp_arr[:,1])

<matplotlib.collections.PathCollection at 0x10ef84460>

_=plt.hist2d(samp_arr[:,0],samp_arr[:,1])

f = np.array([0.1,0.2,0.5,0.2])
g = np.array([1/4,1/4,1/4,1/4])
M = 2
f-M*g

array([-0.275, -0.175,  0.125, -0.175])

samples = []

r = f/(M*g)
rejections = 0
while (len(samples) < 10000):
    x = randint(3)
    r[x]
    U = random()
    if (U <= r[x]):
        samples.append(x)
    else:
        rejections+=1

rejections/(10000+rejections)

0.4117647058823529

discrete_histogram(samples,normed=True)

Sampling from the Poisson¶

The Poisson has distribution function $$ F(x) = e^{-\lambda} \sum_{j=0}^{\lfloor x \rfloor} \frac{\lambda^j}{j!} $$

def sample_poisson():
    from scipy.special import factorial
    Y = random()
    l = 2

    F = 0
    for i in range(1000):
        F = F + np.exp(-l)*np.power(l,i)/factorial(i)
        if (F > Y):
            break
    return i

poisson_samples = [sample_poisson() for i in range(1000)]
f = []
for i in range(13):
    f.append(np.exp(-l)*np.power(l,i)/factorial(i))
F = np.cumsum(f)
plt.plot(F,color='black')
plotEDF(makeEDF(poisson_samples))
discrete_histogram(poisson_samples)

Introduction to Data Science¶