Introduction to Data Science

1MS041, 2023

©2023 Raazesh Sainudiin, Benny Avelin. Attribution 4.0 International (CC BY 4.0)

Generating random variables in different forms

In [76]:
from Utils import linConGen
import numpy as np
import matplotlib.pyplot as plt
from Utils import discrete_histogram
In [77]:
m,a,b = (2**32, 1103515245,12345)
seed = 1
np.array(linConGen(m,a,b,seed,10))/m
Out[77]:
array([2.32830644e-10, 2.56935039e-01, 5.87870652e-01, 1.54325758e-01,
       7.67266943e-01, 9.73813963e-01, 5.85868151e-01, 8.51115584e-01,
       6.13215341e-01, 7.47386723e-01])
In [78]:
def random():
    """Generates one random sample from the uniform [0,1] distribution"""
    global seed
    seed = linConGen(m,a,b,seed,2)[1]
    return seed/m
In [81]:
unif_x = [random() for i in range(1000)]
In [82]:
_=plt.hist(unif_x,bins=10)

Generating Bernoulli rv.s

In [83]:
def unif_to_bernoulli(x,p):
    from math import floor
    return floor(x+p)
In [84]:
bernoulli_x = [unif_to_bernoulli(random(),0.9) for i in range(1000)]
In [85]:
discrete_histogram(bernoulli_x,normed=True)

Generating integers with different probabilites

In [86]:
import numpy as np
In [87]:
p = np.array([0.1,0.2,0.5,0.2])
In [88]:
p_cumsum = np.cumsum(p)
In [89]:
p_cumsum
Out[89]:
array([0.1, 0.3, 0.8, 1. ])
In [96]:
uniform_x = np.array([random() for i in range(10000)])
#np.argmax(uniform_x-p_cumsum,)
In [97]:
xp = (uniform_x.reshape(-1,1)-p_cumsum.reshape(1,-1)) # (10000,1), (1,4) -> (10000,4)
xp.shape
Out[97]:
(10000, 4)
In [ ]:
xp <= 0
x - p1
x - p1 - p2
x - p1 - p2 -p3
...
(0,p1),(p1,p1+p2),...,(p1+p2+p3,1]
0, 1, ..., 3
In [98]:
from Utils import discrete_histogram
discrete_histogram(np.argmax(xp <= 0,axis=1),normed=True)

Shuffling an array

-- To shuffle an array a of n elements (indices 0..n-1): for i from n−1 downto 1 do j ← random integer such that 0 ≤ j ≤ i exchange a[j] and a[i]

$$ F^{-1} (y) = \min\{x: F(x) \geq y\} $$
In [99]:
def randint(b):
    """Producing random integers between 0 and b inclusive"""
    u = random()
    from math import floor
    return floor(u*(b+1))
In [101]:
from Utils import plotEDF,makeEDF
random_samples_int = [randint(10) for i in range(10000)]
discrete_histogram(random_samples_int)
plotEDF(makeEDF(random_samples_int))
In [106]:
arr = np.arange(0,100)
n = len(arr)
for i in range(n-1,0,-1):
    j = randint(i) # Random integer between 0 and i inclusive
    tmp = arr[j]
    arr[j] = arr[i]
    arr[i] = tmp
In [107]:
arr
Out[107]:
array([68, 42, 71, 11, 65, 75, 74, 85, 60, 91, 80, 56, 87, 30, 35, 89, 93,
       58, 17,  4, 13, 16, 39, 57,  2, 36, 37, 96, 19,  0, 62, 29,  3,  6,
       64, 66, 88, 77, 38, 24, 83, 90, 20, 81, 10, 84, 61, 86, 63, 95, 45,
       41, 98, 26, 44, 54, 22, 82, 14, 49, 76, 23, 21, 79, 51, 32, 28,  7,
       46, 97, 53, 43, 18,  1, 55,  9, 78, 48, 52, 59, 12,  8, 50, 31, 73,
       33, 70, 25,  5, 34, 92, 72, 94, 99, 67, 15, 27, 47, 69, 40])
In [108]:
def random_shuffle(arr):
    """Shuffles an array in place"""
    n = len(arr)
    for i in range(n-1,0,-1):
        j = randint(i) # Random integer between 0 and i inclusive
        tmp = arr[j]
        arr[j] = arr[i]
        arr[i] = tmp
    return arr
In [109]:
aa = np.arange(10)
random_shuffle(aa)
aa
Out[109]:
array([3, 7, 4, 9, 5, 6, 8, 1, 2, 0])
In [112]:
from sklearn.datasets import make_classification
X,y = make_classification()
In [114]:
index = np.arange(0,len(y))
random_shuffle(index)
X_shuffle = X[index,:]
y_shuffle = y[index]

Permutation testing

See chapter 10 in All of Statistics

In [115]:
from math import floor
X = np.array([floor(random()+0.5) for i in range(100)])
Y = np.array([floor(random()+0.2) for i in range(100)])
XY = np.concatenate([X,Y])
In [116]:
def compute_diff(arr):
    return np.abs(np.mean(arr[:100])-np.mean(arr[100:]))
In [117]:
compute_diff(XY)
Out[117]:
0.35
In [118]:
differences = np.array([compute_diff(random_shuffle(XY)) for i in range(10000)])
In [119]:
from Utils import plotEDF,makeEDF
plotEDF(makeEDF(differences))

Box Muller

Suppose that $U_1,U_2 \overset{\text{IID}}{\sim} \text{Uniform}([0,1])$, then \begin{align*} Z_0 &= \sqrt{-2\ln(U_1)} \cos(2\pi U_2) \\ Z_1 &= \sqrt{-2\ln(U_1)} \sin(2\pi U_2) \\ \end{align*} are independent random variables, and $Z_0,Z_1 \sim \mathcal{N}(0,1)$.

In [120]:
uniform_sequence = np.array([random() for i in range(10000)])
re_unif = uniform_sequence.reshape(-1,2)
Z1 = np.sqrt(-2*np.log(re_unif[:,0]))*np.cos(2*np.pi*re_unif[:,1])
Z2 = np.sqrt(-2*np.log(re_unif[:,0]))*np.sin(2*np.pi*re_unif[:,1])
Z = np.column_stack([Z1,Z2])
In [121]:
import matplotlib.pyplot as plt
plt.scatter(Z1,Z2,alpha=0.1)
Out[121]:
<matplotlib.collections.PathCollection at 0x15f458550>
In [122]:
from Utils import makeEDF,plotEDF
from scipy.stats import norm
x_plot = np.linspace(-3,3)
plotEDF(makeEDF(Z.flatten()),force_display=False)
plt.plot(x_plot,norm.cdf(x_plot),color='red')
Out[122]:
[<matplotlib.lines.Line2D at 0x161e98dc0>]
In [123]:
_=plt.hist(Z.flatten())

Rejection sampling

  1. input
    • a target density $f(x)$
    • a sampling density $g(x)$ that satisfies $f(x) \leq M g(x)$
  2. output: a sequence of samples $x_0, \ldots$ with distribution $f$

  3. Sample initial state $X^{(0)}$ from $g$

REPEAT

  1. At iteration $t$,
  2. Generate $x$ from $g$ and compute the ratio $r(x) = \frac{f(x)}{Mg(x)}$
  3. Draw $U \sim \text{uniform}([0,1])$ and set $X^{{t+1}} = x$, if $U \leq r(x)$, otherwise goto 2?

UNTIL desired number of samples are obtained.

Let us sample from the uniform distribution on the unit disk, our proposal distribution can be taken to be the uniform distribution on the 2-square. The area of the disk is $r^2 \pi$, i.e. the density $f$ should be $1/\pi$ on the circle and 0 elsewhere. The 2-square has area $4$ so the density of $g = 1/4$, we thus need $M = 4/\pi$

In [132]:
samples = []
rejections = 0
while (len(samples) < 100000):
    # We know that the ratio is 1 when x is in the circle and 0 if it is outside
    (X1,X2) = (2*random()-1,2*random()-1)
    if (np.linalg.norm([X1,X2]) <= 1): # We are inside the circle
        samples.append((X1,X2))
    else:
        rejections+=1
In [138]:
100000/(100000+rejections)
Out[138]:
0.7847137756503315
In [139]:
np.pi/4
Out[139]:
0.7853981633974483
In [125]:
samp_arr = np.array(samples)
In [126]:
plt.scatter(samp_arr[:,0],samp_arr[:,1])
Out[126]:
<matplotlib.collections.PathCollection at 0x10ef84460>
In [127]:
_=plt.hist2d(samp_arr[:,0],samp_arr[:,1])
In [156]:
f = np.array([0.1,0.2,0.5,0.2])
g = np.array([1/4,1/4,1/4,1/4])
M = 2
f-M*g
Out[156]:
array([-0.275, -0.175,  0.125, -0.175])
In [ ]:
 
In [157]:
samples = []

r = f/(M*g)
rejections = 0
while (len(samples) < 10000):
    x = randint(3)
    r[x]
    U = random()
    if (U <= r[x]):
        samples.append(x)
    else:
        rejections+=1
In [158]:
rejections/(10000+rejections)
Out[158]:
0.4117647058823529
In [159]:
discrete_histogram(samples,normed=True)

Sampling from the Poisson

The Poisson has distribution function $$ F(x) = e^{-\lambda} \sum_{j=0}^{\lfloor x \rfloor} \frac{\lambda^j}{j!} $$

In [ ]:
def sample_poisson():
    from scipy.special import factorial
    Y = random()
    l = 2

    F = 0
    for i in range(1000):
        F = F + np.exp(-l)*np.power(l,i)/factorial(i)
        if (F > Y):
            break
    return i
In [ ]:
poisson_samples = [sample_poisson() for i in range(1000)]
f = []
for i in range(13):
    f.append(np.exp(-l)*np.power(l,i)/factorial(i))
F = np.cumsum(f)
plt.plot(F,color='black')
plotEDF(makeEDF(poisson_samples))
discrete_histogram(poisson_samples)
In [ ]: