Introduction to Data Science¶
1MS041, 2024¶
©2024 Raazesh Sainudiin, Benny Avelin. Attribution 4.0 International (CC BY 4.0)
Generating random variables in different forms¶
from Utils import linConGen
import numpy as np
import matplotlib.pyplot as plt
from Utils import discrete_histogram
m,a,b = (2**32, 1103515245,12345)
seed = 1
np.array(linConGen(m,a,b,seed,10))/m
array([2.32830644e-10, 2.56935039e-01, 5.87870652e-01, 1.54325758e-01,
7.67266943e-01, 9.73813963e-01, 5.85868151e-01, 8.51115584e-01,
6.13215341e-01, 7.47386723e-01])
def random():
"""Generates one random sample from the uniform [0,1] distribution"""
global seed
seed = linConGen(m,a,b,seed,2)[1]
return seed/m
unif_x = [random() for i in range(1000)]
_=plt.hist(unif_x,bins=10)
Generating Bernoulli rv.s¶
def unif_to_bernoulli(x,p):
from math import floor
return floor(x+p)
bernoulli_x = [unif_to_bernoulli(random(),0.9) for i in range(1000)]
discrete_histogram(bernoulli_x,normed=True)
Generating integers with different probabilites¶
import numpy as np
p = np.array([0.1,0.2,0.5,0.2])
p_cumsum = np.cumsum(p)
p_cumsum
array([0.1, 0.3, 0.8, 1. ])
uniform_x = np.array([random() for i in range(10000)])
#np.argmax(uniform_x-p_cumsum,)
xp = (uniform_x.reshape(-1,1)-p_cumsum.reshape(1,-1)) # (10000,1), (1,4) -> (10000,4)
xp.shape
(10000, 4)
xp <= 0
x - p1
x - p1 - p2
x - p1 - p2 -p3
...
(0,p1),(p1,p1+p2),...,(p1+p2+p3,1]
0, 1, ..., 3
from Utils import discrete_histogram
discrete_histogram(np.argmax(xp <= 0,axis=1),normed=True)
Shuffling an array¶
--
for i from n−1 downto 1 do
j ← random integer such that 0 ≤ j ≤ i
exchange a[j] and a[i]```
$$ F^{-1} (y) = \min\{x: F(x) \geq y\} $$
def randint(b):
"""Producing random integers between 0 and b inclusive"""
u = random()
from math import floor
return floor(u*(b+1))
from Utils import plotEDF,makeEDF
random_samples_int = [randint(10) for i in range(10000)]
discrete_histogram(random_samples_int)
plotEDF(makeEDF(random_samples_int))
arr = np.arange(0,100)
n = len(arr)
for i in range(n-1,0,-1):
j = randint(i) # Random integer between 0 and i inclusive
tmp = arr[j]
arr[j] = arr[i]
arr[i] = tmp
arr
array([68, 42, 71, 11, 65, 75, 74, 85, 60, 91, 80, 56, 87, 30, 35, 89, 93,
58, 17, 4, 13, 16, 39, 57, 2, 36, 37, 96, 19, 0, 62, 29, 3, 6,
64, 66, 88, 77, 38, 24, 83, 90, 20, 81, 10, 84, 61, 86, 63, 95, 45,
41, 98, 26, 44, 54, 22, 82, 14, 49, 76, 23, 21, 79, 51, 32, 28, 7,
46, 97, 53, 43, 18, 1, 55, 9, 78, 48, 52, 59, 12, 8, 50, 31, 73,
33, 70, 25, 5, 34, 92, 72, 94, 99, 67, 15, 27, 47, 69, 40])
def random_shuffle(arr):
"""Shuffles an array in place"""
n = len(arr)
for i in range(n-1,0,-1):
j = randint(i) # Random integer between 0 and i inclusive
tmp = arr[j]
arr[j] = arr[i]
arr[i] = tmp
return arr
aa = np.arange(10)
random_shuffle(aa)
aa
array([3, 7, 4, 9, 5, 6, 8, 1, 2, 0])
from sklearn.datasets import make_classification
X,y = make_classification()
index = np.arange(0,len(y))
random_shuffle(index)
X_shuffle = X[index,:]
y_shuffle = y[index]
Permutation testing¶
See chapter 10 in All of Statistics
from math import floor
X = np.array([floor(random()+0.5) for i in range(100)])
Y = np.array([floor(random()+0.2) for i in range(100)])
XY = np.concatenate([X,Y])
def compute_diff(arr):
return np.abs(np.mean(arr[:100])-np.mean(arr[100:]))
compute_diff(XY)
0.35
differences = np.array([compute_diff(random_shuffle(XY)) for i in range(10000)])
from Utils import plotEDF,makeEDF
plotEDF(makeEDF(differences))
Box Muller¶
Suppose that $U_1,U_2 \overset{\text{IID}}{\sim} \text{Uniform}([0,1])$, then \begin{align*} Z_0 &= \sqrt{-2\ln(U_1)} \cos(2\pi U_2) \\ Z_1 &= \sqrt{-2\ln(U_1)} \sin(2\pi U_2) \\ \end{align*} are independent random variables, and $Z_0,Z_1 \sim \mathcal{N}(0,1)$.
uniform_sequence = np.array([random() for i in range(10000)])
re_unif = uniform_sequence.reshape(-1,2)
Z1 = np.sqrt(-2*np.log(re_unif[:,0]))*np.cos(2*np.pi*re_unif[:,1])
Z2 = np.sqrt(-2*np.log(re_unif[:,0]))*np.sin(2*np.pi*re_unif[:,1])
Z = np.column_stack([Z1,Z2])
import matplotlib.pyplot as plt
plt.scatter(Z1,Z2,alpha=0.1)
<matplotlib.collections.PathCollection at 0x15f458550>
from Utils import makeEDF,plotEDF
from scipy.stats import norm
x_plot = np.linspace(-3,3)
plotEDF(makeEDF(Z.flatten()),force_display=False)
plt.plot(x_plot,norm.cdf(x_plot),color='red')
[<matplotlib.lines.Line2D at 0x161e98dc0>]
_=plt.hist(Z.flatten())
Rejection sampling¶
input
- a target density $f(x)$
- a sampling density $g(x)$ that satisfies $f(x) \leq M g(x)$
output: a sequence of samples $x_0, \ldots$ with distribution $f$
Sample initial state $X^{(0)}$ from $g$
REPEAT
- At iteration $t$,
- Generate $x$ from $g$ and compute the ratio $r(x) = \frac{f(x)}{Mg(x)}$
- Draw $U \sim \text{uniform}([0,1])$ and set $X^{{t+1}} = x$, if $U \leq r(x)$, otherwise goto 2?
UNTIL desired number of samples are obtained.
Let us sample from the uniform distribution on the unit disk, our proposal distribution can be taken to be the uniform distribution on the 2-square. The area of the disk is $r^2 \pi$, i.e. the density $f$ should be $1/\pi$ on the circle and 0 elsewhere. The 2-square has area $4$ so the density of $g = 1/4$, we thus need $M = 4/\pi$
samples = []
rejections = 0
while (len(samples) < 100000):
# We know that the ratio is 1 when x is in the circle and 0 if it is outside
(X1,X2) = (2*random()-1,2*random()-1)
if (np.linalg.norm([X1,X2]) <= 1): # We are inside the circle
samples.append((X1,X2))
else:
rejections+=1
100000/(100000+rejections)
0.7847137756503315
np.pi/4
0.7853981633974483
samp_arr = np.array(samples)
plt.scatter(samp_arr[:,0],samp_arr[:,1])
<matplotlib.collections.PathCollection at 0x10ef84460>
_=plt.hist2d(samp_arr[:,0],samp_arr[:,1])
f = np.array([0.1,0.2,0.5,0.2])
g = np.array([1/4,1/4,1/4,1/4])
M = 2
f-M*g
array([-0.275, -0.175, 0.125, -0.175])
samples = []
r = f/(M*g)
rejections = 0
while (len(samples) < 10000):
x = randint(3)
r[x]
U = random()
if (U <= r[x]):
samples.append(x)
else:
rejections+=1
rejections/(10000+rejections)
0.4117647058823529
discrete_histogram(samples,normed=True)
Sampling from the Poisson¶
The Poisson has distribution function $$ F(x) = e^{-\lambda} \sum_{j=0}^{\lfloor x \rfloor} \frac{\lambda^j}{j!} $$
def sample_poisson():
from scipy.special import factorial
Y = random()
l = 2
F = 0
for i in range(1000):
F = F + np.exp(-l)*np.power(l,i)/factorial(i)
if (F > Y):
break
return i
poisson_samples = [sample_poisson() for i in range(1000)]
f = []
for i in range(13):
f.append(np.exp(-l)*np.power(l,i)/factorial(i))
F = np.cumsum(f)
plt.plot(F,color='black')
plotEDF(makeEDF(poisson_samples))
discrete_histogram(poisson_samples)