Introduction to Data Science¶

1MS041, 2024¶

©2024 Raazesh Sainudiin, Benny Avelin. Attribution 4.0 International (CC BY 4.0)

Generating random variables in different forms¶

In [76]:
from Utils import linConGen
import numpy as np
import matplotlib.pyplot as plt
from Utils import discrete_histogram
In [77]:
m,a,b = (2**32, 1103515245,12345)
seed = 1
np.array(linConGen(m,a,b,seed,10))/m
Out[77]:
array([2.32830644e-10, 2.56935039e-01, 5.87870652e-01, 1.54325758e-01,
       7.67266943e-01, 9.73813963e-01, 5.85868151e-01, 8.51115584e-01,
       6.13215341e-01, 7.47386723e-01])
In [78]:
def random():
    """Generates one random sample from the uniform [0,1] distribution"""
    global seed
    seed = linConGen(m,a,b,seed,2)[1]
    return seed/m
In [81]:
unif_x = [random() for i in range(1000)]
In [82]:
_=plt.hist(unif_x,bins=10)
No description has been provided for this image

Generating Bernoulli rv.s¶

In [83]:
def unif_to_bernoulli(x,p):
    from math import floor
    return floor(x+p)
In [84]:
bernoulli_x = [unif_to_bernoulli(random(),0.9) for i in range(1000)]
In [85]:
discrete_histogram(bernoulli_x,normed=True)
No description has been provided for this image

Generating integers with different probabilites¶

In [86]:
import numpy as np
In [87]:
p = np.array([0.1,0.2,0.5,0.2])
In [88]:
p_cumsum = np.cumsum(p)
In [89]:
p_cumsum
Out[89]:
array([0.1, 0.3, 0.8, 1. ])
In [96]:
uniform_x = np.array([random() for i in range(10000)])
#np.argmax(uniform_x-p_cumsum,)
In [97]:
xp = (uniform_x.reshape(-1,1)-p_cumsum.reshape(1,-1)) # (10000,1), (1,4) -> (10000,4)
xp.shape
Out[97]:
(10000, 4)
In [ ]:
xp <= 0
x - p1
x - p1 - p2
x - p1 - p2 -p3
...
(0,p1),(p1,p1+p2),...,(p1+p2+p3,1]
0, 1, ..., 3
In [98]:
from Utils import discrete_histogram
discrete_histogram(np.argmax(xp <= 0,axis=1),normed=True)
No description has been provided for this image

Shuffling an array¶

--
for i from n−1 downto 1 do
     j ← random integer such that 0 ≤ j ≤ i
     exchange a[j] and a[i]```

$$ F^{-1} (y) = \min\{x: F(x) \geq y\} $$

In [99]:
def randint(b):
    """Producing random integers between 0 and b inclusive"""
    u = random()
    from math import floor
    return floor(u*(b+1))
In [101]:
from Utils import plotEDF,makeEDF
random_samples_int = [randint(10) for i in range(10000)]
discrete_histogram(random_samples_int)
plotEDF(makeEDF(random_samples_int))
No description has been provided for this image
No description has been provided for this image
In [106]:
arr = np.arange(0,100)
n = len(arr)
for i in range(n-1,0,-1):
    j = randint(i) # Random integer between 0 and i inclusive
    tmp = arr[j]
    arr[j] = arr[i]
    arr[i] = tmp
In [107]:
arr
Out[107]:
array([68, 42, 71, 11, 65, 75, 74, 85, 60, 91, 80, 56, 87, 30, 35, 89, 93,
       58, 17,  4, 13, 16, 39, 57,  2, 36, 37, 96, 19,  0, 62, 29,  3,  6,
       64, 66, 88, 77, 38, 24, 83, 90, 20, 81, 10, 84, 61, 86, 63, 95, 45,
       41, 98, 26, 44, 54, 22, 82, 14, 49, 76, 23, 21, 79, 51, 32, 28,  7,
       46, 97, 53, 43, 18,  1, 55,  9, 78, 48, 52, 59, 12,  8, 50, 31, 73,
       33, 70, 25,  5, 34, 92, 72, 94, 99, 67, 15, 27, 47, 69, 40])
In [108]:
def random_shuffle(arr):
    """Shuffles an array in place"""
    n = len(arr)
    for i in range(n-1,0,-1):
        j = randint(i) # Random integer between 0 and i inclusive
        tmp = arr[j]
        arr[j] = arr[i]
        arr[i] = tmp
    return arr
In [109]:
aa = np.arange(10)
random_shuffle(aa)
aa
Out[109]:
array([3, 7, 4, 9, 5, 6, 8, 1, 2, 0])
In [112]:
from sklearn.datasets import make_classification
X,y = make_classification()
In [114]:
index = np.arange(0,len(y))
random_shuffle(index)
X_shuffle = X[index,:]
y_shuffle = y[index]

Permutation testing¶

See chapter 10 in All of Statistics

In [115]:
from math import floor
X = np.array([floor(random()+0.5) for i in range(100)])
Y = np.array([floor(random()+0.2) for i in range(100)])
XY = np.concatenate([X,Y])
In [116]:
def compute_diff(arr):
    return np.abs(np.mean(arr[:100])-np.mean(arr[100:]))
In [117]:
compute_diff(XY)
Out[117]:
0.35
In [118]:
differences = np.array([compute_diff(random_shuffle(XY)) for i in range(10000)])
In [119]:
from Utils import plotEDF,makeEDF
plotEDF(makeEDF(differences))
No description has been provided for this image

Box Muller¶

Suppose that $U_1,U_2 \overset{\text{IID}}{\sim} \text{Uniform}([0,1])$, then \begin{align*} Z_0 &= \sqrt{-2\ln(U_1)} \cos(2\pi U_2) \\ Z_1 &= \sqrt{-2\ln(U_1)} \sin(2\pi U_2) \\ \end{align*} are independent random variables, and $Z_0,Z_1 \sim \mathcal{N}(0,1)$.

In [120]:
uniform_sequence = np.array([random() for i in range(10000)])
re_unif = uniform_sequence.reshape(-1,2)
Z1 = np.sqrt(-2*np.log(re_unif[:,0]))*np.cos(2*np.pi*re_unif[:,1])
Z2 = np.sqrt(-2*np.log(re_unif[:,0]))*np.sin(2*np.pi*re_unif[:,1])
Z = np.column_stack([Z1,Z2])
In [121]:
import matplotlib.pyplot as plt
plt.scatter(Z1,Z2,alpha=0.1)
Out[121]:
<matplotlib.collections.PathCollection at 0x15f458550>
No description has been provided for this image
In [122]:
from Utils import makeEDF,plotEDF
from scipy.stats import norm
x_plot = np.linspace(-3,3)
plotEDF(makeEDF(Z.flatten()),force_display=False)
plt.plot(x_plot,norm.cdf(x_plot),color='red')
Out[122]:
[<matplotlib.lines.Line2D at 0x161e98dc0>]
No description has been provided for this image
In [123]:
_=plt.hist(Z.flatten())
No description has been provided for this image

Rejection sampling¶

  1. input

    • a target density $f(x)$
    • a sampling density $g(x)$ that satisfies $f(x) \leq M g(x)$
  2. output: a sequence of samples $x_0, \ldots$ with distribution $f$

  3. Sample initial state $X^{(0)}$ from $g$

REPEAT

  1. At iteration $t$,
  2. Generate $x$ from $g$ and compute the ratio $r(x) = \frac{f(x)}{Mg(x)}$
  3. Draw $U \sim \text{uniform}([0,1])$ and set $X^{{t+1}} = x$, if $U \leq r(x)$, otherwise goto 2?

UNTIL desired number of samples are obtained.

Let us sample from the uniform distribution on the unit disk, our proposal distribution can be taken to be the uniform distribution on the 2-square. The area of the disk is $r^2 \pi$, i.e. the density $f$ should be $1/\pi$ on the circle and 0 elsewhere. The 2-square has area $4$ so the density of $g = 1/4$, we thus need $M = 4/\pi$

In [132]:
samples = []
rejections = 0
while (len(samples) < 100000):
    # We know that the ratio is 1 when x is in the circle and 0 if it is outside
    (X1,X2) = (2*random()-1,2*random()-1)
    if (np.linalg.norm([X1,X2]) <= 1): # We are inside the circle
        samples.append((X1,X2))
    else:
        rejections+=1
In [138]:
100000/(100000+rejections)
Out[138]:
0.7847137756503315
In [139]:
np.pi/4
Out[139]:
0.7853981633974483
In [125]:
samp_arr = np.array(samples)
In [126]:
plt.scatter(samp_arr[:,0],samp_arr[:,1])
Out[126]:
<matplotlib.collections.PathCollection at 0x10ef84460>
No description has been provided for this image
In [127]:
_=plt.hist2d(samp_arr[:,0],samp_arr[:,1])
No description has been provided for this image
In [156]:
f = np.array([0.1,0.2,0.5,0.2])
g = np.array([1/4,1/4,1/4,1/4])
M = 2
f-M*g
Out[156]:
array([-0.275, -0.175,  0.125, -0.175])
In [ ]:
 
In [157]:
samples = []

r = f/(M*g)
rejections = 0
while (len(samples) < 10000):
    x = randint(3)
    r[x]
    U = random()
    if (U <= r[x]):
        samples.append(x)
    else:
        rejections+=1
In [158]:
rejections/(10000+rejections)
Out[158]:
0.4117647058823529
In [159]:
discrete_histogram(samples,normed=True)
No description has been provided for this image

Sampling from the Poisson¶

The Poisson has distribution function $$ F(x) = e^{-\lambda} \sum_{j=0}^{\lfloor x \rfloor} \frac{\lambda^j}{j!} $$

In [ ]:
def sample_poisson():
    from scipy.special import factorial
    Y = random()
    l = 2

    F = 0
    for i in range(1000):
        F = F + np.exp(-l)*np.power(l,i)/factorial(i)
        if (F > Y):
            break
    return i
In [ ]:
poisson_samples = [sample_poisson() for i in range(1000)]
f = []
for i in range(13):
    f.append(np.exp(-l)*np.power(l,i)/factorial(i))
F = np.cumsum(f)
plt.plot(F,color='black')
plotEDF(makeEDF(poisson_samples))
discrete_histogram(poisson_samples)
In [ ]: