©2023 Raazesh Sainudiin, Benny Avelin. Attribution 4.0 International (CC BY 4.0)
%%bash
ls data
%%bash
head -n 2 data/NYPowerBall.csv
import csv
with open("data/NYPowerBall.csv",mode='r') as f:
reader = csv.reader(f)
header = next(reader)
#data = [i for i in reader]
data = list(reader)
data[:2]
list_list_list = [line[1].split(' ') for line in data]
list_list_list[:2]
flattened_list = sum(list_list_list,[])
import numpy as np
list_list_list_arr = np.array(list_list_list)
list_list_list_arr.flatten().shape
int_list = [int(i) for i in flattened_list]
int_list[:2]
int_arr = np.array(int_list)
int_arr.shape
np.max(int_arr)
np.min(int_arr)
len(np.unique(int_arr))
Compute confidence interval of what?
reshaped_int_arr = int_arr.reshape(-1,6)
first_draws = reshaped_int_arr[:,0]
list_list_list[:10]
np.unique(first_draws)
# Double check
np.unique([int(i[5]) for i in list_list_list])
power_ball = reshaped_int_arr[:,-1]
power_ball_mean = np.mean(power_ball)
power_ball_mean
39/2
Hoeffdings inequality $P(X \in [a,b]) = 1$ $$ P(|\overline{X}_n - E[X]| > \epsilon) \leq 2 e^{-\frac{2 n \epsilon^2}{(a-b)^2}} $$
def compute_epsilon(n,a,b,alpha):
return np.sqrt((a-b)**2/(2*n) * np.log(2/alpha))
alpha = 0.05
a = 1
b = 39
n = len(power_ball)
epsilon = compute_epsilon(n,a,b,alpha)
epsilon
(power_ball_mean-epsilon,power_ball_mean+epsilon)
(39+1)/2
from Utils import discrete_histogram
discrete_histogram(power_ball)
%%bash
ls data
%%bash
head -n 2 data/earthquakes.csv
import csv
with open("data/earthquakes.csv",mode='r') as f:
reader = csv.reader(f,skipinitialspace=True)
header = next(reader)
#data = [i for i in reader]
data = list(reader)
print(header[2],data[0][2])
format_string = "%Y-%m-%dT%H:%M:%S.%fZ"
from datetime import datetime
datetime.strptime(data[0][2],format_string)
origin_time = [datetime.strptime(line[2],format_string) for line in data]
(origin_time[0]-origin_time[1]).total_seconds()
origin_time_arr = np.array(origin_time)
sorted_origin_time = np.sort(origin_time_arr)
sorted_origin_time[1:]-sorted_origin_time[:-1]
time_between = np.diff(sorted_origin_time)
time_between_seconds = [time.total_seconds() for time in time_between]
import matplotlib.pyplot as plt
_=plt.hist(time_between_seconds,bins=200)
We construct the model space $\mathcal{M} = \{f_\lambda(x) = \lambda e^{-\lambda x}: \lambda > 0\}$
Now define the log-loss $$ L(f_\lambda,x) = -\ln(f_\lambda(x)) $$
The risk is the expected loss $$ R(f_\lambda) = E[L(f_\lambda,X)] $$
We instead minimize the empirical risk, given i.i.d. Data $X = \{X_1,\ldots,X_n\}$ $$ \hat R_n(f_\lambda) = \frac{1}{n} \sum_{i=1}^n L(f_\lambda,X_i) $$
The goal is to minimize $\hat R_n(f_\lambda)$ w.r.t. $\lambda$.
$$ \ln(f_\lambda) = \ln(\lambda) - \lambda x $$gives $$ 1/\hat \lambda = \frac{1}{n} \sum_{i=1}^n X_i $$
import matplotlib.pyplot as plt
_=plt.hist(time_between_seconds,bins=200,density=True)
hat_lambda = 1/np.mean(time_between_seconds)
x_plot = np.linspace(0,20000,100)
plt.plot(x_plot,hat_lambda*np.exp(-hat_lambda*x_plot))
Lets say I want to predict the time to next earthquake, lets say that we use our estimated $\hat \lambda$ to predict $1/\hat \lambda$.
Train, testing split. But for us, using the iid assumption we can just randomly split our dataset into two parts.
n_train = int(len(time_between_seconds)/2)
train_set = time_between_seconds[:n_train]
test_set = time_between_seconds[n_train:]
hat_lambda_train = 1/np.mean(train_set)
guess = 1/hat_lambda_train
guess
_=plt.hist(np.abs(guess-test_set),bins=100)
average_error = np.mean(np.abs(guess-test_set))
print(average_error)