©2022 Raazesh Sainudiin, Benny Avelin. Attribution 4.0 International (CC BY 4.0)
!ls data
!head -n 3 data/NYPowerBall.csv
import csv
data = []
with open('data/NYPowerBall.csv',mode='r') as f:
csv_reader = csv.reader(f)
header = next(csv_reader)
for line in csv_reader:
data.append(line)
[1,2]+[2,3]
list_of_lists = [[int(str_number) for str_number in d[1].split(' ')] for d in data]
numbers = sum(list_of_lists,start=[])
len(numbers)
import numpy as np
arr = np.array(list_of_lists)
numbers_arr = arr.flatten()
np.unique(numbers_arr)
np.max(numbers_arr)
np.min(numbers_arr)
Lets say we fix a level $\alpha \in (0,1)$, and solve the equation $$ \alpha = 2 e^{-\frac{2n\epsilon^2}{(b-a)^2}} $$
def compute_epsilon(alpha,n,a,b):
return np.sqrt((b-a)**2*np.log(2/alpha)/(2*n))
alpha = 0.05
n = numbers_arr.shape[0]
a = 1
b = 69
delta = compute_epsilon(alpha,n,a,b)
delta
conf_interval = (np.mean(numbers_arr)-delta,np.mean(numbers_arr)+delta)
print("Confidencen interval for the mean is: ",conf_interval)
import Utils
import matplotlib.pyplot as plt
#_=plt.hist(numbers_arr,bins=)
Utils.discrete_histogram(numbers_arr)
!head -n 10 data/earthquakes.csv
import csv
data = []
with open('data/earthquakes.csv',mode='r') as f:
csv_reader = csv.reader(f)
header = next(csv_reader)
for line in csv_reader:
data.append(line)
header[2]
data[0][2]
import datetime
#-05-17T12:19:35.516Z
datetime.datetime.strptime(data[0][2],"%Y-%m-%dT%H:%M:%S.%fZ")
origin_time = [datetime.datetime.strptime(d[2],"%Y-%m-%dT%H:%M:%S.%fZ") for d in data]
or_time_arr = np.array(origin_time)
sort_time_arr = np.sort(or_time_arr)
time_between_eq=np.diff(sort_time_arr)
time_between_eq_arr = np.array([d.total_seconds() for d in time_between_eq])
_=plt.hist(time_between_eq_arr,bins=200)
The density is given by? $$ f(x) = \lambda^\ast e^{-\lambda^\ast x} $$
np.random.shuffle(time_between_eq_arr)
total_n = len(time_between_eq_arr)
train_n = int(total_n*0.5)
test_n = total_n-train_n
train_data, test_data = (time_between_eq_arr[:train_n],time_between_eq_arr[train_n:])
train_data
test_data
# define the objective/cost/loss function we want to minimise
def empirical_risk_train(l):
return np.mean(-np.log(l)+l*train_data)
empirical_risk_train(0.001)
result = optimize.minimize(empirical_risk_train,0.0001,method='Nelder-Mead')
l_hat = result['x'][0]
l_hat
1/l_hat
Lets consider the loss $$ L(c,x) = |c-x| $$ The risk then becomes $$ R(c) = E[L(c,x)] = E[|c-x|] $$
So, for our problem, since we have estimated $\hat \lambda$ we can test it as a method of prediction by estimating the following quantity $$ E[|1/\hat \lambda - X| \mid \hat \lambda] $$
We could use our testing data to estimate the above, i.e. $$ \frac{1}{n_{test}} \sum_{X_i \in \text{ testing data}} |1/\hat \lambda - X_i| $$
np.mean(np.abs(1/l_hat - test_data))
np.mean(np.abs(test_data-np.mean(test_data)))
!head -n 10 data/CORIS.csv
# Load data
# Standard scale
# Convert labels to -1,1
# Solve the likelihood problem
import numpy as np
from scipy import optimize
# define the objective/cost/loss function we want to minimise
def f(x):
return np.sum(np.log(1+np.exp(-Y1*(x[0] + x[1]*X_sc[:,9]))))
# multi-dimensional optimisation is syntactically similar to 1D,
# but we are using Gradient and Hessian information from numerical evaluation of f to
# iteratively improve the solution along the steepest direction, etc.
# It 'LBFGS' method you will see in scientific computing
parameter_bounding_box=((-10, 2), (-10, 2)) # specify the constraints for each parameter
initial_arguments = np.array([0, 0]) # point in 2D to initialise the minimize algorithm
result = optimize.minimize(f, initial_arguments, bounds=parameter_bounding_box,) # just call the minimize method!
result