def makeEMFHidden(myDataList):
    '''Make an empirical mass function from a data list.
    
    Param myDataList, list of data to make emf from.
    Return list of tuples comprising (data value, relative frequency) ordered by data value.'''
    
    sortedUniqueValues = sorted(list(set(myDataList)))
    freqs = [myDataList.count(i) for i in sortedUniqueValues]
    relFreqs = [ZZ(fr)/len(myDataList) for fr in freqs] # use a list comprehension
    
    return list(zip(sortedUniqueValues, relFreqs))
    

from pylab import array

def makeEDFHidden(myDataList, offset=0):
    '''Make an empirical distribution function from a data list.
    
    Param myDataList, list of data to make ecdf from.
    Param offset is an offset to adjust the edf by, used for doing confidence bands.
    Return list of tuples comprising (data value, cumulative relative frequency) ordered by data value.'''
    
    sortedUniqueValues = sorted(list(set(myDataList)))
    freqs = [myDataList.count(i) for i in sortedUniqueValues]
    from pylab import cumsum
    cumFreqs = list(cumsum(freqs)) #
    cumRelFreqs = [ZZ(i)/len(myDataList) for i in cumFreqs] # get cumulative relative frequencies as rationals
    if offset > 0: # an upper band
        cumRelFreqs = [min(i ,1) for i in cumRelFreqs] # use a list comprehension
    if offset < 0: # a lower band
        cumRelFreqs = [max(i, 0) for i in cumFreqs] # use a list comprehension
    return list(zip(sortedUniqueValues, cumRelFreqs))
    
# EPMF plot
def epmfPlot(samples):
    '''Returns an empirical probability mass function plot from samples data.'''
    
    epmf_pairs = makeEMFHidden(samples)
    epmf = point(epmf_pairs, rgbcolor = "blue", pointsize="20")
    for k in epmf_pairs:    # for each tuple in the list
        kkey, kheight = k     # unpack tuple
        epmf += line([(kkey, 0),(kkey, kheight)], rgbcolor="blue", linestyle=":")
    # padding
    epmf += point((0,1), rgbcolor="black", pointsize="0")
    return epmf
    

# ECDF plot
def ecdfPlot(samples):
    '''Returns an empirical probability mass function plot from samples data.'''
    ecdf_pairs = makeEDFHidden(samples)
    ecdf = point(ecdf_pairs, rgbcolor = "red", faceted = false, pointsize="20")
    for k in range(len(ecdf_pairs)):
        x, kheight = ecdf_pairs[k]     # unpack tuple
        previous_x = 0
        previous_height = 0
        if k > 0:
            previous_x, previous_height = ecdf_pairs[k-1] # unpack previous tuple
        ecdf += line([(previous_x, previous_height),(x, previous_height)], rgbcolor="grey")
        ecdf += points((x, previous_height),rgbcolor = "white", faceted = true, pointsize="20")
        ecdf += line([(x, previous_height),(x, kheight)], rgbcolor="grey", linestyle=":")
    # padding
    ecdf += line([(ecdf_pairs[0][0]-0.2, 0),(ecdf_pairs[0][0], 0)], rgbcolor="grey")
    max_index = len(ecdf_pairs)-1
    ecdf += line([(ecdf_pairs[max_index][0], ecdf_pairs[max_index][1]),(ecdf_pairs[max_index][0]+0.2, ecdf_pairs[max_index][1])],rgbcolor="grey")
    return ecdf
    
def calcEpsilon(alphaE, nE):
    '''Return confidence band epsilon calculated from parameters alphaE > 0 and nE > 0.'''
    
    return sqrt(1/(2*nE)*log(2/alphaE))


deMs=[randint(1,5) for i in range(20)]  # randint can be used to uniformly sample integers in a specified range
deMs

[1, 3, 1, 1, 2, 3, 2, 4, 4, 4, 1, 4, 3, 4, 5, 1, 1, 5, 5, 4]


sortedUniqueValues = sorted(list(set(deMs)))
freqs = [deMs.count(i) for i in sortedUniqueValues]
from pylab import cumsum
cumFreqs = list(cumsum(freqs)) #
cumRelFreqs = [ZZ(i)/len(deMs) for i in cumFreqs] # get cumulative relative frequencies as rationals
list(zip(sortedUniqueValues, cumRelFreqs))

[(1, 3/10), (2, 2/5), (3, 11/20), (4, 17/20), (5, 1)]


show(ecdfPlot(deMs), figsize=[6,3]) # use hidden ecdfPlot function to plot


@interact
def _(n=(10,(0..200))):
    '''Interactive function to plot ecdf for obs from de Moirve (5).'''
    if (n > 0):
        us = [randint(1,5) for i in range(n)]
        p=ecdfPlot(us) # use hidden ecdfPlot function to plot
        #p+=line([(-0.2,0),(0,0),(1,1),(1.2,1)],linestyle=':')
        p.show(figsize=[8,2])


@interact
def _(n=(10,(0..200))):
    '''Interactive function to plot ecdf for obs from Uniform(0,1).'''
    if (n > 0):
        us = [random() for i in range(n)]
        p=ecdfPlot(us) # use hidden ecdfPlot function to plot
        p+=line([(-0.2,0),(0,0),(1,1),(1.2,1)],linestyle='-')
        p.show(figsize=[3,3],aspect_ratio=1)


n=10
uniformSample = [random() for i in range(n)]
print(uniformSample)

[0.7080534092577805, 0.9526810488576604, 0.3130366899289021, 0.811206154341072, 0.16865419994024078, 0.8307760107371247, 0.38034147148957487, 0.8108010578684278, 0.9751294259315408, 0.5480858527701734]


sortedUniqueValuesUniform = sorted(list(set(uniformSample)))
print(sortedUniqueValuesUniform)

[0.16865419994024078, 0.3130366899289021, 0.38034147148957487, 0.5480858527701734, 0.7080534092577805, 0.8108010578684278, 0.811206154341072, 0.8307760107371247, 0.9526810488576604, 0.9751294259315408]


freqsUniform = [uniformSample.count(i) for i in sortedUniqueValuesUniform]
freqsUniform

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


from pylab import cumsum
cumFreqsUniform = list(cumsum(freqsUniform)) # accumulate
cumFreqsUniform

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


# cumulative rel freqs as rationals
cumRelFreqsUniform = [ZZ(i)/len(uniformSample) for i in cumFreqsUniform] 
cumRelFreqsUniform

[1/10, 1/5, 3/10, 2/5, 1/2, 3/5, 7/10, 4/5, 9/10, 1]


ecdfPointsUniform = list(zip(sortedUniqueValuesUniform, cumRelFreqsUniform))
ecdfPointsUniform

[(0.16865419994024078, 1/10),
 (0.3130366899289021, 1/5),
 (0.38034147148957487, 3/10),
 (0.5480858527701734, 2/5),
 (0.7080534092577805, 1/2),
 (0.8108010578684278, 3/5),
 (0.811206154341072, 7/10),
 (0.8307760107371247, 4/5),
 (0.9526810488576604, 9/10),
 (0.9751294259315408, 1)]


# ECDF plot given a list of points to plot
def ecdfPointsPlot(listOfPoints, colour='grey', lines_only=False):
    '''Returns an empirical probability mass function plot from a list of points to plot.
    
    Param listOfPoints is the list of points to plot.
    Param colour is used for plotting the lines, defaulting to grey.
    Param lines_only controls wether only lines are plotted (true) or points are added (false, the default value).
    Returns an ecdf plot graphic.'''
    
    ecdfP = point((0,0), pointsize="0")
    if not lines_only: ecdfP = point(listOfPoints, rgbcolor = "red", faceted = false, pointsize="20")
    for k in range(len(listOfPoints)):
        x, kheight = listOfPoints[k]     # unpack tuple
        previous_x = 0
        previous_height = 0
        if k > 0:
            previous_x, previous_height = listOfPoints[k-1] # unpack previous tuple
        ecdfP += line([(previous_x, previous_height),(x, previous_height)], rgbcolor=colour)
        ecdfP += line([(x, previous_height),(x, kheight)], rgbcolor=colour, linestyle=":")
        if not lines_only: 
            ecdfP += points((x, previous_height),rgbcolor = "black", faceted = true, pointsize="20")
            # padding
    max_index = len(listOfPoints)-1
    ecdfP += line([(listOfPoints[0][0]-0.2, 0),(listOfPoints[0][0], 0)], rgbcolor=colour)
    ecdfP += line([(listOfPoints[max_index][0], listOfPoints[max_index][1]),(listOfPoints[max_index][0]+0.2,\
                                                                listOfPoints[max_index][1])],rgbcolor=colour)
    return ecdfP


show(ecdfPointsPlot(ecdfPointsUniform), figsize=[6,3])


alpha = 0.05
epsilon = calcEpsilon(alpha, n)
epsilon

0.429469408346738


# heights for the lower band
cumRelFreqsUniformLower = [max(crf - epsilon, 0) for crf in cumRelFreqsUniform] 
print(cumRelFreqsUniformLower)

[0, 0, 0, 0, 0.0705305916532624, 0.170530591653262, 0.270530591653262, 0.370530591653262, 0.470530591653262, 0.570530591653262]


ecdfPointsUniformLower = list(zip(sortedUniqueValuesUniform, cumRelFreqsUniformLower))
ecdfPointsUniformLower

[(0.16865419994024078, 0),
 (0.3130366899289021, 0),
 (0.38034147148957487, 0),
 (0.5480858527701734, 0),
 (0.7080534092577805, 0.0705305916532624),
 (0.8108010578684278, 0.170530591653262),
 (0.811206154341072, 0.270530591653262),
 (0.8307760107371247, 0.370530591653262),
 (0.9526810488576604, 0.470530591653262),
 (0.9751294259315408, 0.570530591653262)]


pointEstimate = ecdfPointsPlot(ecdfPointsUniform)
lowerBound = ecdfPointsPlot(ecdfPointsUniformLower, colour='green', lines_only=true)
show(pointEstimate + lowerBound, figsize=[6,3])


# heights for the upper band


pointEstimate = ecdfPointsPlot(ecdfPointsUniform)
lowerBound = ecdfPointsPlot(ecdfPointsUniformLower,colour='green', lines_only=true)
show(pointEstimate + lowerBound, figsize=[6,3])


def makeEDFPoints(myDataList, offset=0):
    '''Make a list empirical distribution plotting points from from a data list.
    
    Param myDataList, list of data to make ecdf from.
    Param offset is an offset to adjust the edf by, used for doing confidence bands.
    Return list of tuples comprising (data value, cumulative relative frequency(with offset)) 
    ordered by data value.'''
    
    sortedUniqueValues = sorted(list(set(myDataList)))
    freqs = [myDataList.count(i) for i in sortedUniqueValues]
    from pylab import cumsum
    cumFreqs = list(cumsum(freqs)) 
    cumRelFreqs = [ZZ(i)/len(myDataList) for i in cumFreqs] # get cumulative relative frequencies as rationals
    if offset > 0: # an upper band
        cumRelFreqs = [min(i+offset ,1) for i in cumRelFreqs]
    if offset < 0: # a lower band
        cumRelFreqs = [max(i+offset, 0) for i in cumRelFreqs] 
    return list(zip(sortedUniqueValues, cumRelFreqs))


%%sh
ls data/earthquakes* # the '*' just lists all files that start with earthquakes in data/ directory

data/earthquakes.csv
data/earthquakes.csv.zip
data/earthquakes.tgz
data/earthquakes_small.csv


%%sh
# only do this once! So, you don't need to do this step if you see earthquakes.csv file above
cd data
# windows and mac users should first try to unzip by uncommenting next line
# unzip earthquakes.csv.zip
## if unzip is not found try tar by uncommenting next line and commenting the above line
#tar zxvf earthquakes.tgz
ls -al earthquakes*

-rw-r--r-- 1 user user 4085555 Feb 12  2021 earthquakes.csv
-rw-r--r-- 1 user user 1344114 Feb 12  2021 earthquakes.csv.zip
-rw-r--r-- 1 user user 1344959 Feb 12  2021 earthquakes.tgz
-rw-r--r-- 1 user user   77786 Feb 12  2021 earthquakes_small.csv


def getLonLatMagDepTimes(NZEQCsvFileName):
    '''returns longitude, latitude, magnitude, depth and the origin time as unix time
    for each observed earthquake in the csv filr named NZEQCsvFileName'''
    from datetime import datetime
    import time
    from dateutil.parser import parse
    import numpy as np
    
    with open(NZEQCsvFileName) as f:
        reader = f.read() 
        dataList = reader.split('\n')
        
    myDataAccumulatorList =[]
    for data in dataList[1:-1]:
        dataRow = data.split(',')
        myTimeString = dataRow[2] # origintime
        # let's also grab longitude, latitude, magnitude, depth
        myDataString = [dataRow[4],dataRow[5],dataRow[6],dataRow[7]]
        try: 
            myTypedTime = time.mktime(parse(myTimeString).timetuple())
            myFloatData = [float(x) for x in myDataString]
            myFloatData.append(myTypedTime) # append the processed timestamp
            myDataAccumulatorList.append(myFloatData)
        except TypeError as e: # error handling for type incompatibilities
            print ('Error:  Error is ', e)
    #return np.array(myDataAccumulatorList)
    return myDataAccumulatorList

myProcessedList = getLonLatMagDepTimes('data/earthquakes.csv')

def interQuakeTimes(quakeTimes):
    '''Return a list inter-earthquake times in seconds from earthquake origin times
    Date and time elements are expected to be in the 5th column of the array
    Return a list of inter-quake times in seconds. NEEDS sorted quakeTimes Data'''
    import numpy as np
    retList = []
    if len(quakeTimes) > 1:
        retList = [quakeTimes[i]-quakeTimes[i-1] for i in range(1,len(quakeTimes))]
    #return np.array(retList)
    return retList

interQuakesSecs = interQuakeTimes(sorted([x[4] for x in myProcessedList]))
len(interQuakesSecs)

21015


interQuakesSecs[0:10]

[531.0, 551.0, 764.0, 294.0, 678.0, 1538.0, 376.0, 364.0, 208.0, 242.0]


@interact
def _(takeLast=(500,(0..min(len(interQuakesSecs),1999))), alpha=input_box(0.05.n(digits=3),label='$\\alpha$')):
    '''Interactive function to plot the edf estimate and confidence bands for inter earthquake times.'''
    if takeLast > 0 and alpha > 0 and alpha < 1:
        lastInterQuakesSecs = interQuakesSecs[len(interQuakesSecs)-takeLast:len(interQuakesSecs)]
        interQuakePoints = makeEDFPoints(lastInterQuakesSecs)
        p=ecdfPointsPlot(interQuakePoints, lines_only=true)
        epQuakes = calcEpsilon(alpha, len(lastInterQuakesSecs))
        interQuakePointsLower = makeEDFPoints(lastInterQuakesSecs, offset=-epQuakes)
        lowerQuakesBound = ecdfPointsPlot(interQuakePointsLower, colour='green', lines_only=true)
        interQuakePointsUpper = makeEDFPoints(lastInterQuakesSecs, offset=epQuakes)
        upperQuakesBound = ecdfPointsPlot(interQuakePointsUpper, colour='green', lines_only=true)
        show(p + lowerQuakesBound + upperQuakesBound, figsize=[6,3])
    else:
        print ("check your input values")


import numpy as np
iQMinutes = np.array(interQuakesSecs)/60.0
np.percentile(iQMinutes,95.0) # percentile
# is in range (0,100) corresponding to quantile in (0,1)

75.255


print ("minimum, maximum inter EQ-time in minues = ", min(iQMinutes), max(iQMinutes))

print ("(0.05-th, 0.50-th or median, 0.95-th) quantiles = ",\
(np.percentile(iQMinutes,5.0),np.percentile(iQMinutes,50.0), np.percentile(iQMinutes,95.0)))

minimum, maximum inter EQ-time in minues =  0.0 327.8
(0.05-th, 0.50-th or median, 0.95-th) quantiles =  (1.3166666666666667, 15.516666666666667, 75.255)


# Sample Exam Problem x - Solution
# Be patient! this can take a couple minutes; reduce B to 100 to save time
import numpy as np
iQMinutes = np.array(interQuakesSecs)/60.0
sampleMedian = np.percentile(iQMinutes,50.0) # median 
print ("The sample median = ",sampleMedian)

B = 1000 # Number of Bootstrap replications
n = len(iQMinutes) # sample size of the original dataset
bootstrappedSampleMedians=[] # list to store the sample medians from each bootstrapped data
for b in range(B):
    #sample indices at random between 0 and len(iQMinutes)-1 to make the bootstrapped dataset
    randIndices=[randint(0,n-1) for i in range(n)] 
    bootstrappedDataset = iQMinutes[randIndices] # resample with replacement from original dataset
    bootstrappedMedian = np.percentile(bootstrappedDataset,50.0)
    bootstrappedSampleMedians.append(bootstrappedMedian)

lower95BootstrapCIForMedian = np.percentile(bootstrappedSampleMedians,2.5)
upper95BootstrapCIForMedian = np.percentile(bootstrappedSampleMedians,97.5)

print ("The inner 95% percentile based Confidence Interval for the Median = ")
print ("[ "+str(lower95BootstrapCIForMedian) + " , " + str(upper95BootstrapCIForMedian) +" ]")

The sample median =  15.516666666666667
The inner 95% percentile based Confidence Interval for the Median = 
[ 15.183333333333334 , 15.817083333333333 ]


# Sample Exam Problem x - Solution
# Be patient! this can take a couple minutes; reduce B to 100 to save time
import numpy as np
iQMinutes = np.array(interQuakesSecs)/60.0
sampleMedian = np.percentile(iQMinutes,50.0) # median 
print ("The sample median = ",sampleMedian)

B = 1000 # Number of Bootstrap replications
n = len(iQMinutes) # sample size of the original dataset
bootstrappedSampleMedians=[] # list to store the sample medians from each bootstrapped data
for b in range(B):
    #sample indices at random between 0 and len(iQMinutes)-1 to make the bootstrapped dataset
    bootstrappedDataset = np.random.choice(iQMinutes,size=n,replace=True) # resample with replacement from original dataset
    bootstrappedMedian = np.percentile(bootstrappedDataset,50.0)
    bootstrappedSampleMedians.append(bootstrappedMedian)

lower95BootstrapCIForMedian = np.percentile(bootstrappedSampleMedians,2.5)
upper95BootstrapCIForMedian = np.percentile(bootstrappedSampleMedians,97.5)

print ("The inner 95% percentile based Confidence Interval for the Median = ")
print ("[ "+str(lower95BootstrapCIForMedian) + " , " + str(upper95BootstrapCIForMedian) +" ]")

The sample median =  15.516666666666667
The inner 95% percentile based Confidence Interval for the Median = 
[ 15.133333333333333 , 15.866666666666667 ]


### Example: Wald Test that median of inter-EQ times is 10 minutes
#### Be patient if you see a 'In [*]' to the left top corner of the cell ... your computer is working...
nullValueForMedian = 10.0

# check if the null value is in the 95% CI or not to fail to reject or reject the null hypothesis
if nullValueForMedian >= lower95BootstrapCIForMedian and nullValueForMedian <= upper95BootstrapCIForMedian:
    print ("The null value for median = "+str(nullValueForMedian)+\
    "\nis inside the 95% Bootstrapped Confidence Interval: "+\
    "[ "+str(lower95BootstrapCIForMedian) + " , " + str(upper95BootstrapCIForMedian) +" ]")
    print ("        Therefore, we fail to reject the null hypothesis H_0: median=1.0")
else:
    print ("The null value for median = "+str(nullValueForMedian)+\
    "\nis NOT inside the 95% Bootstrapped Confidence Interval: "+\
    "[ "+str(lower95BootstrapCIForMedian) + " , " + str(upper95BootstrapCIForMedian) +" ]")
    print ("        Therefore, we reject the null hypothesis H_0: median = "+str(nullValueForMedian))

The null value for median = 10.0000000000000
is NOT inside the 95% Bootstrapped Confidence Interval: [ 15.133333333333333 , 15.866666666666667 ]
        Therefore, we reject the null hypothesis H_0: median = 10.0000000000000


import numpy as np

def makeBootstrappedConfidenceIntervalOfStatisticT(dataset, statT, alpha, B=100):
    '''make a bootstrapped 1-alpha confidence interval for ANY given statistic statT 
    from the dataset with B Bootstrap replications for 0 < alpha < 1, and 
    return lower CI, upper CI, bootstrapped_samples '''
    n = len(dataset) # sample size of the original dataset
    bootstrappedStatisticTs=[] # list to store the statistic T from each bootstrapped data
    for b in range(B):
        #sample indices at random between 0 and len(iQMinutes)-1 to make the bootstrapped dataset
        #randIndices=[randint(0,n-1) for i in range(n)] 
        #bootstrappedDataset = dataset[randIndices] # resample with replacement from original dataset
        bootstrappedDataset = np.random.choice(dataset,size=n,replace=True) # resample with replacement from original dataset
        bootstrappedStatisticT = statT(bootstrappedDataset)
        bootstrappedStatisticTs.append(bootstrappedStatisticT)
    # noe get the [2.5%, 97.5%] percentile-based CI
    alpaAsPercentage=alpha*100.0
    lowerBootstrap1MinusAlphaCIForStatisticT = np.percentile(bootstrappedStatisticTs,alpaAsPercentage/2)
    upperBootstrap1MinusAlphaCIForStatisticT = np.percentile(bootstrappedStatisticTs,100-alpaAsPercentage/2)
    return (lowerBootstrap1MinusAlphaCIForStatisticT,upperBootstrap1MinusAlphaCIForStatisticT,\
            np.array(bootstrappedStatisticTs))


#### Be patient if you see a 'In [*]' to the left top corner of the cell ... your computer is working...
# dataset
iQMinutes = np.array(interQuakesSecs)/60.0

# our statistic T as one example, say the 50-th percentile or median as  
# via anonymous function / lambda expression
statTMedian = lambda dataset : np.percentile(dataset,50.0)

# now obtain the bootstrapped 1-alpha confidence interval with alpha=0.5, i.e. 95% conf Interval
alpha=0.05
B=100 # number of bootstrap samples

# plug-in point estimate of population median of inter-EQ times
plugInEstimateOfMedian = statTMedian(iQMinutes)

# Let's call our convenient function
# and get the bootstrapped samples and build 1-alpha confidence interval
lowerCIMedian,upperCIMedian,bootValuesMedian = \
           makeBootstrappedConfidenceIntervalOfStatisticT(iQMinutes, statTMedian, alpha, B)

# print the results
print ("The Plug-in Point Estimate of the Population Median of inter-EQ Times  = ", plugInEstimateOfMedian)
print ("1-alpha Bootstrapped CI for the media of inter-EQ Times = ", (lowerCIMedian,upperCIMedian))
print ("         where alpha = ",alpha.n(digits=2)," and bootstrap replicates = ", B)

The Plug-in Point Estimate of the Population Median of inter-EQ Times  =  15.516666666666667
1-alpha Bootstrapped CI for the media of inter-EQ Times =  (15.257916666666667, 15.800833333333333)
         where alpha =  0.050  and bootstrap replicates =  100


import pylab
pylab.clf() # clear current figure
Bins=8 # Number of histogram bins
n, bins, patches = pylab.hist(bootValuesMedian, Bins, density=true) 
pylab.ylabel('normalised count')
pylab.title('Normalised histogram of bootstrapped samples of the plug-in estimate of median inter-EQ time')
pylab.axvline(x=plugInEstimateOfMedian, linewidth=2, color='k') # plot vertical line at plug-in point estimate
pylab.axvline(x=lowerCIMedian, linewidth=2, color='r') # plot vertical line at lower CI at (alpha/2)-th Quantile
pylab.axvline(x=upperCIMedian, linewidth=2, color='r') # plot vertical line at upper CI at (1-alpha/2)-th Quantile
#pylab.savefig('myHist') # to actually save the figure for subsequent extraction elsewhere
pylab.show()


# Sample Exam Problem 7 - REQUIRED-CELL

# DO NOT MODIFY this cell 
# Evaluate this cell before trying this PROBLEM so that the required functions and variables are loaded

import numpy as np
## Be Patient! - This will take more time, about a minute or so
###############################################################################################
def getLonLatMagDepTimes(NZEQCsvFileName):
    '''returns longitude, latitude, magnitude, depth and the origin time as unix time
    for each observed earthquake in the csv filr named NZEQCsvFileName'''
    from datetime import datetime
    import time
    from dateutil.parser import parse
    import numpy as np
    
    with open(NZEQCsvFileName) as f:
        reader = f.read() 
        dataList = reader.split('\n')
        
    myDataAccumulatorList =[]
    for data in dataList[1:-1]:
        dataRow = data.split(',')
        myTimeString = dataRow[2] # origintime
        # let's also grab longitude, latitude, magnitude, depth
        myDataString = [dataRow[4],dataRow[5],dataRow[6],dataRow[7]]
        try: 
            myTypedTime = time.mktime(parse(myTimeString).timetuple())
            myFloatData = [float(x) for x in myDataString]
            myFloatData.append(myTypedTime) # append the processed timestamp
            myDataAccumulatorList.append(myFloatData)
        except TypeError as e: # error handling for type incompatibilities
            print ('Error:  Error is ', e)
    #return np.array(myDataAccumulatorList)
    return myDataAccumulatorList

myProcessedList = getLonLatMagDepTimes('data/earthquakes.csv')

def interQuakeTimes(quakeTimes):
    '''Return a list inter-earthquake times in seconds from earthquake origin times
    Date and time elements are expected to be in the 5th column of the array
    Return a list of inter-quake times in seconds. NEEDS sorted quakeTimes Data'''
    import numpy as np
    retList = []
    if len(quakeTimes) > 1:
        retList = [quakeTimes[i]-quakeTimes[i-1] for i in range(1,len(quakeTimes))]
    #return np.array(retList)
    return retList

def makeBootstrappedConfidenceIntervalOfStatisticT(dataset, statT, alpha, B=100):
    '''make a bootstrapped 1-alpha confidence interval for ANY given statistic statT 
    from the dataset with B Bootstrap replications for 0 < alpha < 1, and 
    return lower CI, upper CI, bootstrapped_samples '''
    n = len(dataset) # sample size of the original dataset
    bootstrappedStatisticTs=[] # list to store the statistic T from each bootstrapped data
    for b in range(B):
        #sample indices at random between 0 and len(iQMinutes)-1 to make the bootstrapped dataset
        randIndices=[randint(0,n-1) for i in range(n)] 
        bootstrappedDataset = dataset[randIndices] # resample with replacement from original dataset
        bootstrappedStatisticT = statT(bootstrappedDataset)
        bootstrappedStatisticTs.append(bootstrappedStatisticT)
    # noe get the [2.5%, 97.5%] percentile-based CI
    alpaAsPercentage=alpha*100.0
    lowerBootstrap1MinusAlphaCIForStatisticT = np.percentile(bootstrappedStatisticTs,alpaAsPercentage/2)
    upperBootstrap1MinusAlphaCIForStatisticT = np.percentile(bootstrappedStatisticTs,100-alpaAsPercentage/2)
    return (lowerBootstrap1MinusAlphaCIForStatisticT,upperBootstrap1MinusAlphaCIForStatisticT,\
            np.array(bootstrappedStatisticTs))

interQuakesSecs = interQuakeTimes(sorted([x[4] for x in myProcessedList]))
iQMinutes = np.array(interQuakesSecs)/60.0
###############################################################################################


# Sample Exam Problem 7
# first evaluate the REQUIRED-CELL above
# Do NOT change the variable or function Names - Just replace XXXs
# now obtain the bootstrapped 1-alpha confidence interval with alpha=0.5, i.e. 95% conf Interval
alpha=0.05
B=1000 # number of bootstrap samples=1000
# our statistic T as one example, say the 50-th percentile or median as  
# via anonymous function / lambda expression
statTMedian = lambda dataset : XXXX

# plug-in point estimate of population median of inter-EQ times
plugInEstimateOfMedian = statTMedian(iQMinutes)

# Let's call our convenient function
# and get the bootstrapped samples and build 1-alpha confidence interval
lowerCIMedian,upperCIMedian,bootValuesMedian = \
           makeBootstrappedConfidenceIntervalOfStatisticT(XXX, XXX, alpha, B)

# print in more details if you want to see if your values make sense
print ("The Plug-in Point Estimate of the Population Median of inter-EQ Times  = ", plugInEstimateOfMedian)
print ("1-alpha Bootstrapped CI for the media of inter-EQ Times = ", (lowerCIMedian,upperCIMedian))
print ("         where alpha = ",alpha.n(digits=2)," and bootstrap replicates = ", B)

print (plugInEstimateOfMedian,lowerCIMedian,upperCIMedian)

# Hypothesis test
NullValueForMedian=20 # 20 minutes is the value of Median under the Null Hypothesis, H_0: Population Median = 20
RejectedH0_sampleMedianIs20min = (NullValueForMedian <= XXX and NullValueForMedian >= XXX)

if RejectedH0_sampleMedianIs20min:
    print (" RejectedH0_sampleMedianIs20min = True, so the Null Hypothesis is Rejected at size alpha = ",alpha)
else:
    print (" RejectedH0_sampleMedianIs20min = False, so the Null Hypothesis is NOT Rejected at size alpha = ",alpha)


# Sample Exam Problem 7 - SOLUTION
#### Be patient if you see a 'In [*]' to the left top corner of the cell ... your computer is working...
# Do NOT change the variable or function Names - Just replace XXXs
# now obtain the bootstrapped 1-alpha confidence interval with alpha=0.5, i.e. 95% conf Interval
alpha=0.05
B=100 # number of bootstrap samples=1000
# our statistic T as one example, say the 50-th percentile or median as  
# via anonymous function / lambda expression
statTMedian = lambda dataset : np.percentile(dataset,50.0)

# plug-in point estimate of population median of inter-EQ times
plugInEstimateOfMedian = statTMedian(iQMinutes)

# Let's call our convenient function
# and get the bootstrapped samples and build 1-alpha confidence interval
lowerCIMedian,upperCIMedian,bootValuesMedian = \
           makeBootstrappedConfidenceIntervalOfStatisticT(iQMinutes, statTMedian, alpha, B)

# print in more details if you want to see if your values make sense
print ("The Plug-in Point Estimate of the Population Median of inter-EQ Times  = ", plugInEstimateOfMedian)
print ("1-alpha Bootstrapped CI for the media of inter-EQ Times = ", (lowerCIMedian,upperCIMedian))
print ("         where alpha = ",alpha.n(digits=2)," and bootstrap replicates = ", B)

print (plugInEstimateOfMedian,lowerCIMedian,upperCIMedian)

# Hypothesis test
NullValueForMedian=20 # 20 minutes is the value of Median under the Null Hypothesis, H_0: Population Median = 20

RejectedH0_sampleMedianIs20min = (NullValueForMedian <= upperCIMedian and NullValueForMedian >= lowerCIMedian)

if RejectedH0_sampleMedianIs20min:
    print (" RejectedH0_sampleMedianIs20min = True, so the Null Hypothesis is Rejected at size alpha = ",alpha)
else:
    print (" RejectedH0_sampleMedianIs20min = False, so the Null Hypothesis is NOT Rejected at size alpha = ",alpha)

# This is just extra stuff you don't need to do for this sample exam problem!
import pylab
pylab.clf() # clear current figure
Bins=10 # Number of histogram bins
n, bins, patches = pylab.hist(bootValuesMedian, Bins, density=true) 
pylab.ylabel('normalised count')
pylab.title('Normalised histogram of bootstrapped samples of the plug-in estimate of median inter-EQ time')
pylab.axvline(x=plugInEstimateOfMedian, linewidth=2, color='k') # plot vertical line at plug-in point estimate
pylab.axvline(x=lowerCIMedian, linewidth=2, color='r') # plot vertical line at lower CI at (alpha/2)-th Quantile
pylab.axvline(x=upperCIMedian, linewidth=2, color='r') # plot vertical line at upper CI at (1-alpha/2)-th Quantile
#pylab.savefig('myHist') # to actually save the figure for subsequent extraction elsewhere
pylab.show()

The Plug-in Point Estimate of the Population Median of inter-EQ Times  =  15.516666666666667
1-alpha Bootstrapped CI for the media of inter-EQ Times =  (15.224583333333333, 15.792916666666667)
         where alpha =  0.050  and bootstrap replicates =  100
15.516666666666667 15.224583333333333 15.792916666666667
 RejectedH0_sampleMedianIs20min = False, so the Null Hypothesis is NOT Rejected at size alpha =  0.0500000000000000


import numpy as np
LSAT=np.array([576, 635, 558, 578, 666, 580, 555, 661, 651, 605, 653, 575, 545, 572, 594]) # LSAT data
GPA=np.array([3.39, 3.30, 2.81, 3.03, 3.44, 3.07, 3.00, 3.43, 3.36, 3.13, 3.12, 2.74, 2.76, 2.88, 3.96]) # GPA data
# use np.vstack to vertically stack a sequence of arrays into a matrix of stacked rows
# see https://docs.scipy.org/doc/numpy/reference/generated/numpy.vstack.html#numpy.vstack
LSATGPA_data = np.vstack((LSAT,GPA)) # this is a 2D array of matric now
p = points(zip(LSATGPA_data[0,:],LSATGPA_data[1,:]))
p.show(figsize=(8,3),axes_labels=['$Y_i$ (LSAT scores)','$Z_i$ (GPA)'])


# the sample correlation matrix, it's symmetric with diagonal elements as 1
SampleCorrelationMatrix = np.corrcoef(LSATGPA_data) 
print ("The Sample Correlation Matrix is")
print (SampleCorrelationMatrix)
print ("For a pair of variables we can extract the right entry of the symmetric Sample Correlation Matrix:")
SampleCorrelation_GPA_LSAT = SampleCorrelationMatrix[(0,1)]
print (SampleCorrelation_GPA_LSAT)

The Sample Correlation Matrix is
[[1.         0.54591892]
 [0.54591892 1.        ]]
For a pair of variables we can extract the right entry of the symmetric Sample Correlation Matrix:
0.5459189161795887


# use the right lamba expression for the statistic of interest
statTSampleCorrCoeff = lambda dataMatrix : np.corrcoef(dataMatrix)[(0,1)] #statistic of interest
alpha=0.05
B=1000 # number of bootstrap samples
# get bootstrapped 1-alpha confidence Interval for statT99thPercentile
lowerCITcor,upperCITcor,bootValuesTcor = \
                 makeBootstrappedConfidenceIntervalOfStatisticT(LSATGPA_data, statTSampleCorrCoeff, alpha, B) 

if 0>=lowerCITcor and 0<=upperCITcor:
    print ("0 is inside the 1-alpha bootstrapped confidence interval for the correlation coefficient:")
    print ("   So, we fail to reject the Null Hypothesis that the population correlation coefficient = 0")
else:
    print ("0 is not inside the 1-alpha bootstrapped confidence interval for the correlation coefficient:")
    print ("   So, we reject the Null Hypothesis that the population correlation coefficient = 0")

0 is not inside the 1-alpha bootstrapped confidence interval for the correlation coefficient:
   So, we reject the Null Hypothesis that the population correlation coefficient = 0


leftSide = [52, 54, 60, 60, 54, 47, 57, 58, 61, 57, 50, 60, 60, 60, 62, 44, 55, 58, 55,\
            60, 59, 65, 59, 63, 51, 61, 62, 61, 60, 61, 65, 43, 59, 58, 67, 56, 64, 47,\
            64, 60, 55, 58, 41, 53, 61, 60, 49, 48, 47, 42, 50, 58, 48, 59, 55, 59, 50, \
            47, 47, 33, 51, 61, 61, 52, 62, 64, 64, 47, 58, 58, 61, 50, 55, 47, 39, 59,\
            64, 63, 63, 62, 64, 61, 50, 62, 61, 65, 62, 66, 60, 59, 58, 58, 60, 59, 61,\
            55, 55, 62, 51, 61, 49, 52, 59, 60, 66, 50, 59, 64, 64, 62, 60, 65, 44, 58, 63]


rightSide = [58, 54, 60, 55, 56, 44, 60, 52, 57, 58, 61, 66, 56, 59, 49, 48, 69, 66, 49,\
             72, 49, 50, 59, 59, 59, 66, 62, 44, 49, 40, 59, 55, 61, 51, 62, 52, 63, 39,\
             63, 52, 62, 49, 48, 65, 68, 45, 63, 58, 55, 56, 55, 57, 34, 64, 66, 54, 65,\
             61, 56, 57, 59, 58, 62, 58, 40, 43, 62, 59, 64, 64, 65, 65, 59, 64, 63, 65,\
             62, 61, 47, 59, 63, 44, 43, 59, 67, 64, 60, 62, 64, 65, 59, 55, 38, 57, 61,\
             52, 61, 61, 60, 34, 62, 64, 58, 39, 63, 47, 55, 54, 48, 60, 55, 60, 65, 41,\
             61, 59, 65, 50, 54, 60, 48, 51, 68, 52, 51, 61, 57, 49, 51, 62, 63, 59, 62,\
             54, 59, 46, 64, 49, 61]


len(leftSide), len(rightSide)

(115, 139)


rightSub = [52, 54]
leftSub = [58]
totalSample = rightSub + leftSub
totalSample

[52, 54, 58]


factorial(3)

6


list(Permutations(totalSample))

[[52, 54, 58],
 [52, 58, 54],
 [54, 52, 58],
 [54, 58, 52],
 [58, 52, 54],
 [58, 54, 52]]


allPerms = list(Permutations(totalSample))
for p in allPerms:
    t = abs((p[0] + p[1])/2 - p[2]/1)
    print (p, " has t = ", t)

[52, 54, 58]  has t =  5
[52, 58, 54]  has t =  1
[54, 52, 58]  has t =  5
[54, 58, 52]  has t =  4
[58, 52, 54]  has t =  1
[58, 54, 52]  has t =  4


allPerms = list(Permutations(totalSample))
pProb = 1/len(allPerms)
pValue = 0
tobs = 5
for p in allPerms:
    t = abs((p[0] + p[1])/2 - p[2]/1)
    if t >= tobs:
        pValue = pValue + pProb
pValue

1/3


rightSub = [52, 54, 60]
leftSub = [58, 54]
totalSample = rightSub + leftSub
totalSample

[52, 54, 60, 58, 54]


n, m = 5, 5
leftSub = sample(leftSide, n)
rightSub = sample(rightSide,m)
totalSample = leftSub + rightSub
leftSub; rightSub; totalSample

[60, 61, 49, 54, 59, 51, 55, 47, 60, 61]


tobs = abs(mean(leftSub) - mean(rightSub))
tobs

9/5


#define a helper function for calculating the tstat from a permutation
def tForPerm(perm, samplesize1, samplesize2):
    '''Calculates the t statistic for a permutation of data given the sample sizes to split the permuation into.
    
    Param perm is the permutation of data to be split into the two samples.
    Param samplesize1, samplesize2 are the two sample sizes.
    Returns the absolute value of the difference in the means of the two samples split out from perm.'''
    sample1 = [perm[i] for i in range(samplesize1)]
    sample2 = [perm[samplesize1+j] for j in range(samplesize2)]
    return abs(mean(sample1) - mean(sample2))


allPerms = list(Permutations(totalSample))
pProb = 1/len(allPerms)
pValue = 0
tobs = abs(mean(leftSub) - mean(rightSub))
for p in allPerms:
    t = tForPerm(p, n, m)
    if t >= tobs:
        pValue = pValue + pProb
pValue

41/63

n+m

10


factorial(n+m) # how many permutations is it checking

3628800


print (leftSide) # shell diameters on left side of pier
print (rightSide) # on right side
print (len(leftSide), len(rightSide))
shellDiameterDataArray = np.array(leftSide + rightSide) # concatenate as np.array, observed data
shellDiameterDataArray

[52, 54, 60, 60, 54, 47, 57, 58, 61, 57, 50, 60, 60, 60, 62, 44, 55, 58, 55, 60, 59, 65, 59, 63, 51, 61, 62, 61, 60, 61, 65, 43, 59, 58, 67, 56, 64, 47, 64, 60, 55, 58, 41, 53, 61, 60, 49, 48, 47, 42, 50, 58, 48, 59, 55, 59, 50, 47, 47, 33, 51, 61, 61, 52, 62, 64, 64, 47, 58, 58, 61, 50, 55, 47, 39, 59, 64, 63, 63, 62, 64, 61, 50, 62, 61, 65, 62, 66, 60, 59, 58, 58, 60, 59, 61, 55, 55, 62, 51, 61, 49, 52, 59, 60, 66, 50, 59, 64, 64, 62, 60, 65, 44, 58, 63]
[58, 54, 60, 55, 56, 44, 60, 52, 57, 58, 61, 66, 56, 59, 49, 48, 69, 66, 49, 72, 49, 50, 59, 59, 59, 66, 62, 44, 49, 40, 59, 55, 61, 51, 62, 52, 63, 39, 63, 52, 62, 49, 48, 65, 68, 45, 63, 58, 55, 56, 55, 57, 34, 64, 66, 54, 65, 61, 56, 57, 59, 58, 62, 58, 40, 43, 62, 59, 64, 64, 65, 65, 59, 64, 63, 65, 62, 61, 47, 59, 63, 44, 43, 59, 67, 64, 60, 62, 64, 65, 59, 55, 38, 57, 61, 52, 61, 61, 60, 34, 62, 64, 58, 39, 63, 47, 55, 54, 48, 60, 55, 60, 65, 41, 61, 59, 65, 50, 54, 60, 48, 51, 68, 52, 51, 61, 57, 49, 51, 62, 63, 59, 62, 54, 59, 46, 64, 49, 61]
115 139

array([52, 54, 60, 60, 54, 47, 57, 58, 61, 57, 50, 60, 60, 60, 62, 44, 55,
       58, 55, 60, 59, 65, 59, 63, 51, 61, 62, 61, 60, 61, 65, 43, 59, 58,
       67, 56, 64, 47, 64, 60, 55, 58, 41, 53, 61, 60, 49, 48, 47, 42, 50,
       58, 48, 59, 55, 59, 50, 47, 47, 33, 51, 61, 61, 52, 62, 64, 64, 47,
       58, 58, 61, 50, 55, 47, 39, 59, 64, 63, 63, 62, 64, 61, 50, 62, 61,
       65, 62, 66, 60, 59, 58, 58, 60, 59, 61, 55, 55, 62, 51, 61, 49, 52,
       59, 60, 66, 50, 59, 64, 64, 62, 60, 65, 44, 58, 63, 58, 54, 60, 55,
       56, 44, 60, 52, 57, 58, 61, 66, 56, 59, 49, 48, 69, 66, 49, 72, 49,
       50, 59, 59, 59, 66, 62, 44, 49, 40, 59, 55, 61, 51, 62, 52, 63, 39,
       63, 52, 62, 49, 48, 65, 68, 45, 63, 58, 55, 56, 55, 57, 34, 64, 66,
       54, 65, 61, 56, 57, 59, 58, 62, 58, 40, 43, 62, 59, 64, 64, 65, 65,
       59, 64, 63, 65, 62, 61, 47, 59, 63, 44, 43, 59, 67, 64, 60, 62, 64,
       65, 59, 55, 38, 57, 61, 52, 61, 61, 60, 34, 62, 64, 58, 39, 63, 47,
       55, 54, 48, 60, 55, 60, 65, 41, 61, 59, 65, 50, 54, 60, 48, 51, 68,
       52, 51, 61, 57, 49, 51, 62, 63, 59, 62, 54, 59, 46, 64, 49, 61])


print (shellDiameterDataArray[0:115]) # left diameters
print (shellDiameterDataArray[0:115].mean()) # mean of left side
print (shellDiameterDataArray[115:115+139]) # right diameters
print (shellDiameterDataArray[115:115+139].mean()) # mean of right side

[52 54 60 60 54 47 57 58 61 57 50 60 60 60 62 44 55 58 55 60 59 65 59 63
 51 61 62 61 60 61 65 43 59 58 67 56 64 47 64 60 55 58 41 53 61 60 49 48
 47 42 50 58 48 59 55 59 50 47 47 33 51 61 61 52 62 64 64 47 58 58 61 50
 55 47 39 59 64 63 63 62 64 61 50 62 61 65 62 66 60 59 58 58 60 59 61 55
 55 62 51 61 49 52 59 60 66 50 59 64 64 62 60 65 44 58 63]
56.82608695652174
[58 54 60 55 56 44 60 52 57 58 61 66 56 59 49 48 69 66 49 72 49 50 59 59
 59 66 62 44 49 40 59 55 61 51 62 52 63 39 63 52 62 49 48 65 68 45 63 58
 55 56 55 57 34 64 66 54 65 61 56 57 59 58 62 58 40 43 62 59 64 64 65 65
 59 64 63 65 62 61 47 59 63 44 43 59 67 64 60 62 64 65 59 55 38 57 61 52
 61 61 60 34 62 64 58 39 63 47 55 54 48 60 55 60 65 41 61 59 65 50 54 60
 48 51 68 52 51 61 57 49 51 62 63 59 62 54 59 46 64 49 61]
56.66187050359712


observedTestStat = np.abs(shellDiameterDataArray[0:115].mean() - shellDiameterDataArray[115:115+139].mean())
observedTestStat

0.16421645292462017


# use the right lamba expression for the statistic of interest
TestStat = \
          lambda dataArray : np.abs(dataArray[0:115].mean() - dataArray[115:115+139].mean())
observedTestStat = TestStat(shellDiameterDataArray)
observedTestStat

0.16421645292462017


np.array(sample(list(shellDiameterDataArray),115+139)) 
# resample from concatenated data to bootstrap TestStat under H_0

array([61, 52, 63, 45, 50, 52, 57, 56, 39, 57, 59, 55, 64, 57, 58, 65, 60,
       65, 60, 54, 60, 49, 34, 48, 57, 59, 54, 54, 44, 50, 63, 58, 63, 65,
       63, 63, 59, 49, 59, 58, 62, 46, 59, 55, 63, 61, 56, 59, 63, 44, 62,
       65, 52, 64, 39, 59, 50, 62, 62, 55, 40, 62, 61, 60, 52, 62, 59, 64,
       66, 42, 47, 65, 51, 59, 47, 59, 60, 60, 65, 61, 58, 60, 48, 58, 49,
       58, 50, 58, 60, 67, 49, 50, 51, 52, 59, 65, 55, 60, 61, 44, 62, 64,
       58, 59, 61, 48, 55, 43, 55, 61, 61, 58, 60, 50, 62, 56, 51, 60, 72,
       54, 65, 61, 66, 34, 60, 57, 64, 47, 60, 44, 63, 54, 55, 62, 49, 65,
       59, 58, 58, 44, 47, 60, 61, 61, 60, 54, 55, 62, 49, 56, 69, 65, 59,
       61, 68, 64, 58, 52, 68, 39, 52, 59, 47, 55, 59, 61, 59, 63, 58, 67,
       59, 58, 49, 61, 59, 60, 63, 66, 33, 59, 59, 51, 64, 48, 47, 59, 43,
       64, 50, 55, 61, 63, 60, 55, 64, 62, 61, 60, 62, 58, 64, 56, 54, 62,
       53, 61, 58, 62, 47, 51, 61, 50, 64, 61, 62, 57, 55, 55, 64, 62, 38,
       62, 55, 62, 60, 61, 47, 64, 59, 43, 49, 64, 41, 57, 60, 61, 49, 65,
       66, 48, 66, 64, 59, 51, 41, 47, 48, 52, 40, 64, 51, 61, 66, 65])


alpha=0.05
B=100 # number of bootstrap samples for estimating p-value in a permutation test / or bootstrapped as per 'sample'

bootstrappedTestStats=[]
for i in range(B):
    # sample from concatenated data to bootstrap or permute as per 'sample(...)' function's behaviour
    permutedBootstrappedData = np.array(sample(list(shellDiameterDataArray),115+139)) 
    #print i, permutedBootstrappedData
    bootstrappedTestStats.append(TestStat(permutedBootstrappedData))
    
bootstrappedTestStatsArray=np.array(bootstrappedTestStats)

print ("observed Test Statistics = ", observedTestStat)
#print bootstrappedTestStats

alpaAsPercentage=alpha*100.0
lowerBootstrap1MinusAlphaCIForStatisticT = np.percentile(bootstrappedTestStats,alpaAsPercentage/2)
upperBootstrap1MinusAlphaCIForStatisticT = np.percentile(bootstrappedTestStats,100-alpaAsPercentage/2)

print ("The inner (1 - "+str(alpha)+" ) percentile based Confidence Interval for the statistic T = ")
print ("  [ "+str(lowerBootstrap1MinusAlphaCIForStatisticT)+"," +str(upperBootstrap1MinusAlphaCIForStatisticT)+" ]")

observed Test Statistics =  0.16421645292462017
The inner (1 - 0.0500000000000000 ) percentile based Confidence Interval for the statistic T = 
  [ 0.039061620269001375,1.7441601501407595 ]

Permutation	$t$	$\mathbf{P}_0(T=t)$
		Probability under Null
(52, 54, 58)	5	$\frac{1}{6}$
(52, 58, 54)	1	$\frac{1}{6}$
(54, 52, 58)	5	$\frac{1}{6}$
(54, 58, 52)	4	$\frac{1}{6}$
(58, 52, 54)	1	$\frac{1}{6}$
(58, 54, 52)	4	$\frac{1}{6}$

Introduction to Data Science: A Comp-Math-Stat Approach¶

1MS041, 2021¶

11. Non-parametric Estimation and Testing¶

Topics¶

Inference and Estimation: The Big Picture¶

Non-parametric estimation¶

Observations from some unknown process¶

Let us continue with the concepts¶

Glivenko-Cantelli Theorem¶

Dvoretsky-Kiefer-Wolfowitz (DKW) Inequality¶

YouTry in class¶

YouTry¶

NZ EartQuakes¶

Plug-in Estimator: A nonparametric Point Estimator¶

Example: Estimating quantiles of inter-earthquake times¶

Quantiles and Percentiles¶

Bootstrap for Confidence Sets in Nonparametric Experiments¶

Confidence Interval via Bootstrap - Median of inter-EQ Times in Minutes¶

Using numpy to bootstrap, a faster method¶

Testing via Bootstrap¶

The $\mathsf{size}$ Wald test from bootstrapped $1-\alpha$ Confidence Interval $C_n$ for a statistic $t$¶

Example: Wald Test that median of inter-EQ times is 10 minutes¶

Making a Generic Bootstrap Function for $1-\alpha$ Confidence Interval of any Statistic¶

Demonstrating the use of bootstrap for different statistics of interest¶

Bootstrapped $1-\alpha$ Confidence Interval for Sample Median of inter-EQ Time in Minutes¶

CAUTION: Bootstrap is justified if two of its assumptions are satisfied:¶

Sample Exam Problem 7¶

Sample Exam Problem 7 Solution¶

Correlation: A Bivariate Nonparametric Bootstrap¶

Plug-in Point Estimate of the Correlation¶

The Bootstrapped $1-\alpha$ Confidence Interval of the Sample Correlation and a Test¶

Why is testing whether correlation = 0 is of interest?¶

Correlation Versus Causation - Proceed with Extreme Caution!¶

Nonparametric Hypothesis Testing¶

Are two samples from the same distribution or not?¶

Permutation Testing¶

Permutation Testing with Shell Data¶

Pooled sample size¶

YouTry in class¶

You will have to think about:¶

Monte Carlo methods to the rescue!¶

You try¶

Example: Permutation test for shell diameters using the bootsrapped/permuted confidence interval¶

So do we reject the Null Hypothesis that the diameters were the same on either side of the New Brighton Pier?¶

Introduction to Data Science: A Comp-Math-Stat Approach ¶