Introduction to Data Science

1MS041, 2022

©2022 Raazesh Sainudiin, Benny Avelin. Attribution 4.0 International (CC BY 4.0)

In [1]:
from Utils import showURL
In [2]:
showURL('https://en.wikipedia.org/wiki/Normal_distribution')
Out[2]:
In [3]:
import numpy as np
In [4]:
lam = 2
$$f(x) = \lambda*e^{-\lambda x}$$
In [5]:
from sympy import var, integrate, exp
In [6]:
l = var('l')
x = var('x')
In [7]:
f = l*exp(-l*x)
f
Out[7]:
$\displaystyle l e^{- l x}$
In [8]:
y = var('y')
f_y = integrate(f,(x,0,y))
In [9]:
f_y
Out[9]:
$\displaystyle 1 - e^{- l y}$
In [10]:
from sympy import solve
In [13]:
# -(1/l)*ln(-x+1) = y
In [14]:
x = np.random.uniform(0,1,10000)
In [15]:
y = -(1/lam)*np.log(-x+1)
In [16]:
import matplotlib.pyplot as plt
_=plt.hist(y,density=True,bins=100)
z = np.linspace(0,4,100)
plt.plot(z,lam*np.exp(-lam*z))
Out[16]:
[<matplotlib.lines.Line2D at 0x161693a60>]
In [ ]:
 
In [17]:
showURL('https://www.sympy.org/en/index.html')
Out[17]:
In [18]:
!ls data
CORIS.csv                  final.csv.zip
NYPowerBall.csv            final.tgz
co2_mm_mlo.txt             flights.csv
digits.csv                 leukemia.csv
earthquakes.csv            portland.csv
earthquakes.csv.zip        pride_and_prejudice.txt
earthquakes.tgz            rainfallInChristchurch.csv
earthquakes_small.csv      ratings.csv
final.csv                  spam.csv
In [19]:
!head -n 100 data/co2_mm_mlo.txt
# --------------------------------------------------------------------
# USE OF NOAA ESRL DATA
# 
# These data are made freely available to the public and the
# scientific community in the belief that their wide dissemination
# will lead to greater understanding and new scientific insights.
# The availability of these data does not constitute publication
# of the data.  NOAA relies on the ethics and integrity of the user to
# ensure that ESRL receives fair credit for their work.  If the data 
# are obtained for potential use in a publication or presentation, 
# ESRL should be informed at the outset of the nature of this work.  
# If the ESRL data are essential to the work, or if an important 
# result or conclusion depends on the ESRL data, co-authorship
# may be appropriate.  This should be discussed at an early stage in
# the work.  Manuscripts using the ESRL data should be sent to ESRL
# for review before they are submitted for publication so we can
# insure that the quality and limitations of the data are accurately
# represented.
# 
# Contact:   Pieter Tans (303 497 6678; pieter.tans@noaa.gov)
# 
# File Creation:  Thu Dec  6 13:26:20 2018
# 
# RECIPROCITY
# 
# Use of these data implies an agreement to reciprocate.
# Laboratories making similar measurements agree to make their
# own data available to the general public and to the scientific
# community in an equally complete and easily accessible form.
# Modelers are encouraged to make available to the community,
# upon request, their own tools used in the interpretation
# of the ESRL data, namely well documented model code, transport
# fields, and additional information necessary for other
# scientists to repeat the work and to run modified versions.
# Model availability includes collaborative support for new
# users of the models.
# --------------------------------------------------------------------
#  
#  
# See www.esrl.noaa.gov/gmd/ccgg/trends/ for additional details.
#  
# Data from March 1958 through April 1974 have been obtained by C. David Keeling
# of the Scripps Institution of Oceanography (SIO) and were obtained from the
# Scripps website (scrippsco2.ucsd.edu).
#
# The "average" column contains the monthly mean CO2 mole fraction determined
# from daily averages.  The mole fraction of CO2, expressed as parts per million
# (ppm) is the number of molecules of CO2 in every one million molecules of dried
# air (water vapor removed).  If there are missing days concentrated either early
# or late in the month, the monthly mean is corrected to the middle of the month
# using the average seasonal cycle.  Missing months are denoted by -99.99.
# The "interpolated" column includes average values from the preceding column
# and interpolated values where data are missing.  Interpolated values are
# computed in two steps.  First, we compute for each month the average seasonal
# cycle in a 7-year window around each monthly value.  In this way the seasonal
# cycle is allowed to change slowly over time.  We then determine the "trend"
# value for each month by removing the seasonal cycle; this result is shown in
# the "trend" column.  Trend values are linearly interpolated for missing months.
# The interpolated monthly mean is then the sum of the average seasonal cycle
# value and the trend value for the missing month.
#
# NOTE: In general, the data presented for the last year are subject to change, 
# depending on recalibration of the reference gas mixtures used, and other quality
# control procedures. Occasionally, earlier years may also be changed for the same
# reasons.  Usually these changes are minor.
#
# CO2 expressed as a mole fraction in dry air, micromol/mol, abbreviated as ppm
#
#  (-99.99 missing data;  -1 no data for #daily means in month)
#
#            decimal     average   interpolated    trend    #days
#             date                             (season corr)
1958   3    1958.208      315.71      315.71      314.62     -1
1958   4    1958.292      317.45      317.45      315.29     -1
1958   5    1958.375      317.50      317.50      314.71     -1
1958   6    1958.458      -99.99      317.10      314.85     -1
1958   7    1958.542      315.86      315.86      314.98     -1
1958   8    1958.625      314.93      314.93      315.94     -1
1958   9    1958.708      313.20      313.20      315.91     -1
1958  10    1958.792      -99.99      312.66      315.61     -1
1958  11    1958.875      313.33      313.33      315.31     -1
1958  12    1958.958      314.67      314.67      315.61     -1
1959   1    1959.042      315.62      315.62      315.70     -1
1959   2    1959.125      316.38      316.38      315.88     -1
1959   3    1959.208      316.71      316.71      315.62     -1
1959   4    1959.292      317.72      317.72      315.56     -1
1959   5    1959.375      318.29      318.29      315.50     -1
1959   6    1959.458      318.15      318.15      315.92     -1
1959   7    1959.542      316.54      316.54      315.66     -1
1959   8    1959.625      314.80      314.80      315.81     -1
1959   9    1959.708      313.84      313.84      316.55     -1
1959  10    1959.792      313.26      313.26      316.19     -1
1959  11    1959.875      314.80      314.80      316.78     -1
1959  12    1959.958      315.58      315.58      316.52     -1
1960   1    1960.042      316.43      316.43      316.51     -1
1960   2    1960.125      316.97      316.97      316.47     -1
1960   3    1960.208      317.58      317.58      316.49     -1
1960   4    1960.292      319.02      319.02      316.86     -1
1960   5    1960.375      320.03      320.03      317.24     -1
1960   6    1960.458      319.59      319.59      317.36     -1
In [20]:
with open('data/co2_mm_mlo.txt',mode='r') as f:
    current_line = f.readline()
    while (current_line[0] == '#'):
        current_line = f.readline()
    
In [21]:
current_line
Out[21]:
'1958   3    1958.208      315.71      315.71      314.62     -1\n'
In [22]:
[d for d in current_line.split(' ') if len(d) > 0]
Out[22]:
['1958', '3', '1958.208', '315.71', '315.71', '314.62', '-1\n']
In [23]:
import re
In [24]:
data_line = re.sub('\n','',re.sub(' +',' ',current_line)).split(' ')
data_line
Out[24]:
['1958', '3', '1958.208', '315.71', '315.71', '314.62', '-1']
In [25]:
schema = [int,int,float,float,float,float,int]
In [26]:
[sch(d) for sch,d in zip(schema,data_line)]
Out[26]:
[1958, 3, 1958.208, 315.71, 315.71, 314.62, -1]
In [27]:
data = []
with open('data/co2_mm_mlo.txt',mode='r') as f:
    current_line = f.readline()
    while (current_line[0] == '#'):
        current_line = f.readline()
    data_line = re.sub('\n','',re.sub(' +',' ',current_line)).split(' ')
    data_line_typed = [sch(d) for sch,d in zip(schema,data_line)]
    data.append(data_line_typed)
    for line in f:
        data_line = re.sub('\n','',re.sub(' +',' ',line)).split(' ')
        data_line_typed = [sch(d) for sch,d in zip(schema,data_line)]
        data.append(data_line_typed)
In [28]:
import numpy as np
data_array = np.array(data,dtype=float)
data_array
Out[28]:
array([[ 1.958000e+03,  3.000000e+00,  1.958208e+03, ...,  3.157100e+02,
         3.146200e+02, -1.000000e+00],
       [ 1.958000e+03,  4.000000e+00,  1.958292e+03, ...,  3.174500e+02,
         3.152900e+02, -1.000000e+00],
       [ 1.958000e+03,  5.000000e+00,  1.958375e+03, ...,  3.175000e+02,
         3.147100e+02, -1.000000e+00],
       ...,
       [ 2.018000e+03,  9.000000e+00,  2.018708e+03, ...,  4.055100e+02,
         4.090900e+02,  2.900000e+01],
       [ 2.018000e+03,  1.000000e+01,  2.018792e+03, ...,  4.060000e+02,
         4.093800e+02,  3.000000e+01],
       [ 2.018000e+03,  1.100000e+01,  2.018875e+03, ...,  4.080200e+02,
         4.099800e+02,  2.400000e+01]])
In [29]:
data_array.shape
Out[29]:
(729, 7)
In [30]:
data_array[:,4:5].reshape(-1).shape
Out[30]:
(729,)
In [31]:
average = data_array[:,4]
In [32]:
import matplotlib.pyplot as plt
_=plt.hist(average)
In [33]:
from Utils import basic_stats
basic_stats(average)
mean: 353.79	std: 27.53	skew: 0.34	kurtosis: 1.93
In [34]:
from Utils import makeEMF, makeEDF,plotEDF
In [35]:
plotEDF(makeEDF(average))