Introduction to Data Science ¶

1MS041, 2023¶

CountVectorizer and TFIDFVectorizer¶

from sklearn.feature_extraction.text import CountVectorizer

import numpy as np
X_test = np.array(['test of stuff','something of test','stuff of something'])

cv = CountVectorizer()

cv.fit(X_test)

CountVectorizer()

CountVectorizer()

cv.get_feature_names_out()

array(['of', 'something', 'stuff', 'test'], dtype=object)

cv.transform(X_test).todense()

matrix([[1, 0, 1, 1],
        [1, 1, 0, 1],
        [1, 1, 1, 0]])

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(X_test)

TfidfVectorizer()

TfidfVectorizer()

tfidf.get_feature_names_out()

array(['of', 'something', 'stuff', 'test'], dtype=object)

tfidf_m = tfidf.transform(X_test).todense()
tfidf_m

matrix([[0.48133417, 0.        , 0.61980538, 0.61980538],
        [0.48133417, 0.61980538, 0.        , 0.61980538],
        [0.48133417, 0.61980538, 0.61980538, 0.        ]])

np.linalg.norm(tfidf_m[0,:])

1.0

import pandas as pd
df = pd.read_csv('data/spam.csv',encoding='Latin-1')
X = df['v2']
Y = df['v1']

X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y)

tfidf.fit(X_train)

TfidfVectorizer()

TfidfVectorizer()

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(tfidf.transform(X_train),Y_train)

LogisticRegression()

LogisticRegression()

from Utils import classification_report_interval
print(classification_report_interval(Y_test,lr.predict(tfidf.transform(X_test))))

            labels           precision             recall

              spam  0.99 : [0.84,1.00] 0.74 : [0.61,0.87]
               ham  0.96 : [0.91,1.00] 1.00 : [0.94,1.00]

          accuracy                                        0.96 : [0.91,1.00]

Feature engineering¶

import pandas as pd
df = pd.read_csv('data/auto.csv')
df = df.dropna()
df.head(5)

X = df['horsepower']
Y = df['mpg']

import matplotlib.pyplot as plt
plt.scatter(X,Y)

<matplotlib.collections.PathCollection at 0x1633196a0>

Power, vs miles per gallon.

Miles per gallon is the inverse of gallon per miles which is fuel consumption.

import numpy as np
X = df['horsepower'].to_numpy().reshape(-1,1)
Y = df['mpg'].to_numpy()

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y,random_state=0)
lr.fit(X_train,Y_train)

import matplotlib.pyplot as plt
plt.scatter(lr.predict(X_test),Y_test)
plt.scatter(Y_test,Y_test)
np.mean((lr.predict(X_test)-Y_test)**2)

27.728220057440613

import numpy as np
X = df['horsepower'].to_numpy().reshape(-1,1)
X = np.concatenate([X,1/X],axis=1) # X = (n_samples,n_features)
Y = df['mpg'].to_numpy()

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y,random_state=0)
lr.fit(X_train,Y_train)

import matplotlib.pyplot as plt
plt.scatter(lr.predict(X_test),Y_test)
plt.scatter(Y_test,Y_test)
np.mean((lr.predict(X_test)-Y_test)**2)

24.01416793195116

plt.scatter(X_test[:,0],Y_test)
plt.scatter(X_test[:,0],lr.predict(X_test))

<matplotlib.collections.PathCollection at 0x1660e4820>

Improving optimization¶

1.123412e6 = 1.123412*10^6 1.234 + 1.123412e6 = 0.000000e6 + 1.123412e6 = 1.123412e6

1.23123e-300 = 1.23123*10^(-300)

import scipy.io as so
import numpy as np
data = so.loadmat('data/mammography.mat')

np.random.seed(0)
shuffle_index = np.arange(0,len(data['X']))
np.random.shuffle(shuffle_index)

X = data['X'][shuffle_index,:]
Y = data['y'][shuffle_index,:].flatten()

Lets make the features very different in scale and see what happens

X[:,0] = X[:,0] + 1000

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X,Y)
lr.score(X,Y)

/Users/avelin/opt/miniconda3/envs/sage_new/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

0.983814718769561

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
lr = LogisticRegression()
lr.fit(sc.fit_transform(X),Y)
lr.score(sc.transform(X),Y)

0.9837252973262989

$$ \min L(w) $$

$$ \min L(w) + \lambda \|w\|^2 $$

$$ f_w(x) = w_0 + w_1 x $$

How scale and regularization affect eachother¶

Lets try to train two models with fairly high regularization and see what happens when we change the scale of the features

X = data['X'][shuffle_index,:]
Y = data['y'][shuffle_index,:].flatten()

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1/10000)
lr.fit(X*100,Y)
lr.score(X*100,Y)

0.9837252973262989

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1/10000)
lr.fit(X,Y)
lr.score(X,Y)

0.9767504247518555

Transforming target¶

from sklearn.datasets import fetch_california_housing
X,Y = fetch_california_housing(return_X_y=True)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X,Y)
lr.score(X,Y)

0.606232685199805

import matplotlib.pyplot as plt
plt.scatter(lr.predict(X),Y,alpha=0.05)
plt.scatter(lr.predict(X),lr.predict(X))
plt.xlim(-1,10)
plt.ylim(-1,10)

(-1.0, 10.0)

log(X)-log(Y) = log(X/Y)

import matplotlib.pyplot as plt
lr = LinearRegression()
lr.fit(X,np.log(Y))
plt.scatter(np.exp(lr.predict(X)),Y,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(0,7)
plt.ylim(0,7)
lr.score(X,np.log(Y))

0.6143678372037653

import matplotlib.pyplot as plt
lr = LinearRegression()
lr.fit(X,np.sqrt(Y))
plt.scatter(np.power(lr.predict(X),2),Y,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(0,7)
plt.ylim(0,7)
lr.score(X,np.sqrt(Y))
#plt.scatter(np.exp(lr.predict(X)),np.exp(lr.predict(X)))

We see that the largest prices seem to be hard to predict, lets see what happens if we remove them

_=plt.hist(Y,bins=60)

_=plt.hist(np.sqrt(Y[Y < 5]),bins=60)

from sklearn.preprocessing import power_transform
_=plt.hist(power_transform(Y[Y < 5].reshape(-1,1)),bins=50)

Playing around with transformations¶

X_new = X[Y < 5,:]
Y_new = Y[Y < 5]

from sklearn.preprocessing import PowerTransformer
pw = PowerTransformer()
pw.fit(Y_new.reshape(-1,1))

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_new,pw.transform(Y_new.reshape(-1,1)).flatten())

import matplotlib.pyplot as plt
plt.scatter(pw.inverse_transform(lr.predict(X_new).reshape(-1,1)),Y_new,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(-1,7)
plt.ylim(-1,7)

lr.score(X_new,pw.transform(Y_new.reshape(-1,1)).flatten())

pw2 = PowerTransformer()
pw2.fit(X_new)
lr = LinearRegression()
lr.fit(pw2.transform(X_new),pw.transform(Y_new.reshape(-1,1)).flatten())

import matplotlib.pyplot as plt
plt.scatter(pw.inverse_transform(lr.predict(pw2.transform(X_new)).reshape(-1,1)),Y_new,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(-1,7)
plt.ylim(-1,7)

lr.score(pw2.transform(X_new),pw.transform(Y_new.reshape(-1,1)).flatten())

Concentration¶

from ipywidgets import interact, IntSlider, FloatSlider
from Utils import discrete_histogram, bennett_epsilon, epsilon_bounded
import numpy as np
@interact 
def concentration(n=IntSlider(1,1,100,5),p=FloatSlider(value=0.5, min=0,max=1,step=0.1)):
    import matplotlib.pyplot as plt
    X = np.random.binomial(1,p,size=(n,10000))
    means = np.mean(X,axis=0)
    #print("P(mean > mu + 0.3 ) = %.5f <= Chebychev %.5f" % (np.mean(means > 0.5+0.3),(1/4)/(0.3**2*n)))
    #print("P(mean > mu + 0.3 ) = %.5f <= Hoeffding %.5f" % (np.mean(means > 0.5+0.3),np.exp(-2*n*0.3**2)))
    print(np.quantile(means,0.025),np.quantile(means,0.975))
    epsilon1 = epsilon_bounded(n,1,0.05)
    epsilon2 = bennett_epsilon(n,1,np.sqrt((1/2)*p*(1-p)),0.05)
    print("95%% confidence interval Hoeffding [%.2f, %.2f] for n=%d" % (np.mean(means)-epsilon1,np.mean(means)+epsilon1,n))
    print("95%% confidence interval Bennett [%.2f, %.2f] for n=%d" % (np.mean(means)-epsilon2,np.mean(means)+epsilon2,n))
    discrete_histogram(means,normed=True)
    plt.xlim(-0.1,1.1)

Spam vs not spam, more complete¶

import pandas as pd
df = pd.read_csv('data/spam.csv',encoding='Latin-1')
X = df['v2']
Y = df['v1']

import pandas as pd
df = pd.read_csv('data/spam.csv',encoding='Latin-1')

X = df['v2']
Y = df['v1']

from Utils import train_test_validation

X_train,X_test, X_valid, Y_train, Y_test, Y_valid = train_test_validation(X,Y)

from sklearn.pipeline import Pipeline
p = Pipeline([('tfidf',TfidfVectorizer()),('model',LogisticRegression())])

p.fit(X_train,Y_train)

If we have no specific cost in mind¶

Then simply compute confidence intervals on the standard metrics

from Utils import classification_report_interval
print(classification_report_interval(Y_test,p.predict(X_test)))

Lets define a cost¶

If we say something is spam but its not spam, that is quite bad and we could miss important emails. We could say that costs $100$.

If on the other hand we have spam that is classified as not spam, then that is annoying and we have to manually delete it. Lets say that incurrs a cost of $10$.

That is, if we define the random variable $$E_1 = 1_{Y=0, g(X)=1} = 1_{Y=0} 1_{g(X) = 1}$$ and the random variable $$E_2 = 1_{Y=1, g(X)=0} = 1_{Y=1} 1_{g(X) = 0} = (1-1_{Y=0})(1-1_{g(X)=1}) = 1 - 1_{Y=0} - 1_{g(X)=1} + E_1$$

Then the cost of a randomly chosen sms is the random variable $$ C = 100 E_1 + 10 E_2 $$

Y_01 = (Y_test == 'spam')*1 # This makes Y_01 into 0 for ham and 1 for spam
g_01 = (p.predict(X_test) == 'spam')*1 # This makes g_01 into 0 for ham and 1 for spam

Y_0 = 1-Y_01
g_1 = g_01
E_1  = Y_0*g_1
E_2 = 1-Y_0-g_1+E_1
C = 100*E_1 + 10*E_2

We are interested in the expected (average) cost of an sms, we need to estimate $E[C]$.

We assume that all the sms are i.i.d. so we can use Hoeffdings inequality:

What do we know about $C$? Well the only thing we know is that it is bounded by $0 \leq C \leq 100$.
Use Hoeffdings inequality to get a confidence interval
We will use the epsilon_bounded function from Utils to do this

from Utils import epsilon_bounded, print_confidence_interval
eps = epsilon_bounded(len(C),100,0.05)
mean = np.mean(C)
print_confidence_interval(mean,eps,min_value=0,max_value=100)

However, we know that we can adjust the threshold of our model, and perhaps the cost will be different for another threshold?

def cost(threshold):
    Y_01 = (Y_test == 'spam')*1 # This makes Y_01 into 0 for ham and 1 for spam
    g_01 = (p.predict_proba(X_test)[:,1] >= threshold)*1 # This makes g_01 into 0 for ham and 1 for spam
    Y_0 = 1-Y_01
    g_1 = g_01
    E_1  = Y_0*g_1
    E_2 = 1-Y_0-g_1+E_1
    C = 100*E_1 + 10*E_2
    
    return np.mean(C)

thresholds = np.linspace(0,1,100)
costs = [cost(t) for t in thresholds]

import matplotlib.pyplot as plt
plt.plot(thresholds, np.log(costs))

	mpg	cylinders	displacement	horsepower	weight	acceleration	model-year
0	18.0	8	307.0	130.0	3504	12.0	70
1	15.0	8	350.0	165.0	3693	11.5	70
2	18.0	8	318.0	150.0	3436	11.0	70
3	16.0	8	304.0	150.0	3433	12.0	70
4	17.0	8	302.0	140.0	3449	10.5	70

Introduction to Data Science¶