Introduction to Data Science¶
1MS041, 2024¶
©2024 Raazesh Sainudiin, Benny Avelin. Attribution 4.0 International (CC BY 4.0)
CountVectorizer and TFIDFVectorizer¶
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
X_test = np.array(['test of stuff','something of test','stuff of something'])
cv = CountVectorizer()
cv.fit(X_test)
CountVectorizer()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CountVectorizer()
cv.get_feature_names_out()
array(['of', 'something', 'stuff', 'test'], dtype=object)
cv.transform(X_test).todense()
matrix([[1, 0, 1, 1],
[1, 1, 0, 1],
[1, 1, 1, 0]])
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(X_test)
TfidfVectorizer()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
TfidfVectorizer()
tfidf.get_feature_names_out()
array(['of', 'something', 'stuff', 'test'], dtype=object)
tfidf_m = tfidf.transform(X_test).todense()
tfidf_m
matrix([[0.48133417, 0. , 0.61980538, 0.61980538],
[0.48133417, 0.61980538, 0. , 0.61980538],
[0.48133417, 0.61980538, 0.61980538, 0. ]])
np.linalg.norm(tfidf_m[0,:])
1.0
import pandas as pd
df = pd.read_csv('data/spam.csv',encoding='Latin-1')
X = df['v2']
Y = df['v1']
X
0 Go until jurong point, crazy.. Available only ...
1 Ok lar... Joking wif u oni...
2 Free entry in 2 a wkly comp to win FA Cup fina...
3 U dun say so early hor... U c already then say...
4 Nah I don't think he goes to usf, he lives aro...
...
5567 This is the 2nd time we have tried 2 contact u...
5568 Will Ì_ b going to esplanade fr home?
5569 Pity, * was in mood for that. So...any other s...
5570 The guy did some bitching but I acted like i'd...
5571 Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object
from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y)
tfidf.fit(X_train)
TfidfVectorizer()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
TfidfVectorizer()
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(tfidf.transform(X_train),Y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
from Utils import classification_report_interval
print(classification_report_interval(Y_test,lr.predict(tfidf.transform(X_test))))
labels precision recall
spam 0.99 : [0.84,1.00] 0.74 : [0.61,0.87]
ham 0.96 : [0.91,1.00] 1.00 : [0.94,1.00]
accuracy 0.96 : [0.91,1.00]
Feature engineering¶
import pandas as pd
df = pd.read_csv('data/auto.csv')
df = df.dropna()
df.head(5)
| mpg | cylinders | displacement | horsepower | weight | acceleration | model-year | |
|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 |
X = df['horsepower']
Y = df['mpg']
import matplotlib.pyplot as plt
plt.scatter(X,Y)
<matplotlib.collections.PathCollection at 0x1633196a0>
Power, vs miles per gallon.
Miles per gallon is the inverse of gallon per miles which is fuel consumption.
import numpy as np
X = df['horsepower'].to_numpy().reshape(-1,1)
Y = df['mpg'].to_numpy()
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y,random_state=0)
lr.fit(X_train,Y_train)
import matplotlib.pyplot as plt
plt.scatter(lr.predict(X_test),Y_test)
plt.scatter(Y_test,Y_test)
np.mean((lr.predict(X_test)-Y_test)**2)
27.728220057440613
import numpy as np
X = df['horsepower'].to_numpy().reshape(-1,1)
X = np.concatenate([X,1/X],axis=1) # X = (n_samples,n_features)
Y = df['mpg'].to_numpy()
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y,random_state=0)
lr.fit(X_train,Y_train)
import matplotlib.pyplot as plt
plt.scatter(lr.predict(X_test),Y_test)
plt.scatter(Y_test,Y_test)
np.mean((lr.predict(X_test)-Y_test)**2)
24.01416793195116
plt.scatter(X_test[:,0],Y_test)
plt.scatter(X_test[:,0],lr.predict(X_test))
<matplotlib.collections.PathCollection at 0x1660e4820>
Improving optimization¶
1.123412e6 = 1.123412*10^6 1.234 + 1.123412e6 = 0.000000e6 + 1.123412e6 = 1.123412e6
1.23123e-300 = 1.23123*10^(-300)
import scipy.io as so
import numpy as np
data = so.loadmat('data/mammography.mat')
np.random.seed(0)
shuffle_index = np.arange(0,len(data['X']))
np.random.shuffle(shuffle_index)
X = data['X'][shuffle_index,:]
Y = data['y'][shuffle_index,:].flatten()
Lets make the features very different in scale and see what happens
X[:,0] = X[:,0] + 1000
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X,Y)
lr.score(X,Y)
/Users/avelin/opt/miniconda3/envs/sage_new/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
0.983814718769561
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
lr = LogisticRegression()
lr.fit(sc.fit_transform(X),Y)
lr.score(sc.transform(X),Y)
0.9837252973262989
$$ \min L(w) $$
$$ \min L(w) + \lambda \|w\|^2 $$
$$ f_w(x) = w_0 + w_1 x $$
How scale and regularization affect eachother¶
Lets try to train two models with fairly high regularization and see what happens when we change the scale of the features
X = data['X'][shuffle_index,:]
Y = data['y'][shuffle_index,:].flatten()
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1/10000)
lr.fit(X*100,Y)
lr.score(X*100,Y)
0.9837252973262989
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1/10000)
lr.fit(X,Y)
lr.score(X,Y)
0.9767504247518555
Transforming target¶
from sklearn.datasets import fetch_california_housing
X,Y = fetch_california_housing(return_X_y=True)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X,Y)
lr.score(X,Y)
0.606232685199805
import matplotlib.pyplot as plt
plt.scatter(lr.predict(X),Y,alpha=0.05)
plt.scatter(lr.predict(X),lr.predict(X))
plt.xlim(-1,10)
plt.ylim(-1,10)
(-1.0, 10.0)
log(X)-log(Y) = log(X/Y)
import matplotlib.pyplot as plt
lr = LinearRegression()
lr.fit(X,np.log(Y))
plt.scatter(np.exp(lr.predict(X)),Y,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(0,7)
plt.ylim(0,7)
lr.score(X,np.log(Y))
0.6143678372037653
import matplotlib.pyplot as plt
lr = LinearRegression()
lr.fit(X,np.sqrt(Y))
plt.scatter(np.power(lr.predict(X),2),Y,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(0,7)
plt.ylim(0,7)
lr.score(X,np.sqrt(Y))
#plt.scatter(np.exp(lr.predict(X)),np.exp(lr.predict(X)))
We see that the largest prices seem to be hard to predict, lets see what happens if we remove them
_=plt.hist(Y,bins=60)
_=plt.hist(np.sqrt(Y[Y < 5]),bins=60)
from sklearn.preprocessing import power_transform
_=plt.hist(power_transform(Y[Y < 5].reshape(-1,1)),bins=50)
Playing around with transformations¶
X_new = X[Y < 5,:]
Y_new = Y[Y < 5]
from sklearn.preprocessing import PowerTransformer
pw = PowerTransformer()
pw.fit(Y_new.reshape(-1,1))
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_new,pw.transform(Y_new.reshape(-1,1)).flatten())
import matplotlib.pyplot as plt
plt.scatter(pw.inverse_transform(lr.predict(X_new).reshape(-1,1)),Y_new,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(-1,7)
plt.ylim(-1,7)
lr.score(X_new,pw.transform(Y_new.reshape(-1,1)).flatten())
pw2 = PowerTransformer()
pw2.fit(X_new)
lr = LinearRegression()
lr.fit(pw2.transform(X_new),pw.transform(Y_new.reshape(-1,1)).flatten())
import matplotlib.pyplot as plt
plt.scatter(pw.inverse_transform(lr.predict(pw2.transform(X_new)).reshape(-1,1)),Y_new,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(-1,7)
plt.ylim(-1,7)
lr.score(pw2.transform(X_new),pw.transform(Y_new.reshape(-1,1)).flatten())
Concentration¶
from ipywidgets import interact, IntSlider, FloatSlider
from Utils import discrete_histogram, bennett_epsilon, epsilon_bounded
import numpy as np
@interact
def concentration(n=IntSlider(1,1,100,5),p=FloatSlider(value=0.5, min=0,max=1,step=0.1)):
import matplotlib.pyplot as plt
X = np.random.binomial(1,p,size=(n,10000))
means = np.mean(X,axis=0)
#print("P(mean > mu + 0.3 ) = %.5f <= Chebychev %.5f" % (np.mean(means > 0.5+0.3),(1/4)/(0.3**2*n)))
#print("P(mean > mu + 0.3 ) = %.5f <= Hoeffding %.5f" % (np.mean(means > 0.5+0.3),np.exp(-2*n*0.3**2)))
print(np.quantile(means,0.025),np.quantile(means,0.975))
epsilon1 = epsilon_bounded(n,1,0.05)
epsilon2 = bennett_epsilon(n,1,np.sqrt((1/2)*p*(1-p)),0.05)
print("95%% confidence interval Hoeffding [%.2f, %.2f] for n=%d" % (np.mean(means)-epsilon1,np.mean(means)+epsilon1,n))
print("95%% confidence interval Bennett [%.2f, %.2f] for n=%d" % (np.mean(means)-epsilon2,np.mean(means)+epsilon2,n))
discrete_histogram(means,normed=True)
plt.xlim(-0.1,1.1)
interactive(children=(IntSlider(value=1, description='n', min=1, step=5), FloatSlider(value=0.5, description='…
Spam vs not spam, more complete¶
import pandas as pd
df = pd.read_csv('data/spam.csv',encoding='Latin-1')
X = df['v2']
Y = df['v1']
import pandas as pd
df = pd.read_csv('data/spam.csv',encoding='Latin-1')
X = df['v2']
Y = df['v1']
from Utils import train_test_validation
X_train,X_test, X_valid, Y_train, Y_test, Y_valid = train_test_validation(X,Y)
from sklearn.pipeline import Pipeline
p = Pipeline([('tfidf',TfidfVectorizer()),('model',LogisticRegression())])
p.fit(X_train,Y_train)
If we have no specific cost in mind¶
Then simply compute confidence intervals on the standard metrics
from Utils import classification_report_interval
print(classification_report_interval(Y_test,p.predict(X_test)))
Lets define a cost¶
If we say something is spam but its not spam, that is quite bad and we could miss important emails. We could say that costs $100$.
If on the other hand we have spam that is classified as not spam, then that is annoying and we have to manually delete it. Lets say that incurrs a cost of $10$.
That is, if we define the random variable $$E_1 = 1_{Y=0, g(X)=1} = 1_{Y=0} 1_{g(X) = 1}$$ and the random variable $$E_2 = 1_{Y=1, g(X)=0} = 1_{Y=1} 1_{g(X) = 0} = (1-1_{Y=0})(1-1_{g(X)=1}) = 1 - 1_{Y=0} - 1_{g(X)=1} + E_1$$
Then the cost of a randomly chosen sms is the random variable $$ C = 100 E_1 + 10 E_2 $$
Y_01 = (Y_test == 'spam')*1 # This makes Y_01 into 0 for ham and 1 for spam
g_01 = (p.predict(X_test) == 'spam')*1 # This makes g_01 into 0 for ham and 1 for spam
Y_0 = 1-Y_01
g_1 = g_01
E_1 = Y_0*g_1
E_2 = 1-Y_0-g_1+E_1
C = 100*E_1 + 10*E_2
We are interested in the expected (average) cost of an sms, we need to estimate $E[C]$.
We assume that all the sms are i.i.d. so we can use Hoeffdings inequality:
- What do we know about $C$? Well the only thing we know is that it is bounded by $0 \leq C \leq 100$.
- Use Hoeffdings inequality to get a confidence interval
- We will use the
epsilon_boundedfunction fromUtilsto do this
from Utils import epsilon_bounded, print_confidence_interval
eps = epsilon_bounded(len(C),100,0.05)
mean = np.mean(C)
print_confidence_interval(mean,eps,min_value=0,max_value=100)
However, we know that we can adjust the threshold of our model, and perhaps the cost will be different for another threshold?
def cost(threshold):
Y_01 = (Y_test == 'spam')*1 # This makes Y_01 into 0 for ham and 1 for spam
g_01 = (p.predict_proba(X_test)[:,1] >= threshold)*1 # This makes g_01 into 0 for ham and 1 for spam
Y_0 = 1-Y_01
g_1 = g_01
E_1 = Y_0*g_1
E_2 = 1-Y_0-g_1+E_1
C = 100*E_1 + 10*E_2
return np.mean(C)
thresholds = np.linspace(0,1,100)
costs = [cost(t) for t in thresholds]
import matplotlib.pyplot as plt
plt.plot(thresholds, np.log(costs))