from sklearn.feature_extraction.text import CountVectorizer

import numpy as np
X_test = np.array(['test of stuff','something of test','stuff of something'])

cv = CountVectorizer()

cv.fit(X_test)

CountVectorizer()

CountVectorizer()

cv.get_feature_names_out()

array(['of', 'something', 'stuff', 'test'], dtype=object)

cv.transform(X_test).todense()

matrix([[1, 0, 1, 1],
        [1, 1, 0, 1],
        [1, 1, 1, 0]])

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(X_test)

TfidfVectorizer()

TfidfVectorizer()

tfidf.get_feature_names_out()

array(['of', 'something', 'stuff', 'test'], dtype=object)

tfidf_m = tfidf.transform(X_test).todense()
tfidf_m

matrix([[0.48133417, 0.        , 0.61980538, 0.61980538],
        [0.48133417, 0.61980538, 0.        , 0.61980538],
        [0.48133417, 0.61980538, 0.61980538, 0.        ]])

np.linalg.norm(tfidf_m[0,:])

1.0

import pandas as pd
df = pd.read_csv('data/spam.csv',encoding='Latin-1')
X = df['v2']
Y = df['v1']

X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y)

tfidf.fit(X_train)

TfidfVectorizer()

TfidfVectorizer()

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(tfidf.transform(X_train),Y_train)

LogisticRegression()

LogisticRegression()

from Utils import classification_report_interval
print(classification_report_interval(Y_test,lr.predict(tfidf.transform(X_test))))

            labels           precision             recall

              spam  0.99 : [0.84,1.00] 0.74 : [0.61,0.87]
               ham  0.96 : [0.91,1.00] 1.00 : [0.94,1.00]

          accuracy                                        0.96 : [0.91,1.00]

import pandas as pd
df = pd.read_csv('data/auto.csv')
df = df.dropna()
df.head(5)

X = df['horsepower']
Y = df['mpg']

import matplotlib.pyplot as plt
plt.scatter(X,Y)

<matplotlib.collections.PathCollection at 0x1633196a0>

import numpy as np
X = df['horsepower'].to_numpy().reshape(-1,1)
Y = df['mpg'].to_numpy()

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y,random_state=0)
lr.fit(X_train,Y_train)

import matplotlib.pyplot as plt
plt.scatter(lr.predict(X_test),Y_test)
plt.scatter(Y_test,Y_test)
np.mean((lr.predict(X_test)-Y_test)**2)

27.728220057440613

import numpy as np
X = df['horsepower'].to_numpy().reshape(-1,1)
X = np.concatenate([X,1/X],axis=1) # X = (n_samples,n_features)
Y = df['mpg'].to_numpy()

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y,random_state=0)
lr.fit(X_train,Y_train)

import matplotlib.pyplot as plt
plt.scatter(lr.predict(X_test),Y_test)
plt.scatter(Y_test,Y_test)
np.mean((lr.predict(X_test)-Y_test)**2)

24.01416793195116

plt.scatter(X_test[:,0],Y_test)
plt.scatter(X_test[:,0],lr.predict(X_test))

<matplotlib.collections.PathCollection at 0x1660e4820>

1.23123e-300 = 1.23123*10^(-300)

import scipy.io as so
import numpy as np
data = so.loadmat('data/mammography.mat')

np.random.seed(0)
shuffle_index = np.arange(0,len(data['X']))
np.random.shuffle(shuffle_index)

X = data['X'][shuffle_index,:]
Y = data['y'][shuffle_index,:].flatten()

X[:,0] = X[:,0] + 1000

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X,Y)
lr.score(X,Y)

/Users/avelin/opt/miniconda3/envs/sage_new/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

0.983814718769561

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
lr = LogisticRegression()
lr.fit(sc.fit_transform(X),Y)
lr.score(sc.transform(X),Y)

0.9837252973262989

X = data['X'][shuffle_index,:]
Y = data['y'][shuffle_index,:].flatten()

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1/10000)
lr.fit(X*100,Y)
lr.score(X*100,Y)

0.9837252973262989

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1/10000)
lr.fit(X,Y)
lr.score(X,Y)

0.9767504247518555

from sklearn.datasets import fetch_california_housing
X,Y = fetch_california_housing(return_X_y=True)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X,Y)
lr.score(X,Y)

0.606232685199805

import matplotlib.pyplot as plt
plt.scatter(lr.predict(X),Y,alpha=0.05)
plt.scatter(lr.predict(X),lr.predict(X))
plt.xlim(-1,10)
plt.ylim(-1,10)

(-1.0, 10.0)

log(X)-log(Y) = log(X/Y)

import matplotlib.pyplot as plt
lr = LinearRegression()
lr.fit(X,np.log(Y))
plt.scatter(np.exp(lr.predict(X)),Y,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(0,7)
plt.ylim(0,7)
lr.score(X,np.log(Y))

0.6143678372037653

import matplotlib.pyplot as plt
lr = LinearRegression()
lr.fit(X,np.sqrt(Y))
plt.scatter(np.power(lr.predict(X),2),Y,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(0,7)
plt.ylim(0,7)
lr.score(X,np.sqrt(Y))
#plt.scatter(np.exp(lr.predict(X)),np.exp(lr.predict(X)))

_=plt.hist(Y,bins=60)

_=plt.hist(np.sqrt(Y[Y < 5]),bins=60)

from sklearn.preprocessing import power_transform
_=plt.hist(power_transform(Y[Y < 5].reshape(-1,1)),bins=50)

X_new = X[Y < 5,:]
Y_new = Y[Y < 5]

from sklearn.preprocessing import PowerTransformer
pw = PowerTransformer()
pw.fit(Y_new.reshape(-1,1))

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_new,pw.transform(Y_new.reshape(-1,1)).flatten())

import matplotlib.pyplot as plt
plt.scatter(pw.inverse_transform(lr.predict(X_new).reshape(-1,1)),Y_new,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(-1,7)
plt.ylim(-1,7)

lr.score(X_new,pw.transform(Y_new.reshape(-1,1)).flatten())

pw2 = PowerTransformer()
pw2.fit(X_new)
lr = LinearRegression()
lr.fit(pw2.transform(X_new),pw.transform(Y_new.reshape(-1,1)).flatten())

import matplotlib.pyplot as plt
plt.scatter(pw.inverse_transform(lr.predict(pw2.transform(X_new)).reshape(-1,1)),Y_new,alpha=0.05)
plt.scatter(Y,Y)
plt.xlim(-1,7)
plt.ylim(-1,7)

lr.score(pw2.transform(X_new),pw.transform(Y_new.reshape(-1,1)).flatten())

from ipywidgets import interact, IntSlider, FloatSlider
from Utils import discrete_histogram, bennett_epsilon, epsilon_bounded
import numpy as np
@interact 
def concentration(n=IntSlider(1,1,100,5),p=FloatSlider(value=0.5, min=0,max=1,step=0.1)):
    import matplotlib.pyplot as plt
    X = np.random.binomial(1,p,size=(n,10000))
    means = np.mean(X,axis=0)
    #print("P(mean > mu + 0.3 ) = %.5f <= Chebychev %.5f" % (np.mean(means > 0.5+0.3),(1/4)/(0.3**2*n)))
    #print("P(mean > mu + 0.3 ) = %.5f <= Hoeffding %.5f" % (np.mean(means > 0.5+0.3),np.exp(-2*n*0.3**2)))
    print(np.quantile(means,0.025),np.quantile(means,0.975))
    epsilon1 = epsilon_bounded(n,1,0.05)
    epsilon2 = bennett_epsilon(n,1,np.sqrt((1/2)*p*(1-p)),0.05)
    print("95%% confidence interval Hoeffding [%.2f, %.2f] for n=%d" % (np.mean(means)-epsilon1,np.mean(means)+epsilon1,n))
    print("95%% confidence interval Bennett [%.2f, %.2f] for n=%d" % (np.mean(means)-epsilon2,np.mean(means)+epsilon2,n))
    discrete_histogram(means,normed=True)
    plt.xlim(-0.1,1.1)

interactive(children=(IntSlider(value=1, description='n', min=1, step=5), FloatSlider(value=0.5, description='…

import pandas as pd
df = pd.read_csv('data/spam.csv',encoding='Latin-1')
X = df['v2']
Y = df['v1']

import pandas as pd
df = pd.read_csv('data/spam.csv',encoding='Latin-1')

X = df['v2']
Y = df['v1']

from Utils import train_test_validation

X_train,X_test, X_valid, Y_train, Y_test, Y_valid = train_test_validation(X,Y)

from sklearn.pipeline import Pipeline
p = Pipeline([('tfidf',TfidfVectorizer()),('model',LogisticRegression())])

p.fit(X_train,Y_train)

from Utils import classification_report_interval
print(classification_report_interval(Y_test,p.predict(X_test)))

Y_01 = (Y_test == 'spam')*1 # This makes Y_01 into 0 for ham and 1 for spam
g_01 = (p.predict(X_test) == 'spam')*1 # This makes g_01 into 0 for ham and 1 for spam

Y_0 = 1-Y_01
g_1 = g_01
E_1  = Y_0*g_1
E_2 = 1-Y_0-g_1+E_1
C = 100*E_1 + 10*E_2

from Utils import epsilon_bounded, print_confidence_interval
eps = epsilon_bounded(len(C),100,0.05)
mean = np.mean(C)
print_confidence_interval(mean,eps,min_value=0,max_value=100)

def cost(threshold):
    Y_01 = (Y_test == 'spam')*1 # This makes Y_01 into 0 for ham and 1 for spam
    g_01 = (p.predict_proba(X_test)[:,1] >= threshold)*1 # This makes g_01 into 0 for ham and 1 for spam
    Y_0 = 1-Y_01
    g_1 = g_01
    E_1  = Y_0*g_1
    E_2 = 1-Y_0-g_1+E_1
    C = 100*E_1 + 10*E_2
    
    return np.mean(C)

thresholds = np.linspace(0,1,100)
costs = [cost(t) for t in thresholds]

import matplotlib.pyplot as plt
plt.plot(thresholds, np.log(costs))

	mpg	cylinders	displacement	horsepower	weight	acceleration	model-year
0	18.0	8	307.0	130.0	3504	12.0	70
1	15.0	8	350.0	165.0	3693	11.5	70
2	18.0	8	318.0	150.0	3436	11.0	70
3	16.0	8	304.0	150.0	3433	12.0	70
4	17.0	8	302.0	140.0	3449	10.5	70

Introduction to Data Science ¶

1MS041, 2024¶

CountVectorizer and TFIDFVectorizer¶

Feature engineering¶

Improving optimization¶

How scale and regularization affect eachother¶

Transforming target¶

Playing around with transformations¶

Concentration¶

Spam vs not spam, more complete¶

If we have no specific cost in mind¶

Lets define a cost¶

Now we can compute the confidence interval in the same way, but on the validation data¶

Introduction to Data Science¶

1MS041, 2024¶

CountVectorizer and TFIDFVectorizer¶

Feature engineering¶

Improving optimization¶

How scale and regularization affect eachother¶

Transforming target¶

Playing around with transformations¶

Concentration¶

Spam vs not spam, more complete¶

If we have no specific cost in mind¶

Lets define a cost¶

Now we can compute the confidence interval in the same way, but on the validation data¶

Introduction to Data Science ¶