Reminder:
To-do now:
# importing useful packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from sklearn import datasets
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import statsmodels.formula.api as sm
datasets.load_iris()['data']
iris = datasets.load_iris()
X = iris['data'][:,(2,3)]
scaler = StandardScaler()
Xstan = scaler.fit_transform(X)
data = pd.DataFrame(data=Xstan, columns=['petal length','petal width'])
data['target'] = iris['target']
data = data[data['target']!=2] # we will only focus on Iris-setosa and Iris-Versicolor
data.head()
sns.lmplot(x='petal length',y='petal width',hue='target',data=data, fit_reg=False, legend=False)
plt.legend(['Iris-Setosa','Iris-Versicolor'], fontsize = 14)
plt.xlabel('petal length (scaled)', fontsize = 18)
plt.ylabel('petal width (scaled)', fontsize = 18)
svc = svm.SVC(kernel='linear', C=1 )
svc.fit(data[['petal length','petal width']].values,data['target'].values)
# get the parameters
w0,w1 = svc.coef_[0]
b = svc.intercept_[0]
x0 = np.linspace(-1.7, 0.7, num=100)
# decision boundary
x1_decision = -b/w1 - w0/w1*x0
# +1 margin
x1_plus = x1_decision + 1/w1
# -1 margin
x1_minus = x1_decision - 1/w1
sns.lmplot(x='petal length',y='petal width',hue='target',data=data, fit_reg=False, legend=False)
plt.plot(x0,x1_decision, color='grey')
plt.plot(x0,x1_plus,x0,x1_minus,color='grey', linestyle='--')
plt.legend(['decision boundary','margin','margin','Iris-Setosa','Iris-Versicolor'], fontsize = 14, loc='center left', bbox_to_anchor=(1.05,0.5))
plt.xlabel('petal length (scaled)', fontsize = 18)
plt.ylabel('petal width (scaled)', fontsize = 18)
plt.title('C = 1', fontsize = 20)
plt.ylim(-1.6,1)
plt.xlim(-1.7,0.8)
svc = svm.SVC(kernel='linear', C=1000 ) # let's change C to a much larger value
svc.fit(data[['petal length','petal width']].values,data['target'].values)
# get the parameters
w0,w1 = svc.coef_[0]
b = svc.intercept_[0]
x0 = np.linspace(-1.7, 0.7, num=100)
# decision boundary
x1_decision = -b/w1 - w0/w1*x0
# +1 margin
x1_plus = x1_decision + 1/w1
# -1 margin
x1_minus = x1_decision - 1/w1
sns.lmplot(x='petal length',y='petal width',hue='target',data=data, fit_reg=False, legend=False)
plt.plot(x0,x1_decision, color='grey')
plt.plot(x0,x1_plus,x0,x1_minus,color='grey', linestyle='--')
plt.legend(['decision boundary','margin','margin','Iris-Setosa','Iris-Versicolor'], fontsize = 14, loc='center left', bbox_to_anchor=(1.05,0.5))
plt.xlabel('petal length (scaled)', fontsize = 18)
plt.ylabel('petal width (scaled)', fontsize = 18)
plt.title('C = 1000', fontsize = 20)
plt.ylim(-1.6,1)
plt.xlim(-1.7,0.8)
from sklearn.datasets import make_moons
X, y = make_moons(noise=0.1, random_state=0) # fix random_state to make sure it produces the same dataset everytime. Remove it if you want randomized dataset.
data = pd.DataFrame(data = X, columns=['x1','x2'])
data['y']=y
data.head()
sns.lmplot(x='x1',y='x2',hue='y',data=data, fit_reg=False, legend=True, size=4, aspect=4/3)
plt.xlabel('x1', fontsize = 18)
plt.ylabel('x2', fontsize = 18)
plt.show()
# tranform the features, here we use a 3rd degree polynomials
print('Shape of X before tranformation:', X.shape)
poly = PolynomialFeatures(degree = 3, include_bias=False)
Xpoly = poly.fit_transform(X)
print('Shape of X after tranformation:', Xpoly.shape)
# standardize the data
scaler = StandardScaler()
Xpolystan = scaler.fit_transform(Xpoly)
svm_clf = svm.SVC(kernel='linear', C=1)
svm_clf.fit(Xpolystan,y)
print(svm_clf.intercept_, svm_clf.coef_)
# preparing to plot decision boundary of the classifier
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
# create grids
X0, X1 = X[:, 0], X[:, 1]
xx0, xx1 = make_meshgrid(X0, X1)
# polynomial transformation and standardization on the grids
xgrid = np.c_[xx0.ravel(), xx1.ravel()]
xgridpoly = poly.transform(xgrid)
xgridpolystan = scaler.transform(xgridpoly)
# prediction
Z = xgridpolystan.dot(svm_clf.coef_[0].reshape(-1,1)) + svm_clf.intercept_[0] # wx + b
#Z = svm_clf.predict(xgridpolystan)
Z = Z.reshape(xx0.shape)
# plotting prediction contours - decision boundary (Z=0), and two margins (Z = 1 or -1)
sns.lmplot(x='x1',y='x2',hue='y',data=data, fit_reg=False, legend=True, size=4, aspect=4/3)
CS=plt.contour(xx0, xx1, Z, alpha=0.5, levels=[-1,0,1])
plt.clabel(CS, inline=1,levels=[-1.0,0,1.0], fmt='%1.1f', fontsize=12, manual=[(1.5,0.3),(0.5,0.0),(-0.5,-0.2)])
#
plt.xlim(-1.2,2.2)
plt.ylim(-1,1.5)
plt.title('C=1', fontsize = 20)
plt.xlabel('x1', fontsize = 18)
plt.ylabel('x2', fontsize = 18)
plt.show()
svm_clf = svm.SVC(kernel='linear', C=1000)
svm_clf.fit(Xpolystan,y)
# prediction
Z = xgridpolystan.dot(svm_clf.coef_[0].reshape(-1,1)) + svm_clf.intercept_[0] # wx + b
#Z = svm_clf.predict(xgridpolystan)
Z = Z.reshape(xx0.shape)
# plotting prediction contours - decision boundary (Z=0), and two margins (Z = 1 or -1)
sns.lmplot(x='x1',y='x2',hue='y',data=data, fit_reg=False, legend=True, size=4, aspect=4/3)
CS=plt.contour(xx0, xx1, Z, alpha=0.5, levels=[-1,0,1])
plt.clabel(CS, inline=1,levels=[-1.0,0,1.0], fmt='%1.1f', fontsize=12, manual=[(1.5,0.1),(0.5,0.0),(-0.5,0.0)])
plt.xlim(-1.2,2.2)
plt.ylim(-1,1.5)
plt.title('C=1000', fontsize = 20)
plt.xlabel('x1', fontsize = 18)
plt.ylabel('x2', fontsize = 18)
protein = pd.read_excel( './aar3247_Cohen_SM_Tables-S1-S11.xlsx',
sheet_name = 'Table S6', skiprows=2 )
# dropping the last 4 rows:
protein = protein.drop( [1817, 1818, 1819, 1820], axis = 0 )
#protein.tail()
# saving the CancerSEEK score and labels:
CanserSEEK_score = protein['CancerSEEK Logistic Regression Score']
CanserSEEK_label = protein['CancerSEEK Test Result']
protein = protein.drop( [ 'Patient ID #', 'Sample ID #', 'AJCC Stage',
'CancerSEEK Logistic Regression Score',
'CancerSEEK Test Result' ], axis = 1 )
#protein.head()
CS_score_copy = copy.deepcopy( CanserSEEK_score )
CS_score_copy.shape
protein = protein.astype(str)
features = protein.columns[1:]
for i in features:
protein[i] = protein[i].str.replace( r'*', r'').astype(float)
#protein.head()
col_with_nan = protein.columns[ protein.isna().sum() != 0 ]
for i in col_with_nan:
mean = np.mean( protein[i] )
protein[i] = protein[i].fillna( mean )
protein.isna().sum().sum()
x_training = protein[features][(protein.index % 2) == 0 ]
x_test = protein[features][(protein.index % 2) != 0 ]
types = np.unique( protein['Tumor type'] )
labels = protein['Tumor type'].replace( 'Normal', 0 )
labels = labels.replace( types, np.ones(len(types)) )
np.unique( labels )
y_training = labels[(protein.index % 2) == 0 ]
y_test = labels[(protein.index % 2) != 0 ]
SVC_model = svm.SVC( probability=True )
SVC_model.fit(x_training, y_training) # training
predictions = SVC_model.predict_proba(x_test) # predicting probabilities
pred_labels = SVC_model.predict(x_test) # predicting labels
# calculting the false positive and false negative rate:
fpr_mine, tpr_mine, _ = metrics.roc_curve( y_test, predictions[:,1] )
CanserSEEK_score = CanserSEEK_score[(CanserSEEK_score.index % 2) != 0 ]
fpr_CanserSEEK, tpr_CancerSEEK, _ = metrics.roc_curve( y_test, CanserSEEK_score )
auc_mine = np.round(metrics.roc_auc_score( y_test, predictions[:,1] ), 3 )
auc_CancerSEEK = np.round(metrics.roc_auc_score( y_test, CanserSEEK_score), 3 )
# plotting:
plt.figure(figsize=(8, 8))
plt.plot( fpr_mine, tpr_mine, color = 'forestgreen',
label = 'SVC_proba:' + str(auc_mine) )
plt.plot( fpr_CanserSEEK, tpr_CancerSEEK, color = 'magenta',
label = 'CancerSEEK' + str(auc_CancerSEEK) )
plt.plot([0, 1], [0, 1], '--', c='k')
plt.xticks( fontsize = 12 )
plt.yticks( fontsize = 12 )
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.legend()
# calculating the confusion matrix
confusion_matrix = pd.crosstab( y_test, pred_labels,
rownames=['Real type'], colnames=['Predicted type'] )
print( confusion_matrix )
# visualizng it on a heatmap
plt.title( 'Confusion matrix of Logistic regression')
sns.heatmap(confusion_matrix, annot=True)
plt.show()
scaler = StandardScaler()
x_training_sc = scaler.fit_transform( x_training )
x_test_sc = scaler.transform( x_test )
SVC_model = svm.SVC( probability=True )
SVC_model.fit(x_training_sc, y_training) # training
predictions = SVC_model.predict_proba(x_test_sc) # predicting probabilities
pred_labels = SVC_model.predict(x_test_sc) # predicting labels
# calculting the false positive and false negative rate:
fpr_mine, tpr_mine, _ = metrics.roc_curve( y_test, predictions[:,1] )
CanserSEEK_score = CanserSEEK_score[(CanserSEEK_score.index % 2) != 0 ]
fpr_CanserSEEK, tpr_CancerSEEK, _ = metrics.roc_curve( y_test, CanserSEEK_score )
auc_mine = np.round(metrics.roc_auc_score( y_test, predictions[:,1] ), 3 )
auc_CancerSEEK = np.round(metrics.roc_auc_score( y_test, CanserSEEK_score), 3 )
# plotting:
plt.figure(figsize=(8, 8))
plt.plot( fpr_mine, tpr_mine, color = 'forestgreen',
label = 'SVC_proba:' + str(auc_mine) )
plt.plot( fpr_CanserSEEK, tpr_CancerSEEK, color = 'magenta',
label = 'CancerSEEK' + str(auc_CancerSEEK) )
plt.plot([0, 1], [0, 1], '--', c='k')
plt.xticks( fontsize = 12 )
plt.yticks( fontsize = 12 )
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.legend()
# calculating the confusion matrix
confusion_matrix = pd.crosstab( y_test, pred_labels,
rownames=['Real type'], colnames=['Predicted type'] )
print( confusion_matrix )
# visualizng it on a heatmap
plt.title( 'Confusion matrix of Logistic regression')
sns.heatmap(confusion_matrix, annot=True)
plt.show()
SVC_model = svm.SVC( kernel='linear', probability=True )
SVC_model.fit(x_training_sc, y_training) # training
predictions = SVC_model.predict_proba(x_test_sc) # predicting probabilities
pred_labels = SVC_model.predict(x_test_sc) # predicting labels
# calculting the false positive and false negative rate:
fpr_mine, tpr_mine, _ = metrics.roc_curve( y_test, predictions[:,1] )
CanserSEEK_score = CanserSEEK_score[(CanserSEEK_score.index % 2) != 0 ]
fpr_CanserSEEK, tpr_CancerSEEK, _ = metrics.roc_curve( y_test, CanserSEEK_score )
auc_mine = np.round(metrics.roc_auc_score( y_test, predictions[:,1] ), 3 )
auc_CancerSEEK = np.round(metrics.roc_auc_score( y_test, CanserSEEK_score), 3 )
# plotting:
plt.figure(figsize=(8, 8))
plt.plot( fpr_mine, tpr_mine, color = 'forestgreen',
label = 'SVC_proba:' + str(auc_mine) )
plt.plot( fpr_CanserSEEK, tpr_CancerSEEK, color = 'magenta',
label = 'CancerSEEK' + str(auc_CancerSEEK) )
plt.plot([0, 1], [0, 1], '--', c='k')
plt.xticks( fontsize = 12 )
plt.yticks( fontsize = 12 )
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.legend()
# calculating the confusion matrix
confusion_matrix = pd.crosstab( y_test, pred_labels,
rownames=['Real type'], colnames=['Predicted type'] )
print( confusion_matrix )
# visualizng it on a heatmap
plt.figure(figsize=(8, 8))
plt.title( 'Confusion matrix of Logistic regression')
sns.heatmap(confusion_matrix, annot=True)
plt.show()
SVC_model = svm.SVC( kernel='poly', probability=True )
SVC_model.fit(x_training_sc, y_training) # training
predictions = SVC_model.predict_proba(x_test_sc) # predicting probabilities
pred_labels = SVC_model.predict(x_test_sc) # predicting labels
# calculting the false positive and false negative rate:
fpr_mine, tpr_mine, _ = metrics.roc_curve( y_test, predictions[:,1] )
CanserSEEK_score = CanserSEEK_score[(CanserSEEK_score.index % 2) != 0 ]
fpr_CanserSEEK, tpr_CancerSEEK, _ = metrics.roc_curve( y_test, CanserSEEK_score )
auc_mine = np.round(metrics.roc_auc_score( y_test, predictions[:,1] ), 3 )
auc_CancerSEEK = np.round(metrics.roc_auc_score( y_test, CanserSEEK_score), 3 )
# plotting:
plt.figure(figsize=(8, 8))
plt.plot( fpr_mine, tpr_mine, color = 'forestgreen',
label = 'SVC_proba:' + str(auc_mine) )
plt.plot( fpr_CanserSEEK, tpr_CancerSEEK, color = 'magenta',
label = 'CancerSEEK' + str(auc_CancerSEEK) )
plt.plot([0, 1], [0, 1], '--', c='k')
plt.xticks( fontsize = 12 )
plt.yticks( fontsize = 12 )
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.legend()
# calculating the confusion matrix
confusion_matrix = pd.crosstab( y_test, pred_labels,
rownames=['Real type'], colnames=['Predicted type'] )
print( confusion_matrix )
# visualizng it on a heatmap
plt.figure(figsize=(8, 8))
plt.title( 'Confusion matrix of Logistic regression')
sns.heatmap(confusion_matrix, annot=True)
plt.show()
SVC_model = svm.SVC( kernel='sigmoid', probability=True )
SVC_model.fit(x_training_sc, y_training) # training
predictions = SVC_model.predict_proba(x_test_sc) # predicting probabilities
pred_labels = SVC_model.predict(x_test_sc) # predicting labels
# calculting the false positive and false negative rate:
fpr_mine, tpr_mine, _ = metrics.roc_curve( y_test, predictions[:,1] )
CanserSEEK_score = CanserSEEK_score[(CanserSEEK_score.index % 2) != 0 ]
fpr_CanserSEEK, tpr_CancerSEEK, _ = metrics.roc_curve( y_test, CanserSEEK_score )
auc_mine = np.round(metrics.roc_auc_score( y_test, predictions[:,1] ), 3 )
auc_CancerSEEK = np.round(metrics.roc_auc_score( y_test, CanserSEEK_score), 3 )
# plotting:
plt.figure(figsize=(8, 8))
plt.plot( fpr_mine, tpr_mine, color = 'forestgreen',
label = 'SVC_proba:' + str(auc_mine) )
plt.plot( fpr_CanserSEEK, tpr_CancerSEEK, color = 'magenta',
label = 'CancerSEEK' + str(auc_CancerSEEK) )
plt.plot([0, 1], [0, 1], '--', c='k')
plt.xticks( fontsize = 12 )
plt.yticks( fontsize = 12 )
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.legend()
# calculating the confusion matrix
confusion_matrix = pd.crosstab( y_test, pred_labels,
rownames=['Real type'], colnames=['Predicted type'] )
print( confusion_matrix )
# visualizng it on a heatmap
plt.figure(figsize=(8, 8))
plt.title( 'Confusion matrix of Logistic regression')
sns.heatmap(confusion_matrix, annot=True)
plt.show()
np.random.seed(0)
rand_idx = np.random.permutation(labels.shape[0])
data_rand = protein[features].values[ rand_idx ]
target_rand = labels.values[ rand_idx ]
N = data_rand.shape[0]
n1 = int(N*0.15)
n2 = int(N*0.30)
testidx = np.arange( 0, n1 ) # getting indices of the selected fraction for the test data
validx = np.arange( n1, n2 ) # getting indices of the selected fraction for the test data
trainidx = np.arange( n2, N)
testidx[-3:], validx[:3], validx[-3:], trainidx[:3]
x_test = data_rand[testidx]
x_val = data_rand[validx]
x_train = data_rand[trainidx]
y_test = target_rand[testidx]
y_val = target_rand[validx]
y_train = target_rand[trainidx]
x_test.shape, x_val.shape, x_train.shape, y_test.shape, y_val.shape, y_train.shape
scaler = StandardScaler()
x_train_sc = scaler.fit_transform( x_train )
x_test_sc = scaler.transform( x_test )
x_val_sc = scaler.fit_transform( x_val )
def SVM_model_predict_tune( x_train, x_test, y_train, y_test, kernel='poly',
gamma='auto', C=1.0, cache_size=200, random_state=None,
tol=0.001, coef0=0.0 ):
clf = svm.SVC( gamma=gamma, C=C, kernel=kernel, probability=True,
cache_size=cache_size, random_state=random_state, tol=tol, coef0=coef0 )
clf.fit( x_train, y_train )
y_pred = clf.predict_proba( x_test )
fpr, tpr, thresholds = metrics.roc_curve( y_test, y_pred[:,1] )
auc = metrics.auc( fpr, tpr )
return auc
%%time
gammas = np.linspace( 1e-5, 1, 100 )
auc_tune = np.empty( gammas.shape[0], dtype=float )
for i in range(gammas.shape[0]):
auc_tune[i] = SVM_model_predict_tune( x_train_sc, x_val_sc, y_train, y_val, kernel='rbf', gamma=gammas[i] )
gamma_best = gammas[ np.argmax( auc_tune, axis=0 ) ]
fig = plt.figure( figsize=(6,4), dpi=100 )
plt.title( 'Tuning the hyperparameters of SVM ', fontsize=20)
plt.plot( gammas, auc_tune, label='AUC' )
plt.ylabel( 'AUC value', fontsize = 15)
plt.xlabel( 'Gamma parameter', fontsize = 15)
plt.xscale( 'log' )
plt.grid()
plt.legend()
gamma_best
gamma_best = 0.04
%%time
Cs = np.linspace( 1e-5, 1000, 100 )
auc_tune = np.empty( Cs.shape[0], dtype=float )
for i in range(Cs.shape[0]):
auc_tune[i] = SVM_model_predict_tune( x_train_sc, x_val_sc, y_train, y_val, kernel='rbf', C=Cs[i], gamma=gamma_best )
Cs_best = Cs[ np.argmax( auc_tune, axis=0 ) ]
fig = plt.figure( figsize=(6,4), dpi=100 )
plt.title( 'Tuning the hyperparameters of SVM ', fontsize=20)
plt.plot( gammas, auc_tune, label='AUC' )
plt.ylabel( 'AUC value', fontsize = 15)
plt.xlabel( 'C parameter', fontsize = 15)
plt.xscale( 'log' )
plt.grid()
plt.legend()
Cs_best
Cs_best = 10.10
%%time
tols = np.linspace( 1e-5, 100, 100 )
auc_tune = np.empty( tols.shape[0], dtype=float )
for i in range(tols.shape[0]):
auc_tune[i] = SVM_model_predict_tune( x_train_sc, x_val_sc, y_train, y_val, kernel='rbf',
gamma=gamma_best, C=Cs_best, tol=tols[i] )
tol_best = tols[ np.argmax( auc_tune, axis=0 ) ]
fig = plt.figure( figsize=(6,4), dpi=100 )
plt.title( 'Tuning the hyperparameters of SVM ', fontsize=20)
plt.plot( tols, auc_tune, label='AUC' )
plt.ylabel( 'AUC value', fontsize = 15)
plt.xlabel( 'Toleration parameter', fontsize = 15)
plt.xscale( 'log' )
plt.grid()
plt.legend()
tol_best = 0.001 # default
def SVM_model_predict( x_train, y_train, x_test, y_test,kernel,
gamma='auto', C=1.0, tol=0.001 ):
clf = svm.SVC( gamma=gamma, C=C, kernel=kernel, probability=True, tol=tol )
clf.fit( x_train, y_train )
predictions = clf.predict_proba( x_test )
pred_labels = clf.predict( x_test ) # predicting labels
# calculting the false positive and false negative rate:
fpr_mine, tpr_mine, _ = metrics.roc_curve( y_test, predictions[:,1] )
CanserSEEK_score = CS_score_copy.values[rand_idx][testidx]
fpr_CanserSEEK, tpr_CancerSEEK, _ = metrics.roc_curve( y_test, CanserSEEK_score )
auc_mine = np.round(metrics.roc_auc_score( y_test, predictions[:,1] ), 3 )
auc_CancerSEEK = np.round(metrics.roc_auc_score( y_test, CanserSEEK_score), 3 )
# plotting:
plt.figure(figsize=(8, 8))
plt.plot( fpr_mine, tpr_mine, color = 'forestgreen',
label = 'SVC_proba:' + str(auc_mine) )
plt.plot( fpr_CanserSEEK, tpr_CancerSEEK, color = 'magenta',
label = 'CancerSEEK' + str(auc_CancerSEEK) )
plt.plot([0, 1], [0, 1], '--', c='k')
plt.xticks( fontsize = 12 )
plt.yticks( fontsize = 12 )
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.legend()
# calculating the confusion matrix
confusion_matrix = pd.crosstab( y_test, pred_labels,
rownames=['Real type'], colnames=['Predicted type'] )
print( confusion_matrix )
# visualizng it on a heatmap
plt.figure(figsize=(8, 8))
plt.title( 'Confusion matrix of Logistic regression')
sns.heatmap(confusion_matrix, annot=True)
plt.show()
SVM_model_predict( x_train_sc, y_train, x_test_sc, y_test,
kernel='rbf', gamma=gamma_best, C=Cs_best, tol=tol_best )