import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import (LinearRegression, Ridge,
                                  Lasso, ElasticNet,
                                  LassoCV, RidgeCV, ElasticNetCV)
from sklearn.datasets import make_regression
from sklearn.model_selection import (train_test_split,
                                     RandomizedSearchCV,
                                     GridSearchCV,
                                     cross_validate, cross_val_score,
                                     cross_val_predict)
import pandas as pd
import requests
import io
import re


def ols(X, y):
    """
        Addign intercept to X as trailing 1s,
        by default the weight will include the
        coefficient in the last parameter and
        (X.T X).inv()X.T y will solve the problem
        for the weights.
    """
    _X = np.hstack((X, np.ones(shape=(X.shape[0], 1))))

    scores = np.matmul(_X.T, _X)
    scores_inv = np.linalg.inv(scores)
    
    unnormalized_weights= np.matmul(scores_inv, _X.T)
    weights = np.dot(unnormalized_weights, y)
    return weights[:-1], weights[-1]


X, y = make_regression(n_samples=1000, n_features=20)


coef, intercept = ols(X, y)


regr = LinearRegression().fit(X, y)


assert np.allclose(regr.coef_,
                   coef), 'Coefficients should be close'


assert np.allclose(regr.intercept_,
                   intercept), 'Intercepts must match'


data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data"
s = requests.get(data_url).content

df = pd.read_csv(io.StringIO(s.decode('utf-8')), na_values='?')


description_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names"
s = requests.get(description_url).content

attributes = re.findall(r'@attribute.*',
                        s.decode('utf-8'))

cols = [attr.split(' ')[1] for attr in attributes]


df.columns = cols


df.head()


plt.figure(figsize=(20, 5))
plt.imshow(df.isna().T)
plt.show()


df.describe()


df.dropna(axis=1, thresh=len(df) // 2, inplace=True)


df[df.isna().any(axis=1)]


df.fillna(df.mean(), inplace=True)


df[df.isna().any(axis=1)]


df.dtypes

state                    int64
communityname           object
fold                     int64
population             float64
householdsize          float64
                        ...   
LandArea               float64
PopDens                float64
PctUsePubTrans         float64
LemasPctOfficDrugUn    float64
ViolentCrimesPerPop    float64
Length: 104, dtype: object


np.unique(df['communityname'].values).shape

(1828,)


df.drop(columns=['communityname'], inplace=True)


df.drop(columns=['fold'], inplace=True)


X = df.values[:, :-1]
y = df.values[:, -1]

linreg = LinearRegression()

results = cross_validate(linreg, X, y, cv=5,
                         return_train_score=True)


test_mean, test_std = np.mean(results['test_score']),\
                      np.std(results['test_score'])
train_mean, train_std = np.mean(results['train_score']),\
                      np.std(results['train_score'])

print('Cross-validation R^2 [training] : %.3f \pm %.3f' % (train_mean,
                                                           train_std))
print('Cross-validation R^2 [testing]  : %.3f \pm %.3f' % (test_mean,
                                                           test_std))

Cross-validation R^2 [training] : 0.701 \pm 0.009
Cross-validation R^2 [testing]  : 0.651 \pm 0.037


lasso = Lasso()

dist = dict(alpha=np.linspace(1e-6, .05, 50),
            normalize=[True, False], 
            max_iter=[10_000, 50_000])

clf = GridSearchCV(lasso, dist, cv=5, verbose=1,
                   return_train_score=True, n_jobs=8)


search = clf.fit(X, y)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    3.2s
[Parallel(n_jobs=8)]: Done 960 tasks      | elapsed:    5.2s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    5.3s finished


search.best_params_

{'alpha': 1e-06, 'max_iter': 10000, 'normalize': True}


%%time
coeffs = []
alphas = np.linspace(1e-6, .05, 200)

for alpha in alphas:
    lasso = Lasso(alpha=alpha,
                  max_iter=10_000,
                  normalize=True)
    lasso.fit(X, y)
    coeffs.append(lasso.coef_)

CPU times: user 2.03 s, sys: 40 ms, total: 2.07 s
Wall time: 639 ms


def plot_shrinkage(alphas, coeffs, limit=2):
    coeffs = np.array(coeffs)

    plt.figure(figsize=(16, 12))
    plt.plot(alphas, coeffs)
    
    non_null = np.where(coeffs[limit])

    if len(non_null[0]) < 10:
        for _y, ind in zip(np.linspace(-.5, .5, len(non_null[0])), non_null[0]):
            plt.text(alphas[limit], _y, s=df.columns[ind], fontsize=12)

        plt.vlines(alphas[limit], ymin=-1, ymax=1, label='alpha = %.5f' % alphas[limit])
        plt.legend()

    plt.xscale('log')   
    plt.xlabel('Penalty')
    plt.ylabel('Coeff')
    plt.show()


plot_shrinkage(alphas, coeffs)


%%time
coeffs = []
alphas = np.linspace(1e-6, 2000., 500)

for alpha in alphas:
    lasso = Ridge(alpha=alpha,
                  max_iter=10_000,
                  normalize=True)
    lasso.fit(X, y)
    coeffs.append(lasso.coef_)

CPU times: user 3.98 s, sys: 83.8 ms, total: 4.07 s
Wall time: 1.04 s


plot_shrinkage(alphas, coeffs)


%%time
coeffs = []
alphas = np.linspace(1e-6, 1., 500)

for alpha in alphas:
    en = ElasticNet(alpha=alpha,
                    max_iter=10_000,
                    normalize=True)
    en.fit(X, y)
    coeffs.append(en.coef_)

CPU times: user 3.99 s, sys: 59.8 ms, total: 4.05 s
Wall time: 1.08 s


plot_shrinkage(alphas, coeffs)


from sklearn.feature_selection import RFE


def do_rfe_with_cv(regressor, X, y):
    rfe = RFE(estimator=regressor, step=1, n_features_to_select=10)
    rfe = rfe.fit(X, y)
    return rfe


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=42)


def get_explanining_variables(selector, original_feature_columns, X_test, y_test):
    return sorted(original_feature_columns[selector.support_].values.tolist()),\
           selector.score(X_test, y_test),\
           selector.predict(X_test)


exp_lasso, lasso_test_score, lasso_preds = get_explanining_variables(
    do_rfe_with_cv(LassoCV(cv=5, normalize=True), X_train, y_train),
    df.columns[:-1], X_test, y_test)

exp_ridge, ridge_test_score, ridge_preds = get_explanining_variables(
    do_rfe_with_cv(RidgeCV(cv=5, normalize=True), X_train, y_train),
    df.columns[:-1], X_test, y_test)

exp_elastic_net, elastic_net_test_score, elastic_net_pred = get_explanining_variables(
    do_rfe_with_cv(ElasticNetCV(cv=5, normalize=True), X_train, y_train),
    df.columns[:-1], X_test, y_test)


explain_df = pd.DataFrame(columns=["model"] + ['var%d' % i for i in range(1, 11)] + ["test_score"])


explain_df.loc[0] = ["lasso"] + exp_lasso + [lasso_test_score]
explain_df.loc[1] = ["ridge"] + exp_ridge + [ridge_test_score]
explain_df.loc[2] = ["elastic"] + exp_elastic_net + [elastic_net_test_score]


explain_df


fig, axes = plt.subplots(1, 3, sharex=True, sharey=True, figsize=(15, 5))
axes = axes.flatten()
axes[0].scatter(y_test, lasso_preds, label='Lasso | score : %.3f' % lasso_test_score)
axes[1].scatter(y_test, ridge_preds, label='Ridge | score : %.3f' % ridge_test_score )
axes[2].scatter(y_test, elastic_net_pred, label='ElasticNet | score : %.3f' % elastic_net_test_score)

for ax in axes:
    ax.legend()
    ax.set_xticks([0., .5, 1.])
    ax.set_yticks([0., .5, 1.])
    ax.plot(np.linspace(0, 1, 200), np.linspace(0, 1, 200), 'r-')

plt.xlim(0, 1)
plt.ylim(0, 1)

plt.show()


from scipy.optimize import minimize

def score_surface(penalty, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    alpha, beta = penalty
    elastic = ElasticNet(alpha=alpha, l1_ratio=beta, max_iter=15_000, normalize=True).fit(X_train, y_train)
    y_preds = elastic.predict(X_test)
    return np.mean(np.square(y_preds - y_test))

alphas = np.linspace(1e-6, 0.5, 100)
betas = np.linspace(0.1, 1, 100)

test_scores = []

scores = np.array([[score_surface((alpha, beta)) for alpha in alphas] for beta in betas]).reshape(100, 100)


from scipy.optimize import minimize

min_penalty = minimize(score_surface, x0=[1e-4, 1e-6], bounds=[(1e-10, 1), (1e-10, 1)]).x


min_penalty

array([8.10253678e-06, 8.53783635e-08])


alpha_grid, beta_grid = np.meshgrid(alphas, betas*alphas)


from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(16, 16))
ax = fig.add_subplot(111, projection='3d')

surf = ax.plot_surface(alpha_grid, beta_grid, scores, rstride=1, cstride=1,
                       cmap=plt.cm.jet, linewidth=0, antialiased=False)

ax.set_xlabel('L2', fontsize=15)
ax.set_ylabel('L1', fontsize=15)
ax.set_zlabel('MSE', fontsize=15)

ax.plot(np.repeat(min_penalty[0], 100), np.repeat(min_penalty[1], 100), np.linspace(0, 0.05, 100))

ax.set_title('ElasticNet MSE surface')

Text(0.5, 0.92, 'ElasticNet MSE surface')

	state	county	community	communityname	fold	population	householdsize	racepctblack	racePctWhite	racePctAsian	...	LandArea	PopDens	PctUsePubTrans	PolicCars	PolicOperBudg	LemasPctPolicOnPatr	LemasGangUnitDeploy	PolicBudgPerPop	ViolentCrimesPerPop
0	53	NaN	NaN	Tukwilacity	1	0.00	0.16	0.12	0.74	0.45	...	0.02	0.12	0.45	NaN	NaN	NaN	NaN	NaN	0.67
1	24	NaN	NaN	Aberdeentown	1	0.00	0.42	0.49	0.56	0.17	...	0.01	0.21	0.02	NaN	NaN	NaN	NaN	NaN	0.43
2	34	5.0	81440.0	Willingborotownship	1	0.04	0.77	1.00	0.08	0.12	...	0.02	0.39	0.28	NaN	NaN	NaN	NaN	NaN	0.12
3	42	95.0	6096.0	Bethlehemtownship	1	0.01	0.55	0.02	0.95	0.09	...	0.04	0.09	0.02	NaN	NaN	NaN	NaN	NaN	0.03
4	6	NaN	NaN	SouthPasadenacity	1	0.02	0.28	0.06	0.54	1.00	...	0.01	0.58	0.10	NaN	NaN	NaN	NaN	NaN	0.14

	state	county	community	fold	population	householdsize	racepctblack	racePctWhite	racePctAsian	racePctHisp	...	LandArea	PopDens	PctUsePubTrans	PolicCars	PolicOperBudg	LemasPctPolicOnPatr	LemasGangUnitDeploy	LemasPctOfficDrugUn	PolicBudgPerPop	ViolentCrimesPerPop
count	1993.000000	820.000000	817.000000	1993.000000	1993.000000	1993.000000	1993.000000	1993.000000	1993.000000	1993.000000	...	1993.000000	1993.000000	1993.000000	318.000000	318.000000	318.000000	318.000000	1993.000000	318.000000	1993.000000
mean	28.693929	58.826829	46188.336597	5.496237	0.057526	0.463462	0.179709	0.753643	0.153698	0.144009	...	0.065203	0.232840	0.161666	0.163428	0.076824	0.697956	0.440252	0.093939	0.195252	0.237998
std	16.395117	126.420560	25299.726569	2.872650	0.126903	0.163731	0.253480	0.244079	0.208929	0.232549	...	0.109480	0.203142	0.229111	0.215038	0.140413	0.213981	0.406434	0.240335	0.164948	0.233042
min	1.000000	1.000000	70.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	12.000000	9.000000	25065.000000	3.000000	0.010000	0.350000	0.020000	0.630000	0.040000	0.010000	...	0.020000	0.100000	0.020000	0.040000	0.020000	0.620000	0.000000	0.000000	0.110000	0.070000
50%	34.000000	23.000000	48090.000000	5.000000	0.020000	0.440000	0.060000	0.850000	0.070000	0.040000	...	0.040000	0.170000	0.070000	0.080000	0.030000	0.750000	0.500000	0.000000	0.155000	0.150000
75%	42.000000	59.500000	66660.000000	8.000000	0.050000	0.540000	0.230000	0.940000	0.170000	0.160000	...	0.070000	0.280000	0.190000	0.197500	0.060000	0.840000	1.000000	0.000000	0.220000	0.330000
max	56.000000	840.000000	94597.000000	10.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

	model	var1	var2	var3	var4	var5	var6	var7	var8	var9	var10	test_score
0	lasso	HousVacant	MalePctDivorce	MedRent	NumIlleg	NumStreet	PctIlleg	PctKids2Par	PctPersDenseHous	RentLowQ	racepctblack	0.625684
1	ridge	HousVacant	MalePctDivorce	NumStreet	PctFam2Par	PctIlleg	PctKids2Par	PctPersDenseHous	agePct12t29	racePctWhite	racepctblack	0.610449
2	elastic	HousVacant	MalePctDivorce	MedRent	NumImmig	NumStreet	PctIlleg	PctKids2Par	PctPersDenseHous	RentLowQ	racepctblack	0.626936

1. Implement a linear model¶

2. Use of real data¶

3. Shrinkage¶

4. Subset selection¶

5. ElasticNet penalty surface¶

1.¶

2.¶

3.¶

Doing the same for ElasticNet [ EXTRA ]¶

4.¶

5.¶