 
 Using KNN algorithm to estimate photometric redshifts
- 12 mins02. Lab exercise, Supervised learning, KNN
- Implement naive K nearest neighbour regression as a function only using python and numpy. The signature of the function should be:
def knn_regression(x2pred, x_train, y_train, k=10):
    """Return prediction with knn regression."""
    return y_pred
-  Apply the KNN regressor on photometric redshift estimation using the provided photoz_mini.csv file. Use a 80-20% train test split. Calculate the mean absolute error of predictions, and plot the true and the predicted values on a scatterplot. 
-  Apply the KNN regressor on photometric redshift estimation using the provided photoz_mini.csv file. Use 5 fold cross validation. Estimate the mean and satndard deviation of the MAE of the predictions. 
-  Repeat excercise (3.) with the KNN regression class from sklearn. Compare the predictions and the runtime. 
-  Implement weighted KNN regression and apply it on the same data. Use 5 fold cross validation. Estimate the mean and satndard deviation of the MAE of the predictions. Plot the true and the predicted values from one fold on a scatterplot. 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
dataset = pd.read_csv("photoz_mini.csv")
print(dataset.columns.values)
colors = ['u', 'g', 'r', 'i', 'z']
['Unnamed: 0' 'id' 'u' 'g' 'r' 'i' 'z' 'redshift']
# split u, g, r, i, z channels
sets = np.split(dataset.values[:,2:7], 5)
test = sets[0]
train = np.concatenate(sets[1:5])
# split redshifts
sets = np.split(dataset.values[:,7], 5)
test_redshift = sets[0]
train_redshift = np.concatenate(sets[1:5])
# get shapes
print(train.shape, train_redshift.shape)
print(test.shape, test_redshift.shape)
(800, 5) (800,)
(200, 5) (200,)
def knn_regression(x2pred, x_train, y_train, k=10):
    y_pred = []
    for dp in x2pred:
        dists = np.array([np.sqrt(np.sum(np.square(dp-value))) for value in x_train])
        ind = np.argpartition(dists, k)
        neighbors_redshift = y_train[ind][:k]
        y_pred.append(np.mean(neighbors_redshift))
    return np.array(y_pred)
pred_redshift = knn_regression(test, train, train_redshift)
# Plot redshifts: predicted-valid
plt.figure(figsize=(10.,8.))
plt.scatter(pred_redshift, test_redshift, c="b")
plt.xlabel("Predicted redshifts")
plt.ylabel("Measured redshifts")
# Plot 45 degree line for grasping accuracy
x = np.linspace(np.min(pred_redshift), np.max(pred_redshift), num=1000)
plt.scatter(x, x, c="r", marker=".")
<matplotlib.collections.PathCollection at 0x7f85e100f898>

# get the mean squared error
MSE = np.sum(np.square(pred_redshift-test_redshift))/test_redshift.shape[0]
print("Mean squared error is %.5f" % MSE)
Mean squared error is 0.00766
# get the mean absolute error
MAE = np.sum(np.absolute(pred_redshift-test_redshift))/test_redshift.shape[0]
print("Mean absolute error is %.5f" % MAE)
Mean absolute error is 0.05062
def cross_validate(algo, dataset, splits=5, k=10):
    MAEs = []
    MSEs = []
    start = time.time()
    for i in range(splits):
        # Sets for u, g, r, i, z indices
        sets = np.split(dataset[['u','g','r','i','z']].values, splits)
        x_test = sets[i]
        x_train = np.concatenate([st for ind, st in enumerate(sets) if ind != i])
        # Split for redshift
        sets = np.split(dataset['redshift'].values, splits)
        y_test = sets[i]
        y_train = np.concatenate([st for ind, st in enumerate(sets) if ind != i])
        y_pred = algo(x_test, x_train, y_train, k)
        MAEs.append(np.sum(np.absolute(y_pred-y_test))/y_test.shape[0])
        MSEs.append(np.sum(np.square(y_pred-y_test))/y_test.shape[0])
    print("Mean MAE : %.10f" % np.mean(MAEs))
    print("Standard deviation of MAE : %.10f" % np.std(MAEs))
    print("Mean MSE : %.10f" % np.mean(MSEs))
    print("Standard deviation of MSE : %.10f" % np.std(MSEs))
    end = time.time()
    print("The operation took %.5f seconds. " % (end-start))
cross_validate(knn_regression, dataset, splits=10, k=13)
Mean MAE : 0.0486281296
Standard deviation of MAE : 0.0036387536
Mean MSE : 0.0066927752
Standard deviation of MSE : 0.0017580858
The operation took 7.55900 seconds. 
from sklearn.neighbors import NearestNeighbors
def sklearn_knn_regression(x2pred, x_train, y_train, k=10):
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(x_train)
    y_pred = []
    for indices in neigh.kneighbors(x2pred, return_distance=False):
        y_pred.append(np.mean(y_train[indices]))
    return np.array(y_pred)
sklearn_pred_redshift = sklearn_knn_regression(test, train, train_redshift)
# Plot redshifts: predicted-valid
plt.figure(figsize=(10.,8.))
plt.scatter(sklearn_pred_redshift, test_redshift, c="b")
plt.xlabel("Predicted redshifts")
plt.ylabel("Measured redshifts")
# Plot 45 degree line for grasping accuracy
x = np.linspace(np.min(sklearn_pred_redshift), np.max(sklearn_pred_redshift), num=1000)
plt.scatter(x, x, c="r", marker=".")
<matplotlib.collections.PathCollection at 0x7f85d6d90668>

cross_validate(sklearn_knn_regression, dataset, splits=5, k=7)
Mean MAE : 0.0498563174
Standard deviation of MAE : 0.0023512523
Mean MSE : 0.0071803898
Standard deviation of MSE : 0.0019106446
The operation took 0.01940 seconds. 
# I rerun this here just to have the values below each other.
cross_validate(knn_regression, dataset, splits=5, k=7)
Mean MAE : 0.0498563174
Standard deviation of MAE : 0.0023512523
Mean MSE : 0.0071803898
Standard deviation of MSE : 0.0019106446
The operation took 6.69437 seconds. 
All praise the skLearn package. I am AMAZED. :O The predictions are the same for 11 digits and the built in estimator is ~250 times faster than the one that I wrote from scratch.
def weighted_knn_regression(x2pred, x_train, y_train, k=10):
    y_pred = []
    for dp in x2pred:
        dists = np.array([np.sqrt(np.sum(np.square(dp-value))) for value in x_train])
        ind = np.argpartition(dists, k)
        neighbors_redshifts = y_train[ind][:k]
        neighbors_distances = dists[ind][:k]
        y_pred.append(np.sum(neighbors_redshifts/neighbors_distances)/np.sum(1./neighbors_distances))
    return np.array(y_pred)
weighted_pred_redshift = weighted_knn_regression(test, train, train_redshift)
# Plot redshifts: predicted-valid
plt.figure(figsize=(10.,8.))
plt.scatter(weighted_pred_redshift, test_redshift, c="b")
plt.xlabel("Predicted redshifts")
plt.ylabel("Measured redshifts")
# Plot 45 degree line for grasping accuracy
x = np.linspace(np.min(weighted_pred_redshift), np.max(weighted_pred_redshift), num=1000)
plt.scatter(x, x, c="r", marker=".")
<matplotlib.collections.PathCollection at 0x7f85d6d7f390>

cross_validate(weighted_knn_regression, dataset, splits=5, k=7)
Mean MAE : 0.0490389669
Standard deviation of MAE : 0.0023706854
Mean MSE : 0.0070522085
Standard deviation of MSE : 0.0019837564
The operation took 6.79338 seconds. 
def sklearn_weighted_knn_regression(x2pred, x_train, y_train, k=10):
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(x_train)
    y_pred = []
    distances, indices = neigh.kneighbors(x2pred, return_distance=True)
    for inds, dists in zip(indices, distances):
        y_pred.append(np.sum(y_train[inds]/dists)/np.sum(1./dists))
    return np.array(y_pred)
sklearn_weighted_pred_redshift = sklearn_weighted_knn_regression(test, train, train_redshift)
# Plot redshifts: predicted-valid
plt.figure(figsize=(10.,8.))
plt.scatter(sklearn_weighted_pred_redshift, test_redshift, c="b")
plt.xlabel("Predicted redshifts")
plt.ylabel("Measured redshifts")
# Plot 45 degree line for grasping accuracy
x = np.linspace(np.min(sklearn_weighted_pred_redshift), np.max(sklearn_weighted_pred_redshift), num=1000)
plt.scatter(x, x, c="r", marker=".")
<matplotlib.collections.PathCollection at 0x7f85d6ce3978>

cross_validate(sklearn_weighted_knn_regression, dataset, splits=5, k=7)
Mean MAE : 0.0490389669
Standard deviation of MAE : 0.0023706854
Mean MSE : 0.0070522085
Standard deviation of MSE : 0.0019837564
The operation took 0.02641 seconds. 
