DBSCAN clustering on human brain cell data

DBSCAN clustering on human brain cell data

- 10 mins

Human schizofrenic brain samples

Using density based clustering from sklearn I acquired some data that represents human brain cells and their diameter/length. Using these cells I calculated the cluster size distribution and visualized the clustered data for many distance cut-off parameters.

import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
xlsx = pd.ExcelFile('S05_161ID12720CRcaudatecoordinatescleared4_60um.xlsx')
df1 = pd.read_excel(xlsx, 'Sheet1')
df1.columns = ['length', 'x', 'y']

The contents of the data file.

df1.head(n=5)
length x y
0 4.1 25633 29230
1 4.1 31532 30709
2 4.1 32346 25639
3 4.3 8605 26359
4 4.3 24773 19835
import matplotlib.pyplot as plt

Visualizing the data points with applying length as marker size.

plt.scatter(df1['x'].values, df1['y'].values, s=df1['length'].values, marker='.', color=[.9, .6, .9, 1])
<matplotlib.collections.PathCollection at 0x7f4c78ddd710>

png

Showing the parameters of the outputted result by the classifier. Each data point (1862) is labeled by a cluster index and there are 476 core components which may or not in separate clusters for sure since there are 115 different labels only. Among these unique labels there is one that is labeled -1, the points this corresponds to are outliers, therefore not clustered, most of these points are left out from the histogram creation to not pollute the significant data.

clf = DBSCAN(eps=450, min_samples=5).fit(df1[['x','y']])
clf.labels_.shape, clf.components_.shape, clf.core_sample_indices_.shape, len(set(clf.labels_))
((1862,), (476, 2), (476,), 115)

Calculating the cluster size within a cut-off epsilon range.

def cluster_size(data, fromEps=300, toEps=700):
    plt.figure(figsize=(18,19))

    eps = np.linspace(fromEps, toEps, 9)
    
    for i, E in zip(range(1, 10), eps):
        
        plt.subplot(int(str("33%d") % i))
    
        clf = DBSCAN(eps=E, min_samples=5).fit(data)
        
        n_clusters_ = len(set(clf.labels_)) - (1 if -1 in clf.labels_ else 0)
    
        labeledData = dict()
        for label in set(clf.labels_):
            labeledData[label] = []
        for ind, label in enumerate(clf.labels_):
            labeledData[label].append(df1[['x', 'y']].values[ind, :])

        cluster_sizes = []
        for label in set(clf.labels_):
            cluster_sizes.append(len(labeledData[label]))

        cluster_sizes.pop(-1)
        plt.title('Cluster size distro using eps = %.2f\n%d clusters' % (E, n_clusters_))
        plt.ylabel("# of records")
        plt.xlabel("Cluster size")
        plt.hist(np.array(cluster_sizes))
    plt.show()

There is not enough data points to tell whether these are well behaving distributions but for example just by eye-balling the distro at eps = 500 it seems like log-normal.

cluster_size(df1[['x', 'y']])

png

Moving on with visualization I plotted them in a range that can be set by hand and clustered the points on x,y coordinated only, right after this part I am going to validate that using length as a clustering parameter is not significant.

def cluster_visualization(data, fromEps=350, toEps=750):

    plt.figure(figsize=(18,15))

    eps = np.linspace(fromEps, toEps, 9)
    
    for i, E in zip(range(1, 10), eps):
        
        plt.subplot(int(str("33%d") % i))
        
        clf = DBSCAN(eps=E, min_samples=5).fit(data[['x', 'y']])

        core_samples_mask = np.zeros_like(clf.labels_, dtype=bool)
        core_samples_mask[clf.core_sample_indices_] = True

        n_clusters_ = len(set(clf.labels_)) - (1 if -1 in clf.labels_ else 0)

        unique_labels = set(clf.labels_)
        colors = [plt.cm.Spectral(each)
              for each in np.linspace(0, 1, len(unique_labels))]
    
        for k, col in zip(unique_labels, colors):
        
            if k == -1:
                # Black used for noise.
                col = [0, 0, 0, .5]

            class_member_mask = (clf.labels_ == k)

            xy = data.values[class_member_mask & core_samples_mask]
            plt.scatter(xy[:, 1], xy[:, 2], marker='.', color=tuple(col), s=xy[:, 0])

            xy = data.values[class_member_mask & ~core_samples_mask]
            plt.scatter(xy[:, 1], xy[:, 2], marker='.', color=tuple(col), s=xy[:, 0])

        plt.title('Number of clusters = %d, using eps = : %.2f' % (n_clusters_, E))
        plt.xticks(np.linspace(5000, 35000, 5))

plt.show()
cluster_visualization(df1, fromEps=400, toEps=750)

png

Using the length only for clustering clusters all the data to one giant blob and therefore does not seem to be significant.

def cluster_visualization(data_for_fit, data_for_viz, fromEps=350, toEps=750):

    plt.figure(figsize=(18,15))

    eps = np.linspace(fromEps, toEps, 9)
    
    for i, E in zip(range(1, 10), eps):
        
        plt.subplot(int(str("33%d") % i))
        
        clf = DBSCAN(eps=E, min_samples=5).fit(data_for_fit.values.reshape(-1, 1))

        core_samples_mask = np.zeros_like(clf.labels_, dtype=bool)
        core_samples_mask[clf.core_sample_indices_] = True

        n_clusters_ = len(set(clf.labels_)) - (1 if -1 in clf.labels_ else 0)

        unique_labels = set(clf.labels_)
        colors = [plt.cm.Spectral(each)
              for each in np.linspace(0, 1, len(unique_labels))]
    
        for k, col in zip(unique_labels, colors):
        
            if k == -1:
                # Black used for noise.
                col = [0, 0, 0, .5]

            class_member_mask = (clf.labels_ == k)

            xy = data_for_viz.values[class_member_mask & core_samples_mask]
            plt.plot(xy[:, 0], xy[:, 1], '.', markerfacecolor=tuple(col),
                     markeredgecolor='k', markersize=14)

            xy = data_for_viz.values[class_member_mask & ~core_samples_mask]
            plt.plot(xy[:, 0], xy[:, 1], '.', markerfacecolor=tuple(col),
                     markeredgecolor='k', markersize=6)

        plt.title('Number of clusters = %d, using eps = : %.2f' % (n_clusters_, E))
        plt.xticks(np.linspace(5000, 35000, 5))

plt.show()
cluster_visualization(df1['length'], df1[['x','y']], fromEps=10, toEps=1000)

png

@Regards, Alex

Alex Olar

Alex Olar

Christian, foodie, physicist, tech enthusiast

comments powered by Disqus
rss facebook twitter github gitlab youtube mail spotify lastfm instagram linkedin google google-plus pinterest medium vimeo stackoverflow reddit quora quora