Code for Clustering Algorithms in Machine Learning with Python Tutorial


View on Github

kmeans_clustering.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans
from matplotlib import pyplot

# 2 features, 2 informative, 0 redundant, 1 cluster per class
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10) 

# 2 clusters
m = KMeans(n_clusters=2) 
# fit the model
m.fit(X)
# predict the cluster for each data point
p = m.predict(X) 
# unique clusters
cl = np.unique(p)
# plot the data points and cluster centers
for c in cl:
    r = np.where(c == p)
    pyplot.title('K-means (No. of Clusters = 3)')
    pyplot.scatter(X[r, 0], X[r, 1])
# show the plot
pyplot.show()

minibatch_kmeans_clustering.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import MiniBatchKMeans
from matplotlib import pyplot

X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10)
# 3 clusters
m = MiniBatchKMeans(n_clusters=3) 
# fit the model
m.fit(X)
# predict the cluster for each data point
p = m.predict(X) 
# unique clusters
cl = np.unique(p)
# plot the data points and cluster centers
for c in cl:
    r = np.where(c == p)
    pyplot.title('Mini Batch K-means')
    pyplot.scatter(X[r, 0], X[r, 1])
# show the plot
pyplot.show()

time_diff_minibatch_and_kmeans.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from matplotlib import pyplot
import timeit

X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10)
# start timer for Mini Batch K-Means
t1_mkm = timeit.default_timer() 
m = MiniBatchKMeans(n_clusters=2)
m.fit(X)
p = m.predict(X)
# stop timer for Mini Batch K-Means
t2_mkm = timeit.default_timer()
# start timer for K-Means
t1_km = timeit.default_timer()
m = KMeans(n_clusters=2)
m.fit(X)
p = m.predict(X)
# stop timer for K-Means
t2_km = timeit.default_timer()
# print time difference
print("Time difference between Mini Batch K-Means and K-Means = ",
      (t2_km-t1_km)-(t2_mkm-t1_mkm))

affinity_propagation.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import AffinityPropagation
from matplotlib import pyplot

X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10)

# initialize the model
m = AffinityPropagation(damping=0.9)
# fit the model
m.fit(X)
# predict the cluster for each data point
p = m.predict(X)
# unique clusters
cl = np.unique(p)
# plot the data points and cluster centers
for c in cl:
    r = np.where(c == p)
    pyplot.title('Affinity Propagation Clustering')
    pyplot.scatter(X[r, 0], X[r, 1])
# show the plot
pyplot.show()

dbscan_clustering.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import DBSCAN
from matplotlib import pyplot

X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10)
# init the model
m = DBSCAN(eps=0.05, min_samples=10)
# predict the cluster for each data point after fitting the model
p = m.fit_predict(X) 
# unique clusters
cl = np.unique(p)
# plot the data points and cluster centers
for c in cl:
    r = np.where(c == p)
    pyplot.title('DBSCAN Clustering')
    pyplot.scatter(X[r, 0], X[r, 1])
# show the plot
pyplot.show()

optics.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import OPTICS
from matplotlib import pyplot

X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10)

# init the model
m = OPTICS(eps=0.5, min_samples=10)
# predict the cluster for each data point after fitting the model
p = m.fit_predict(X)
# unique clusters
cl = np.unique(p)
# plot the data points and cluster centers
for c in cl:
    r = np.where(c == p)
    pyplot.title('OPTICS Clustering')
    pyplot.scatter(X[r, 0], X[r, 1])
# show the plot
pyplot.show()

birch.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import Birch
from matplotlib import pyplot

X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10)
# init the model with 2 clusters
m = Birch(threshold=0.05, n_clusters=2)
# predict the cluster for each data point after fitting the model
p = m.fit_predict(X) 
# unique clusters
cl = np.unique(p)
# plot the data points and cluster centers
for c in cl:
    r = np.where(c == p)
    pyplot.title('Birch Clustering')
    pyplot.scatter(X[r, 0], X[r, 1])
# show the plot
pyplot.show()

agglomerative_clustering.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot

X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10)
# init the model with 3 clusters
m = AgglomerativeClustering(n_clusters=3)
# predict the cluster for each data point after fitting the model
p = m.fit_predict(X) 
# unique clusters
cl = np.unique(p)
# plot the data points and cluster centers
for c in cl:
    r = np.where(c == p)
    pyplot.title('Agglomerative Clustering')
    pyplot.scatter(X[r, 0], X[r, 1])
# show the plot
pyplot.show()

meanshift_clustering.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import MeanShift
from matplotlib import pyplot

X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10)
# init the model
m = MeanShift()
# predict the cluster for each data point after fitting the model
p = m.fit_predict(X)
# unique clusters
cl = np.unique(p)
# plot the data points and cluster centers
for c in cl:
    r = np.where(c == p)
    pyplot.title('Mean Shift Clustering')
    pyplot.scatter(X[r, 0], X[r, 1])
# show the plot
pyplot.show()

spectral_clustering.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import SpectralClustering
from matplotlib import pyplot

X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10)
# init the model with 3 clusters
m = SpectralClustering(n_clusters=3)
# predict the cluster for each data point after fitting the model
p = m.fit_predict(X)
# unique clusters
cl = np.unique(p)
# plot the data points and cluster centers
for c in cl:
    r = np.where(c == p)
    pyplot.title('Spectral Clustering')
    pyplot.scatter(X[r, 0], X[r, 1])
# show the plot
pyplot.show()

gmm.py

import numpy as np
from sklearn.datasets import make_classification
from sklearn.mixture import GaussianMixture
from matplotlib import pyplot

X, y = make_classification(n_samples=1000, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=10)
# init the model with 2 components
m = GaussianMixture(n_components=2)
# predict the cluster for each data point after fitting the model
p = m.fit_predict(X)
# unique clusters
cl = np.unique(p)
# plot the data points and cluster centers
for c in cl:
    r = np.where(c == p)
    pyplot.title('Gaussian Mixture Clustering')
    pyplot.scatter(X[r, 0], X[r, 1])
# show the plot
pyplot.show()

metrics.py

from sklearn import metrics

y_true = [5, 3, 5, 4, 4, 5]
y_pred = [3, 5, 5, 4, 3, 4]
# homogeneity: each cluster contains only members of a single class.
print(metrics.homogeneity_score(y_true, y_pred))
# completeness: all members of a given class are assigned to the same cluster.
print(metrics.completeness_score(y_true, y_pred))
# v-measure: harmonic mean of homogeneity and completeness
print(metrics.v_measure_score(y_true, y_pred))