What is Unsupervised Learning?

Unlike supervised learning, unsupervised learning works with unlabeled data. The algorithm discovers hidden patterns, structures, and relationships within the data without predefined outcomes.

It's like exploring a new city without a map - you discover neighborhoods, landmarks, and connections on your own.

Main Categories

  • Clustering: Group similar data points together (customer segmentation)
  • Dimensionality Reduction: Reduce features while preserving information (visualization, compression)
  • Association Rules: Find relationships between variables (market basket analysis)
  • Anomaly Detection: Identify unusual data points (fraud detection)

K-Means Clustering

The most popular clustering algorithm - partitions data into K clusters:

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Scale features (important for K-Means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit K-Means
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

# Add clusters to dataframe
df['cluster'] = clusters

# Cluster centers
print("Cluster Centers:")
print(scaler.inverse_transform(kmeans.cluster_centers_))

# Visualize clusters (2D)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
            s=300, c='red', marker='X', label='Centroids')
plt.legend()
plt.title('K-Means Clustering')
plt.show()

Finding Optimal K (Elbow Method)

# Elbow Method
inertias = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 5))
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.show()

# Silhouette Score
from sklearn.metrics import silhouette_score

silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)
    print(f"K={k}: Silhouette Score = {score:.4f}")

optimal_k = range(2, 11)[np.argmax(silhouette_scores)]
print(f"\nOptimal K: {optimal_k}")

DBSCAN (Density-Based Clustering)

from sklearn.cluster import DBSCAN

# DBSCAN doesn't require specifying number of clusters
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(X_scaled)

# Number of clusters (excluding noise labeled as -1)
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
n_noise = list(clusters).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Noise points: {n_noise}")

# Visualize
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap='viridis')
plt.title(f'DBSCAN Clustering ({n_clusters} clusters)')
plt.show()

# Finding optimal eps using k-distance graph
from sklearn.neighbors import NearestNeighbors

neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(X_scaled)
distances, indices = neighbors_fit.kneighbors(X_scaled)

distances = np.sort(distances[:, 4])
plt.plot(distances)
plt.xlabel('Points')
plt.ylabel('5th Nearest Neighbor Distance')
plt.title('K-Distance Graph (Elbow = eps)')
plt.show()

Hierarchical Clustering

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

# Agglomerative Clustering
hierarchical = AgglomerativeClustering(n_clusters=3, linkage='ward')
clusters = hierarchical.fit_predict(X_scaled)

# Create and plot dendrogram
linked = linkage(X_scaled, method='ward')

plt.figure(figsize=(12, 7))
dendrogram(linked, truncate_mode='level', p=5)
plt.xlabel('Sample Index or Cluster Size')
plt.ylabel('Distance')
plt.title('Hierarchical Clustering Dendrogram')
plt.show()

# Cut dendrogram at different heights
from scipy.cluster.hierarchy import fcluster

# Get clusters by distance threshold
clusters_by_distance = fcluster(linked, t=10, criterion='distance')

# Get specific number of clusters
clusters_by_count = fcluster(linked, t=4, criterion='maxclust')

Gaussian Mixture Models (GMM)

from sklearn.mixture import GaussianMixture

# GMM - soft clustering (probabilistic)
gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(X_scaled)

# Hard cluster assignments
clusters = gmm.predict(X_scaled)

# Soft cluster probabilities
probabilities = gmm.predict_proba(X_scaled)
print("Probability of belonging to each cluster:")
print(probabilities[:5])

# Model selection with BIC/AIC
n_components_range = range(1, 10)
bic_scores = []
aic_scores = []

for n in n_components_range:
    gmm = GaussianMixture(n_components=n, random_state=42)
    gmm.fit(X_scaled)
    bic_scores.append(gmm.bic(X_scaled))
    aic_scores.append(gmm.aic(X_scaled))

plt.plot(n_components_range, bic_scores, label='BIC')
plt.plot(n_components_range, aic_scores, label='AIC')
plt.xlabel('Number of Components')
plt.ylabel('Score')
plt.legend()
plt.title('GMM Model Selection')
plt.show()

Principal Component Analysis (PCA)

from sklearn.decomposition import PCA

# PCA for dimensionality reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Explained variance
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print("Total Variance Explained:", sum(pca.explained_variance_ratio_))

# Cumulative variance plot
pca_full = PCA()
pca_full.fit(X_scaled)

cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
plt.legend()
plt.title('PCA - Explained Variance')
plt.show()

# Choose components for 95% variance
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Components for 95% variance: {n_components_95}")

# Visualize PCA
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.6)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA Visualization')
plt.colorbar(label='Target')
plt.show()

t-SNE (t-Distributed Stochastic Neighbor Embedding)

from sklearn.manifold import TSNE

# t-SNE for visualization
tsne = TSNE(n_components=2, perplexity=30, random_state=42, n_iter=1000)
X_tsne = tsne.fit_transform(X_scaled)

# Visualize
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=clusters,
                      cmap='viridis', alpha=0.6)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE Visualization')
plt.colorbar(scatter)
plt.show()

# Compare different perplexity values
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
perplexities = [5, 30, 50, 100]

for ax, perplexity in zip(axes.flat, perplexities):
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    X_tsne = tsne.fit_transform(X_scaled)
    ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=clusters, cmap='viridis')
    ax.set_title(f'Perplexity = {perplexity}')

plt.tight_layout()
plt.show()

UMAP (Uniform Manifold Approximation)

# pip install umap-learn
import umap

# UMAP - often better than t-SNE
reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_scaled)

plt.scatter(X_umap[:, 0], X_umap[:, 1], c=clusters, cmap='viridis')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.title('UMAP Visualization')
plt.show()

# UMAP preserves more global structure
# Also much faster than t-SNE on large datasets

Cluster Evaluation

from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score
)

# Internal metrics (no ground truth needed)
silhouette = silhouette_score(X_scaled, clusters)
calinski = calinski_harabasz_score(X_scaled, clusters)
davies = davies_bouldin_score(X_scaled, clusters)

print(f"Silhouette Score: {silhouette:.4f} (higher is better, max 1)")
print(f"Calinski-Harabasz: {calinski:.4f} (higher is better)")
print(f"Davies-Bouldin: {davies:.4f} (lower is better)")

# If ground truth available
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

ari = adjusted_rand_score(y_true, clusters)
nmi = normalized_mutual_info_score(y_true, clusters)
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Normalized Mutual Info: {nmi:.4f}")

Choosing the Right Algorithm

  • K-Means: Fast, spherical clusters, need to specify K
  • DBSCAN: Arbitrary shapes, handles noise, no K needed
  • Hierarchical: No K needed, provides dendrogram, slower
  • GMM: Soft clustering, elliptical clusters, probabilistic
  • PCA: Linear reduction, fast, preserves variance
  • t-SNE: Visualization, non-linear, computationally expensive
  • UMAP: Fast non-linear reduction, preserves global structure

Master Unsupervised Learning

Our Data Science program covers clustering, dimensionality reduction, and anomaly detection with real-world applications.

Explore Data Science Program

Related Articles