Scikit-learn: The Essential Machine Learning Library

What is Scikit-learn?

Scikit-learn (sklearn) is Python's most popular machine learning library. Built on NumPy, SciPy, and Matplotlib, it provides simple and efficient tools for data analysis and modeling. Whether you're building your first ML model or deploying to production, sklearn is likely your starting point.

With consistent APIs across all algorithms, excellent documentation, and a massive community, scikit-learn is the Swiss Army knife of machine learning.

The Sklearn API Pattern

Every sklearn estimator follows the same pattern:

from sklearn.some_module import SomeEstimator

# 1. Instantiate
model = SomeEstimator(hyperparameters)

# 2. Fit (train)
model.fit(X_train, y_train)

# 3. Predict
predictions = model.predict(X_test)

# 4. Evaluate
score = model.score(X_test, y_test)

This consistency makes it easy to swap algorithms and experiment quickly.

Classification

Predict discrete categories:

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Random Forest - robust, handles non-linear relationships
rf = RandomForestClassifier(n_estimators=100, max_depth=10)
rf.fit(X_train, y_train)

# Logistic Regression - fast, interpretable
lr = LogisticRegression(C=1.0, max_iter=1000)
lr.fit(X_train, y_train)

# SVM - effective in high dimensions
svm = SVC(kernel='rbf', C=1.0)
svm.fit(X_train, y_train)

# KNN - simple, instance-based
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

Regression

Predict continuous values:

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Linear Regression - baseline
linear = LinearRegression()
linear.fit(X_train, y_train)

# Ridge - L2 regularization
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Lasso - L1 regularization, feature selection
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

# Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100)
rf_reg.fit(X_train, y_train)

# Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
gb.fit(X_train, y_train)

Clustering

Find natural groupings in data:

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

# K-Means - specify number of clusters
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)

# DBSCAN - density-based, finds arbitrary shapes
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(X)

# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=3)
clusters = hierarchical.fit_predict(X)

Preprocessing

Prepare your data for modeling:

from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler,
    LabelEncoder, OneHotEncoder
)
from sklearn.impute import SimpleImputer

# Standardization (mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Normalization (0-1 range)
minmax = MinMaxScaler()
X_normalized = minmax.fit_transform(X_train)

# Handle missing values
imputer = SimpleImputer(strategy='mean')  # or 'median', 'most_frequent'
X_imputed = imputer.fit_transform(X)

# Encode categorical variables
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X_categorical)

Model Selection & Evaluation

from sklearn.model_selection import (
    train_test_split, cross_val_score, GridSearchCV
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, r2_score, classification_report
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"CV Score: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    RandomForestClassifier(), param_grid,
    cv=5, scoring='f1', n_jobs=-1
)
grid_search.fit(X_train, y_train)

print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.3f}")

Pipelines

Chain preprocessing and modeling steps:

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Simple pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

# Complex pipeline with different preprocessing per column type
numeric_features = ['age', 'income', 'score']
categorical_features = ['gender', 'city']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

full_pipeline.fit(X_train, y_train)

Feature Selection

from sklearn.feature_selection import (
    SelectKBest, f_classif, RFE
)

# Select K best features
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)

# Recursive Feature Elimination
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)
X_rfe = rfe.fit_transform(X, y)

# Feature importance from tree-based models
model = RandomForestClassifier().fit(X, y)
importances = model.feature_importances_

Complete Example

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# Load data
df = pd.read_csv('customer_churn.csv')
X = df.drop('churned', axis=1)
y = df['churned']

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1')
print(f"CV F1: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})")

# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

Master Scikit-learn with Expert Mentorship

Our Data Science program covers scikit-learn extensively, from basics to advanced techniques. Build real ML pipelines with guidance from industry experts.

Explore Data Science Program

Scikit-learn