Model Debugging & Reliability
A model with 95% overall accuracy might still fail catastrophically on specific subgroups, be poorly calibrated (confident but wrong), or be brittle to small input perturbations. Model debugging goes beyond aggregate metrics to find and fix these hidden failure modes.
This lesson covers systematic techniques for finding model weaknesses, improving calibration, and estimating uncertainty.
The Aggregate Metric Trap
Error Analysis: A Systematic Approach
Error analysis means systematically examining the model's mistakes rather than just counting them.
Step 1: Categorize Errors
Group misclassified samples by type:Step 2: Slice-Based Evaluation
Evaluate metrics on meaningful subsets:Step 3: Error Clustering
Cluster misclassified samples in feature space to find systematic failure modes. If errors cluster, there is a pattern the model is missing.1import numpy as np
2from sklearn.datasets import load_breast_cancer
3from sklearn.ensemble import GradientBoostingClassifier
4from sklearn.model_selection import train_test_split
5from sklearn.metrics import (
6 classification_report, confusion_matrix
7)
8
9# --- Setup ---
10data = load_breast_cancer()
11X, y = data.data, data.target
12feature_names = data.feature_names
13
14X_train, X_test, y_train, y_test = train_test_split(
15 X, y, test_size=0.3, random_state=42
16)
17
18gbc = GradientBoostingClassifier(
19 n_estimators=100, max_depth=3, random_state=42
20)
21gbc.fit(X_train, y_train)
22
23# --- Overall metrics ---
24y_pred = gbc.predict(X_test)
25y_proba = gbc.predict_proba(X_test)[:, 1]
26print("=== Overall Performance ===")
27print(classification_report(y_test, y_pred,
28 target_names=["malignant", "benign"]))
29
30# --- Error analysis ---
31errors = y_pred != y_test
32error_indices = np.where(errors)[0]
33print(f"Total errors: {errors.sum()} / {len(y_test)}")
34
35# Confident errors (wrong but P > 0.9)
36confident_wrong = errors & (
37 (y_proba > 0.9) | (y_proba < 0.1)
38)
39print(f"Confident errors (P > 0.9): {confident_wrong.sum()}")
40
41# --- Slice-based evaluation ---
42print("\n=== Slice-Based Evaluation ===")
43
44# Slice by feature: mean radius (feature 0)
45median_radius = np.median(X_test[:, 0])
46small_tumors = X_test[:, 0] < median_radius
47large_tumors = ~small_tumors
48
49for name, mask in [("Small tumors", small_tumors),
50 ("Large tumors", large_tumors)]:
51 if mask.sum() > 0:
52 acc = (y_pred[mask] == y_test[mask]).mean()
53 n_errors = (y_pred[mask] != y_test[mask]).sum()
54 print(f" {name}: acc={acc:.4f} errors={n_errors}/{mask.sum()}")
55
56# Slice by feature: mean texture (feature 1)
57median_texture = np.median(X_test[:, 1])
58low_texture = X_test[:, 1] < median_texture
59high_texture = ~low_texture
60
61for name, mask in [("Low texture", low_texture),
62 ("High texture", high_texture)]:
63 if mask.sum() > 0:
64 acc = (y_pred[mask] == y_test[mask]).mean()
65 n_errors = (y_pred[mask] != y_test[mask]).sum()
66 print(f" {name}: acc={acc:.4f} errors={n_errors}/{mask.sum()}")
67
68# --- Analyze error patterns ---
69print("\n=== Error Pattern Analysis ===")
70error_features = X_test[error_indices]
71correct_features = X_test[~errors]
72
73print(f"{'Feature':<25} {'Error Mean':>12} {'Correct Mean':>14} {'Diff':>8}")
74print("-" * 60)
75for i, name in enumerate(feature_names[:10]):
76 err_mean = error_features[:, i].mean()
77 cor_mean = correct_features[:, i].mean()
78 diff = err_mean - cor_mean
79 flag = " ***" if abs(diff / (cor_mean + 1e-8)) > 0.3 else ""
80 print(f"{name:<25} {err_mean:>12.2f} {cor_mean:>14.2f} "
81 f"{diff:>+8.2f}{flag}")Model Calibration
A model is well-calibrated if its predicted probabilities match the actual frequencies. When the model says "80% chance of rain," it should rain 80% of the time.
Why Calibration Matters
| Scenario | Why calibration is critical |
|---|---|
| Medical diagnosis | "90% chance of cancer" must be trustworthy |
| Risk scoring | Insurance, credit risk, fraud detection |
| Decision making | Probability thresholds for actions |
| Model comparison | Comparing models by probability quality |
Measuring Calibration
Reliability Diagram: Bin predictions by confidence, plot predicted probability vs actual frequency. A perfectly calibrated model follows the diagonal.
Expected Calibration Error (ECE): Weighted average of |predicted - actual| across bins. Lower is better.
Calibration Methods
1import numpy as np
2from sklearn.datasets import make_classification
3from sklearn.ensemble import GradientBoostingClassifier
4from sklearn.model_selection import train_test_split
5from sklearn.calibration import CalibratedClassifierCV, calibration_curve
6
7np.random.seed(42)
8
9# Create dataset
10X, y = make_classification(
11 n_samples=5000, n_features=20, n_informative=10,
12 random_state=42
13)
14X_train, X_cal_test, y_train, y_cal_test = train_test_split(
15 X, y, test_size=0.4, random_state=42
16)
17X_cal, X_test, y_cal, y_test = train_test_split(
18 X_cal_test, y_cal_test, test_size=0.5, random_state=42
19)
20
21# Train an uncalibrated model
22gbc = GradientBoostingClassifier(
23 n_estimators=100, max_depth=3, random_state=42
24)
25gbc.fit(X_train, y_train)
26
27# --- Measure calibration ---
28def compute_ece(y_true, y_prob, n_bins=10):
29 """Compute Expected Calibration Error."""
30 bins = np.linspace(0, 1, n_bins + 1)
31 ece = 0.0
32 for b in range(n_bins):
33 mask = (y_prob >= bins[b]) & (y_prob < bins[b + 1])
34 if mask.sum() > 0:
35 actual_freq = y_true[mask].mean()
36 avg_confidence = y_prob[mask].mean()
37 ece += mask.sum() * abs(actual_freq - avg_confidence)
38 return ece / len(y_true)
39
40y_proba_uncal = gbc.predict_proba(X_test)[:, 1]
41ece_uncal = compute_ece(y_test, y_proba_uncal)
42
43print("=== Uncalibrated Model ===")
44print(f"ECE: {ece_uncal:.4f}")
45
46# Reliability diagram (text-based)
47prob_true, prob_pred = calibration_curve(y_test, y_proba_uncal, n_bins=10)
48print(f"\n{'Bin':>8} {'Predicted':>10} {'Actual':>10} {'Gap':>8}")
49print("-" * 40)
50for pt, pp in zip(prob_true, prob_pred):
51 gap = abs(pt - pp)
52 print(f"{'':>8} {pp:>10.3f} {pt:>10.3f} {gap:>8.3f}")
53
54# --- Apply Platt Scaling ---
55platt_model = CalibratedClassifierCV(
56 gbc, method="sigmoid", cv="prefit"
57)
58platt_model.fit(X_cal, y_cal)
59y_proba_platt = platt_model.predict_proba(X_test)[:, 1]
60ece_platt = compute_ece(y_test, y_proba_platt)
61
62# --- Apply Isotonic Regression ---
63iso_model = CalibratedClassifierCV(
64 gbc, method="isotonic", cv="prefit"
65)
66iso_model.fit(X_cal, y_cal)
67y_proba_iso = iso_model.predict_proba(X_test)[:, 1]
68ece_iso = compute_ece(y_test, y_proba_iso)
69
70# --- Compare ---
71print("\n=== Calibration Comparison ===")
72print(f"{'Method':<20} {'ECE':>8} {'Accuracy':>10}")
73print("-" * 40)
74for name, proba, model in [
75 ("Uncalibrated", y_proba_uncal, gbc),
76 ("Platt Scaling", y_proba_platt, platt_model),
77 ("Isotonic", y_proba_iso, iso_model),
78]:
79 ece = compute_ece(y_test, proba)
80 acc = model.score(X_test, y_test)
81 print(f"{name:<20} {ece:>8.4f} {acc:>10.4f}")Uncertainty Estimation
Standard neural networks produce point predictions with no measure of confidence. Uncertainty estimation quantifies how much the model does not know.
Types of Uncertainty
Methods
MC Dropout: At inference time, keep dropout enabled and run the model multiple times. The variance of predictions estimates epistemic uncertainty. Simple to implement; just run forward passes with dropout on.
Deep Ensembles: Train multiple models with different random seeds and average their predictions. The disagreement between models measures uncertainty. More expensive but more reliable than MC Dropout.
Prediction Intervals: For regression, estimate not just the mean but a confidence interval (e.g., [low, high] for 90% confidence).
1import numpy as np
2from sklearn.ensemble import (
3 GradientBoostingClassifier, RandomForestClassifier
4)
5from sklearn.model_selection import train_test_split
6from sklearn.datasets import make_classification
7
8np.random.seed(42)
9
10# --- Deep Ensemble (with sklearn) ---
11X, y = make_classification(
12 n_samples=2000, n_features=20, n_informative=10,
13 random_state=42
14)
15X_train, X_test, y_train, y_test = train_test_split(
16 X, y, test_size=0.3, random_state=42
17)
18
19# Train 5 models with different seeds
20n_models = 5
21models = []
22for i in range(n_models):
23 model = GradientBoostingClassifier(
24 n_estimators=100, max_depth=3, random_state=i * 42
25 )
26 model.fit(X_train, y_train)
27 models.append(model)
28
29# --- Ensemble predictions ---
30all_proba = np.array([m.predict_proba(X_test) for m in models])
31mean_proba = all_proba.mean(axis=0)
32std_proba = all_proba.std(axis=0)
33
34# --- Identify high-uncertainty predictions ---
35uncertainty = std_proba[:, 1] # std of P(class=1)
36ensemble_pred = (mean_proba[:, 1] > 0.5).astype(int)
37
38print("=== Ensemble Uncertainty Analysis ===")
39print(f"Mean accuracy: {(ensemble_pred == y_test).mean():.4f}")
40
41# High vs low uncertainty accuracy
42high_unc = uncertainty > np.percentile(uncertainty, 75)
43low_unc = uncertainty < np.percentile(uncertainty, 25)
44
45print(f"\nHigh uncertainty samples (top 25%): {high_unc.sum()}")
46print(f" Accuracy: {(ensemble_pred[high_unc] == y_test[high_unc]).mean():.4f}")
47print(f" Mean uncertainty: {uncertainty[high_unc].mean():.4f}")
48
49print(f"\nLow uncertainty samples (bottom 25%): {low_unc.sum()}")
50print(f" Accuracy: {(ensemble_pred[low_unc] == y_test[low_unc]).mean():.4f}")
51print(f" Mean uncertainty: {uncertainty[low_unc].mean():.4f}")
52
53# --- Uncertainty-based rejection ---
54print("\n=== Selective Prediction (Reject Uncertain) ===")
55thresholds = [0.0, 0.02, 0.05, 0.10, 0.15]
56print(f"{'Threshold':<12} {'Accepted':>10} {'Accuracy':>10} {'Rejected':>10}")
57print("-" * 45)
58
59for thresh in thresholds:
60 accepted = uncertainty <= thresh
61 if accepted.sum() > 0:
62 acc = (ensemble_pred[accepted] == y_test[accepted]).mean()
63 print(f"{thresh:<12.2f} {accepted.sum():>10} "
64 f"{acc:>10.4f} {(~accepted).sum():>10}")
65 else:
66 print(f"{thresh:<12.2f} {'0':>10} {'N/A':>10} {len(uncertainty):>10}")
67
68print("\nKey insight: rejecting high-uncertainty predictions")
69print("significantly improves accuracy on accepted samples.")