Post

Fraud Detection with Machine Learning in Python

Learn how to build a fraud detection system with Python using Random Forest, XGBoost, and neural networks. Covers handling imbalanced datasets with SMOTE, feature engineering, evaluation metrics, and a complete credit card fraud detection pipeline.

Fraud Detection with Machine Learning in Python

The Fraud Detection Problem

Credit card fraud accounts for billions in losses annually. The challenge: fraudulent transactions are rare — typically less than 1% of all transactions. A model that predicts “not fraud” for everything gets 99% accuracy but catches zero fraud.

1
2
3
4
5
6
7
8
import pandas as pd

# Load the Kaggle credit card fraud dataset
df = pd.read_csv("creditcard.csv")

print(f"Total transactions: {len(df):,}")
print(f"Fraudulent: {df['Class'].sum():,} ({df['Class'].mean():.2%})")
print(f"Legitimate: {(df['Class'] == 0).sum():,}")
1
2
3
Total transactions: 284,807
Fraudulent: 492 (0.17%)
Legitimate: 284,315

That’s a 578:1 imbalance. Standard accuracy is meaningless here. You need precision, recall, and AUC-ROC.

Installation

1
pip install pandas numpy scikit-learn xgboost imbalanced-learn matplotlib seaborn

Exploratory Analysis

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Transaction amount distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Legitimate vs fraud amounts
legit = df[df["Class"] == 0]["Amount"]
fraud = df[df["Class"] == 1]["Amount"]

axes[0].hist(legit, bins=50, alpha=0.7, label="Legitimate", color="#2ecc71")
axes[0].hist(fraud, bins=50, alpha=0.7, label="Fraud", color="#e74c3c")
axes[0].set_title("Transaction Amount Distribution")
axes[0].set_xlabel("Amount ($)")
axes[0].legend()
axes[0].set_xlim(0, 500)

# Time distribution
axes[1].scatter(df[df["Class"] == 0]["Time"], df[df["Class"] == 0]["Amount"],
               alpha=0.1, s=1, label="Legitimate", color="#2ecc71")
axes[1].scatter(df[df["Class"] == 1]["Time"], df[df["Class"] == 1]["Amount"],
               alpha=0.5, s=10, label="Fraud", color="#e74c3c")
axes[1].set_title("Transactions Over Time")
axes[1].legend()

plt.tight_layout()
plt.savefig("fraud_eda.png", dpi=150)

Feature Engineering

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from sklearn.preprocessing import StandardScaler

# The dataset already has PCA-transformed features (V1-V28)
# Scale the remaining features
scaler = StandardScaler()
df["Amount_scaled"] = scaler.fit_transform(df[["Amount"]])
df["Time_scaled"] = scaler.fit_transform(df[["Time"]])

# Add engineered features
df["Amount_log"] = np.log1p(df["Amount"])

# Hour of day (Time is in seconds from first transaction)
df["Hour"] = (df["Time"] / 3600) % 24

# Drop original unscaled columns
features = [c for c in df.columns if c not in ["Class", "Amount", "Time"]]

X = df[features]
y = df["Class"]

print(f"Features: {len(features)}")
print(f"Samples: {len(X)}")

Train/Test Split

1
2
3
4
5
6
7
8
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)} ({y_train.mean():.2%} fraud)")
print(f"Test: {len(X_test)} ({y_test.mean():.2%} fraud)")

Handling Imbalanced Data with SMOTE

SMOTE (Synthetic Minority Over-sampling Technique) generates synthetic fraud samples:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# SMOTE + undersampling combo
resampler = ImbPipeline([
    ("smote", SMOTE(sampling_strategy=0.5, random_state=42)),
    ("under", RandomUnderSampler(sampling_strategy=0.8, random_state=42)),
])

X_resampled, y_resampled = resampler.fit_resample(X_train, y_train)

print(f"Before SMOTE: {len(X_train)} samples ({y_train.sum()} fraud)")
print(f"After SMOTE:  {len(X_resampled)} samples ({y_resampled.sum()} fraud)")

The strategy: oversample fraud to 50% of legitimate, then undersample legitimate to get an 80/20 split. This gives the model enough fraud examples without drowning in duplicates.

Model 1: Random Forest

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1,
)
rf.fit(X_resampled, y_resampled)

# Predict probabilities
rf_probs = rf.predict_proba(X_test)[:, 1]
rf_preds = (rf_probs > 0.5).astype(int)

print("Random Forest Results:")
print(classification_report(y_test, rf_preds, target_names=["Legitimate", "Fraud"]))
print(f"AUC-ROC: {roc_auc_score(y_test, rf_probs):.4f}")

Model 2: XGBoost

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import xgboost as xgb

# Calculate scale_pos_weight for imbalanced data
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    eval_metric="aucpr",
    random_state=42,
    n_jobs=-1,
)
xgb_model.fit(X_resampled, y_resampled)

xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
xgb_preds = (xgb_probs > 0.5).astype(int)

print("XGBoost Results:")
print(classification_report(y_test, xgb_preds, target_names=["Legitimate", "Fraud"]))
print(f"AUC-ROC: {roc_auc_score(y_test, xgb_probs):.4f}")

Model 3: Neural Network

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),
    activation="relu",
    max_iter=300,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
)
nn.fit(X_resampled, y_resampled)

nn_probs = nn.predict_proba(X_test)[:, 1]
nn_preds = (nn_probs > 0.5).astype(int)

print("Neural Network Results:")
print(classification_report(y_test, nn_preds, target_names=["Legitimate", "Fraud"]))
print(f"AUC-ROC: {roc_auc_score(y_test, nn_probs):.4f}")

Evaluation: The Right Metrics

For fraud detection, focus on:

  • Recall (sensitivity): What fraction of actual fraud did we catch?
  • Precision: Of all fraud alerts, how many were real?
  • AUC-ROC: Overall model discrimination ability
  • AUC-PR: Better than ROC for imbalanced datasets
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from sklearn.metrics import (
    roc_curve, precision_recall_curve, average_precision_score, auc
)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ROC Curves
for name, probs in [("Random Forest", rf_probs), ("XGBoost", xgb_probs), ("Neural Net", nn_probs)]:
    fpr, tpr, _ = roc_curve(y_test, probs)
    roc_auc = auc(fpr, tpr)
    axes[0].plot(fpr, tpr, label=f"{name} (AUC={roc_auc:.4f})")

axes[0].plot([0, 1], [0, 1], "k--", alpha=0.3)
axes[0].set_xlabel("False Positive Rate")
axes[0].set_ylabel("True Positive Rate")
axes[0].set_title("ROC Curve")
axes[0].legend()

# Precision-Recall Curves
for name, probs in [("Random Forest", rf_probs), ("XGBoost", xgb_probs), ("Neural Net", nn_probs)]:
    precision, recall, _ = precision_recall_curve(y_test, probs)
    ap = average_precision_score(y_test, probs)
    axes[1].plot(recall, precision, label=f"{name} (AP={ap:.4f})")

axes[1].set_xlabel("Recall")
axes[1].set_ylabel("Precision")
axes[1].set_title("Precision-Recall Curve")
axes[1].legend()

plt.tight_layout()
plt.savefig("model_evaluation.png", dpi=150)

Threshold Tuning

The default 0.5 threshold isn’t optimal for fraud detection. Tune it:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from sklearn.metrics import f1_score

thresholds = np.arange(0.1, 0.9, 0.01)
f1_scores = [f1_score(y_test, (xgb_probs > t).astype(int)) for t in thresholds]

best_threshold = thresholds[np.argmax(f1_scores)]
best_f1 = max(f1_scores)
print(f"Best threshold: {best_threshold:.2f}")
print(f"Best F1 score: {best_f1:.4f}")

# Apply optimal threshold
final_preds = (xgb_probs > best_threshold).astype(int)
print("\nOptimized Results:")
print(classification_report(y_test, final_preds, target_names=["Legitimate", "Fraud"]))

Feature Importance

1
2
3
4
5
6
7
8
9
10
11
12
13
import matplotlib.pyplot as plt

# XGBoost feature importance
importance = xgb_model.feature_importances_
indices = np.argsort(importance)[-15:]  # Top 15

plt.figure(figsize=(10, 6))
plt.barh(range(15), importance[indices], color="#3498db")
plt.yticks(range(15), [features[i] for i in indices])
plt.xlabel("Feature Importance")
plt.title("Top 15 Features for Fraud Detection")
plt.tight_layout()
plt.savefig("feature_importance.png", dpi=150)

Complete Pipeline

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import joblib

# Production pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", xgb.XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
    ))
])

pipeline.fit(X_resampled, y_resampled)

# Save for production
joblib.dump(pipeline, "fraud_detection_model.pkl")
joblib.dump(best_threshold, "optimal_threshold.pkl")

# Load and predict
loaded_pipeline = joblib.load("fraud_detection_model.pkl")
threshold = joblib.load("optimal_threshold.pkl")

def predict_fraud(transaction_features):
    prob = loaded_pipeline.predict_proba(transaction_features)[:, 1]
    is_fraud = prob > threshold
    return is_fraud, prob

Key Takeaways

  • Fraud detection is an imbalanced classification problem — accuracy is misleading
  • Use SMOTE to generate synthetic fraud samples, combined with undersampling
  • XGBoost with scale_pos_weight handles imbalance natively
  • Evaluate with precision, recall, F1, AUC-ROC, and AUC-PR — not accuracy
  • Tune the classification threshold for your business needs (catch more fraud vs. fewer false alerts)
  • Feature importance analysis reveals which transaction attributes matter most
  • In production, monitor for concept drift — fraud patterns evolve constantly
This post is licensed under CC BY 4.0 by the author.