# --- Setup & imports ---
import os, sys, math, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Ensure project modules are importable when running from repo root
sys.path.append(os.getcwd())

from src.features.build_features import clean_columns
from src.models.predict import predict_proba_df
from src.models.metrics import pr_auc, roc_auc, best_profit_threshold

import joblib
from sklearn.metrics import precision_recall_curve, roc_curve, confusion_matrix

DATA_PATH = "data/raw/Telco-Customer-Churn.csv"  # adjust if needed

def read_any(path: str) -> pd.DataFrame:
    ext = os.path.splitext(path)[1].lower()
    if ext in [".xlsx", ".xls"]:
        import openpyxl
        return pd.read_excel(path, engine="openpyxl")
    try:
        return pd.read_csv(path)
    except UnicodeDecodeError:
        for enc in ("utf-8-sig","latin1"):
            try:
                return pd.read_csv(path, encoding=enc)
            except UnicodeDecodeError:
                continue
        raise

raw = read_any(DATA_PATH)
print("Raw shape:", raw.shape)
raw.head(3)

Raw shape: (7043, 33)

df = clean_columns(raw)
print("Cleaned shape:", df.shape)
display(df.head(3))

print("\nColumns (first 40 shown):", len(df.columns))
print(sorted(df.columns)[:40])
print("\nDtypes (first 20):\n", df.dtypes.head(20))

Cleaned shape: (7043, 26)

Columns (first 40 shown): 26
['Churn', 'Contract', 'Contract_norm', 'Dependents', 'DeviceProtection', 'InternetService', 'MonthlyCharges', 'MultipleLines', 'OnlineBackup', 'OnlineSecurity', 'PaperlessBilling', 'Partner', 'PaymentMethod', 'PhoneService', 'SeniorCitizen', 'StreamingMovies', 'StreamingTV', 'TechSupport', 'TotalCharges', 'charges_per_tenure', 'contract_length_months', 'gender', 'has_tech_support', 'is_electronic_check', 'tenure', 'tenure_bucket']

Dtypes (first 20):
 gender               object
SeniorCitizen        object
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

if "Churn" in df.columns:
    churn_rate = df["Churn"].mean()
    print(f"Churn rate: {churn_rate:.3f} ({df['Churn'].sum()} / {len(df)})")
    plt.figure(); df["Churn"].value_counts().sort_index().plot(kind="bar"); plt.title("Class counts: Churn=0/1"); plt.show()
else:
    print("No 'Churn' column found after cleaning.")

na = df.isna().sum().sort_values(ascending=False)
display(na[na>0].head(20))

Churn rate: 0.265 (1869 / 7043)

Series([], dtype: int64)

def churn_rate_table(col):
    if "Churn" not in df.columns: raise ValueError("Churn target not found.")
    g = df.groupby(col)["Churn"].mean().sort_values(ascending=False)
    return g.to_frame("churn_rate")

for col in ["Contract","InternetService","PaymentMethod","TechSupport","OnlineSecurity","PaperlessBilling","MultipleLines"]:
    if col in df.columns:
        display(churn_rate_table(col).head(20))
        rates = df.groupby(col)["Churn"].mean().sort_values(ascending=True)
        plt.figure(); rates.plot(kind="barh"); plt.title(f"Churn rate by {col}"); plt.xlabel("Churn rate"); plt.ylabel(col); plt.show()

for col in ["MonthlyCharges","TotalCharges","tenure","charges_per_tenure"]:
    if col in df.columns:
        plt.figure(); df[col].dropna().hist(bins=30); plt.title(f"Distribution: {col}"); plt.xlabel(col); plt.ylabel("Count"); plt.show()

def simple_boxplot(xcol):
    if xcol in df.columns and "Churn" in df.columns:
        plt.figure()
        data0 = df.loc[df["Churn"]==0, xcol].dropna().values
        data1 = df.loc[df["Churn"]==1, xcol].dropna().values
        plt.boxplot([data0, data1], labels=["Churn=0","Churn=1"], showfliers=False)
        plt.title(f"{xcol} by Churn"); plt.ylabel(xcol); plt.show()

for col in ["MonthlyCharges","TotalCharges","tenure"]:
    simple_boxplot(col)

/var/folders/x3/03nbm4yx29z8f6kpfx7mqpzr0000gn/T/ipykernel_44101/4209833373.py:10: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot([data0, data1], labels=["Churn=0","Churn=1"], showfliers=False)

/var/folders/x3/03nbm4yx29z8f6kpfx7mqpzr0000gn/T/ipykernel_44101/4209833373.py:10: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot([data0, data1], labels=["Churn=0","Churn=1"], showfliers=False)

/var/folders/x3/03nbm4yx29z8f6kpfx7mqpzr0000gn/T/ipykernel_44101/4209833373.py:10: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot([data0, data1], labels=["Churn=0","Churn=1"], showfliers=False)

assert os.path.exists("./pd_models/model.pkl"), "Train the model first (models/model.pkl not found)."
pipe = joblib.load("./pd_models/model.pkl")

probs = predict_proba_df(df)
print("Predictions:", probs.shape)

if "Churn" in df.columns:
    y = df["Churn"].astype(int).values
    print(f"ROC-AUC: {roc_auc(y, probs):.4f} | PR-AUC: {pr_auc(y, probs):.4f}")
    fpr, tpr, _ = roc_curve(y, probs)
    plt.figure(); plt.plot(fpr, tpr); plt.plot([0,1],[0,1],'--'); plt.title("ROC Curve"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.show()
    p, r, _ = precision_recall_curve(y, probs)
    plt.figure(); plt.plot(r, p); plt.title("PR Curve"); plt.xlabel("Recall"); plt.ylabel("Precision"); plt.show()

Predictions: (7043,)
ROC-AUC: 0.8612 | PR-AUC: 0.6786

MARGIN, INCENTIVE, OUTREACH = 50.0, 10.0, 1.0  # adjust for your business
if "Churn" in df.columns:
    y = df["Churn"].astype(int).values
    t_opt, profit = best_profit_threshold(y, probs, MARGIN, INCENTIVE, OUTREACH)
    print(f"Optimal threshold ~ {t_opt:.3f}, estimated profit ≈ ${profit:,.0f}")
    p, r, thr = precision_recall_curve(y, probs)
    thresholds = thr if thr.size else np.array([0.5])
    def profit_at(th):
        y_pred = (probs >= th).astype(int)
        tp = ((y==1)&(y_pred==1)).sum(); fp=((y==0)&(y_pred==1)).sum()
        return tp*MARGIN - (tp+fp)*(INCENTIVE+OUTREACH)
    profs = [profit_at(t) for t in thresholds]
    plt.figure(); plt.plot(thresholds, profs); plt.title("Profit vs Threshold"); plt.xlabel("Threshold"); plt.ylabel("Profit ($)"); plt.show()

Optimal threshold ~ 0.454, estimated profit ≈ $45,640

import shap

pre = pipe.named_steps['pre']; clf = pipe.named_steps['clf']

try:
    feat_names = pre.get_feature_names_out().tolist()
except Exception:
    try:
        ohe = pre.named_transformers_['cat']
        cat_cols = list(pre.transformers_[0][2]) if pre.transformers_ and len(pre.transformers_)>0 else []
        num_cols = list(pre.transformers_[1][2]) if pre.transformers_ and len(pre.transformers_)>1 else []
        cat_names = ohe.get_feature_names_out(cat_cols).tolist() if cat_cols else []
        feat_names = cat_names + num_cols
    except Exception:
        feat_names = None

X = df.drop(columns=['Churn'], errors='ignore')
Xt = pre.transform(X)
import pandas as pd
Xt_df = pd.DataFrame(Xt, columns=feat_names if feat_names else [f"f{i}" for i in range(Xt.shape[1])])

# Sample for speed
sample = Xt_df
if len(Xt_df) > 1500:
    idx = np.random.RandomState(42).choice(len(Xt_df), 1500, replace=False)
    sample = Xt_df.iloc[idx]

# Explainer
if "xgboost" in str(type(clf)).lower():
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer(sample)
else:
    explainer = shap.Explainer(clf, sample)
    shap_values = explainer(sample)

shap.plots.beeswarm(shap_values, max_display=15)

# Local explanations for top 3 highest-risk customers
N=3
all_probs = predict_proba_df(df)
top_idx = np.argsort(-all_probs)[:N]
for i in top_idx:
    print(f"Index {i} — predicted churn prob = {all_probs[i]:.3f}")
    try:
        shap.plots.waterfall(shap_values[i])
    except Exception:
        shap.plots.force(shap_values[i], matplotlib=True)

Index 886 — predicted churn prob = 0.960

Index 1582 — predicted churn prob = 0.959

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[14], line 8
      7 try:
----> 8     shap.plots.waterfall(shap_values[i])
      9 except Exception:

File ~/Desktop/telco-churn-mlops/.venv/lib/python3.11/site-packages/shap/_explanation.py:417, in Explanation.__getitem__(self, item)
    416     new_self = copy.copy(self)
--> 417 new_self._s = new_self._s.__getitem__(item)
    418 new_self.op_history.append(OpHistoryItem(name="__getitem__", args=(item,), prev_shape=self.shape))

File ~/Desktop/telco-churn-mlops/.venv/lib/python3.11/site-packages/slicer/slicer.py:112, in Slicer.__getitem__(self, item)
    111 slicer_index = index_slicer[tracked.dim]
--> 112 sliced_o = tracked[slicer_index]
    113 sliced_dim = resolve_dim(index_tup, tracked.dim)

File ~/Desktop/telco-churn-mlops/.venv/lib/python3.11/site-packages/slicer/slicer_internal.py:69, in AtomicSlicer.__getitem__(self, item)
     68 # Slice according to object type.
---> 69 return UnifiedDataHandler.slice(self.o, index_tup, self.max_dim)

File ~/Desktop/telco-churn-mlops/.venv/lib/python3.11/site-packages/slicer/slicer_internal.py:588, in UnifiedDataHandler.slice(cls, o, index_tup, max_dim)
    586 tail_slice = cls.type_map[o_type].tail_slice
--> 588 is_element, sliced_o, cut = head_slice(o, index_tup, max_dim)
    589 out = tail_slice(sliced_o, index_tup[cut:], max_dim - cut, is_element)

File ~/Desktop/telco-churn-mlops/.venv/lib/python3.11/site-packages/slicer/slicer_internal.py:418, in ArrayHandler.head_slice(cls, o, index_tup, max_dim)
    417 is_element = any([True if isinstance(x, int) else False for x in cut_index])
--> 418 sliced_o = o[cut_index]
    420 return is_element, sliced_o, cut

IndexError: index 1582 is out of bounds for axis 0 with size 1500

During handling of the above exception, another exception occurred:

IndexError                                Traceback (most recent call last)
Cell In[14], line 10
      8     shap.plots.waterfall(shap_values[i])
      9 except Exception:
---> 10     shap.plots.force(shap_values[i], matplotlib=True)

File ~/Desktop/telco-churn-mlops/.venv/lib/python3.11/site-packages/shap/_explanation.py:417, in Explanation.__getitem__(self, item)
    415 if new_self is None:
    416     new_self = copy.copy(self)
--> 417 new_self._s = new_self._s.__getitem__(item)
    418 new_self.op_history.append(OpHistoryItem(name="__getitem__", args=(item,), prev_shape=self.shape))
    420 return new_self

File ~/Desktop/telco-churn-mlops/.venv/lib/python3.11/site-packages/slicer/slicer.py:112, in Slicer.__getitem__(self, item)
    110 index_slicer = AtomicSlicer(index_tup, max_dim=1)
    111 slicer_index = index_slicer[tracked.dim]
--> 112 sliced_o = tracked[slicer_index]
    113 sliced_dim = resolve_dim(index_tup, tracked.dim)
    115 new_tracked = tracked.__class__(sliced_o, sliced_dim)

File ~/Desktop/telco-churn-mlops/.venv/lib/python3.11/site-packages/slicer/slicer_internal.py:69, in AtomicSlicer.__getitem__(self, item)
     66 index_tup = unify_slice(item, self.max_dim)
     68 # Slice according to object type.
---> 69 return UnifiedDataHandler.slice(self.o, index_tup, self.max_dim)

File ~/Desktop/telco-churn-mlops/.venv/lib/python3.11/site-packages/slicer/slicer_internal.py:588, in UnifiedDataHandler.slice(cls, o, index_tup, max_dim)
    585 head_slice = cls.type_map[o_type].head_slice
    586 tail_slice = cls.type_map[o_type].tail_slice
--> 588 is_element, sliced_o, cut = head_slice(o, index_tup, max_dim)
    589 out = tail_slice(sliced_o, index_tup[cut:], max_dim - cut, is_element)
    590 return out

File ~/Desktop/telco-churn-mlops/.venv/lib/python3.11/site-packages/slicer/slicer_internal.py:418, in ArrayHandler.head_slice(cls, o, index_tup, max_dim)
    416 cut_index = index_tup[:cut]
    417 is_element = any([True if isinstance(x, int) else False for x in cut_index])
--> 418 sliced_o = o[cut_index]
    420 return is_element, sliced_o, cut

IndexError: index 1582 is out of bounds for axis 0 with size 1500

import numpy as np, pandas as pd, matplotlib.pyplot as plt

assert "Churn" in df.columns, "Churn target required for decile analysis."
y = df["Churn"].astype(int).values
p = predict_proba_df(df)

# Deciles (10 = highest risk)
q = pd.qcut(p, 10, labels=False, duplicates="drop")  # 0..9
dec = pd.DataFrame({"prob": p, "y": y, "decile": 9 - q})  # 9=top risk

tbl = dec.groupby("decile").agg(
    n=("y", "size"),
    positives=("y", "sum"),
    avg_prob=("prob", "mean")
).sort_index(ascending=False)
tbl["cum_positives"] = tbl["positives"].cumsum()
tbl["cum_pct_positives"] = tbl["cum_positives"] / tbl["positives"].sum()
tbl["cum_pct_population"] = tbl["n"].cumsum() / tbl["n"].sum()
tbl["lift"] = (tbl["positives"] / tbl["n"]) / (dec["y"].mean()+1e-12)
display(tbl)

# Cumulative gains chart
plt.figure()
plt.plot(tbl["cum_pct_population"], tbl["cum_pct_positives"], marker="o")
plt.plot([0,1],[0,1],"--")
plt.title("Cumulative Gains")
plt.xlabel("Cumulative % of population (by risk)")
plt.ylabel("Cumulative % of churners captured")
plt.show()

# Lift chart
plt.figure()
plt.plot(np.arange(1, len(tbl)+1), tbl["lift"], marker="o")
plt.axhline(1.0, linestyle="--")
plt.title("Lift by Decile (10=highest risk)")
plt.xlabel("Decile rank (10 highest)")
plt.ylabel("Lift")
plt.show()

import numpy as np, matplotlib.pyplot as plt, pandas as pd

if "tenure" in df.columns and "Contract" in df.columns and "Churn" in df.columns:
    bins = [-1, 6, 12, 24, 48, 72, 999]
    labels = ["<=6","7-12","13-24","25-48","49-72",">72"]
    tbin = pd.cut(df["tenure"], bins=bins, labels=labels)
    piv = pd.pivot_table(df.assign(tbin=tbin), values="Churn", index="tbin", columns="Contract", aggfunc="mean")
    mat = piv.values.astype(float)
    plt.figure()
    plt.imshow(mat, aspect="auto")
    plt.xticks(range(mat.shape[1]), piv.columns, rotation=45, ha="right")
    plt.yticks(range(mat.shape[0]), piv.index)
    plt.colorbar(label="Churn rate")
    plt.title("Churn rate by Tenure bin × Contract")
    plt.tight_layout()
    plt.show()
else:
    print("Need columns: tenure, Contract, Churn for heatmap.")

/var/folders/x3/03nbm4yx29z8f6kpfx7mqpzr0000gn/T/ipykernel_44101/2721081917.py:7: FutureWarning: The default value of observed=False is deprecated and will change to observed=True in a future version of pandas. Specify observed=False to silence this warning and retain the current behavior
  piv = pd.pivot_table(df.assign(tbin=tbin), values="Churn", index="tbin", columns="Contract", aggfunc="mean")

svc_cols = [c for c in ["OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies"] if c in df.columns]
tmp = df.copy()
for c in svc_cols:
    tmp[c] = tmp[c].astype(str).str.lower().str.startswith("yes").astype(int)
tmp["addon_count"] = tmp[svc_cols].sum(axis=1)
rates = tmp.groupby("addon_count")["Churn"].mean()
plt.figure(); rates.plot(kind="bar")
plt.title("Churn rate by number of add-on services")
plt.xlabel("Add-on count (security/backup/protection/support/streaming)")
plt.ylabel("Churn rate")
plt.show()

from sklearn.inspection import PartialDependenceDisplay

X_clean = df.drop(columns=["Churn"], errors="ignore")
pre = pipe.named_steps["pre"]; clf = pipe.named_steps["clf"]
Xt = pre.transform(X_clean)

try:
    feat_names = pre.get_feature_names_out().tolist()
except Exception:
    feat_names = [f"f{i}" for i in range(Xt.shape[1])]

# Numeric original columns expected by pre (second transformer group)
try:
    num_cols = list(pre.transformers_[1][2]) if len(pre.transformers_)>1 else []
except Exception:
    num_cols = []

def find_idx(name):
    # Try 'num__<name>' exact match, else suffix match
    for i, n in enumerate(feat_names):
        if n == f"num__{name}" or n.endswith(f"__{name}"):
            return i
    raise ValueError(f"Feature '{name}' not found in transformed names.")

to_plot = [c for c in ["tenure","MonthlyCharges"] if c in num_cols]
if to_plot:
    feats = [find_idx(c) for c in to_plot]
    fig = plt.figure()
    PartialDependenceDisplay.from_estimator(clf, Xt, features=feats)
    plt.suptitle("Partial Dependence (numeric features)")
    plt.show()
else:
    print("Numeric features not found among ['tenure','MonthlyCharges'] in preprocessor.")

<Figure size 640x480 with 0 Axes>

import numpy as np
# Determine top features by mean |SHAP|
vals = np.abs(shap_values.values if hasattr(shap_values, "values") else shap_values).mean(axis=0)
rank = np.argsort(-vals)
top_idxs = rank[:2]
for i in top_idxs:
    shap.plots.scatter(shap_values[:, i], show=True)  # uses feature names from sample/Xt_df

	CustomerID	Count	Country	State	City	Zip Code	Lat Long	Latitude	Longitude	Gender	...	Contract	Paperless Billing	Payment Method	Monthly Charges	Total Charges	Churn Label	Churn Value	Churn Score	CLTV	Churn Reason
0	3668-QPYBK	1	United States	California	Los Angeles	90003	33.964131, -118.272783	33.964131	-118.272783	Male	...	Month-to-month	Yes	Mailed check	53.85	108.15	Yes	1	86	3239	Competitor made better offer
1	9237-HQITU	1	United States	California	Los Angeles	90005	34.059281, -118.30742	34.059281	-118.307420	Female	...	Month-to-month	Yes	Electronic check	70.70	151.65	Yes	1	67	2701	Moved
2	9305-CDSKC	1	United States	California	Los Angeles	90006	34.048013, -118.293953	34.048013	-118.293953	Female	...	Month-to-month	Yes	Electronic check	99.65	820.5	Yes	1	86	5372	Moved

	churn_rate
Contract
Month-to-month	0.427097
One year	0.112695
Two year	0.028319

	churn_rate
InternetService
Fiber optic	0.418928
DSL	0.189591
No	0.074050

	churn_rate
PaymentMethod
Electronic check	0.452854
Mailed check	0.191067
Bank transfer (automatic)	0.167098
Credit card (automatic)	0.152431

	churn_rate
TechSupport
No	0.416355
Yes	0.151663
No internet service	0.074050

Telecom Customer Churn — Deep-Dive Analysis & Model Insights¶

Objectives¶

Initial Hypotheses¶

1) Load data¶

2) Clean & feature-engineer (same as training)¶

Target distribution & missing values¶

3) Churn rate by key categorical features¶

4) Numeric feature distributions & boxplots¶

5) Load saved model & evaluate¶

Profit-oriented threshold & curve¶

6) SHAP — Global importance & local explanations¶

7) EXTRA — Risk deciles, Gain & Lift¶

8) EXTRA — Tenure × Contract heatmap¶

9) EXTRA — Add-on services count vs churn¶

10) EXTRA — Partial Dependence (tenure & monthly charges)¶

11) EXTRA — SHAP dependence (top 2 features)¶

12) Conclusions¶

	churn_rate
OnlineSecurity
No	0.417667
Yes	0.146112
No internet service	0.074050

	n	positives	avg_prob	cum_positives	cum_pct_positives	cum_pct_population	lift
decile
9	705	2	0.013185	2	0.001070	0.100099	0.010690
8	704	11	0.045340	13	0.006956	0.200057	0.058880
7	704	28	0.097888	41	0.021937	0.300014	0.149877
6	704	51	0.183675	92	0.049224	0.399972	0.272989
5	705	102	0.296763	194	0.103799	0.500071	0.545205
4	704	167	0.444267	361	0.193151	0.600028	0.893907
3	704	226	0.576396	587	0.314072	0.699986	1.209718
2	704	302	0.705344	889	0.475655	0.799943	1.616526
1	704	434	0.807725	1323	0.707865	0.899901	2.323087
0	705	546	0.896720	1869	1.000000	1.000000	2.918448

	churn_rate
MultipleLines
Yes	0.286099
No	0.250442
No phone service	0.249267