# Import essential libraries
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('Telco_customer_churn.csv')

# Inspect the first few rows
df.head()

# Check data info and missing values
df.info()

# Show summary statistics for numerical columns
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   object 
 1   Count              7043 non-null   int64  
 2   Country            7043 non-null   object 
 3   State              7043 non-null   object 
 4   City               7043 non-null   object 
 5   Zip Code           7043 non-null   int64  
 6   Lat Long           7043 non-null   object 
 7   Latitude           7043 non-null   float64
 8   Longitude          7043 non-null   float64
 9   Gender             7043 non-null   object 
 10  Senior Citizen     7043 non-null   object 
 11  Partner            7043 non-null   object 
 12  Dependents         7043 non-null   object 
 13  Tenure Months      7043 non-null   int64  
 14  Phone Service      7043 non-null   object 
 15  Multiple Lines     7043 non-null   object 
 16  Internet Service   7043 non-null   object 
 17  Online Security    7043 non-null   object 
 18  Online Backup      7043 non-null   object 
 19  Device Protection  7043 non-null   object 
 20  Tech Support       7043 non-null   object 
 21  Streaming TV       7043 non-null   object 
 22  Streaming Movies   7043 non-null   object 
 23  Contract           7043 non-null   object 
 24  Paperless Billing  7043 non-null   object 
 25  Payment Method     7043 non-null   object 
 26  Monthly Charges    7043 non-null   float64
 27  Total Charges      7043 non-null   object 
 28  Churn Label        7043 non-null   object 
 29  Churn Value        7043 non-null   int64  
 30  Churn Score        7043 non-null   int64  
 31  CLTV               7043 non-null   int64  
 32  Churn Reason       1869 non-null   object 
dtypes: float64(3), int64(6), object(24)
memory usage: 1.8+ MB

# Check for missing values
df.isnull().sum()

CustomerID              0
Count                   0
Country                 0
State                   0
City                    0
Zip Code                0
Lat Long                0
Latitude                0
Longitude               0
Gender                  0
Senior Citizen          0
Partner                 0
Dependents              0
Tenure Months           0
Phone Service           0
Multiple Lines          0
Internet Service        0
Online Security         0
Online Backup           0
Device Protection       0
Tech Support            0
Streaming TV            0
Streaming Movies        0
Contract                0
Paperless Billing       0
Payment Method          0
Monthly Charges         0
Total Charges           0
Churn Label             0
Churn Value             0
Churn Score             0
CLTV                    0
Churn Reason         5174
dtype: int64

# Review column data types
df.dtypes

CustomerID            object
Count                  int64
Country               object
State                 object
City                  object
Zip Code               int64
Lat Long              object
Latitude             float64
Longitude            float64
Gender                object
Senior Citizen        object
Partner               object
Dependents            object
Tenure Months          int64
Phone Service         object
Multiple Lines        object
Internet Service      object
Online Security       object
Online Backup         object
Device Protection     object
Tech Support          object
Streaming TV          object
Streaming Movies      object
Contract              object
Paperless Billing     object
Payment Method        object
Monthly Charges      float64
Total Charges         object
Churn Label           object
Churn Value            int64
Churn Score            int64
CLTV                   int64
Churn Reason          object
dtype: object

# Convert 'Total Charges' to numeric, if not already
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')

# Identify categorical columns based on your list
cat_columns = [
    'Gender', 'Senior Citizen', 'Partner', 'Dependents',
    'Phone Service', 'Multiple Lines', 'Internet Service',
    'Online Security', 'Online Backup', 'Device Protection',
    'Tech Support', 'Streaming TV', 'Streaming Movies',
    'Contract', 'Paperless Billing', 'Payment Method',
    'Churn Label'
]

for col in cat_columns:
    df[col] = df[col].astype('category')

# Check missing 'Churn Reason'
df['Churn Reason'].isnull().sum()

np.int64(5174)

# Example: Fill missing 'Churn Reason' with 'Not Churned'
df['Churn Reason'] = df['Churn Reason'].fillna('Not Churned')

# Confirm the changes
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   CustomerID         7043 non-null   object  
 1   Count              7043 non-null   int64   
 2   Country            7043 non-null   object  
 3   State              7043 non-null   object  
 4   City               7043 non-null   object  
 5   Zip Code           7043 non-null   int64   
 6   Lat Long           7043 non-null   object  
 7   Latitude           7043 non-null   float64 
 8   Longitude          7043 non-null   float64 
 9   Gender             7043 non-null   category
 10  Senior Citizen     7043 non-null   category
 11  Partner            7043 non-null   category
 12  Dependents         7043 non-null   category
 13  Tenure Months      7043 non-null   int64   
 14  Phone Service      7043 non-null   category
 15  Multiple Lines     7043 non-null   category
 16  Internet Service   7043 non-null   category
 17  Online Security    7043 non-null   category
 18  Online Backup      7043 non-null   category
 19  Device Protection  7043 non-null   category
 20  Tech Support       7043 non-null   category
 21  Streaming TV       7043 non-null   category
 22  Streaming Movies   7043 non-null   category
 23  Contract           7043 non-null   category
 24  Paperless Billing  7043 non-null   category
 25  Payment Method     7043 non-null   category
 26  Monthly Charges    7043 non-null   float64 
 27  Total Charges      7032 non-null   float64 
 28  Churn Label        7043 non-null   category
 29  Churn Value        7043 non-null   int64   
 30  Churn Score        7043 non-null   int64   
 31  CLTV               7043 non-null   int64   
 32  Churn Reason       7043 non-null   object  
dtypes: category(17), float64(4), int64(6), object(6)
memory usage: 999.6+ KB

CustomerID            0
Count                 0
Country               0
State                 0
City                  0
Zip Code              0
Lat Long              0
Latitude              0
Longitude             0
Gender                0
Senior Citizen        0
Partner               0
Dependents            0
Tenure Months         0
Phone Service         0
Multiple Lines        0
Internet Service      0
Online Security       0
Online Backup         0
Device Protection     0
Tech Support          0
Streaming TV          0
Streaming Movies      0
Contract              0
Paperless Billing     0
Payment Method        0
Monthly Charges       0
Total Charges        11
Churn Label           0
Churn Value           0
Churn Score           0
CLTV                  0
Churn Reason          0
dtype: int64

import matplotlib.pyplot as plt
import seaborn as sns

# Overall churn distribution
sns.countplot(x='Churn Label', data=df, palette='Set2')
plt.title('Customer Churn Distribution')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.show()

# Optionally, show churn rate numerically
churn_rate = df['Churn Label'].value_counts(normalize=True).rename_axis('Churn').reset_index(name='Proportion')
print(churn_rate)

C:\Users\johna\AppData\Local\Temp\ipykernel_9104\3277478164.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Churn Label', data=df, palette='Set2')

  Churn  Proportion
0    No     0.73463
1   Yes     0.26537

plt.figure(figsize=(8,5))
sns.countplot(x='Contract', hue='Churn Label', data=df, palette='Set1')
plt.title('Churn by Contract Type')
plt.xlabel('Contract Type')
plt.ylabel('Count')
plt.legend(title='Churn')
plt.show()

# Optionally, show churn rate numerically
churn_rate = df['Churn Label'].value_counts(normalize=True).rename_axis('Churn').reset_index(name='Proportion')
print(churn_rate)

  Churn  Proportion
0    No     0.73463
1   Yes     0.26537

plt.figure(figsize=(10,5))
sns.countplot(y='Payment Method', hue='Churn Label', data=df, palette='Set3')
plt.title('Churn by Payment Method')
plt.xlabel('Count')
plt.ylabel('Payment Method')
plt.legend(title='Churn')
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(x='Churn Label', y='Monthly Charges', data=df)
plt.title('Monthly Charges by Churn')
plt.xlabel('Churn')
plt.ylabel('Monthly Charges')
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(data=df, x='Tenure Months', hue='Churn Label', bins=30, kde=True, element='step', stat='density')
plt.title('Tenure Distribution by Churn')
plt.xlabel('Tenure (Months)')
plt.ylabel('Density')
plt.show()

plt.figure(figsize=(5,3))
sns.countplot(x='Senior Citizen', hue='Churn Label', data=df, palette='Accent')
plt.title('Churn by Senior Citizen Status')
plt.xlabel('Senior Citizen')
plt.ylabel('Count')
plt.legend(title='Churn')
plt.show()

# Numeric
senior_churn = df.groupby('Senior Citizen', observed=True)['Churn Value'].mean() * 100

print(senior_churn)

Senior Citizen
No     23.606168
Yes    41.681261
Name: Churn Value, dtype: float64

service_features = [
    'Online Security', 'Tech Support', 'Streaming TV', 
    'Streaming Movies', 'Device Protection', 'Online Backup', 'Multiple Lines'
]

for feature in service_features:
    plt.figure(figsize=(6,3))
    sns.countplot(x=feature, hue='Churn Label', data=df, palette='coolwarm')
    plt.title(f'Churn by {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.legend(title='Churn')
    plt.tight_layout()
    plt.show()

# Top 10 cities by number of customers
top_cities = df['City'].value_counts().head(10).index
plt.figure(figsize=(10,5))
sns.countplot(y='City', hue='Churn Label', data=df[df['City'].isin(top_cities)], palette='Set2', order=top_cities)
plt.title('Churn by City (California)')
plt.xlabel('Count')
plt.ylabel('City')
plt.legend(title='Churn')
plt.tight_layout()
plt.show()

# Function to plot stacked bar chart for any categorical feature
def stacked_barplot(feature):
    cross = pd.crosstab(df[feature], df['Churn Label'])
    cross_norm = cross.div(cross.sum(axis=1), axis=0)
    cross_norm.plot(kind='bar', stacked=True, figsize=(7,4), colormap='tab20c')
    plt.title(f'Stacked Barplot of {feature} by Churn')
    plt.xlabel(feature)
    plt.ylabel('Proportion')
    plt.legend(title='Churn', loc='upper right')
    plt.tight_layout()
    plt.show()

# Example usage
for col in ['Internet Service', 'Streaming TV', 'Payment Method']:
    stacked_barplot(col)

plt.figure(figsize=(10,7))
numeric_cols = ['Tenure Months', 'Monthly Charges', 'Total Charges', 'Churn Value', 'Churn Score', 'CLTV']
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap (Numeric Features)')
plt.show()

sns.pairplot(df[numeric_cols + ['Churn Label']], hue='Churn Label', corner=True)
plt.suptitle('Pairplot of Key Numeric Features by Churn', y=1.02)
plt.show()

plt.figure(figsize=(8,5))
sns.scatterplot(
    data=df, 
    x='Tenure Months', 
    y='Monthly Charges', 
    hue='Churn Label', 
    alpha=0.5, 
    palette='coolwarm'
)
plt.title('Monthly Charges vs. Tenure Months by Churn')
plt.xlabel('Tenure (Months)')
plt.ylabel('Monthly Charges')
plt.legend(title='Churn')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,7))
sns.scatterplot(
    x='Longitude', y='Latitude', 
    data=df, hue='Churn Label', 
    alpha=0.5, palette='coolwarm'
)
plt.title('Customer Locations by Churn - California state')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Churn')
plt.show()

drop_cols = [
    'CustomerID', 'Country', 'State', 'City', 'Zip Code', 'Lat Long',
    'Latitude', 'Longitude', 'Churn Label', 'Churn Reason', 'Count'
]

X = df.drop(columns=drop_cols + ['Churn Value'])
y = df['Churn Value']

print(X.columns)  # See your remaining features before next step

Index(['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure Months',
       'Phone Service', 'Multiple Lines', 'Internet Service',
       'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
       'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charges', 'Total Charges', 'Churn Score',
       'CLTV'],
      dtype='object')

# Confirm the exact names of your numeric columns
numeric_features = ['Tenure Months', 'Monthly Charges', 'Total Charges']
numeric_features = [col for col in numeric_features if col in X.columns]

categorical_features = [col for col in X.columns if col not in numeric_features]

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

Numeric features: ['Tenure Months', 'Monthly Charges', 'Total Charges']
Categorical features: ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method', 'Churn Score', 'CLTV']

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_numeric = pd.DataFrame(
    scaler.fit_transform(X[numeric_features]),
    columns=numeric_features,
    index=X.index
)

X_categorical = pd.get_dummies(X[categorical_features], drop_first=True)

import pandas as pd

X_prepared = pd.concat([X_numeric, X_categorical], axis=1)

print(X_prepared.shape)
print(X_prepared.columns[:10])  # Show first 10 columns to check

(7043, 32)
Index(['Tenure Months', 'Monthly Charges', 'Total Charges', 'Churn Score',
       'CLTV', 'Gender_Male', 'Senior Citizen_Yes', 'Partner_Yes',
       'Dependents_Yes', 'Phone Service_Yes'],
      dtype='object')

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_prepared, y, test_size=0.2, random_state=42, stratify=y
)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_prepared, y, test_size=0.2, random_state=42, stratify=y
)

import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay

# Train XGBoost Classifier
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_clf.fit(X_train, y_train)

# Predict on test set
y_pred = xgb_clf.predict(X_test)
y_proba = xgb_clf.predict_proba(X_test)[:, 1]

# Classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ROC AUC
auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC Score:", round(auc, 3))

# ROC Curve
RocCurveDisplay.from_estimator(xgb_clf, X_test, y_test)
plt.title('XGBoost ROC Curve')
plt.show()

C:\Users\johna\AppData\Local\Programs\Python\Python313\Lib\site-packages\xgboost\training.py:183: UserWarning: [00:22:54] WARNING: C:\actions-runner\_work\xgboost\xgboost\src\learner.cc:738: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

              precision    recall  f1-score   support

           0       0.96      0.95      0.95      1035
           1       0.87      0.89      0.88       374

    accuracy                           0.93      1409
   macro avg       0.91      0.92      0.92      1409
weighted avg       0.93      0.93      0.93      1409

Confusion Matrix:
[[984  51]
 [ 43 331]]
ROC-AUC Score: 0.983

importances = xgb_clf.feature_importances_
features = X_train.columns

fi_df = pd.DataFrame({'feature': features, 'importance': importances}).sort_values('importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='importance', y='feature', data=fi_df.head(15), palette='Blues_r')
plt.title('Top 15 Feature Importances (XGBoost)')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

C:\Users\johna\AppData\Local\Temp\ipykernel_9104\160547992.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='importance', y='feature', data=fi_df.head(15), palette='Blues_r')

import shap

# Explain model predictions using SHAP
explainer = shap.TreeExplainer(xgb_clf)
shap_values = explainer.shap_values(X_test)

# Global feature importance summary plot
shap.summary_plot(shap_values, X_test, plot_type='bar')

# Detailed beeswarm plot (shows how each feature impacts churn predictions for all samples)
shap.summary_plot(shap_values, X_test)

	CustomerID	Count	Country	State	City	Zip Code	Lat Long	Latitude	Longitude	Gender	...	Contract	Paperless Billing	Payment Method	Monthly Charges	Total Charges	Churn Label	Churn Value	Churn Score	CLTV	Churn Reason
0	3668-QPYBK	1	United States	California	Los Angeles	90003	33.964131, -118.272783	33.964131	-118.272783	Male	...	Month-to-month	Yes	Mailed check	53.85	108.15	Yes	1	86	3239	Competitor made better offer
1	9237-HQITU	1	United States	California	Los Angeles	90005	34.059281, -118.30742	34.059281	-118.307420	Female	...	Month-to-month	Yes	Electronic check	70.70	151.65	Yes	1	67	2701	Moved
2	9305-CDSKC	1	United States	California	Los Angeles	90006	34.048013, -118.293953	34.048013	-118.293953	Female	...	Month-to-month	Yes	Electronic check	99.65	820.5	Yes	1	86	5372	Moved
3	7892-POOKP	1	United States	California	Los Angeles	90010	34.062125, -118.315709	34.062125	-118.315709	Female	...	Month-to-month	Yes	Electronic check	104.80	3046.05	Yes	1	84	5003	Moved
4	0280-XJGEX	1	United States	California	Los Angeles	90015	34.039224, -118.266293	34.039224	-118.266293	Male	...	Month-to-month	Yes	Bank transfer (automatic)	103.70	5036.3	Yes	1	89	5340	Competitor had better devices

	Count	Zip Code	Latitude	Longitude	Tenure Months	Monthly Charges	Churn Value	Churn Score	CLTV
count	7043.0	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000
mean	1.0	93521.964646	36.282441	-119.798880	32.371149	64.761692	0.265370	58.699418	4400.295755
std	0.0	1865.794555	2.455723	2.157889	24.559481	30.090047	0.441561	21.525131	1183.057152
min	1.0	90001.000000	32.555828	-124.301372	0.000000	18.250000	0.000000	5.000000	2003.000000
25%	1.0	92102.000000	34.030915	-121.815412	9.000000	35.500000	0.000000	40.000000	3469.000000
50%	1.0	93552.000000	36.391777	-119.730885	29.000000	70.350000	0.000000	61.000000	4527.000000
75%	1.0	95351.000000	38.224869	-118.043237	55.000000	89.850000	1.000000	75.000000	5380.500000
max	1.0	96161.000000	41.962127	-114.192901	72.000000	118.750000	1.000000	100.000000	6500.000000

Telco Customer Churn Analysis¶

Data Loading and Preparation¶

Data Cleaning and Preparation¶

Exploratory Data Analysis (EDA)¶

Correlation Heatmap Insights¶

Pairplot: Churn Patterns in Numeric Features¶

Summary of EDA Findings¶

Feature Engineering and Data Preparation¶

Predictive Modeling with XGBoost¶

XGBoost Feature Importance¶

Model Evaluation and Insights¶

1. Classification Report & Key Metrics¶

2. Confusion Matrix¶

3. ROC-AUC Score¶

4. Feature Importance Analysis¶

5. Recommendations¶

6. Next Steps¶

Model Interpretability with SHAP¶

SHAP Analysis: Explaining Model Predictions¶

Summary of Model Interpretability and Insights¶

Conclusion and Business Recommendations¶

Recommended Actions¶

Next Steps¶