import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from xgboost import XGBClassifier from sklearn.model_selection import StratifiedKFold, cross_val_predict from sklearn.preprocessing import MinMaxScaler from imblearn.over_sampling import SMOTE from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score from imblearn.pipeline import Pipeline import joblib sns.set(style='whitegrid') # Veri setini oku telco = pd.read_csv('Telco-Customer-Churn.csv') pd.set_option('display.max_columns', None) # TotalCharges sayısallaştırma ve eksik verileri çıkarma telco.TotalCharges = pd.to_numeric(telco.TotalCharges, errors='coerce') telco.dropna(inplace=True) # CustomerID'yi çıkar df2 = telco.iloc[:, 1:] # Binary sütunları True/False yap bool_map = {'Yes': True, 'No': False} binary_columns = ['Churn', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling'] for col in binary_columns: df2[col] = df2[col].map(bool_map) df2['SeniorCitizen'].replace({1: True, 0: False}, inplace=True) df2['gender'].replace({'Female': True, 'Male': False}, inplace=True) # Yeni özellikler # Ortalama aylık ödeme df2['avg_charge_per_month'] = df2['TotalCharges'] / df2['tenure'].replace(0, 1) # Toplam ödeme ile (aylık * süre) oranı df2['charge_ratio'] = df2.apply( lambda row: row['TotalCharges'] / (row['MonthlyCharges'] * row['tenure']) if row['MonthlyCharges'] > 0 and row['tenure'] > 0 else 1, axis=1) # Süre kategorisi df2['tenure_bin'] = pd.cut(df2['tenure'], bins=[0, 12, 24, df2['tenure'].max()], labels=['0-12', '12-24', '24+']) # One-hot encoding multi_cat_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod', 'tenure_bin'] df_dummies = pd.get_dummies(df2, columns=multi_cat_cols, drop_first=False) # Uzun vadeli sözleşme özelliği df_dummies['is_long_term_contract'] = ( df_dummies.get('Contract_One year', False) | df_dummies.get('Contract_Two year', False) ) # Hedef ve bağımsız değişkenler y = df_dummies['Churn'].values X = df_dummies.drop(columns=['Churn']) # En önemli 15 özelliği seçmek için XGB fit temp_model = XGBClassifier(random_state=42) temp_model.fit(X, y) feature_importances = pd.Series(temp_model.feature_importances_, index=X.columns).sort_values(ascending=False) top_15_features = feature_importances.head(15).index.tolist() X_selected = X[top_15_features] # Pipeline: Ölçekleme + SMOTE + XGBoost pipe = Pipeline([ ("scaler", MinMaxScaler()), ("smote", SMOTE(sampling_strategy=1.0, random_state=42)), ("xgb", XGBClassifier( n_estimators=100, max_depth=4, learning_rate=0.1, subsample=1.0, colsample_bytree=0.7, scale_pos_weight=1, eval_metric='logloss', random_state=42 )) ]) # 5-fold stratified cross-validation cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) y_pred = cross_val_predict(pipe, X_selected, y, cv=cv, method="predict") y_proba = cross_val_predict(pipe, X_selected, y, cv=cv, method="predict_proba")[:, 1] # Performans metrikleri print("XGBoost Cross-Validation Sonuçları (Binary):") print("--------------------------------------------") print("Accuracy: {:.4f}".format(accuracy_score(y, y_pred))) print("Precision: {:.4f}".format(precision_score(y, y_pred))) print("Recall: {:.4f}".format(recall_score(y, y_pred))) print("F1 Score: {:.4f}".format(f1_score(y, y_pred))) print("ROC-AUC: {:.4f}".format(roc_auc_score(y, y_proba))) print("") print("XGBoost Cross-Validation Sonuçları (Macro):") print("--------------------------------------------") print("Accuracy: {:.4f}".format(accuracy_score(y, y_pred))) print("Precision: {:.4f}".format(precision_score(y, y_pred, average='macro'))) print("Recall: {:.4f}".format(recall_score(y, y_pred, average='macro'))) print("F1 Score: {:.4f}".format(f1_score(y, y_pred, average='macro'))) print("ROC-AUC: {:.4f}".format(roc_auc_score(y, y_proba))) # Modeli eğit ve kaydet pipe.fit(X_selected, y) joblib.dump(pipe, 'churn_model.pkl') joblib.dump(top_15_features, "model_features.pkl")