Pipeline riesgo de crédito

Contents

Pipeline riesgo de crédito#

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    BaggingClassifier,
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    StackingClassifier,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb

# Cargar el archivo CSV (ya lo hemos cargado y limpiado previamente)
credit_risk_data = pd.read_csv("credit_risk_data.csv")
credit_risk_data = credit_risk_data.drop(columns=["ID"])

# Dividir los datos en características (X) y etiqueta (y)
X = credit_risk_data.drop(columns=["Estado del Préstamo"])
y = credit_risk_data["Estado del Préstamo"]

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=34)

Creación de los modelos:#

Pipeline()

models = {
    "Logistic Regression": Pipeline(
        [
            ("scaler", StandardScaler()),
            ("classifier", LogisticRegression(max_iter=1000)),
        ]
    ),
    "SVM": Pipeline(
        [
            ("scaler", StandardScaler()),
            ("classifier", SVC(kernel="rbf", probability=True)),
        ]
    ),
    "Decision Tree": Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "classifier",
                DecisionTreeClassifier(
                    max_depth=3, min_samples_split=5, min_samples_leaf=2
                ),
            ),
        ]
    ),
    "Bagging": Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "classifier",
                BaggingClassifier(
                    base_estimator=DecisionTreeClassifier(),
                    n_estimators=100,
                    max_samples=0.8,
                    bootstrap=True,
                    n_jobs=-1,
                    random_state=42,
                ),
            ),
        ]
    ),
    "Random Forest": Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "classifier",
                RandomForestClassifier(
                    n_estimators=100,
                    max_depth=3,
                    max_features="sqrt",
                    random_state=42,
                    n_jobs=-1,
                ),
            ),
        ]
    ),
    "AdaBoost": Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "classifier",
                AdaBoostClassifier(
                    base_estimator=DecisionTreeClassifier(max_depth=1),
                    n_estimators=100,
                    learning_rate=0.1,
                    random_state=42,
                ),
            ),
        ]
    ),
    "Gradient Boosting": Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "classifier",
                GradientBoostingClassifier(
                    n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42
                ),
            ),
        ]
    ),
    "XGBoost": Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "classifier",
                xgb.XGBClassifier(
                    n_estimators=100,
                    max_depth=3,
                    learning_rate=0.1,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    random_state=42,
                ),
            ),
        ]
    ),
    "Stacking": Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "classifier",
                StackingClassifier(
                    estimators=[
                        ("svc", SVC(probability=True)),
                        (
                            "rf",
                            RandomForestClassifier(n_estimators=100, random_state=42),
                        ),
                        ("dt", DecisionTreeClassifier(random_state=42)),
                        ("log_reg", LogisticRegression(max_iter=1000)),
                    ],
                    final_estimator=LogisticRegression(),
                    cv=5,
                ),
            ),
        ]
    ),
}

# Evaluar cada modelo
accuracies = {}
for name, pipeline in models.items():
    # Validación cruzada para obtener la media de la precisión
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring="accuracy")
    mean_cv_score = cv_scores.mean()

    # Entrenar el modelo y predecir en el conjunto de prueba
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Calcular la precisión en el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred)

    # Guardar las precisiones
    accuracies[name] = {
        "CV Accuracy": mean_cv_score,
        "Test Accuracy": test_accuracy,
        "Confusion Matrix": confusion_matrix(y_test, y_pred),
        "Classification Report": classification_report(y_test, y_pred),
    }

# Mostrar los resultados
for model_name, metrics in accuracies.items():
    print(f"Model: {model_name}")
    print(f"Cross-Validation Accuracy: {metrics['CV Accuracy']:.2f}")
    print(f"Test Accuracy: {metrics['Test Accuracy']:.2f}")
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("Classification Report:")
    print(metrics["Classification Report"])
    print("\n" + "-" * 40 + "\n")

Model: Logistic Regression
Cross-Validation Accuracy: 0.88
Test Accuracy: 0.87
Confusion Matrix:
[[ 46  27]
 [ 12 215]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.63      0.70        73
           1       0.89      0.95      0.92       227

    accuracy                           0.87       300
   macro avg       0.84      0.79      0.81       300
weighted avg       0.87      0.87      0.86       300


----------------------------------------

Model: SVM
Cross-Validation Accuracy: 0.94
Test Accuracy: 0.93
Confusion Matrix:
[[ 57  16]
 [  6 221]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.78      0.84        73
           1       0.93      0.97      0.95       227

    accuracy                           0.93       300
   macro avg       0.92      0.88      0.90       300
weighted avg       0.93      0.93      0.92       300


----------------------------------------

Model: Decision Tree
Cross-Validation Accuracy: 0.92
Test Accuracy: 0.92
Confusion Matrix:
[[ 55  18]
 [  7 220]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.75      0.81        73
           1       0.92      0.97      0.95       227

    accuracy                           0.92       300
   macro avg       0.91      0.86      0.88       300
weighted avg       0.92      0.92      0.91       300


----------------------------------------

Model: Bagging
Cross-Validation Accuracy: 0.98
Test Accuracy: 0.98
Confusion Matrix:
[[ 70   3]
 [  3 224]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        73
           1       0.99      0.99      0.99       227

    accuracy                           0.98       300
   macro avg       0.97      0.97      0.97       300
weighted avg       0.98      0.98      0.98       300


----------------------------------------

Model: Random Forest
Cross-Validation Accuracy: 0.88
Test Accuracy: 0.86
Confusion Matrix:
[[ 30  43]
 [  0 227]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.41      0.58        73
           1       0.84      1.00      0.91       227

    accuracy                           0.86       300
   macro avg       0.92      0.71      0.75       300
weighted avg       0.88      0.86      0.83       300


----------------------------------------

Model: AdaBoost
Cross-Validation Accuracy: 0.98
Test Accuracy: 0.97
Confusion Matrix:
[[ 67   6]
 [  2 225]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.92      0.94        73
           1       0.97      0.99      0.98       227

    accuracy                           0.97       300
   macro avg       0.97      0.95      0.96       300
weighted avg       0.97      0.97      0.97       300


----------------------------------------

Model: Gradient Boosting
Cross-Validation Accuracy: 0.98
Test Accuracy: 0.99
Confusion Matrix:
[[ 71   2]
 [  2 225]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        73
           1       0.99      0.99      0.99       227

    accuracy                           0.99       300
   macro avg       0.98      0.98      0.98       300
weighted avg       0.99      0.99      0.99       300


----------------------------------------

Model: XGBoost
Cross-Validation Accuracy: 0.99
Test Accuracy: 0.99
Confusion Matrix:
[[ 71   2]
 [  1 226]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98        73
           1       0.99      1.00      0.99       227

    accuracy                           0.99       300
   macro avg       0.99      0.98      0.99       300
weighted avg       0.99      0.99      0.99       300


----------------------------------------

Model: Stacking
Cross-Validation Accuracy: 0.98
Test Accuracy: 0.99
Confusion Matrix:
[[ 71   2]
 [  2 225]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        73
           1       0.99      0.99      0.99       227

    accuracy                           0.99       300
   macro avg       0.98      0.98      0.98       300
weighted avg       0.99      0.99      0.99       300


----------------------------------------

Validación cruzada:

Se utiliza cross_val_score para realizar validación cruzada con 10 divisiones (cv=10) en el conjunto de entrenamiento. Esto proporciona una medida de la precisión promedio (mean_cv_score) obtenida en múltiples particiones de los datos.

La validación cruzada es útil para evaluar la estabilidad y la generalización del modelo, asegurando que no se sobreajuste a una sola división de los datos.