Ejemplo pipeline empresas en re organización#

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

# path = "BD empresas re organización.xlsx"

path = "BD empresas en re organización.xlsx"

xls = pd.ExcelFile(path)

df = pd.read_excel(path, sheet_name=xls.sheet_names[0])

df.head()

	Razón Social	Margen EBIT	Carga financiera	Margen neto	CxC	CxP	Solvencia	Apalancamiento
0	AACER SAS	0.071690	0.000000	0.042876	0.104095	0.153192	1.877078	1.642505
1	ABARROTES EL ROMPOY SAS	0.017816	0.000000	0.010767	0.018414	0.000000	0.000000	0.865044
2	ABASTECIMIENTOS INDUSTRIALES SAS	0.144646	0.054226	0.059784	0.227215	0.025591	1.077412	1.272299
3	ACME LEON PLASTICOS SAS	0.004465	0.000000	-0.013995	0.073186	0.127866	0.000000	1.391645
4	ADVANCED PRODUCTS COLOMBIA SAS	0.141829	0.050810	0.053776	0.398755	0.147678	0.675073	2.118774

# Conteo absoluto
conteo_clases = df['En Reorganización'].value_counts()
# Porcentaje
porcentaje_clases = df['En Reorganización'].value_counts(normalize=True) * 100

# Mostrar conteo y porcentaje
print("Cantidad de empresas por clase:")
print(conteo_clases)
print("\nPorcentaje de empresas por clase:")
print(porcentaje_clases.round(2))

Cantidad de empresas por clase:
En Reorganización
1    342
0    287
Name: count, dtype: int64

Porcentaje de empresas por clase:
En Reorganización
1    54.37
0    45.63
Name: proportion, dtype: float64

Árboles de decisión:#

# ------------------------
# Selección de variables
# ------------------------
variables_seleccionadas = ['Margen EBIT',
                           'Carga financiera',
                           'Margen neto',
                           'CxC',
                           'CxP',
                           'Solvencia',
                           'Apalancamiento']

# Variable objetivo
target = 'En Reorganización'

# ------------------------
# Preparar datos
# ------------------------
X = df[variables_seleccionadas]
y = df[target]

# Estandarizar variables
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# Dividir en entrenamiento y prueba (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=35, stratify=y)

Pipeline:#

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.ensemble import (
    BaggingClassifier,
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    StackingClassifier,
)
import xgboost as xgb

models = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(max_iter=1000)),
    ]),
    "SVM": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", SVC(kernel="rbf", probability=True)),
    ]),
    "Decision Tree": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", DecisionTreeClassifier(
            max_depth=3, min_samples_split=5, min_samples_leaf=2
        )),
    ]),
    # -------------------- Ajustes para Bagging --------------------
    "Bagging": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", BaggingClassifier(
            estimator=DecisionTreeClassifier(
                max_depth=5,
                min_samples_split=10,
                min_samples_leaf=5,
                random_state=42
            ),
            n_estimators=50,
            max_samples=0.6,
            max_features=0.8,
            bootstrap=True,
            random_state=42,
        )),
    ]),
    "Random Forest": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", RandomForestClassifier(
            n_estimators=100,
            max_depth=3,
            max_features="sqrt",
            random_state=42,
          )),
    ]),
    "AdaBoost": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=1),
            n_estimators=100,
            learning_rate=0.1,
            random_state=42,
        )),
    ]),
    # -------------------- Ajustes para Gradient Boosting --------------------
    "Gradient Boosting": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", GradientBoostingClassifier(
            n_estimators=200,        # más etapas con contribución reducida
            learning_rate=0.05,      # tasa de aprendizaje menor
            max_depth=3,             # mantener árboles poco profundos
            subsample=0.8,           # bagging interno para reducir varianza
            min_samples_leaf=5,      # evitar hojas con muy pocas muestras
            random_state=42
        )),
    ]),

    # -------------------- Ajustes para XGBoost --------------------
    "XGBoost": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", xgb.XGBClassifier(
            n_estimators=200,           # más rondas con contribución reducida
            learning_rate=0.05,         # tasa de aprendizaje menor
            max_depth=3,                # limitar complejidad del árbol
            subsample=0.8,              # muestreo aleatorio de filas
            colsample_bytree=0.8,       # muestreo aleatorio de columnas
            reg_alpha=0.1,              # L1 regularización
            reg_lambda=1.0,             # L2 regularización
            random_state=42,
            eval_metric="auc"
        )),
    ]),
    # -------------------- Ajustes para Stacking --------------------
    "Stacking": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", StackingClassifier(
            estimators=[
                ("svc", SVC(kernel="linear", C=0.5, probability=True)),
                ("rf", RandomForestClassifier(
                    n_estimators=50,
                    max_depth=3,
                    max_features="sqrt",
                    random_state=42
                )),
                ("dt", DecisionTreeClassifier(
                    max_depth=3,
                    min_samples_leaf=5,
                    random_state=42
                )),
                ("log_reg", LogisticRegression(
                    penalty='l2',
                    C=0.1,
                    max_iter=1000,
                    solver='lbfgs'
                )),
            ],
            final_estimator=LogisticRegression(
                penalty='l2',
                C=0.05,
                max_iter=1000,
                solver='lbfgs'
            ),
            cv=8,
        )),
    ]),
}

# Evaluar cada modelo
accuracies = {}
for name, pipeline in models.items():
    # Validación cruzada para obtener la media de la precisión
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring="accuracy")
    mean_cv_score = cv_scores.mean()

    # Entrenar el modelo y predecir en el conjunto de prueba
    pipeline.fit(X_train, y_train)
    y_pred_train = pipeline.predict(X_train)
    y_pred = pipeline.predict(X_test)

    # Calcular la precisión en el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred)

    # Guardar las precisiones
    accuracies[name] = {
        "CV Accuracy": mean_cv_score,
        "Test Accuracy": test_accuracy,
        "Confusion Matrix": confusion_matrix(y_test, y_pred),
        "Classification Report - test": classification_report(y_test, y_pred),
        "Classification Report - train": classification_report(y_train, y_pred_train),
    }

# Mostrar los resultados
for model_name, metrics in accuracies.items():
    print(f"Model: {model_name}")
    print(f"Cross-Validation Accuracy: {metrics['CV Accuracy']:.2f}")
    print(f"Test Accuracy: {metrics['Test Accuracy']:.2f}")
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("Classification Report - train:")
    print(metrics["Classification Report - train"])
    print("Classification Report - test:")
    print(metrics["Classification Report - test"])
    print("\n" + "-" * 40 + "\n")

Model: Logistic Regression
Cross-Validation Accuracy: 0.72
Test Accuracy: 0.69
Confusion Matrix:
[[62 24]
 [35 68]]
Classification Report - train:
              precision    recall  f1-score   support

           0       0.69      0.76      0.72       201
           1       0.78      0.72      0.75       239

    accuracy                           0.73       440
   macro avg       0.73      0.74      0.73       440
weighted avg       0.74      0.73      0.73       440

Classification Report - test:
              precision    recall  f1-score   support

           0       0.64      0.72      0.68        86
           1       0.74      0.66      0.70       103

    accuracy                           0.69       189
   macro avg       0.69      0.69      0.69       189
weighted avg       0.69      0.69      0.69       189


----------------------------------------

Model: SVM
Cross-Validation Accuracy: 0.78
Test Accuracy: 0.77
Confusion Matrix:
[[71 15]
 [29 74]]
Classification Report - train:
              precision    recall  f1-score   support

           0       0.72      0.87      0.79       201
           1       0.87      0.72      0.79       239

    accuracy                           0.79       440
   macro avg       0.80      0.80      0.79       440
weighted avg       0.80      0.79      0.79       440

Classification Report - test:
              precision    recall  f1-score   support

           0       0.71      0.83      0.76        86
           1       0.83      0.72      0.77       103

    accuracy                           0.77       189
   macro avg       0.77      0.77      0.77       189
weighted avg       0.78      0.77      0.77       189


----------------------------------------

Model: Decision Tree
Cross-Validation Accuracy: 0.79
Test Accuracy: 0.78
Confusion Matrix:
[[83  3]
 [39 64]]
Classification Report - train:
              precision    recall  f1-score   support

           0       0.70      0.99      0.82       201
           1       0.98      0.64      0.78       239

    accuracy                           0.80       440
   macro avg       0.84      0.81      0.80       440
weighted avg       0.85      0.80      0.80       440

Classification Report - test:
              precision    recall  f1-score   support

           0       0.68      0.97      0.80        86
           1       0.96      0.62      0.75       103

    accuracy                           0.78       189
   macro avg       0.82      0.79      0.78       189
weighted avg       0.83      0.78      0.77       189


----------------------------------------

Model: Bagging
Cross-Validation Accuracy: 0.81
Test Accuracy: 0.80
Confusion Matrix:
[[75 11]
 [26 77]]
Classification Report - train:
              precision    recall  f1-score   support

           0       0.78      0.96      0.86       201
           1       0.95      0.77      0.85       239

    accuracy                           0.86       440
   macro avg       0.87      0.86      0.86       440
weighted avg       0.87      0.86      0.86       440

Classification Report - test:
              precision    recall  f1-score   support

           0       0.74      0.87      0.80        86
           1       0.88      0.75      0.81       103

    accuracy                           0.80       189
   macro avg       0.81      0.81      0.80       189
weighted avg       0.81      0.80      0.80       189


----------------------------------------

Model: Random Forest
Cross-Validation Accuracy: 0.80
Test Accuracy: 0.80
Confusion Matrix:
[[78  8]
 [30 73]]
Classification Report - train:
              precision    recall  f1-score   support

           0       0.74      0.96      0.83       201
           1       0.96      0.71      0.82       239

    accuracy                           0.82       440
   macro avg       0.85      0.84      0.82       440
weighted avg       0.86      0.82      0.82       440

Classification Report - test:
              precision    recall  f1-score   support

           0       0.72      0.91      0.80        86
           1       0.90      0.71      0.79       103

    accuracy                           0.80       189
   macro avg       0.81      0.81      0.80       189
weighted avg       0.82      0.80      0.80       189


----------------------------------------

Model: AdaBoost
Cross-Validation Accuracy: 0.80
Test Accuracy: 0.80
Confusion Matrix:
[[83  3]
 [35 68]]
Classification Report - train:
              precision    recall  f1-score   support

           0       0.73      0.99      0.84       201
           1       0.99      0.69      0.81       239

    accuracy                           0.82       440
   macro avg       0.86      0.84      0.82       440
weighted avg       0.87      0.82      0.82       440

Classification Report - test:
              precision    recall  f1-score   support

           0       0.70      0.97      0.81        86
           1       0.96      0.66      0.78       103

    accuracy                           0.80       189
   macro avg       0.83      0.81      0.80       189
weighted avg       0.84      0.80      0.80       189


----------------------------------------

Model: Gradient Boosting
Cross-Validation Accuracy: 0.82
Test Accuracy: 0.84
Confusion Matrix:
[[74 12]
 [18 85]]
Classification Report - train:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       201
           1       0.99      0.96      0.97       239

    accuracy                           0.97       440
   macro avg       0.97      0.97      0.97       440
weighted avg       0.97      0.97      0.97       440

Classification Report - test:
              precision    recall  f1-score   support

           0       0.80      0.86      0.83        86
           1       0.88      0.83      0.85       103

    accuracy                           0.84       189
   macro avg       0.84      0.84      0.84       189
weighted avg       0.84      0.84      0.84       189


----------------------------------------

Model: XGBoost
Cross-Validation Accuracy: 0.83
Test Accuracy: 0.85
Confusion Matrix:
[[76 10]
 [19 84]]
Classification Report - train:
              precision    recall  f1-score   support

           0       0.88      0.98      0.92       201
           1       0.98      0.89      0.93       239

    accuracy                           0.93       440
   macro avg       0.93      0.93      0.93       440
weighted avg       0.93      0.93      0.93       440

Classification Report - test:
              precision    recall  f1-score   support

           0       0.80      0.88      0.84        86
           1       0.89      0.82      0.85       103

    accuracy                           0.85       189
   macro avg       0.85      0.85      0.85       189
weighted avg       0.85      0.85      0.85       189


----------------------------------------

Model: Stacking
Cross-Validation Accuracy: 0.79
Test Accuracy: 0.79
Confusion Matrix:
[[80  6]
 [33 70]]
Classification Report - train:
              precision    recall  f1-score   support

           0       0.72      0.95      0.82       201
           1       0.94      0.69      0.80       239

    accuracy                           0.81       440
   macro avg       0.83      0.82      0.81       440
weighted avg       0.84      0.81      0.81       440

Classification Report - test:
              precision    recall  f1-score   support

           0       0.71      0.93      0.80        86
           1       0.92      0.68      0.78       103

    accuracy                           0.79       189
   macro avg       0.81      0.80      0.79       189
weighted avg       0.82      0.79      0.79       189


----------------------------------------

Modelo	Train Accuracy	Test Accuracy	Gap	¿Overfitting?
Logistic Regression	0.73	0.69	0.04	No
SVM	0.79	0.77	0.02	No
Decision Tree	0.80	0.78	0.02	No
Bagging	0.86	0.80	0.06	Sí
Random Forest	0.82	0.80	0.02	No
AdaBoost	0.82	0.80	0.02	No
Gradient Boosting	0.97	0.84	0.13	Sí
XGBoost	0.93	0.85	0.08	Sí
Stacking	0.81	0.79	0.02	No

Extraer el mejor modelo:#

XGBoost = models["XGBoost"]
XGBoost.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=0.8, device=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric='auc',
                               feature_types=None, feature_weights=None,
                               gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=0.05,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=3, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
                               n_estimators=200, n_jobs=None,
                               num_parallel_tree=None, ...))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Pipeline

?Documentation for PipelineiFitted

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=0.8, device=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric='auc',
                               feature_types=None, feature_weights=None,
                               gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=0.05,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=3, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
                               n_estimators=200, n_jobs=None,
                               num_parallel_tree=None, ...))])

StandardScaler

?Documentation for StandardScaler

StandardScaler()

XGBClassifier

?Documentation for XGBClassifier

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.05, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=200,
              n_jobs=None, num_parallel_tree=None, ...)

y_pred_train = XGBoost.predict(X_train)
y_prob_train = XGBoost.predict_proba(X_train)[:, 1]

y_pred = XGBoost.predict(X_test)
y_prob = XGBoost.predict_proba(X_test)[:, 1]

# ------------------------
# Evaluación del modelo
# ------------------------
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0,1])
cm_df_train = pd.DataFrame(cm_train, index=["Real 0", "Real 1"], columns=["Predicho 0", "Predicho 1"])

plt.figure(figsize=(5.2,4.2))
sns.heatmap(cm_train, annot=True, fmt="d", cbar=True, linewidths=.5, cmap="coolwarm")
plt.title("Matriz de confusión - train")
plt.xlabel("Predicho"); plt.ylabel("Real")
plt.tight_layout()
plt.show()

cm = confusion_matrix(y_test, y_pred, labels=[0,1])
cm_df = pd.DataFrame(cm, index=["Real 0", "Real 1"], columns=["Predicho 0", "Predicho 1"])

plt.figure(figsize=(5.2,4.2))
sns.heatmap(cm_df, annot=True, fmt="d", cbar=True, linewidths=.5, cmap="coolwarm")
plt.title("Matriz de confusión - Test")
plt.xlabel("Predicho"); plt.ylabel("Real")
plt.tight_layout()
plt.show()

print("\n=== Reporte de Clasificación - train ===")
print(classification_report(y_train, y_pred_train))

print("\n=== Reporte de Clasificación - test ===")
print(classification_report(y_test, y_pred))

=== Reporte de Clasificación - train ===
              precision    recall  f1-score   support

           0       0.88      0.98      0.92       201
           1       0.98      0.89      0.93       239

    accuracy                           0.93       440
   macro avg       0.93      0.93      0.93       440
weighted avg       0.93      0.93      0.93       440


=== Reporte de Clasificación - test ===
              precision    recall  f1-score   support

           0       0.80      0.88      0.84        86
           1       0.89      0.82      0.85       103

    accuracy                           0.85       189
   macro avg       0.85      0.85      0.85       189
weighted avg       0.85      0.85      0.85       189

# ============================
# ROC AUC Score
# ============================
auc_train = roc_auc_score(y_train, y_prob_train)
auc_test = roc_auc_score(y_test, y_prob)

print(f"ROC AUC - Train: {auc_train:.3f}")
print(f"ROC AUC - Test : {auc_test:.3f}")

# ============================
# Curva ROC (Train y Test)
# ============================
fpr_train, tpr_train, _ = roc_curve(y_train, y_prob_train)
fpr_test, tpr_test, _ = roc_curve(y_test, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr_train, tpr_train, label=f'Train (AUC = {auc_train:.2f})', color='blue')
plt.plot(fpr_test, tpr_test, label=f'Test  (AUC = {auc_test:.2f})', color='orange')
plt.plot([0, 1], [0, 1], 'k--', label='Azar')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Curva ROC - Train y Test")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

ROC AUC - Train: 0.991
ROC AUC - Test : 0.910

Ejemplo pipeline empresas en re organización

Contents

Ejemplo pipeline empresas en re organización#

Árboles de decisión:#

Pipeline:#

Extraer el mejor modelo:#