Ejemplo pipeline empresas en re organización#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
# path = "BD empresas re organización.xlsx"
path = "BD empresas en re organización.xlsx"
xls = pd.ExcelFile(path)
df = pd.read_excel(path, sheet_name=xls.sheet_names[0])
df.head()
Razón Social | Margen EBIT | Carga financiera | Margen neto | CxC | CxP | Solvencia | Apalancamiento | En Reorganización | |
---|---|---|---|---|---|---|---|---|---|
0 | AACER SAS | 0.071690 | 0.000000 | 0.042876 | 0.104095 | 0.153192 | 1.877078 | 1.642505 | 0 |
1 | ABARROTES EL ROMPOY SAS | 0.017816 | 0.000000 | 0.010767 | 0.018414 | 0.000000 | 0.000000 | 0.865044 | 0 |
2 | ABASTECIMIENTOS INDUSTRIALES SAS | 0.144646 | 0.054226 | 0.059784 | 0.227215 | 0.025591 | 1.077412 | 1.272299 | 0 |
3 | ACME LEON PLASTICOS SAS | 0.004465 | 0.000000 | -0.013995 | 0.073186 | 0.127866 | 0.000000 | 1.391645 | 0 |
4 | ADVANCED PRODUCTS COLOMBIA SAS | 0.141829 | 0.050810 | 0.053776 | 0.398755 | 0.147678 | 0.675073 | 2.118774 | 0 |
# Conteo absoluto
conteo_clases = df['En Reorganización'].value_counts()
# Porcentaje
porcentaje_clases = df['En Reorganización'].value_counts(normalize=True) * 100
# Mostrar conteo y porcentaje
print("Cantidad de empresas por clase:")
print(conteo_clases)
print("\nPorcentaje de empresas por clase:")
print(porcentaje_clases.round(2))
Cantidad de empresas por clase:
En Reorganización
1 342
0 287
Name: count, dtype: int64
Porcentaje de empresas por clase:
En Reorganización
1 54.37
0 45.63
Name: proportion, dtype: float64
Árboles de decisión:#
# ------------------------
# Selección de variables
# ------------------------
variables_seleccionadas = ['Margen EBIT',
'Carga financiera',
'Margen neto',
'CxC',
'CxP',
'Solvencia',
'Apalancamiento']
# Variable objetivo
target = 'En Reorganización'
# ------------------------
# Preparar datos
# ------------------------
X = df[variables_seleccionadas]
y = df[target]
# Estandarizar variables
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# Dividir en entrenamiento y prueba (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=35, stratify=y)
Pipeline:#
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.ensemble import (
BaggingClassifier,
RandomForestClassifier,
AdaBoostClassifier,
GradientBoostingClassifier,
StackingClassifier,
)
import xgboost as xgb
models = {
"Logistic Regression": Pipeline([
("scaler", StandardScaler()),
("classifier", LogisticRegression(max_iter=1000)),
]),
"SVM": Pipeline([
("scaler", StandardScaler()),
("classifier", SVC(kernel="rbf", probability=True)),
]),
"Decision Tree": Pipeline([
("scaler", StandardScaler()),
("classifier", DecisionTreeClassifier(
max_depth=3, min_samples_split=5, min_samples_leaf=2
)),
]),
# -------------------- Ajustes para Bagging --------------------
"Bagging": Pipeline([
("scaler", StandardScaler()),
("classifier", BaggingClassifier(
estimator=DecisionTreeClassifier(
max_depth=5,
min_samples_split=10,
min_samples_leaf=5,
random_state=42
),
n_estimators=50,
max_samples=0.6,
max_features=0.8,
bootstrap=True,
random_state=42,
)),
]),
"Random Forest": Pipeline([
("scaler", StandardScaler()),
("classifier", RandomForestClassifier(
n_estimators=100,
max_depth=3,
max_features="sqrt",
random_state=42,
)),
]),
"AdaBoost": Pipeline([
("scaler", StandardScaler()),
("classifier", AdaBoostClassifier(
estimator=DecisionTreeClassifier(max_depth=1),
n_estimators=100,
learning_rate=0.1,
random_state=42,
)),
]),
# -------------------- Ajustes para Gradient Boosting --------------------
"Gradient Boosting": Pipeline([
("scaler", StandardScaler()),
("classifier", GradientBoostingClassifier(
n_estimators=200, # más etapas con contribución reducida
learning_rate=0.05, # tasa de aprendizaje menor
max_depth=3, # mantener árboles poco profundos
subsample=0.8, # bagging interno para reducir varianza
min_samples_leaf=5, # evitar hojas con muy pocas muestras
random_state=42
)),
]),
# -------------------- Ajustes para XGBoost --------------------
"XGBoost": Pipeline([
("scaler", StandardScaler()),
("classifier", xgb.XGBClassifier(
n_estimators=200, # más rondas con contribución reducida
learning_rate=0.05, # tasa de aprendizaje menor
max_depth=3, # limitar complejidad del árbol
subsample=0.8, # muestreo aleatorio de filas
colsample_bytree=0.8, # muestreo aleatorio de columnas
reg_alpha=0.1, # L1 regularización
reg_lambda=1.0, # L2 regularización
random_state=42,
eval_metric="auc"
)),
]),
# -------------------- Ajustes para Stacking --------------------
"Stacking": Pipeline([
("scaler", StandardScaler()),
("classifier", StackingClassifier(
estimators=[
("svc", SVC(kernel="linear", C=0.5, probability=True)),
("rf", RandomForestClassifier(
n_estimators=50,
max_depth=3,
max_features="sqrt",
random_state=42
)),
("dt", DecisionTreeClassifier(
max_depth=3,
min_samples_leaf=5,
random_state=42
)),
("log_reg", LogisticRegression(
penalty='l2',
C=0.1,
max_iter=1000,
solver='lbfgs'
)),
],
final_estimator=LogisticRegression(
penalty='l2',
C=0.05,
max_iter=1000,
solver='lbfgs'
),
cv=8,
)),
]),
}
# Evaluar cada modelo
accuracies = {}
for name, pipeline in models.items():
# Validación cruzada para obtener la media de la precisión
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring="accuracy")
mean_cv_score = cv_scores.mean()
# Entrenar el modelo y predecir en el conjunto de prueba
pipeline.fit(X_train, y_train)
y_pred_train = pipeline.predict(X_train)
y_pred = pipeline.predict(X_test)
# Calcular la precisión en el conjunto de prueba
test_accuracy = accuracy_score(y_test, y_pred)
# Guardar las precisiones
accuracies[name] = {
"CV Accuracy": mean_cv_score,
"Test Accuracy": test_accuracy,
"Confusion Matrix": confusion_matrix(y_test, y_pred),
"Classification Report - test": classification_report(y_test, y_pred),
"Classification Report - train": classification_report(y_train, y_pred_train),
}
# Mostrar los resultados
for model_name, metrics in accuracies.items():
print(f"Model: {model_name}")
print(f"Cross-Validation Accuracy: {metrics['CV Accuracy']:.2f}")
print(f"Test Accuracy: {metrics['Test Accuracy']:.2f}")
print("Confusion Matrix:")
print(metrics["Confusion Matrix"])
print("Classification Report - train:")
print(metrics["Classification Report - train"])
print("Classification Report - test:")
print(metrics["Classification Report - test"])
print("\n" + "-" * 40 + "\n")
Model: Logistic Regression
Cross-Validation Accuracy: 0.72
Test Accuracy: 0.69
Confusion Matrix:
[[62 24]
[35 68]]
Classification Report - train:
precision recall f1-score support
0 0.69 0.76 0.72 201
1 0.78 0.72 0.75 239
accuracy 0.73 440
macro avg 0.73 0.74 0.73 440
weighted avg 0.74 0.73 0.73 440
Classification Report - test:
precision recall f1-score support
0 0.64 0.72 0.68 86
1 0.74 0.66 0.70 103
accuracy 0.69 189
macro avg 0.69 0.69 0.69 189
weighted avg 0.69 0.69 0.69 189
----------------------------------------
Model: SVM
Cross-Validation Accuracy: 0.78
Test Accuracy: 0.77
Confusion Matrix:
[[71 15]
[29 74]]
Classification Report - train:
precision recall f1-score support
0 0.72 0.87 0.79 201
1 0.87 0.72 0.79 239
accuracy 0.79 440
macro avg 0.80 0.80 0.79 440
weighted avg 0.80 0.79 0.79 440
Classification Report - test:
precision recall f1-score support
0 0.71 0.83 0.76 86
1 0.83 0.72 0.77 103
accuracy 0.77 189
macro avg 0.77 0.77 0.77 189
weighted avg 0.78 0.77 0.77 189
----------------------------------------
Model: Decision Tree
Cross-Validation Accuracy: 0.79
Test Accuracy: 0.78
Confusion Matrix:
[[83 3]
[39 64]]
Classification Report - train:
precision recall f1-score support
0 0.70 0.99 0.82 201
1 0.98 0.64 0.78 239
accuracy 0.80 440
macro avg 0.84 0.81 0.80 440
weighted avg 0.85 0.80 0.80 440
Classification Report - test:
precision recall f1-score support
0 0.68 0.97 0.80 86
1 0.96 0.62 0.75 103
accuracy 0.78 189
macro avg 0.82 0.79 0.78 189
weighted avg 0.83 0.78 0.77 189
----------------------------------------
Model: Bagging
Cross-Validation Accuracy: 0.81
Test Accuracy: 0.80
Confusion Matrix:
[[75 11]
[26 77]]
Classification Report - train:
precision recall f1-score support
0 0.78 0.96 0.86 201
1 0.95 0.77 0.85 239
accuracy 0.86 440
macro avg 0.87 0.86 0.86 440
weighted avg 0.87 0.86 0.86 440
Classification Report - test:
precision recall f1-score support
0 0.74 0.87 0.80 86
1 0.88 0.75 0.81 103
accuracy 0.80 189
macro avg 0.81 0.81 0.80 189
weighted avg 0.81 0.80 0.80 189
----------------------------------------
Model: Random Forest
Cross-Validation Accuracy: 0.80
Test Accuracy: 0.80
Confusion Matrix:
[[78 8]
[30 73]]
Classification Report - train:
precision recall f1-score support
0 0.74 0.96 0.83 201
1 0.96 0.71 0.82 239
accuracy 0.82 440
macro avg 0.85 0.84 0.82 440
weighted avg 0.86 0.82 0.82 440
Classification Report - test:
precision recall f1-score support
0 0.72 0.91 0.80 86
1 0.90 0.71 0.79 103
accuracy 0.80 189
macro avg 0.81 0.81 0.80 189
weighted avg 0.82 0.80 0.80 189
----------------------------------------
Model: AdaBoost
Cross-Validation Accuracy: 0.80
Test Accuracy: 0.80
Confusion Matrix:
[[83 3]
[35 68]]
Classification Report - train:
precision recall f1-score support
0 0.73 0.99 0.84 201
1 0.99 0.69 0.81 239
accuracy 0.82 440
macro avg 0.86 0.84 0.82 440
weighted avg 0.87 0.82 0.82 440
Classification Report - test:
precision recall f1-score support
0 0.70 0.97 0.81 86
1 0.96 0.66 0.78 103
accuracy 0.80 189
macro avg 0.83 0.81 0.80 189
weighted avg 0.84 0.80 0.80 189
----------------------------------------
Model: Gradient Boosting
Cross-Validation Accuracy: 0.82
Test Accuracy: 0.84
Confusion Matrix:
[[74 12]
[18 85]]
Classification Report - train:
precision recall f1-score support
0 0.96 0.99 0.97 201
1 0.99 0.96 0.97 239
accuracy 0.97 440
macro avg 0.97 0.97 0.97 440
weighted avg 0.97 0.97 0.97 440
Classification Report - test:
precision recall f1-score support
0 0.80 0.86 0.83 86
1 0.88 0.83 0.85 103
accuracy 0.84 189
macro avg 0.84 0.84 0.84 189
weighted avg 0.84 0.84 0.84 189
----------------------------------------
Model: XGBoost
Cross-Validation Accuracy: 0.83
Test Accuracy: 0.85
Confusion Matrix:
[[76 10]
[19 84]]
Classification Report - train:
precision recall f1-score support
0 0.88 0.98 0.92 201
1 0.98 0.89 0.93 239
accuracy 0.93 440
macro avg 0.93 0.93 0.93 440
weighted avg 0.93 0.93 0.93 440
Classification Report - test:
precision recall f1-score support
0 0.80 0.88 0.84 86
1 0.89 0.82 0.85 103
accuracy 0.85 189
macro avg 0.85 0.85 0.85 189
weighted avg 0.85 0.85 0.85 189
----------------------------------------
Model: Stacking
Cross-Validation Accuracy: 0.79
Test Accuracy: 0.79
Confusion Matrix:
[[80 6]
[33 70]]
Classification Report - train:
precision recall f1-score support
0 0.72 0.95 0.82 201
1 0.94 0.69 0.80 239
accuracy 0.81 440
macro avg 0.83 0.82 0.81 440
weighted avg 0.84 0.81 0.81 440
Classification Report - test:
precision recall f1-score support
0 0.71 0.93 0.80 86
1 0.92 0.68 0.78 103
accuracy 0.79 189
macro avg 0.81 0.80 0.79 189
weighted avg 0.82 0.79 0.79 189
----------------------------------------
Modelo |
Train Accuracy |
Test Accuracy |
Gap |
¿Overfitting? |
---|---|---|---|---|
Logistic Regression |
0.73 |
0.69 |
0.04 |
No |
SVM |
0.79 |
0.77 |
0.02 |
No |
Decision Tree |
0.80 |
0.78 |
0.02 |
No |
Bagging |
0.86 |
0.80 |
0.06 |
Sí |
Random Forest |
0.82 |
0.80 |
0.02 |
No |
AdaBoost |
0.82 |
0.80 |
0.02 |
No |
Gradient Boosting |
0.97 |
0.84 |
0.13 |
Sí |
XGBoost |
0.93 |
0.85 |
0.08 |
Sí |
Stacking |
0.81 |
0.79 |
0.02 |
No |
Extraer el mejor modelo:#
XGBoost = models["XGBoost"]
XGBoost.fit(X_train, y_train)
Pipeline(steps=[('scaler', StandardScaler()), ('classifier', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.8, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='auc', feature_types=None, feature_weights=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.05, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=200, n_jobs=None, num_parallel_tree=None, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()), ('classifier', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.8, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='auc', feature_types=None, feature_weights=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.05, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=200, n_jobs=None, num_parallel_tree=None, ...))])
StandardScaler()
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.8, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='auc', feature_types=None, feature_weights=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.05, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=200, n_jobs=None, num_parallel_tree=None, ...)
y_pred_train = XGBoost.predict(X_train)
y_prob_train = XGBoost.predict_proba(X_train)[:, 1]
y_pred = XGBoost.predict(X_test)
y_prob = XGBoost.predict_proba(X_test)[:, 1]
# ------------------------
# Evaluación del modelo
# ------------------------
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0,1])
cm_df_train = pd.DataFrame(cm_train, index=["Real 0", "Real 1"], columns=["Predicho 0", "Predicho 1"])
plt.figure(figsize=(5.2,4.2))
sns.heatmap(cm_train, annot=True, fmt="d", cbar=True, linewidths=.5, cmap="coolwarm")
plt.title("Matriz de confusión - train")
plt.xlabel("Predicho"); plt.ylabel("Real")
plt.tight_layout()
plt.show()
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
cm_df = pd.DataFrame(cm, index=["Real 0", "Real 1"], columns=["Predicho 0", "Predicho 1"])
plt.figure(figsize=(5.2,4.2))
sns.heatmap(cm_df, annot=True, fmt="d", cbar=True, linewidths=.5, cmap="coolwarm")
plt.title("Matriz de confusión - Test")
plt.xlabel("Predicho"); plt.ylabel("Real")
plt.tight_layout()
plt.show()


print("\n=== Reporte de Clasificación - train ===")
print(classification_report(y_train, y_pred_train))
print("\n=== Reporte de Clasificación - test ===")
print(classification_report(y_test, y_pred))
=== Reporte de Clasificación - train ===
precision recall f1-score support
0 0.88 0.98 0.92 201
1 0.98 0.89 0.93 239
accuracy 0.93 440
macro avg 0.93 0.93 0.93 440
weighted avg 0.93 0.93 0.93 440
=== Reporte de Clasificación - test ===
precision recall f1-score support
0 0.80 0.88 0.84 86
1 0.89 0.82 0.85 103
accuracy 0.85 189
macro avg 0.85 0.85 0.85 189
weighted avg 0.85 0.85 0.85 189
# ============================
# ROC AUC Score
# ============================
auc_train = roc_auc_score(y_train, y_prob_train)
auc_test = roc_auc_score(y_test, y_prob)
print(f"ROC AUC - Train: {auc_train:.3f}")
print(f"ROC AUC - Test : {auc_test:.3f}")
# ============================
# Curva ROC (Train y Test)
# ============================
fpr_train, tpr_train, _ = roc_curve(y_train, y_prob_train)
fpr_test, tpr_test, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr_train, tpr_train, label=f'Train (AUC = {auc_train:.2f})', color='blue')
plt.plot(fpr_test, tpr_test, label=f'Test (AUC = {auc_test:.2f})', color='orange')
plt.plot([0, 1], [0, 1], 'k--', label='Azar')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Curva ROC - Train y Test")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()
ROC AUC - Train: 0.991
ROC AUC - Test : 0.910
