Optimización de Hiperparámetros - Empresas en Reorganización#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from keras import optimizers
from keras.models import load_model
path = "BD empresas en re organización.xlsx"
xls = pd.ExcelFile(path)
df = pd.read_excel(path, sheet_name=xls.sheet_names[0])
df.head()
| Razón Social | Margen EBIT | Carga financiera | Margen neto | CxC | CxP | Solvencia | Apalancamiento | En Reorganización | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | AACER SAS | 0.071690 | 0.000000 | 0.042876 | 0.104095 | 0.153192 | 1.877078 | 1.642505 | 0 |
| 1 | ABARROTES EL ROMPOY SAS | 0.017816 | 0.000000 | 0.010767 | 0.018414 | 0.000000 | 0.000000 | 0.865044 | 0 |
| 2 | ABASTECIMIENTOS INDUSTRIALES SAS | 0.144646 | 0.054226 | 0.059784 | 0.227215 | 0.025591 | 1.077412 | 1.272299 | 0 |
| 3 | ACME LEON PLASTICOS SAS | 0.004465 | 0.000000 | -0.013995 | 0.073186 | 0.127866 | 0.000000 | 1.391645 | 0 |
| 4 | ADVANCED PRODUCTS COLOMBIA SAS | 0.141829 | 0.050810 | 0.053776 | 0.398755 | 0.147678 | 0.675073 | 2.118774 | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Razón Social 629 non-null object
1 Margen EBIT 629 non-null float64
2 Carga financiera 629 non-null float64
3 Margen neto 629 non-null float64
4 CxC 629 non-null float64
5 CxP 629 non-null float64
6 Solvencia 629 non-null float64
7 Apalancamiento 629 non-null float64
8 En Reorganización 629 non-null int64
dtypes: float64(7), int64(1), object(1)
memory usage: 44.4+ KB
# ------------------------
# Selección de variables
# ------------------------
variables_seleccionadas = ['Margen EBIT',
'Carga financiera',
'Margen neto',
'CxC',
'CxP',
'Solvencia',
'Apalancamiento']
# Variable objetivo
target = 'En Reorganización'
# ------------------------
# Preparar datos
# ------------------------
X = df[variables_seleccionadas]
y = df[target]
# Estandarizar variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Dividir en entrenamiento y prueba (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=35, stratify=y)
Optimización de Hiperparámetros#
cantidad_modelos = 10
for i in range(cantidad_modelos):
units = np.random.choice([5, 8, 10, 12, 15, 18, 20, 22, 24], 1).item()
n_hidden = np.random.choice([1, 2], 1)[0]
activation = np.random.choice(['relu', 'tanh', 'selu', 'elu'], 1)[0]
learning_rate = np.random.choice([0.001, 0.01, 0.1], 1)[0]
batch_size = np.random.choice([16, 32, 64], 1)[0]
optimizer = np.random.choice(['Adam', 'RMSprop'], 1)[0]
print(f'Modelo: {i+1}, Units: {units}, Hidden: {n_hidden}, Activation: {activation}, Learning Rate: {learning_rate}, Optimizer: {optimizer}, Batch Size: {batch_size}')
epochs = 200
# Definir el modelo
best_model = Sequential()
best_model.add(Input(shape=(X.shape[1],)))
# Loop para las capas ocultas:
for _ in range(n_hidden):
best_model.add(Dense(units, activation=activation))
best_model.add(Dropout(0.2))
# Capa de salida:
best_model.add(Dense(1))
# Optimizador:
if optimizer == 'Adam':
optimizer = optimizers.Adam(learning_rate=learning_rate)
else:
optimizer = optimizers.RMSprop(learning_rate=learning_rate)
# Compilar el modelo:
best_model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer=optimizer)
# Entrenar el modelo:
history = best_model.fit(X_train, y_train, epochs=epochs,
validation_data=(X_test, y_test),
batch_size=batch_size,
verbose=0)
# Evaluar el modelo con accuracy:
y_prob_train = best_model.predict(X_train)
y_prob = best_model.predict(X_test)
y_pred_train = np.where(y_prob_train > 0.5, 1, 0)
y_pred = np.where(y_prob > 0.5, 1, 0)
accuracy_train = accuracy_score(y_train, y_pred_train.flatten())
accuracy_test = accuracy_score(y_test, y_pred.flatten())
recall_train = recall_score(y_train, y_pred_train.flatten())
recall_test = recall_score(y_test, y_pred.flatten())
precision_train = precision_score(y_train, y_pred_train.flatten())
precision_test = precision_score(y_test, y_pred.flatten())
print(f'Accuracy train: {accuracy_train}, Accuracy test: {accuracy_test}')
print(f'Recall train: {recall_train}, Recall test: {recall_test}')
print(f'Precision train: {precision_train}, Precision test: {precision_test}')
# Graficar Loss train y Loss test:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
# Guardar el modelo:
best_model.save(f"best_model_{i+1}.keras")
Modelo: 1, Units: 5, Hidden: 1, Activation: selu, Learning Rate: 0.001, Optimizer: RMSprop, Batch Size: 16
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Accuracy train: 0.7477272727272727, Accuracy test: 0.708994708994709
Recall train: 0.7447698744769874, Recall test: 0.7087378640776699
Precision train: 0.7807017543859649, Precision test: 0.7448979591836735
Modelo: 2, Units: 20, Hidden: 2, Activation: elu, Learning Rate: 0.001, Optimizer: Adam, Batch Size: 32
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Accuracy train: 0.75, Accuracy test: 0.7195767195767195
Recall train: 0.7405857740585774, Recall test: 0.7378640776699029
Precision train: 0.7866666666666666, Precision test: 0.7450980392156863
Modelo: 3, Units: 12, Hidden: 2, Activation: relu, Learning Rate: 0.01, Optimizer: Adam, Batch Size: 64
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Accuracy train: 0.8340909090909091, Accuracy test: 0.7777777777777778
Recall train: 0.698744769874477, Recall test: 0.6699029126213593
Precision train: 0.9940476190476191, Precision test: 0.8961038961038961
Modelo: 4, Units: 8, Hidden: 1, Activation: relu, Learning Rate: 0.1, Optimizer: RMSprop, Batch Size: 64
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Accuracy train: 0.8386363636363636, Accuracy test: 0.798941798941799
Recall train: 0.7364016736401674, Recall test: 0.7572815533980582
Precision train: 0.9565217391304348, Precision test: 0.8571428571428571
Modelo: 5, Units: 8, Hidden: 2, Activation: relu, Learning Rate: 0.1, Optimizer: Adam, Batch Size: 32
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Accuracy train: 0.7454545454545455, Accuracy test: 0.7619047619047619
Recall train: 0.5564853556485355, Recall test: 0.5922330097087378
Precision train: 0.9568345323741008, Precision test: 0.953125
Modelo: 6, Units: 18, Hidden: 1, Activation: relu, Learning Rate: 0.01, Optimizer: Adam, Batch Size: 64
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Accuracy train: 0.825, Accuracy test: 0.7724867724867724
Recall train: 0.7238493723849372, Recall test: 0.7087378640776699
Precision train: 0.9402173913043478, Precision test: 0.8488372093023255
Modelo: 7, Units: 18, Hidden: 2, Activation: selu, Learning Rate: 0.001, Optimizer: Adam, Batch Size: 64
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Accuracy train: 0.7090909090909091, Accuracy test: 0.7037037037037037
Recall train: 0.7740585774058577, Recall test: 0.8155339805825242
Precision train: 0.7142857142857143, Precision test: 0.6942148760330579
Modelo: 8, Units: 12, Hidden: 2, Activation: elu, Learning Rate: 0.01, Optimizer: RMSprop, Batch Size: 64
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Accuracy train: 0.8113636363636364, Accuracy test: 0.7671957671957672
Recall train: 0.7196652719665272, Recall test: 0.7087378640776699
Precision train: 0.9148936170212766, Precision test: 0.8390804597701149
Modelo: 9, Units: 15, Hidden: 1, Activation: tanh, Learning Rate: 0.001, Optimizer: RMSprop, Batch Size: 64
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Accuracy train: 0.7295454545454545, Accuracy test: 0.6878306878306878
Recall train: 0.7824267782426778, Recall test: 0.7475728155339806
Precision train: 0.7362204724409449, Precision test: 0.7
Modelo: 10, Units: 10, Hidden: 2, Activation: selu, Learning Rate: 0.1, Optimizer: Adam, Batch Size: 16
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Accuracy train: 0.7, Accuracy test: 0.7142857142857143
Recall train: 0.7615062761506276, Recall test: 0.8349514563106796
Precision train: 0.708171206225681, Precision test: 0.6991869918699187
Mejor modelo#
Modelo: 2, Units: 20, Hidden: 2, Activation: elu, Learning Rate: 0.001, Optimizer: Adam, Batch Size: 32
Se selecciona el Modelo 2; sin embargo, esta elección no es única, ya que depende del criterio del analista y de la métrica de desempeño que se desee priorizar según el contexto del problem
model = load_model("best_model_2.keras")
# Probabilidades:
y_prob_train = model.predict(X_train)
y_prob = model.predict(X_test)
# Definición de las clases con umbral:
y_pred_train = np.where(y_prob_train > 0.5, 1, 0)
y_pred = np.where(y_prob > 0.5, 1, 0)
# ------------------------
# Evaluación del modelo
# ------------------------
# =========================================================
# 1. Matrices de confusión
# =========================================================
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_pred, labels=[0, 1])
cm_df_train = pd.DataFrame(
cm_train,
index=["Real: No Reorg.", "Real: Reorg."],
columns=["Pred: No Reorg.", "Pred: Reorg."]
)
cm_df_test = pd.DataFrame(
cm_test,
index=["Real: No Reorg.", "Real: Reorg."],
columns=["Pred: No Reorg.", "Pred: Reorg."]
)
# =========================================================
# 2. Estilo visual
# =========================================================
cmap = mpl.colormaps["viridis"]
BG_FIG = "#f7f7f7"
BG_AX = "#ffffff"
GRID_COL = "#d9d9d9"
TEXT_COL = "#1f1f1f"
SUB_COL = "#4d4d4d"
TITLE_FS = 20
SUBTITLE_FS = 12
LABEL_FS = 12
TICK_FS = 11
ANNOT_FS = 16
sns.set_theme(style="white")
# =========================================================
# 3. Figura con dos paneles
# =========================================================
fig, axes = plt.subplots(1, 2, figsize=(12, 5.5), facecolor=BG_FIG)
fig.suptitle(
"Matrices de confusión",
fontsize=TITLE_FS,
fontweight="bold",
color=TEXT_COL,
y=0.98
)
# =========================================================
# 4. Función para dibujar cada heatmap
# =========================================================
def plot_conf_matrix(ax, cm_df, title):
ax.set_facecolor(BG_AX)
hm = sns.heatmap(
cm_df,
annot=True,
fmt="d",
cmap=cmap,
cbar=True,
linewidths=0.8,
linecolor=GRID_COL,
square=True,
annot_kws={
"fontsize": ANNOT_FS,
"fontweight": "bold",
"color": TEXT_COL
},
cbar_kws={"shrink": 0.85},
ax=ax
)
ax.set_title(
title,
fontsize=15,
fontweight="bold",
color=TEXT_COL,
pad=10
)
ax.set_xlabel(
"Clase predicha",
fontsize=LABEL_FS,
fontweight="bold",
color=SUB_COL
)
ax.set_ylabel(
"Clase real",
fontsize=LABEL_FS,
fontweight="bold",
color=SUB_COL
)
ax.tick_params(axis='x', labelsize=TICK_FS, colors=TEXT_COL, rotation=0)
ax.tick_params(axis='y', labelsize=TICK_FS, colors=TEXT_COL, rotation=0)
for lbl in ax.get_xticklabels() + ax.get_yticklabels():
lbl.set_fontweight("bold")
# Estilo del colorbar
cbar = hm.collections[0].colorbar
cbar.ax.tick_params(labelsize=10, colors=TEXT_COL)
for t in cbar.ax.get_yticklabels():
t.set_fontweight("bold")
for spine in ax.spines.values():
spine.set_edgecolor(GRID_COL)
spine.set_linewidth(0.8)
# =========================================================
# 5. Dibujar train y test
# =========================================================
plot_conf_matrix(axes[0], cm_df_train, "Train")
plot_conf_matrix(axes[1], cm_df_test, "Test")
plt.tight_layout(rect=[0.03, 0.08, 0.98, 0.92])
plt.show()
print("\n=== Reporte de Clasificación - train ===")
print(classification_report(y_train, y_pred_train))
print("\n=== Reporte de Clasificación - test ===")
print(classification_report(y_test, y_pred))
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
=== Reporte de Clasificación - train ===
precision recall f1-score support
0 0.71 0.76 0.74 201
1 0.79 0.74 0.76 239
accuracy 0.75 440
macro avg 0.75 0.75 0.75 440
weighted avg 0.75 0.75 0.75 440
=== Reporte de Clasificación - test ===
precision recall f1-score support
0 0.69 0.70 0.69 86
1 0.75 0.74 0.74 103
accuracy 0.72 189
macro avg 0.72 0.72 0.72 189
weighted avg 0.72 0.72 0.72 189