Optimización de Hiperparámetros - Empresas en Reorganización

Optimización de Hiperparámetros - Empresas en Reorganización#

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from keras import optimizers
from keras.models import load_model
path = "BD empresas en re organización.xlsx"

xls = pd.ExcelFile(path)

df = pd.read_excel(path, sheet_name=xls.sheet_names[0])

df.head()
Razón Social Margen EBIT Carga financiera Margen neto CxC CxP Solvencia Apalancamiento En Reorganización
0 AACER SAS 0.071690 0.000000 0.042876 0.104095 0.153192 1.877078 1.642505 0
1 ABARROTES EL ROMPOY SAS 0.017816 0.000000 0.010767 0.018414 0.000000 0.000000 0.865044 0
2 ABASTECIMIENTOS INDUSTRIALES SAS 0.144646 0.054226 0.059784 0.227215 0.025591 1.077412 1.272299 0
3 ACME LEON PLASTICOS SAS 0.004465 0.000000 -0.013995 0.073186 0.127866 0.000000 1.391645 0
4 ADVANCED PRODUCTS COLOMBIA SAS 0.141829 0.050810 0.053776 0.398755 0.147678 0.675073 2.118774 0
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Razón Social       629 non-null    object
 1   Margen EBIT        629 non-null    float64
 2   Carga financiera   629 non-null    float64
 3   Margen neto        629 non-null    float64
 4   CxC                629 non-null    float64
 5   CxP                629 non-null    float64
 6   Solvencia          629 non-null    float64
 7   Apalancamiento     629 non-null    float64
 8   En Reorganización  629 non-null    int64
dtypes: float64(7), int64(1), object(1)
memory usage: 44.4+ KB
# ------------------------
# Selección de variables
# ------------------------
variables_seleccionadas = ['Margen EBIT',
                           'Carga financiera',
                           'Margen neto',
                           'CxC',
                           'CxP',
                           'Solvencia',
                           'Apalancamiento']

# Variable objetivo
target = 'En Reorganización'

# ------------------------
# Preparar datos
# ------------------------
X = df[variables_seleccionadas]
y = df[target]

# Estandarizar variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en entrenamiento y prueba (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=35, stratify=y)

Optimización de Hiperparámetros#

cantidad_modelos = 10

for i in range(cantidad_modelos):

  units = np.random.choice([5, 8, 10, 12, 15, 18, 20, 22, 24], 1).item()
  n_hidden = np.random.choice([1, 2], 1)[0]
  activation = np.random.choice(['relu', 'tanh', 'selu', 'elu'], 1)[0]
  learning_rate = np.random.choice([0.001, 0.01, 0.1], 1)[0]
  batch_size = np.random.choice([16, 32, 64], 1)[0]
  optimizer = np.random.choice(['Adam', 'RMSprop'], 1)[0]

  print(f'Modelo: {i+1}, Units: {units}, Hidden: {n_hidden}, Activation: {activation}, Learning Rate: {learning_rate}, Optimizer: {optimizer}, Batch Size: {batch_size}')

  epochs = 200

  # Definir el modelo
  best_model = Sequential()
  best_model.add(Input(shape=(X.shape[1],)))

  # Loop para las capas ocultas:

  for _ in range(n_hidden):
    best_model.add(Dense(units, activation=activation))
    best_model.add(Dropout(0.2))

  # Capa de salida:
  best_model.add(Dense(1))

  # Optimizador:
  if optimizer == 'Adam':
    optimizer = optimizers.Adam(learning_rate=learning_rate)
  else:
    optimizer = optimizers.RMSprop(learning_rate=learning_rate)

  # Compilar el modelo:
  best_model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer=optimizer)

  # Entrenar el modelo:
  history = best_model.fit(X_train, y_train, epochs=epochs,
                      validation_data=(X_test, y_test),
                      batch_size=batch_size,
                      verbose=0)

  # Evaluar el modelo con accuracy:
  y_prob_train = best_model.predict(X_train)
  y_prob = best_model.predict(X_test)

  y_pred_train  = np.where(y_prob_train > 0.5, 1, 0)
  y_pred = np.where(y_prob > 0.5, 1, 0)

  accuracy_train = accuracy_score(y_train, y_pred_train.flatten())
  accuracy_test = accuracy_score(y_test, y_pred.flatten())

  recall_train = recall_score(y_train, y_pred_train.flatten())
  recall_test = recall_score(y_test, y_pred.flatten())

  precision_train = precision_score(y_train, y_pred_train.flatten())
  precision_test = precision_score(y_test, y_pred.flatten())

  print(f'Accuracy train: {accuracy_train}, Accuracy test: {accuracy_test}')
  print(f'Recall train: {recall_train}, Recall test: {recall_test}')
  print(f'Precision train: {precision_train}, Precision test: {precision_test}')

  # Graficar Loss train y Loss test:

  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('Model loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Test'], loc='upper left')
  plt.show()

  # Guardar el modelo:
  best_model.save(f"best_model_{i+1}.keras")
Modelo: 1, Units: 5, Hidden: 1, Activation: selu, Learning Rate: 0.001, Optimizer: RMSprop, Batch Size: 16
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
Accuracy train: 0.7477272727272727, Accuracy test: 0.708994708994709
Recall train: 0.7447698744769874, Recall test: 0.7087378640776699
Precision train: 0.7807017543859649, Precision test: 0.7448979591836735
../../../_images/output_6_12.png
Modelo: 2, Units: 20, Hidden: 2, Activation: elu, Learning Rate: 0.001, Optimizer: Adam, Batch Size: 32
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
Accuracy train: 0.75, Accuracy test: 0.7195767195767195
Recall train: 0.7405857740585774, Recall test: 0.7378640776699029
Precision train: 0.7866666666666666, Precision test: 0.7450980392156863
../../../_images/output_6_3.png
Modelo: 3, Units: 12, Hidden: 2, Activation: relu, Learning Rate: 0.01, Optimizer: Adam, Batch Size: 64
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
Accuracy train: 0.8340909090909091, Accuracy test: 0.7777777777777778
Recall train: 0.698744769874477, Recall test: 0.6699029126213593
Precision train: 0.9940476190476191, Precision test: 0.8961038961038961
../../../_images/output_6_5.png
Modelo: 4, Units: 8, Hidden: 1, Activation: relu, Learning Rate: 0.1, Optimizer: RMSprop, Batch Size: 64
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
Accuracy train: 0.8386363636363636, Accuracy test: 0.798941798941799
Recall train: 0.7364016736401674, Recall test: 0.7572815533980582
Precision train: 0.9565217391304348, Precision test: 0.8571428571428571
../../../_images/output_6_7.png
Modelo: 5, Units: 8, Hidden: 2, Activation: relu, Learning Rate: 0.1, Optimizer: Adam, Batch Size: 32
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
Accuracy train: 0.7454545454545455, Accuracy test: 0.7619047619047619
Recall train: 0.5564853556485355, Recall test: 0.5922330097087378
Precision train: 0.9568345323741008, Precision test: 0.953125
../../../_images/output_6_9.png
Modelo: 6, Units: 18, Hidden: 1, Activation: relu, Learning Rate: 0.01, Optimizer: Adam, Batch Size: 64
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
Accuracy train: 0.825, Accuracy test: 0.7724867724867724
Recall train: 0.7238493723849372, Recall test: 0.7087378640776699
Precision train: 0.9402173913043478, Precision test: 0.8488372093023255
../../../_images/output_6_111.png
Modelo: 7, Units: 18, Hidden: 2, Activation: selu, Learning Rate: 0.001, Optimizer: Adam, Batch Size: 64
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
Accuracy train: 0.7090909090909091, Accuracy test: 0.7037037037037037
Recall train: 0.7740585774058577, Recall test: 0.8155339805825242
Precision train: 0.7142857142857143, Precision test: 0.6942148760330579
../../../_images/output_6_13.png
Modelo: 8, Units: 12, Hidden: 2, Activation: elu, Learning Rate: 0.01, Optimizer: RMSprop, Batch Size: 64
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
Accuracy train: 0.8113636363636364, Accuracy test: 0.7671957671957672
Recall train: 0.7196652719665272, Recall test: 0.7087378640776699
Precision train: 0.9148936170212766, Precision test: 0.8390804597701149
../../../_images/output_6_15.png
Modelo: 9, Units: 15, Hidden: 1, Activation: tanh, Learning Rate: 0.001, Optimizer: RMSprop, Batch Size: 64
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
Accuracy train: 0.7295454545454545, Accuracy test: 0.6878306878306878
Recall train: 0.7824267782426778, Recall test: 0.7475728155339806
Precision train: 0.7362204724409449, Precision test: 0.7
../../../_images/output_6_17.png
Modelo: 10, Units: 10, Hidden: 2, Activation: selu, Learning Rate: 0.1, Optimizer: Adam, Batch Size: 16
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
Accuracy train: 0.7, Accuracy test: 0.7142857142857143
Recall train: 0.7615062761506276, Recall test: 0.8349514563106796
Precision train: 0.708171206225681, Precision test: 0.6991869918699187
../../../_images/output_6_19.png

Mejor modelo#

Modelo: 2, Units: 20, Hidden: 2, Activation: elu, Learning Rate: 0.001, Optimizer: Adam, Batch Size: 32

Se selecciona el Modelo 2; sin embargo, esta elección no es única, ya que depende del criterio del analista y de la métrica de desempeño que se desee priorizar según el contexto del problem

model = load_model("best_model_2.keras")

# Probabilidades:
y_prob_train = model.predict(X_train)
y_prob = model.predict(X_test)

# Definición de las clases con umbral:
y_pred_train  = np.where(y_prob_train > 0.5, 1, 0)
y_pred = np.where(y_prob > 0.5, 1, 0)

# ------------------------
# Evaluación del modelo
# ------------------------

# =========================================================
# 1. Matrices de confusión
# =========================================================
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
cm_test  = confusion_matrix(y_test, y_pred, labels=[0, 1])

cm_df_train = pd.DataFrame(
    cm_train,
    index=["Real: No Reorg.", "Real: Reorg."],
    columns=["Pred: No Reorg.", "Pred: Reorg."]
)

cm_df_test = pd.DataFrame(
    cm_test,
    index=["Real: No Reorg.", "Real: Reorg."],
    columns=["Pred: No Reorg.", "Pred: Reorg."]
)

# =========================================================
# 2. Estilo visual
# =========================================================
cmap = mpl.colormaps["viridis"]

BG_FIG   = "#f7f7f7"
BG_AX    = "#ffffff"
GRID_COL = "#d9d9d9"
TEXT_COL = "#1f1f1f"
SUB_COL  = "#4d4d4d"

TITLE_FS    = 20
SUBTITLE_FS = 12
LABEL_FS    = 12
TICK_FS     = 11
ANNOT_FS    = 16

sns.set_theme(style="white")

# =========================================================
# 3. Figura con dos paneles
# =========================================================
fig, axes = plt.subplots(1, 2, figsize=(12, 5.5), facecolor=BG_FIG)

fig.suptitle(
    "Matrices de confusión",
    fontsize=TITLE_FS,
    fontweight="bold",
    color=TEXT_COL,
    y=0.98
)

# =========================================================
# 4. Función para dibujar cada heatmap
# =========================================================
def plot_conf_matrix(ax, cm_df, title):
    ax.set_facecolor(BG_AX)

    hm = sns.heatmap(
        cm_df,
        annot=True,
        fmt="d",
        cmap=cmap,
        cbar=True,
        linewidths=0.8,
        linecolor=GRID_COL,
        square=True,
        annot_kws={
            "fontsize": ANNOT_FS,
            "fontweight": "bold",
            "color": TEXT_COL
        },
        cbar_kws={"shrink": 0.85},
        ax=ax
    )

    ax.set_title(
        title,
        fontsize=15,
        fontweight="bold",
        color=TEXT_COL,
        pad=10
    )

    ax.set_xlabel(
        "Clase predicha",
        fontsize=LABEL_FS,
        fontweight="bold",
        color=SUB_COL
    )

    ax.set_ylabel(
        "Clase real",
        fontsize=LABEL_FS,
        fontweight="bold",
        color=SUB_COL
    )

    ax.tick_params(axis='x', labelsize=TICK_FS, colors=TEXT_COL, rotation=0)
    ax.tick_params(axis='y', labelsize=TICK_FS, colors=TEXT_COL, rotation=0)

    for lbl in ax.get_xticklabels() + ax.get_yticklabels():
        lbl.set_fontweight("bold")

    # Estilo del colorbar
    cbar = hm.collections[0].colorbar
    cbar.ax.tick_params(labelsize=10, colors=TEXT_COL)
    for t in cbar.ax.get_yticklabels():
        t.set_fontweight("bold")

    for spine in ax.spines.values():
        spine.set_edgecolor(GRID_COL)
        spine.set_linewidth(0.8)

# =========================================================
# 5. Dibujar train y test
# =========================================================
plot_conf_matrix(axes[0], cm_df_train, "Train")
plot_conf_matrix(axes[1], cm_df_test, "Test")

plt.tight_layout(rect=[0.03, 0.08, 0.98, 0.92])
plt.show()

print("\n=== Reporte de Clasificación - train ===")
print(classification_report(y_train, y_pred_train))

print("\n=== Reporte de Clasificación - test ===")
print(classification_report(y_test, y_pred))
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
../../../_images/output_10_16.png
=== Reporte de Clasificación - train ===
              precision    recall  f1-score   support

           0       0.71      0.76      0.74       201
           1       0.79      0.74      0.76       239

    accuracy                           0.75       440
   macro avg       0.75      0.75      0.75       440
weighted avg       0.75      0.75      0.75       440


=== Reporte de Clasificación - test ===
              precision    recall  f1-score   support

           0       0.69      0.70      0.69        86
           1       0.75      0.74      0.74       103

    accuracy                           0.72       189
   macro avg       0.72      0.72      0.72       189
weighted avg       0.72      0.72      0.72       189