XGBoost -empresas en Reorganización
-----------------------------------

.. code:: ipython3

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    import seaborn as sns
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import (
        classification_report, confusion_matrix,
        accuracy_score, f1_score, roc_auc_score,
        RocCurveDisplay, ConfusionMatrixDisplay
    )
    import xgboost.callback as xgb_callback
    from xgboost import XGBClassifier, plot_importance
    
    import warnings
    warnings.filterwarnings("ignore")

.. code:: ipython3

    path = "BD empresas en re organización.xlsx"
    
    xls = pd.ExcelFile(path)
    
    df = pd.read_excel(path, sheet_name=xls.sheet_names[0])
    
    df.head()




.. raw:: html

    
      <div id="df-145abaf6-08a5-4b67-90a7-d3ac44e41ffd" class="colab-df-container">
        <div>
    <style scoped>
        .dataframe tbody tr th:only-of-type {
            vertical-align: middle;
        }
    
        .dataframe tbody tr th {
            vertical-align: top;
        }
    
        .dataframe thead th {
            text-align: right;
        }
    </style>
    <table border="1" class="dataframe">
      <thead>
        <tr style="text-align: right;">
          <th></th>
          <th>Razón Social</th>
          <th>Margen EBIT</th>
          <th>Carga financiera</th>
          <th>Margen neto</th>
          <th>CxC</th>
          <th>CxP</th>
          <th>Solvencia</th>
          <th>Apalancamiento</th>
          <th>En Reorganización</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <th>0</th>
          <td>AACER SAS</td>
          <td>0.071690</td>
          <td>0.000000</td>
          <td>0.042876</td>
          <td>0.104095</td>
          <td>0.153192</td>
          <td>1.877078</td>
          <td>1.642505</td>
          <td>0</td>
        </tr>
        <tr>
          <th>1</th>
          <td>ABARROTES EL ROMPOY SAS</td>
          <td>0.017816</td>
          <td>0.000000</td>
          <td>0.010767</td>
          <td>0.018414</td>
          <td>0.000000</td>
          <td>0.000000</td>
          <td>0.865044</td>
          <td>0</td>
        </tr>
        <tr>
          <th>2</th>
          <td>ABASTECIMIENTOS INDUSTRIALES SAS</td>
          <td>0.144646</td>
          <td>0.054226</td>
          <td>0.059784</td>
          <td>0.227215</td>
          <td>0.025591</td>
          <td>1.077412</td>
          <td>1.272299</td>
          <td>0</td>
        </tr>
        <tr>
          <th>3</th>
          <td>ACME LEON PLASTICOS SAS</td>
          <td>0.004465</td>
          <td>0.000000</td>
          <td>-0.013995</td>
          <td>0.073186</td>
          <td>0.127866</td>
          <td>0.000000</td>
          <td>1.391645</td>
          <td>0</td>
        </tr>
        <tr>
          <th>4</th>
          <td>ADVANCED PRODUCTS COLOMBIA SAS</td>
          <td>0.141829</td>
          <td>0.050810</td>
          <td>0.053776</td>
          <td>0.398755</td>
          <td>0.147678</td>
          <td>0.675073</td>
          <td>2.118774</td>
          <td>0</td>
        </tr>
      </tbody>
    </table>
    </div>
        <div class="colab-df-buttons">
    
      <div class="colab-df-container">
        <button class="colab-df-convert" onclick="convertToInteractive('df-145abaf6-08a5-4b67-90a7-d3ac44e41ffd')"
                title="Convert this dataframe to an interactive table."
                style="display:none;">
    
      <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
        <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
      </svg>
        </button>
    
      <style>
        .colab-df-container {
          display:flex;
          gap: 12px;
        }
    
        .colab-df-convert {
          background-color: #E8F0FE;
          border: none;
          border-radius: 50%;
          cursor: pointer;
          display: none;
          fill: #1967D2;
          height: 32px;
          padding: 0 0 0 0;
          width: 32px;
        }
    
        .colab-df-convert:hover {
          background-color: #E2EBFA;
          box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
          fill: #174EA6;
        }
    
        .colab-df-buttons div {
          margin-bottom: 4px;
        }
    
        [theme=dark] .colab-df-convert {
          background-color: #3B4455;
          fill: #D2E3FC;
        }
    
        [theme=dark] .colab-df-convert:hover {
          background-color: #434B5C;
          box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
          filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
          fill: #FFFFFF;
        }
      </style>
    
        <script>
          const buttonEl =
            document.querySelector('#df-145abaf6-08a5-4b67-90a7-d3ac44e41ffd button.colab-df-convert');
          buttonEl.style.display =
            google.colab.kernel.accessAllowed ? 'block' : 'none';
    
          async function convertToInteractive(key) {
            const element = document.querySelector('#df-145abaf6-08a5-4b67-90a7-d3ac44e41ffd');
            const dataTable =
              await google.colab.kernel.invokeFunction('convertToInteractive',
                                                        [key], {});
            if (!dataTable) return;
    
            const docLinkHtml = 'Like what you see? Visit the ' +
              '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
              + ' to learn more about interactive tables.';
            element.innerHTML = '';
            dataTable['output_type'] = 'display_data';
            await google.colab.output.renderOutput(dataTable, element);
            const docLink = document.createElement('div');
            docLink.innerHTML = docLinkHtml;
            element.appendChild(docLink);
          }
        </script>
      </div>
    
    
        </div>
      </div>
    



.. code:: ipython3

    df.info()


.. parsed-literal::

    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 629 entries, 0 to 628
    Data columns (total 9 columns):
     #   Column             Non-Null Count  Dtype  
    ---  ------             --------------  -----  
     0   Razón Social       629 non-null    object 
     1   Margen EBIT        629 non-null    float64
     2   Carga financiera   629 non-null    float64
     3   Margen neto        629 non-null    float64
     4   CxC                629 non-null    float64
     5   CxP                629 non-null    float64
     6   Solvencia          629 non-null    float64
     7   Apalancamiento     629 non-null    float64
     8   En Reorganización  629 non-null    int64  
    dtypes: float64(7), int64(1), object(1)
    memory usage: 44.4+ KB
    

Ajuste con todas las variables
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. code:: ipython3

    # ------------------------
    # Selección de variables
    # ------------------------
    variables_seleccionadas = ['Margen EBIT',
                               'Carga financiera',
                               'Margen neto',
                               'CxC',
                               'CxP',
                               'Solvencia',
                               'Apalancamiento']
    
    # Variable objetivo
    target = 'En Reorganización'
    
    # ------------------------
    # Preparar datos
    # ------------------------
    X = df[variables_seleccionadas]
    y = df[target]
    
    # Estandarizar variables
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Dividir en entrenamiento y prueba (70%-30%)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=35, stratify=y)

XGBoost
~~~~~~~

**Definición del Modelo:**

.. code:: ipython3

    model = XGBClassifier(
        # --- Tasa de aprendizaje e iteraciones ---
        learning_rate=0.1,         # η: contracción de cada árbol
        n_estimators=500,          # techo de árboles (early stopping decide el real)
        early_stopping_rounds=50,  # Early Stopping
    
    
        # --- Estructura del árbol ---
        max_depth=5,               # profundidad máxima por árbol
        min_child_weight=3,        # suma mínima de hessianos en nodo hijo
        gamma=0.1,                 # ganancia mínima para aceptar un split
    
        # --- Regularización directa ---
        reg_lambda=1,              # λ: penalización L2 sobre pesos de hojas
        reg_alpha=0.1,             # α: penalización L1 sobre pesos de hojas (sparsity)
    
        # --- Muestreo (stochastic boosting) ---
        subsample=0.8,             # fracción de filas por árbol
        colsample_bytree=0.8,      # fracción de columnas por árbol
    
        # --- Configuración general ---
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=36
    )

**Entrenamiento con Early Stopping:**

**¿Por qué se hace un segundo split?**

Early stopping necesita **dos conjuntos con roles distintos**:

-  **Datos de entrenamiento** → el modelo aprende de ellos (ajusta
   pesos, construye árboles).

-  **Datos de validación** → el modelo **no aprende** de ellos, solo los
   usa como “alarma” para saber cuándo parar.

Si monitoreas la pérdida sobre los mismos datos con los que entrenas, la
pérdida casi siempre seguirá bajando (porque el modelo puede memorizar),
y early stopping nunca se activaría.

Es por esto que ``X_train`` **se divide otra vez**.

.. code:: ipython3

    # Separar una porción del train para monitoreo
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=35, stratify=y_train
    )
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False,
    )




.. raw:: html

    <style>#sk-container-id-1 {
      /* Definition of color scheme common for light and dark mode */
      --sklearn-color-text: #000;
      --sklearn-color-text-muted: #666;
      --sklearn-color-line: gray;
      /* Definition of color scheme for unfitted estimators */
      --sklearn-color-unfitted-level-0: #fff5e6;
      --sklearn-color-unfitted-level-1: #f6e4d2;
      --sklearn-color-unfitted-level-2: #ffe0b3;
      --sklearn-color-unfitted-level-3: chocolate;
      /* Definition of color scheme for fitted estimators */
      --sklearn-color-fitted-level-0: #f0f8ff;
      --sklearn-color-fitted-level-1: #d4ebff;
      --sklearn-color-fitted-level-2: #b3dbfd;
      --sklearn-color-fitted-level-3: cornflowerblue;
    
      /* Specific color for light theme */
      --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));
      --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));
      --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));
      --sklearn-color-icon: #696969;
    
      @media (prefers-color-scheme: dark) {
        /* Redefinition of color scheme for dark theme */
        --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));
        --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));
        --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));
        --sklearn-color-icon: #878787;
      }
    }
    
    #sk-container-id-1 {
      color: var(--sklearn-color-text);
    }
    
    #sk-container-id-1 pre {
      padding: 0;
    }
    
    #sk-container-id-1 input.sk-hidden--visually {
      border: 0;
      clip: rect(1px 1px 1px 1px);
      clip: rect(1px, 1px, 1px, 1px);
      height: 1px;
      margin: -1px;
      overflow: hidden;
      padding: 0;
      position: absolute;
      width: 1px;
    }
    
    #sk-container-id-1 div.sk-dashed-wrapped {
      border: 1px dashed var(--sklearn-color-line);
      margin: 0 0.4em 0.5em 0.4em;
      box-sizing: border-box;
      padding-bottom: 0.4em;
      background-color: var(--sklearn-color-background);
    }
    
    #sk-container-id-1 div.sk-container {
      /* jupyter's `normalize.less` sets `[hidden] { display: none; }`
         but bootstrap.min.css set `[hidden] { display: none !important; }`
         so we also need the `!important` here to be able to override the
         default hidden behavior on the sphinx rendered scikit-learn.org.
         See: https://github.com/scikit-learn/scikit-learn/issues/21755 */
      display: inline-block !important;
      position: relative;
    }
    
    #sk-container-id-1 div.sk-text-repr-fallback {
      display: none;
    }
    
    div.sk-parallel-item,
    div.sk-serial,
    div.sk-item {
      /* draw centered vertical line to link estimators */
      background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));
      background-size: 2px 100%;
      background-repeat: no-repeat;
      background-position: center center;
    }
    
    /* Parallel-specific style estimator block */
    
    #sk-container-id-1 div.sk-parallel-item::after {
      content: "";
      width: 100%;
      border-bottom: 2px solid var(--sklearn-color-text-on-default-background);
      flex-grow: 1;
    }
    
    #sk-container-id-1 div.sk-parallel {
      display: flex;
      align-items: stretch;
      justify-content: center;
      background-color: var(--sklearn-color-background);
      position: relative;
    }
    
    #sk-container-id-1 div.sk-parallel-item {
      display: flex;
      flex-direction: column;
    }
    
    #sk-container-id-1 div.sk-parallel-item:first-child::after {
      align-self: flex-end;
      width: 50%;
    }
    
    #sk-container-id-1 div.sk-parallel-item:last-child::after {
      align-self: flex-start;
      width: 50%;
    }
    
    #sk-container-id-1 div.sk-parallel-item:only-child::after {
      width: 0;
    }
    
    /* Serial-specific style estimator block */
    
    #sk-container-id-1 div.sk-serial {
      display: flex;
      flex-direction: column;
      align-items: center;
      background-color: var(--sklearn-color-background);
      padding-right: 1em;
      padding-left: 1em;
    }
    
    
    /* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is
    clickable and can be expanded/collapsed.
    - Pipeline and ColumnTransformer use this feature and define the default style
    - Estimators will overwrite some part of the style using the `sk-estimator` class
    */
    
    /* Pipeline and ColumnTransformer style (default) */
    
    #sk-container-id-1 div.sk-toggleable {
      /* Default theme specific background. It is overwritten whether we have a
      specific estimator or a Pipeline/ColumnTransformer */
      background-color: var(--sklearn-color-background);
    }
    
    /* Toggleable label */
    #sk-container-id-1 label.sk-toggleable__label {
      cursor: pointer;
      display: flex;
      width: 100%;
      margin-bottom: 0;
      padding: 0.5em;
      box-sizing: border-box;
      text-align: center;
      align-items: start;
      justify-content: space-between;
      gap: 0.5em;
    }
    
    #sk-container-id-1 label.sk-toggleable__label .caption {
      font-size: 0.6rem;
      font-weight: lighter;
      color: var(--sklearn-color-text-muted);
    }
    
    #sk-container-id-1 label.sk-toggleable__label-arrow:before {
      /* Arrow on the left of the label */
      content: "▸";
      float: left;
      margin-right: 0.25em;
      color: var(--sklearn-color-icon);
    }
    
    #sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {
      color: var(--sklearn-color-text);
    }
    
    /* Toggleable content - dropdown */
    
    #sk-container-id-1 div.sk-toggleable__content {
      max-height: 0;
      max-width: 0;
      overflow: hidden;
      text-align: left;
      /* unfitted */
      background-color: var(--sklearn-color-unfitted-level-0);
    }
    
    #sk-container-id-1 div.sk-toggleable__content.fitted {
      /* fitted */
      background-color: var(--sklearn-color-fitted-level-0);
    }
    
    #sk-container-id-1 div.sk-toggleable__content pre {
      margin: 0.2em;
      border-radius: 0.25em;
      color: var(--sklearn-color-text);
      /* unfitted */
      background-color: var(--sklearn-color-unfitted-level-0);
    }
    
    #sk-container-id-1 div.sk-toggleable__content.fitted pre {
      /* unfitted */
      background-color: var(--sklearn-color-fitted-level-0);
    }
    
    #sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {
      /* Expand drop-down */
      max-height: 200px;
      max-width: 100%;
      overflow: auto;
    }
    
    #sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {
      content: "▾";
    }
    
    /* Pipeline/ColumnTransformer-specific style */
    
    #sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
      color: var(--sklearn-color-text);
      background-color: var(--sklearn-color-unfitted-level-2);
    }
    
    #sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {
      background-color: var(--sklearn-color-fitted-level-2);
    }
    
    /* Estimator-specific style */
    
    /* Colorize estimator box */
    #sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
      /* unfitted */
      background-color: var(--sklearn-color-unfitted-level-2);
    }
    
    #sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {
      /* fitted */
      background-color: var(--sklearn-color-fitted-level-2);
    }
    
    #sk-container-id-1 div.sk-label label.sk-toggleable__label,
    #sk-container-id-1 div.sk-label label {
      /* The background is the default theme color */
      color: var(--sklearn-color-text-on-default-background);
    }
    
    /* On hover, darken the color of the background */
    #sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {
      color: var(--sklearn-color-text);
      background-color: var(--sklearn-color-unfitted-level-2);
    }
    
    /* Label box, darken color on hover, fitted */
    #sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {
      color: var(--sklearn-color-text);
      background-color: var(--sklearn-color-fitted-level-2);
    }
    
    /* Estimator label */
    
    #sk-container-id-1 div.sk-label label {
      font-family: monospace;
      font-weight: bold;
      display: inline-block;
      line-height: 1.2em;
    }
    
    #sk-container-id-1 div.sk-label-container {
      text-align: center;
    }
    
    /* Estimator-specific */
    #sk-container-id-1 div.sk-estimator {
      font-family: monospace;
      border: 1px dotted var(--sklearn-color-border-box);
      border-radius: 0.25em;
      box-sizing: border-box;
      margin-bottom: 0.5em;
      /* unfitted */
      background-color: var(--sklearn-color-unfitted-level-0);
    }
    
    #sk-container-id-1 div.sk-estimator.fitted {
      /* fitted */
      background-color: var(--sklearn-color-fitted-level-0);
    }
    
    /* on hover */
    #sk-container-id-1 div.sk-estimator:hover {
      /* unfitted */
      background-color: var(--sklearn-color-unfitted-level-2);
    }
    
    #sk-container-id-1 div.sk-estimator.fitted:hover {
      /* fitted */
      background-color: var(--sklearn-color-fitted-level-2);
    }
    
    /* Specification for estimator info (e.g. "i" and "?") */
    
    /* Common style for "i" and "?" */
    
    .sk-estimator-doc-link,
    a:link.sk-estimator-doc-link,
    a:visited.sk-estimator-doc-link {
      float: right;
      font-size: smaller;
      line-height: 1em;
      font-family: monospace;
      background-color: var(--sklearn-color-background);
      border-radius: 1em;
      height: 1em;
      width: 1em;
      text-decoration: none !important;
      margin-left: 0.5em;
      text-align: center;
      /* unfitted */
      border: var(--sklearn-color-unfitted-level-1) 1pt solid;
      color: var(--sklearn-color-unfitted-level-1);
    }
    
    .sk-estimator-doc-link.fitted,
    a:link.sk-estimator-doc-link.fitted,
    a:visited.sk-estimator-doc-link.fitted {
      /* fitted */
      border: var(--sklearn-color-fitted-level-1) 1pt solid;
      color: var(--sklearn-color-fitted-level-1);
    }
    
    /* On hover */
    div.sk-estimator:hover .sk-estimator-doc-link:hover,
    .sk-estimator-doc-link:hover,
    div.sk-label-container:hover .sk-estimator-doc-link:hover,
    .sk-estimator-doc-link:hover {
      /* unfitted */
      background-color: var(--sklearn-color-unfitted-level-3);
      color: var(--sklearn-color-background);
      text-decoration: none;
    }
    
    div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,
    .sk-estimator-doc-link.fitted:hover,
    div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,
    .sk-estimator-doc-link.fitted:hover {
      /* fitted */
      background-color: var(--sklearn-color-fitted-level-3);
      color: var(--sklearn-color-background);
      text-decoration: none;
    }
    
    /* Span, style for the box shown on hovering the info icon */
    .sk-estimator-doc-link span {
      display: none;
      z-index: 9999;
      position: relative;
      font-weight: normal;
      right: .2ex;
      padding: .5ex;
      margin: .5ex;
      width: min-content;
      min-width: 20ex;
      max-width: 50ex;
      color: var(--sklearn-color-text);
      box-shadow: 2pt 2pt 4pt #999;
      /* unfitted */
      background: var(--sklearn-color-unfitted-level-0);
      border: .5pt solid var(--sklearn-color-unfitted-level-3);
    }
    
    .sk-estimator-doc-link.fitted span {
      /* fitted */
      background: var(--sklearn-color-fitted-level-0);
      border: var(--sklearn-color-fitted-level-3);
    }
    
    .sk-estimator-doc-link:hover span {
      display: block;
    }
    
    /* "?"-specific style due to the `<a>` HTML tag */
    
    #sk-container-id-1 a.estimator_doc_link {
      float: right;
      font-size: 1rem;
      line-height: 1em;
      font-family: monospace;
      background-color: var(--sklearn-color-background);
      border-radius: 1rem;
      height: 1rem;
      width: 1rem;
      text-decoration: none;
      /* unfitted */
      color: var(--sklearn-color-unfitted-level-1);
      border: var(--sklearn-color-unfitted-level-1) 1pt solid;
    }
    
    #sk-container-id-1 a.estimator_doc_link.fitted {
      /* fitted */
      border: var(--sklearn-color-fitted-level-1) 1pt solid;
      color: var(--sklearn-color-fitted-level-1);
    }
    
    /* On hover */
    #sk-container-id-1 a.estimator_doc_link:hover {
      /* unfitted */
      background-color: var(--sklearn-color-unfitted-level-3);
      color: var(--sklearn-color-background);
      text-decoration: none;
    }
    
    #sk-container-id-1 a.estimator_doc_link.fitted:hover {
      /* fitted */
      background-color: var(--sklearn-color-fitted-level-3);
    }
    </style><div id="sk-container-id-1" class="sk-top-container"><div class="sk-text-repr-fallback"><pre>XGBClassifier(base_score=None, booster=None, callbacks=None,
                  colsample_bylevel=None, colsample_bynode=None,
                  colsample_bytree=0.8, device=None, early_stopping_rounds=50,
                  enable_categorical=False, eval_metric=&#x27;logloss&#x27;,
                  feature_types=None, feature_weights=None, gamma=0.1,
                  grow_policy=None, importance_type=None,
                  interaction_constraints=None, learning_rate=0.1, max_bin=None,
                  max_cat_threshold=None, max_cat_to_onehot=None,
                  max_delta_step=None, max_depth=5, max_leaves=None,
                  min_child_weight=3, missing=nan, monotone_constraints=None,
                  multi_strategy=None, n_estimators=500, n_jobs=None,
                  num_parallel_tree=None, ...)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item"><div class="sk-estimator fitted sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-1" type="checkbox" checked><label for="sk-estimator-id-1" class="sk-toggleable__label fitted sk-toggleable__label-arrow"><div><div>XGBClassifier</div></div><div><a class="sk-estimator-doc-link fitted" rel="noreferrer" target="_blank" href="https://xgboost.readthedocs.io/en/release_3.2.0/python/python_api.html#xgboost.XGBClassifier">?<span>Documentation for XGBClassifier</span></a><span class="sk-estimator-doc-link fitted">i<span>Fitted</span></span></div></label><div class="sk-toggleable__content fitted"><pre>XGBClassifier(base_score=None, booster=None, callbacks=None,
                  colsample_bylevel=None, colsample_bynode=None,
                  colsample_bytree=0.8, device=None, early_stopping_rounds=50,
                  enable_categorical=False, eval_metric=&#x27;logloss&#x27;,
                  feature_types=None, feature_weights=None, gamma=0.1,
                  grow_policy=None, importance_type=None,
                  interaction_constraints=None, learning_rate=0.1, max_bin=None,
                  max_cat_threshold=None, max_cat_to_onehot=None,
                  max_delta_step=None, max_depth=5, max_leaves=None,
                  min_child_weight=3, missing=nan, monotone_constraints=None,
                  multi_strategy=None, n_estimators=500, n_jobs=None,
                  num_parallel_tree=None, ...)</pre></div> </div></div></div></div>



Resultados
~~~~~~~~~~

.. code:: ipython3

    print(f"Árboles solicitados (n_estimators):  {model.n_estimators}")
    print(f"Árboles construidos (best_iteration): {model.best_iteration}")
    print(f"Mejor logloss en validación:          {model.best_score:.4f}")


.. parsed-literal::

    Árboles solicitados (n_estimators):  500
    Árboles construidos (best_iteration): 21
    Mejor logloss en validación:          0.4752
    

.. code:: ipython3

    resultados = model.evals_result()
    
    plt.figure(figsize=(10, 5))
    plt.plot(resultados['validation_0']['logloss'], label='Validación', color='steelblue')
    plt.axvline(x=model.best_iteration, color='red', linestyle='--', label=f'Mejor iteración ({model.best_iteration})')
    plt.xlabel('Número de árboles')
    plt.ylabel('Log Loss')
    plt.title('Curva de Aprendizaje')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()



.. image:: output_14_0.png


.. code:: ipython3

    importancia = pd.DataFrame({
        'Feature': variables_seleccionadas,
        'Gain': [model.get_booster().get_score(importance_type='gain').get(f'f{i}', 0) for i in range(len(variables_seleccionadas))],
        'Weight': [model.get_booster().get_score(importance_type='weight').get(f'f{i}', 0) for i in range(len(variables_seleccionadas))],
        'Cover': [model.get_booster().get_score(importance_type='cover').get(f'f{i}', 0) for i in range(len(variables_seleccionadas))],
    })
    
    importancia = importancia.sort_values('Gain', ascending=False).reset_index(drop=True)
    print(importancia.to_string(index=False))


.. parsed-literal::

             Feature     Gain  Weight     Cover
      Apalancamiento 4.085507    86.0 27.509918
                 CxC 3.897206    87.0 24.689163
                 CxP 3.717442    88.0 23.958036
         Margen neto 3.180511    83.0 25.785898
           Solvencia 3.067891    73.0 20.580566
    Carga financiera 1.666710    44.0 19.381701
         Margen EBIT 1.559393    56.0 17.179787
    

La tabla muestra qué tan importante fue cada variable para el modelo,
medida de tres formas distintas. Las tres cuentan historias
complementarias.

**Gain** es la más informativa. Cada vez que un árbol usa una variable
para hacer un split, ese split produce una reducción en la función de
pérdida (la “ganancia” que discutimos con gamma). El Gain de la tabla es
el promedio de esas ganancias a lo largo de todos los árboles.
Apalancamiento tiene el Gain más alto (4.08), lo que significa que cada
vez que el modelo usó Apalancamiento para partir un nodo, en promedio
mejoró la predicción más que con cualquier otra variable. Carga
financiera y Margen EBIT tienen los Gain más bajos (1.66 y 1.55), lo que
indica que cuando se usaron, su contribución a reducir el error fue
menor.

**Weight** es simplemente cuántas veces apareció cada variable como
criterio de split en todos los árboles. CxP fue la más usada (88 veces),
Carga financiera la menos usada (44 veces). Pero esta métrica puede ser
engañosa: una variable ruidosa podría aparecer muchas veces haciendo
splits pequeños e inútiles, inflando su Weight sin realmente contribuir.
Por eso CxP tiene el Weight más alto pero no el Gain más alto: se usó
mucho pero cada uso individual aportó un poco menos que Apalancamiento.

**Cover** es el número promedio de muestras que pasaron por los nodos
donde se usó esa variable. Apalancamiento tiene el Cover más alto
(27.5), lo que significa que cuando el modelo la usó, afectó a más
muestras en promedio. Margen EBIT tiene el más bajo (17.1): sus splits
tendieron a ocurrir en nodos más pequeños, afectando a menos
observaciones. Si Apalancamiento tiene Cover de 27.5, significa que en
promedio, cada vez que el modelo usó Apalancamiento para partir un nodo,
ese nodo contenía unas 27-28 muestras. Los splits con Apalancamiento
ocurrieron en nodos “grandes”, cerca de la raíz, donde todavía había
muchas muestras juntas. La decisión afectó a muchas observaciones de un
solo golpe.

Evaluación del ajuste
~~~~~~~~~~~~~~~~~~~~~

.. code:: ipython3

    # Probabilidades:
    y_prob_train = model.predict(X_train)
    y_prob = model.predict(X_test)
    
    # Definición de las clases con umbral:
    y_pred_train  = np.where(y_prob_train > 0.5, 1, 0)
    y_pred = np.where(y_prob > 0.5, 1, 0)
    
    # ------------------------
    # Evaluación del modelo
    # ------------------------
    
    # =========================================================
    # 1. Matrices de confusión
    # =========================================================
    cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
    cm_test  = confusion_matrix(y_test, y_pred, labels=[0, 1])
    
    cm_df_train = pd.DataFrame(
        cm_train,
        index=["Real: No Reorg.", "Real: Reorg."],
        columns=["Pred: No Reorg.", "Pred: Reorg."]
    )
    
    cm_df_test = pd.DataFrame(
        cm_test,
        index=["Real: No Reorg.", "Real: Reorg."],
        columns=["Pred: No Reorg.", "Pred: Reorg."]
    )
    
    # =========================================================
    # 2. Estilo visual
    # =========================================================
    cmap = mpl.colormaps["viridis"]
    
    BG_FIG   = "#f7f7f7"
    BG_AX    = "#ffffff"
    GRID_COL = "#d9d9d9"
    TEXT_COL = "#1f1f1f"
    SUB_COL  = "#4d4d4d"
    
    TITLE_FS    = 20
    SUBTITLE_FS = 12
    LABEL_FS    = 12
    TICK_FS     = 11
    ANNOT_FS    = 16
    
    sns.set_theme(style="white")
    
    # =========================================================
    # 3. Figura con dos paneles
    # =========================================================
    fig, axes = plt.subplots(1, 2, figsize=(12, 5.5), facecolor=BG_FIG)
    
    fig.suptitle(
        "Matrices de confusión",
        fontsize=TITLE_FS,
        fontweight="bold",
        color=TEXT_COL,
        y=0.98
    )
    
    # =========================================================
    # 4. Función para dibujar cada heatmap
    # =========================================================
    def plot_conf_matrix(ax, cm_df, title):
        ax.set_facecolor(BG_AX)
    
        hm = sns.heatmap(
            cm_df,
            annot=True,
            fmt="d",
            cmap=cmap,
            cbar=True,
            linewidths=0.8,
            linecolor=GRID_COL,
            square=True,
            annot_kws={
                "fontsize": ANNOT_FS,
                "fontweight": "bold",
                "color": TEXT_COL
            },
            cbar_kws={"shrink": 0.85},
            ax=ax
        )
    
        ax.set_title(
            title,
            fontsize=15,
            fontweight="bold",
            color=TEXT_COL,
            pad=10
        )
    
        ax.set_xlabel(
            "Clase predicha",
            fontsize=LABEL_FS,
            fontweight="bold",
            color=SUB_COL
        )
    
        ax.set_ylabel(
            "Clase real",
            fontsize=LABEL_FS,
            fontweight="bold",
            color=SUB_COL
        )
    
        ax.tick_params(axis='x', labelsize=TICK_FS, colors=TEXT_COL, rotation=0)
        ax.tick_params(axis='y', labelsize=TICK_FS, colors=TEXT_COL, rotation=0)
    
        for lbl in ax.get_xticklabels() + ax.get_yticklabels():
            lbl.set_fontweight("bold")
    
        # Estilo del colorbar
        cbar = hm.collections[0].colorbar
        cbar.ax.tick_params(labelsize=10, colors=TEXT_COL)
        for t in cbar.ax.get_yticklabels():
            t.set_fontweight("bold")
    
        for spine in ax.spines.values():
            spine.set_edgecolor(GRID_COL)
            spine.set_linewidth(0.8)
    
    # =========================================================
    # 5. Dibujar train y test
    # =========================================================
    plot_conf_matrix(axes[0], cm_df_train, "Train")
    plot_conf_matrix(axes[1], cm_df_test, "Test")
    
    plt.tight_layout(rect=[0.03, 0.08, 0.98, 0.92])
    plt.show()
    
    print("\n=== Reporte de Clasificación - train ===")
    print(classification_report(y_train, y_pred_train))
    
    print("\n=== Reporte de Clasificación - test ===")
    print(classification_report(y_test, y_pred))



.. image:: output_18_0.png


.. parsed-literal::

    
    === Reporte de Clasificación - train ===
                  precision    recall  f1-score   support
    
               0       0.83      0.91      0.87       201
               1       0.91      0.85      0.88       239
    
        accuracy                           0.87       440
       macro avg       0.87      0.88      0.87       440
    weighted avg       0.88      0.87      0.87       440
    
    
    === Reporte de Clasificación - test ===
                  precision    recall  f1-score   support
    
               0       0.76      0.84      0.80        86
               1       0.85      0.78      0.81       103
    
        accuracy                           0.80       189
       macro avg       0.80      0.81      0.80       189
    weighted avg       0.81      0.80      0.80       189
    
    
