Ejemplo gradiente descendente#

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Importar datos:

df = pd.read_csv("DatosCafe.csv", sep=";", decimal=",")
print(df.head())

    Fecha  PrecioInterno  PrecioInternacional  Producción  Exportaciones  0  ene-00       371375.0               130.12         658          517.0
1  feb-00       354297.0               124.72         740          642.0
2  mar-00       360016.0               119.51         592          404.0
3  abr-00       347538.0               112.67        1055          731.0
4  may-00       353750.0               110.31        1114          615.0

       TRM     EUR
0  1923.57  1916.0
1  1950.64  1878.5
2  1956.25  1875.0
3  1986.77  1832.0
4  2055.69  1971.5

Modelo:

\[\hat{y} = +w_1\times Producción + w_2\times EUR +b\]

y = df[["Exportaciones"]]
X = df[["Producción", "EUR"]]

print(y[0:5])

   Exportaciones
        517.0
        642.0
        404.0
        731.0
        615.0

print(X[0:5])

   Producción     EUR
       658  1916.0
       740  1878.5
       592  1875.0
      1055  1832.0
      1114  1971.5

m = len(X)
m

Normalización de los datos:

\[x_i = \frac{x_i-\overline{x}}{\sigma_x}\]

mean = X.mean(axis=0)
X -= mean
std = X.std(axis=0)
X /= std
print(X[0:5])

   Producción       EUR
 -1.169135 -1.681368
 -0.861422 -1.739874
 -1.416807 -1.745334
  0.320649 -1.812420
  0.542052 -1.594781

X_b = np.c_[np.ones((m, 1)), X["Producción"], X["EUR"]]
X_b[0:5]

array([[ 1.        , -1.16913522, -1.68136846],
       [ 1.        , -0.86142167, -1.73987376],
       [ 1.        , -1.4168071 , -1.74533426],
       [ 1.        ,  0.32064866, -1.81242033],
       [ 1.        ,  0.54205231, -1.59478062]])

Solución analítica de mínimos cuadrados:

\[\hat{W} = \left(X^TX\right)^{-1}X^Ty\]

W_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
W_best

array([[891.61310606],
       [162.80146402],
       [ 43.15212569]])

\[\hat{y} = 162,8\times Producción + 43,2\times EUR + 891,6\]

MSE_best = sum(np.array((X_b.dot(W_best) - y) ** 2)) / m
MSE_best

array([14712.15294164])

Batch Gradient Descent:

\[\eta = 0.01\]

\[iteraciones = 1000\]

eta = 0.01  # learning rate
n_iterations = 1000

W = np.random.randn(3, 1)  # random initialization
Ws = np.zeros([3, n_iterations])

for iteration in range(n_iterations):
    output = X_b.dot(W)
    gradients = 2 / m * X_b.T.dot(output - y)
    W = W - eta * gradients
    Ws[:, iteration] = W.T

from mpl_toolkits import mplot3d

fig = plt.figure(figsize=(8, 6))
axes = plt.axes(projection="3d")
axes.scatter3D(
    Ws[0, :], Ws[1, :], Ws[2, :], cmap=plt.cm.RdYlGn,
)
axes.scatter3D(W_best[0], W_best[1], W_best[2], marker="*", color="darkgreen", s=200)
axes.set_xlabel("W0")
axes.set_ylabel("W1")
axes.set_zlabel("W3")
plt.tight_layout()
plt.show()

MSE = []
for i in range(len(Ws.T)):
    MSE.append(sum((X_b.dot(Ws[:, i]) - df["Exportaciones"]) ** 2) / m)

plt.plot(range(len(Ws.T)), MSE)
plt.title("Evolución Loss Funtion por cada epoch")
plt.xlabel("$Epoch$")
plt.ylabel("$MSE$")

Text(0, 0.5, '$MSE$')

min(MSE)

14712.15294185085

MSE.index(min(MSE))

Ws[:, MSE.index(min(MSE))]

array([891.61310456, 162.80104591,  43.1525438 ])

W_best

array([[891.61310606],
       [162.80146402],
       [ 43.15212569]])

Stochastic Gradient Descent:

\[epoch = 50\]

\[iteraciones = 100\]

n_iterations = 100
n_epochs = 50
t0, t1 = 5, 50  # learning schedule hyperparameters


def learning_schedule(t):
    return t0 / (t + t1)


W = np.random.randn(3, 1)  # random initialization
Ws = np.zeros([3, n_epochs])

etas = []
for epoch in range(n_epochs):
    for iteration in range(n_iterations):
        random_index = np.random.randint(m)
        xi = X_b[random_index : random_index + 1]
        yi = y[random_index : random_index + 1]
        output = xi.dot(W)
        gradients = 2 * xi.T.dot(output - yi)
        eta = learning_schedule(epoch * n_iterations + iteration)
        W = W - eta * gradients
    Ws[:, epoch] = W.T
    etas.append(eta)

fig = plt.figure(figsize=(8, 6))
axes = plt.axes(projection="3d")
axes.scatter3D(
    Ws[0, :], Ws[1, :], Ws[2, :], cmap=plt.cm.RdYlGn,
)
axes.scatter3D(W_best[0], W_best[1], W_best[2], marker="*", color="darkgreen", s=200)
axes.set_xlabel("W0")
axes.set_ylabel("W1")
axes.set_zlabel("W3")
plt.tight_layout()
plt.show()

MSE = []
for i in range(len(Ws.T)):
    MSE.append(sum((X_b.dot(Ws[:, i]) - df["Exportaciones"]) ** 2) / m)

plt.plot(range(len(Ws.T)), MSE)
plt.title("Evolución Loss Funtion por cada epoch")
plt.xlabel("$Epoch$")
plt.ylabel("$MSE$")

Text(0, 0.5, '$MSE$')

min(MSE)

14717.292656273383

MSE.index(min(MSE))

Ws[:, MSE.index(min(MSE))]

array([893.05677315, 164.10330922,  43.90785387])

W_best

array([[891.61310606],
       [162.80146402],
       [ 43.15212569]])

Mini-batch Gradient Descent

\[batch = 12\]

\[epoch = 50\]

\[iteraciones = 1000\]

batch_size = 12
n_iterations = 1000
n_epochs = 50
t0, t1 = 5, 50  # learning schedule hyperparameters


def learning_schedule(t):
    return t0 / (t + t1)


W = np.random.randn(3, 1)  # random initialization
Ws = np.zeros([3, n_epochs])

etas = []
for epoch in range(n_epochs):
    for iteration in range(n_iterations):
        random_index = np.random.randint(m - batch_size)
        xi = X_b[random_index : random_index + batch_size]
        yi = y[random_index : random_index + batch_size]
        output = xi.dot(W)
        gradients = 2 * xi.T.dot(output - yi)
        eta = learning_schedule(epoch * n_iterations + iteration)
        W = W - eta * gradients
    Ws[:, epoch] = W.T
    etas.append(eta)

fig = plt.figure(figsize=(8, 6))
axes = plt.axes(projection="3d")
axes.scatter3D(
    Ws[0, :], Ws[1, :], Ws[2, :], cmap=plt.cm.RdYlGn,
)
axes.scatter3D(W_best[0], W_best[1], W_best[2], marker="*", color="darkgreen", s=200)
axes.set_xlabel("W0")
axes.set_ylabel("W1")
axes.set_zlabel("W3")
plt.tight_layout()
plt.show()

MSE = []
for i in range(len(Ws.T)):
    MSE.append(sum((X_b.dot(Ws[:, i]) - df["Exportaciones"]) ** 2) / m)

plt.plot(range(len(Ws.T)), MSE)
plt.title("Evolución Loss Funtion por cada epoch")
plt.xlabel("$Epoch$")
plt.ylabel("$MSE$")

Text(0, 0.5, '$MSE$')

min(MSE)

14714.314445750006

MSE.index(min(MSE))

Ws[:, MSE.index(min(MSE))]

array([893.0543586 , 163.11032403,  43.09734125])

W_best

array([[891.61310606],
       [162.80146402],
       [ 43.15212569]])