Ejemplo gradiente descendente

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Importar datos:

df = pd.read_csv("DatosCafe.csv", sep=";", decimal=",")
print(df.head())
    Fecha  PrecioInterno  PrecioInternacional  Producción  Exportaciones  0  ene-00       371375.0               130.12         658          517.0
1  feb-00       354297.0               124.72         740          642.0
2  mar-00       360016.0               119.51         592          404.0
3  abr-00       347538.0               112.67        1055          731.0
4  may-00       353750.0               110.31        1114          615.0

       TRM     EUR
0  1923.57  1916.0
1  1950.64  1878.5
2  1956.25  1875.0
3  1986.77  1832.0
4  2055.69  1971.5

Modelo:

\[\hat{y} = +w_1\times Producción + w_2\times EUR +b\]
y = df[["Exportaciones"]]
X = df[["Producción", "EUR"]]
print(y[0:5])
   Exportaciones
0          517.0
1          642.0
2          404.0
3          731.0
4          615.0
print(X[0:5])
   Producción     EUR
0         658  1916.0
1         740  1878.5
2         592  1875.0
3        1055  1832.0
4        1114  1971.5
m = len(X)
m
264

Normalización de los datos:

\[x_i = \frac{x_i-\overline{x}}{\sigma_x}\]
mean = X.mean(axis=0)
X -= mean
std = X.std(axis=0)
X /= std
print(X[0:5])
   Producción       EUR
0   -1.169135 -1.681368
1   -0.861422 -1.739874
2   -1.416807 -1.745334
3    0.320649 -1.812420
4    0.542052 -1.594781
X_b = np.c_[np.ones((m, 1)), X["Producción"], X["EUR"]]
X_b[0:5]
array([[ 1.        , -1.16913522, -1.68136846],
       [ 1.        , -0.86142167, -1.73987376],
       [ 1.        , -1.4168071 , -1.74533426],
       [ 1.        ,  0.32064866, -1.81242033],
       [ 1.        ,  0.54205231, -1.59478062]])

Solución analítica de mínimos cuadrados:

\[\hat{W} = \left(X^TX\right)^{-1}X^Ty\]
W_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
W_best
array([[891.61310606],
       [162.80146402],
       [ 43.15212569]])
\[\hat{y} = 162,8\times Producción + 43,2\times EUR + 891,6\]
MSE_best = sum(np.array((X_b.dot(W_best) - y) ** 2)) / m
MSE_best
array([14712.15294164])

Batch Gradient Descent:

\[\eta = 0.01\]
\[iteraciones = 1000\]
eta = 0.01  # learning rate
n_iterations = 1000

W = np.random.randn(3, 1)  # random initialization
Ws = np.zeros([3, n_iterations])

for iteration in range(n_iterations):
    output = X_b.dot(W)
    gradients = 2 / m * X_b.T.dot(output - y)
    W = W - eta * gradients
    Ws[:, iteration] = W.T
from mpl_toolkits import mplot3d
fig = plt.figure(figsize=(8, 6))
axes = plt.axes(projection="3d")
axes.scatter3D(
    Ws[0, :], Ws[1, :], Ws[2, :], cmap=plt.cm.RdYlGn,
)
axes.scatter3D(W_best[0], W_best[1], W_best[2], marker="*", color="darkgreen", s=200)
axes.set_xlabel("W0")
axes.set_ylabel("W1")
axes.set_zlabel("W3")
plt.tight_layout()
plt.show()
../../../_images/output_23_0.png
MSE = []
for i in range(len(Ws.T)):
    MSE.append(sum((X_b.dot(Ws[:, i]) - df["Exportaciones"]) ** 2) / m)

plt.plot(range(len(Ws.T)), MSE)
plt.title("Evolución Loss Funtion por cada epoch")
plt.xlabel("$Epoch$")
plt.ylabel("$MSE$")
Text(0, 0.5, '$MSE$')
../../../_images/output_24_1.png
min(MSE)
14712.15294185085
MSE.index(min(MSE))
999
Ws[:, MSE.index(min(MSE))]
array([891.61310456, 162.80104591,  43.1525438 ])
W_best
array([[891.61310606],
       [162.80146402],
       [ 43.15212569]])

Stochastic Gradient Descent:

\[epoch = 50\]
\[iteraciones = 100\]
n_iterations = 100
n_epochs = 50
t0, t1 = 5, 50  # learning schedule hyperparameters


def learning_schedule(t):
    return t0 / (t + t1)


W = np.random.randn(3, 1)  # random initialization
Ws = np.zeros([3, n_epochs])

etas = []
for epoch in range(n_epochs):
    for iteration in range(n_iterations):
        random_index = np.random.randint(m)
        xi = X_b[random_index : random_index + 1]
        yi = y[random_index : random_index + 1]
        output = xi.dot(W)
        gradients = 2 * xi.T.dot(output - yi)
        eta = learning_schedule(epoch * n_iterations + iteration)
        W = W - eta * gradients
    Ws[:, epoch] = W.T
    etas.append(eta)
fig = plt.figure(figsize=(8, 6))
axes = plt.axes(projection="3d")
axes.scatter3D(
    Ws[0, :], Ws[1, :], Ws[2, :], cmap=plt.cm.RdYlGn,
)
axes.scatter3D(W_best[0], W_best[1], W_best[2], marker="*", color="darkgreen", s=200)
axes.set_xlabel("W0")
axes.set_ylabel("W1")
axes.set_zlabel("W3")
plt.tight_layout()
plt.show()
../../../_images/output_32_0.png
MSE = []
for i in range(len(Ws.T)):
    MSE.append(sum((X_b.dot(Ws[:, i]) - df["Exportaciones"]) ** 2) / m)

plt.plot(range(len(Ws.T)), MSE)
plt.title("Evolución Loss Funtion por cada epoch")
plt.xlabel("$Epoch$")
plt.ylabel("$MSE$")
Text(0, 0.5, '$MSE$')
../../../_images/output_33_1.png
min(MSE)
14717.292656273383
MSE.index(min(MSE))
18
Ws[:, MSE.index(min(MSE))]
array([893.05677315, 164.10330922,  43.90785387])
W_best
array([[891.61310606],
       [162.80146402],
       [ 43.15212569]])

Mini-batch Gradient Descent

\[batch = 12\]
\[epoch = 50\]
\[iteraciones = 1000\]
batch_size = 12
n_iterations = 1000
n_epochs = 50
t0, t1 = 5, 50  # learning schedule hyperparameters


def learning_schedule(t):
    return t0 / (t + t1)


W = np.random.randn(3, 1)  # random initialization
Ws = np.zeros([3, n_epochs])

etas = []
for epoch in range(n_epochs):
    for iteration in range(n_iterations):
        random_index = np.random.randint(m - batch_size)
        xi = X_b[random_index : random_index + batch_size]
        yi = y[random_index : random_index + batch_size]
        output = xi.dot(W)
        gradients = 2 * xi.T.dot(output - yi)
        eta = learning_schedule(epoch * n_iterations + iteration)
        W = W - eta * gradients
    Ws[:, epoch] = W.T
    etas.append(eta)
fig = plt.figure(figsize=(8, 6))
axes = plt.axes(projection="3d")
axes.scatter3D(
    Ws[0, :], Ws[1, :], Ws[2, :], cmap=plt.cm.RdYlGn,
)
axes.scatter3D(W_best[0], W_best[1], W_best[2], marker="*", color="darkgreen", s=200)
axes.set_xlabel("W0")
axes.set_ylabel("W1")
axes.set_zlabel("W3")
plt.tight_layout()
plt.show()
../../../_images/output_41_0.png
MSE = []
for i in range(len(Ws.T)):
    MSE.append(sum((X_b.dot(Ws[:, i]) - df["Exportaciones"]) ** 2) / m)

plt.plot(range(len(Ws.T)), MSE)
plt.title("Evolución Loss Funtion por cada epoch")
plt.xlabel("$Epoch$")
plt.ylabel("$MSE$")
Text(0, 0.5, '$MSE$')
../../../_images/output_42_1.png
min(MSE)
14714.314445750006
MSE.index(min(MSE))
47
Ws[:, MSE.index(min(MSE))]
array([893.0543586 , 163.11032403,  43.09734125])
W_best
array([[891.61310606],
       [162.80146402],
       [ 43.15212569]])