Entrenamiento ingenuo de modelos en sklearn#

Ultima modificación: Junio 3, 2022

https://www.mlflow.org/docs/latest/quickstart.html

Carga de datos#

[1]:

def load_data():

    import pandas as pd

    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    df = pd.read_csv(url, sep=";")

    y = df["quality"]
    x = df.copy()
    x.pop("quality")

    return x, y

Particionamiento de los datos#

[2]:

def make_train_test_split(x, y):

    from sklearn.model_selection import train_test_split

    (x_train, x_test, y_train, y_test) = train_test_split(
        x,
        y,
        test_size=0.25,
        random_state=123456,
    )
    return x_train, x_test, y_train, y_test

Cálculo de métricas de evaluación#

[3]:

def eval_metrics(y_true, y_pred):

    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return mse, mae, r2

Reporte de métricas de evaluación#

[4]:

def report(estimator, mse, mae, r2):

    print(estimator, ":", sep="")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")

Almacenamiento del modelo#

[5]:

def save_best_estimator(estimator):

    import os
    import pickle

    if not os.path.exists("models"):
        os.makedirs("models")
    with open("models/estimator.pickle", "wb") as file:
        pickle.dump(estimator, file)

Carga del modelo#

[6]:

def load_best_estimator():

    import os
    import pickle

    if not os.path.exists("models"):
        return None
    with open("models/estimator.pickle", "rb") as file:
        estimator = pickle.load(file)

    return estimator

Entrenamiento#

[7]:

def train_estimator(alpha=0.5, l1_ratio=0.5, verbose=1):

    from sklearn.linear_model import ElasticNet

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)
    estimator = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=12345)
    estimator.fit(x_train, y_train)
    mse, mae, r2 = eval_metrics(y_test, y_pred=estimator.predict(x_test))
    if verbose > 0:
        report(estimator, mse, mae, r2)

    best_estimator = load_best_estimator()
    if best_estimator is None or estimator.score(x_test, y_test) > best_estimator.score(
        x_test, y_test
    ):
        best_estimator = estimator

    save_best_estimator(best_estimator)

Búsqueda manual de los mejores hiperparámetros#

[8]:

!ls -1 models/*

models/estimator.pickle

[9]:

train_estimator(0.5, 0.5)

ElasticNet(alpha=0.5, random_state=12345):
  MSE: 0.6349429447805036
  MAE: 0.6453803508338732
  R2: 0.0890018368226928

[10]:

train_estimator(0.2, 0.2)

ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=12345):
  MSE: 0.5170837474931838
  MAE: 0.5701436798648394
  R2: 0.2581028767270219

[11]:

train_estimator(0.1, 0.1)

ElasticNet(alpha=0.1, l1_ratio=0.1, random_state=12345):
  MSE: 0.489021012335199
  MAE: 0.551252749110561
  R2: 0.29836649473051535

[12]:

train_estimator(0.1, 0.05)

ElasticNet(alpha=0.1, l1_ratio=0.05, random_state=12345):
  MSE: 0.48683363717622585
  MAE: 0.5493759222336462
  R2: 0.30150487868829456

[13]:

train_estimator(0.3, 0.2)

ElasticNet(alpha=0.3, l1_ratio=0.2, random_state=12345):
  MSE: 0.5322180010211477
  MAE: 0.5793993870194708
  R2: 0.23638867818623654

Chequeo#

[14]:

def check_estimator():

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)
    estimator = load_best_estimator()
    mse, mae, r2 = eval_metrics(y_test, y_pred=estimator.predict(x_test))
    report(estimator, mse, mae, r2)


#
# Debe coincidir con el mejor modelo encontrado en la celdas anteriores
#
check_estimator()

ElasticNet(alpha=0.0001, l1_ratio=0.0001, random_state=12345):
  MSE: 0.4555137059864613
  MAE: 0.5292599144523824
  R2: 0.3464418293533321

Problemas de esta aproximación#

El usuario tiene que decidir en cada iteración que valores tantear.
No se almacenan los resultados de cada corrida.
La complejidad aumenta exponencialmente para el usuario con el aumento de parámetros elegibles.
No tiene en cuenta aspectos como la validación cruzada.

Búsqueda por código#

[15]:

def make_hyperparameters_search(alphas, l1_ratios):

    for alpha in alphas:
        for l1_ratio in l1_ratios:
            train_estimator(alpha=alpha, l1_ratio=l1_ratio, verbose=0)

[16]:

import numpy as np

alphas = np.linspace(0.0001, 0.5, 10)
l1_ratios = np.linspace(0.0001, 0.5, 10)
make_hyperparameters_search(alphas, l1_ratios)
check_estimator()

ElasticNet(alpha=0.0001, l1_ratio=0.0001, random_state=12345):
  MSE: 0.4555137059864613
  MAE: 0.5292599144523824
  R2: 0.3464418293533321

Esta aproximación es util solo cuando hay pocos hiperparámetros.
Tampoco se almacenan parámetros de las corridas, solo el mejor modelo.

[17]:

%%bash
rm -rf outputs mlruns models