Tracking con Scikit-Learn usando el sistema local de archivos#

  • Ultima modificación: Mayo 14, 2022

Código base#

[1]:
def load_data():

    import pandas as pd

    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    df = pd.read_csv(url, sep=";")

    y = df["quality"]
    x = df.copy()
    x.pop("quality")

    return x, y


def make_train_test_split(x, y):

    from sklearn.model_selection import train_test_split

    (x_train, x_test, y_train, y_test) = train_test_split(
        x,
        y,
        test_size=0.25,
        random_state=123456,
    )
    return x_train, x_test, y_train, y_test


def eval_metrics(y_true, y_pred):

    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return mse, mae, r2


def report(estimator, mse, mae, r2):

    print(estimator, ":", sep="")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")

MLflow Tracking#

[2]:
def make_experiment(experiment_name, alphas, l1_ratios, n_splits=5, verbose=1):

    import os

    from sklearn.linear_model import ElasticNet
    from sklearn.model_selection import GridSearchCV

    import mlflow
    import mlflow.sklearn

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)

    param_grid = {
        "alpha": alphas,
        "l1_ratio": l1_ratios,
    }

    estimator = GridSearchCV(
        estimator=ElasticNet(
            random_state=12345,
        ),
        param_grid=param_grid,
        cv=n_splits,
        refit=True,
        verbose=0,
        return_train_score=False,
    )

    #
    # Establece el directorio de tracking. Esta es la dirección absoluta al
    # directorio actual en este ejemplo.
    #
    if not os.path.exists("corridas"):
        os.makedirs("corridas")
    mlflow.set_tracking_uri("file:///workspace/mlflow/corridas")
    print("Tracking directory:", mlflow.get_tracking_uri())


    #
    # Autotracking
    #
    mlflow.sklearn.autolog(
        log_input_examples=False,
        log_model_signatures=True,
        log_models=True,
        disable=False,
        exclusive=False,
        disable_for_unsupported_versions=False,
        silent=False,
        max_tuning_runs=10,
        log_post_training_metrics=True,
        serialization_format="cloudpickle",
        registered_model_name=None,
    )

    #
    # Almancena las corridas  en el experimento indicado
    #
    mlflow.set_experiment(experiment_name)

    with mlflow.start_run() as run:

        run = mlflow.active_run()
        print("Active run_id: {}".format(run.info.run_id))

        estimator.fit(x_train, y_train)

        #
        # Reporta el mejor modelo encontrado en la corrida
        #
        estimator = estimator.best_estimator_
        mse, mae, r2 = eval_metrics(y_test, y_pred=estimator.predict(x_test))
        if verbose > 0:
            report(estimator, mse, mae, r2)

        mlflow.log_metric("mse", mse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

[3]:
import numpy as np

#
# Se realizar el primer tanteo
#
make_experiment(
    experiment_name="red-wine",
    alphas=np.linspace(0.0001, 0.5, 10),
    l1_ratios=np.linspace(0.0001, 0.5, 10),
    n_splits=5,
    verbose=1,
)
Tracking directory: file:///workspace/mlflow/corridas
2022/06/04 03:05:56 INFO mlflow.tracking.fluent: Experiment with name 'red-wine' does not exist. Creating a new experiment.
2022/06/04 03:05:57 WARNING mlflow.utils: Truncated the value of the key `param_grid`. Truncated value: `{'alpha': array([1.00000000e-04, 5.56444444e-02, 1.11188889e-01, 1.66733333e-01,
       2.22277778e-01, 2.77822222e-01, 3.33366667e-01, 3.88911111e-01,
       4.44455556e-01, 5.00000000e-01]), 'l1_ratio': array([1.00000000e-04, 5.56444444e-02, 1.1...`
Active run_id: f030202219cb413fa80755e476d5dd0b
2022/06/04 03:06:04 INFO mlflow.sklearn.utils: Logging the 10 best runs, 90 runs will be omitted.
ElasticNet(alpha=0.0001, l1_ratio=0.16673333333333332, random_state=12345):
  MSE: 0.4555342217989927
  MAE: 0.5292810437627216
  R2: 0.34641239384627454
[4]:
#
# Se realizar el segundo tanteo
#
make_experiment(
    experiment_name="red-wine",
    alphas=np.linspace(0.0000001, 0.0002, 10),
    l1_ratios=np.linspace(0.1, 0.2, 10),
    n_splits=5,
    verbose=1,
)
Tracking directory: file:///workspace/mlflow/corridas
2022/06/04 03:06:06 WARNING mlflow.utils: Truncated the value of the key `param_grid`. Truncated value: `{'alpha': array([1.00000000e-07, 2.23111111e-05, 4.45222222e-05, 6.67333333e-05,
       8.89444444e-05, 1.11155556e-04, 1.33366667e-04, 1.55577778e-04,
       1.77788889e-04, 2.00000000e-04]), 'l1_ratio': array([0.1       , 0.11111111, 0.12222222,...`
Active run_id: 37eb322f2b1f4c60af70114ad4381f43
2022/06/04 03:06:12 INFO mlflow.sklearn.utils: Logging the 10 best runs, 90 runs will be omitted.
ElasticNet(alpha=0.0002, l1_ratio=0.2, random_state=12345):
  MSE: 0.4555784329298052
  MAE: 0.5291677556910062
  R2: 0.3463489609673155

MLflow ui#

Para visualizar la interfase use:

mlflow ui

Nota: En docker usar:

mlflow ui --host 0.0.0.0

con:

http://127.0.0.1:5001

assets/mlflow-tracking-1-sklearn-part-0.png

Detalles de la corrida

assets/mlflow-tracking-1-sklearn-part-1.png assets/mlflow-tracking-1-sklearn-part-2.png assets/mlflow-tracking-1-sklearn-part-3.png

Chequeo#

[5]:
# def check_estimator():
#
#     import mlflow
#
#     x, y = load_data()
#     x_train, x_test, y_train, y_test = make_train_test_split(x, y)
#
#     # NOTA: este parámetro es copiado directamente de la interfase de MLflow
#     estimator_path = "runs:/41f159a7bf5b48348354b126242d4c23/model"
#     estimator = mlflow.pyfunc.load_model(estimator_path)
#     mse, mae, r2 = eval_metrics(y_test, y_pred=estimator.predict(x_test))
#     report(estimator, mse, mae, r2)
#
#
# #
# # Debe coincidir con el mejor modelo encontrado en la celdas anteriores
# #
# check_estimator()
[6]:
# -----------------------------------------------------------------------------
# No se borran las corridas para comparar resultados con otras librerías
# -----------------------------------------------------------------------------
# %%bash
# rm -rf outputs mlruns models