Tracking con Scikit-Learn usando el sistema local de archivos#
Ultima modificación: Mayo 14, 2022
Código base#
[1]:
def load_data():
import pandas as pd
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, sep=";")
y = df["quality"]
x = df.copy()
x.pop("quality")
return x, y
def make_train_test_split(x, y):
from sklearn.model_selection import train_test_split
(x_train, x_test, y_train, y_test) = train_test_split(
x,
y,
test_size=0.25,
random_state=123456,
)
return x_train, x_test, y_train, y_test
def eval_metrics(y_true, y_pred):
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
return mse, mae, r2
def report(estimator, mse, mae, r2):
print(estimator, ":", sep="")
print(f" MSE: {mse}")
print(f" MAE: {mae}")
print(f" R2: {r2}")
MLflow Tracking#
[2]:
def make_experiment(experiment_name, alphas, l1_ratios, n_splits=5, verbose=1):
import os
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
import mlflow
import mlflow.sklearn
x, y = load_data()
x_train, x_test, y_train, y_test = make_train_test_split(x, y)
param_grid = {
"alpha": alphas,
"l1_ratio": l1_ratios,
}
estimator = GridSearchCV(
estimator=ElasticNet(
random_state=12345,
),
param_grid=param_grid,
cv=n_splits,
refit=True,
verbose=0,
return_train_score=False,
)
#
# Establece el directorio de tracking. Esta es la dirección absoluta al
# directorio actual en este ejemplo.
#
if not os.path.exists("corridas"):
os.makedirs("corridas")
mlflow.set_tracking_uri("file:///workspace/mlflow/corridas")
print("Tracking directory:", mlflow.get_tracking_uri())
#
# Autotracking
#
mlflow.sklearn.autolog(
log_input_examples=False,
log_model_signatures=True,
log_models=True,
disable=False,
exclusive=False,
disable_for_unsupported_versions=False,
silent=False,
max_tuning_runs=10,
log_post_training_metrics=True,
serialization_format="cloudpickle",
registered_model_name=None,
)
#
# Almancena las corridas en el experimento indicado
#
mlflow.set_experiment(experiment_name)
with mlflow.start_run() as run:
run = mlflow.active_run()
print("Active run_id: {}".format(run.info.run_id))
estimator.fit(x_train, y_train)
#
# Reporta el mejor modelo encontrado en la corrida
#
estimator = estimator.best_estimator_
mse, mae, r2 = eval_metrics(y_test, y_pred=estimator.predict(x_test))
if verbose > 0:
report(estimator, mse, mae, r2)
mlflow.log_metric("mse", mse)
mlflow.log_metric("mae", mae)
mlflow.log_metric("r2", r2)
[3]:
import numpy as np
#
# Se realizar el primer tanteo
#
make_experiment(
experiment_name="red-wine",
alphas=np.linspace(0.0001, 0.5, 10),
l1_ratios=np.linspace(0.0001, 0.5, 10),
n_splits=5,
verbose=1,
)
Tracking directory: file:///workspace/mlflow/corridas
2022/06/04 03:05:56 INFO mlflow.tracking.fluent: Experiment with name 'red-wine' does not exist. Creating a new experiment.
2022/06/04 03:05:57 WARNING mlflow.utils: Truncated the value of the key `param_grid`. Truncated value: `{'alpha': array([1.00000000e-04, 5.56444444e-02, 1.11188889e-01, 1.66733333e-01,
2.22277778e-01, 2.77822222e-01, 3.33366667e-01, 3.88911111e-01,
4.44455556e-01, 5.00000000e-01]), 'l1_ratio': array([1.00000000e-04, 5.56444444e-02, 1.1...`
Active run_id: f030202219cb413fa80755e476d5dd0b
2022/06/04 03:06:04 INFO mlflow.sklearn.utils: Logging the 10 best runs, 90 runs will be omitted.
ElasticNet(alpha=0.0001, l1_ratio=0.16673333333333332, random_state=12345):
MSE: 0.4555342217989927
MAE: 0.5292810437627216
R2: 0.34641239384627454
[4]:
#
# Se realizar el segundo tanteo
#
make_experiment(
experiment_name="red-wine",
alphas=np.linspace(0.0000001, 0.0002, 10),
l1_ratios=np.linspace(0.1, 0.2, 10),
n_splits=5,
verbose=1,
)
Tracking directory: file:///workspace/mlflow/corridas
2022/06/04 03:06:06 WARNING mlflow.utils: Truncated the value of the key `param_grid`. Truncated value: `{'alpha': array([1.00000000e-07, 2.23111111e-05, 4.45222222e-05, 6.67333333e-05,
8.89444444e-05, 1.11155556e-04, 1.33366667e-04, 1.55577778e-04,
1.77788889e-04, 2.00000000e-04]), 'l1_ratio': array([0.1 , 0.11111111, 0.12222222,...`
Active run_id: 37eb322f2b1f4c60af70114ad4381f43
2022/06/04 03:06:12 INFO mlflow.sklearn.utils: Logging the 10 best runs, 90 runs will be omitted.
ElasticNet(alpha=0.0002, l1_ratio=0.2, random_state=12345):
MSE: 0.4555784329298052
MAE: 0.5291677556910062
R2: 0.3463489609673155
MLflow ui#
Para visualizar la interfase use:
mlflow ui
Nota: En docker usar:
mlflow ui --host 0.0.0.0
con:
Detalles de la corrida
Chequeo#
[5]:
# def check_estimator():
#
# import mlflow
#
# x, y = load_data()
# x_train, x_test, y_train, y_test = make_train_test_split(x, y)
#
# # NOTA: este parámetro es copiado directamente de la interfase de MLflow
# estimator_path = "runs:/41f159a7bf5b48348354b126242d4c23/model"
# estimator = mlflow.pyfunc.load_model(estimator_path)
# mse, mae, r2 = eval_metrics(y_test, y_pred=estimator.predict(x_test))
# report(estimator, mse, mae, r2)
#
#
# #
# # Debe coincidir con el mejor modelo encontrado en la celdas anteriores
# #
# check_estimator()
[6]:
# -----------------------------------------------------------------------------
# No se borran las corridas para comparar resultados con otras librerías
# -----------------------------------------------------------------------------
# %%bash
# rm -rf outputs mlruns models