Escritura de funciones#
Última modificación: Mayo 14, 2022 | YouTube
El código debe ser corto#
[ ]:
#
# Cuánto tiempo requiere para entender el siguiente código que
# extrae los conjuntos train y test de un dataset para entrenar
# un modelo de ML?
#
def get_datasets(
use_cut_points=True,
use_quantiles=True,
extract_titles=True,
generate_dummies=True,
dummies_columns=None,
):
application_path = pathlib.Path(__file__).parent.parent.parent.absolute()
train = pd.read_csv(application_path / "katas/data/titanic/train.csv")
test = pd.read_csv(application_path / "katas/data/titanic/test.csv")
if use_cut_points and not use_quantiles:
cut_points = [-1, 0, 5, 12, 18, 35, 60, 100]
label_names = [
"Missing",
"Infant",
"Child",
"Teenager",
"Young Adult",
"Adult",
"Senior",
]
train["Age"] = train["Age"].fillna(-0.5)
train["Age_categories"] = pd.cut(train["Age"], cut_points, labels=label_names)
test["Age"] = test["Age"].fillna(-0.5)
test["Age_categories"] = pd.cut(test["Age"], cut_points, labels=label_names)
elif use_quantiles and not use_cut_points:
train["Age"] = train.Age.fillna(train.Age.median())
test["Age"] = test.Age.fillna(test.Age.median())
train["Age_categories"] = pd.qcut(train.Age, q=4, labels=False)
test["Age_categories"] = pd.qcut(test.Age, q=4, labels=False)
if extract_titles:
train["Title"] = train.Name.apply(
lambda x: re.search(" ([A-Z][a-z]+)\.", x).group(1)
)
test["Title"] = test.Name.apply(
lambda x: re.search(" ([A-Z][a-z]+)\.", x).group(1)
)
train["Title"] = train["Title"].replace(
{"Mlle": "Miss", "Mme": "Mrs", "Ms": "Miss"}
)
train["Title"] = train["Title"].replace(
[
"Don",
"Dona",
"Rev",
"Dr",
"Major",
"Lady",
"Sir",
"Col",
"Capt",
"Countess",
"Jonkheer",
]
)
test["Title"] = test["Title"].replace(
{"Mlle": "Miss", "Mme": "Mrs", "Ms": "Miss"}
)
if generate_dummies and dummies_columns:
for column_name in dummies_columns:
if (
column_name == "Age_categories"
and not use_cut_points
and not use_quantiles
):
continue
if column_name == "Title" and not extract_titles:
continue
else:
dummies = pd.get_dummies(train[column_name], prefix=column_name)
train = pd.concat([train, dummies], axis=1)
dummies = pd.get_dummies(test[column_name], prefix=column_name)
test = pd.concat([test, dummies], axis=1)
return train, test
[ ]:
#
# Existen multiples variables booleanas en el código que generan
# multiples trayectorias. Se puede refactorizar para mejorar la
# legibilidad
#
def transform_df(
input_df,
use_cut_points,
use_quantiles,
extract_titles,
generate_dummies,
dummies_columns,
):
df = input_df.copy()
df = fill_age_column(df, use_cut_points, use_quantiles)
df = append_age_categories(df, use_cut_points, use_quantiles)
df = append_titles(df, extract_titles)
df = append_one_hot_encoded_columns(
df,
use_cut_points,
use_quantiles,
extract_titles,
generate_dummies,
dummies_columns,
)
return df
Las funciones deben tener un solo objetivo#
Functions should do one thing.
They should do it well.
They should do it only.
Ejemplos de una cosa (thing):
Interactuar con los archivos del sistema para leer datos.
Aplicar una transformación a una columna.
Aplicar la misma transformación en varios datasets.
Aplicar un mismo transformador a varias columnas.
Llamar funciones (como el ejemplo de arriba).
Nivel simple de abstracción#
Se puede verificar al hacer la analogía a los capítulos y secciones de un libro.
Cantidad de argumentos en las funciones#
Cero es ideal. Tenga un máximo de seis.
Nunca use argumentos de salida#
[ ]:
def push_model(model, serving_endpoint):
model_json = model.to_json()
serving_endpoint.post(model_json)
model.uploaded = True
return model
Argumentos binarios#
Preferiblemente no use argumentos binarios.
Nunca use argumentos binarios que modifican el código de forma anidada.
Un argumento binario con if/else es una indicación de que la función está haciendo más de una cosa.
[ ]:
# ---< Mal >-------------------------------------------------------------------
def feature_transform_complex(input_df, with_mean=True, with_std=False):
if with_mean and with_std:
df = scale_with_mean_and_std(df)
elif with_mean:
df = scale_with_mean(df)
elif with_std:
df = scale_with_std(df)
else:
continue
return df
[ ]:
# ---< Ok >--------------------------------------------------------------------
# Note que esta función actua como un orquestador.
def feature_transform_simple(
input_df,
with_one_hot_encoding=True,
with_remove_stop_words=False,
):
if with_one_hot_encoding:
df = gen_one_hot_encoding(df, column="city")
if with_remove_stop_word:
df = remove_stop_words(df, column="text_comment")
return df
Funciones con muchos argumentos#
[ ]:
# ---< Mal >-------------------------------------------------------------------
def feature_transform(
df,
with_mean=True,
with_std=False,
copy=True,
with_max_clipping=True,
with_min_clipping=True,
with_one_hot_encoding=True,
with_generate_embeddings=True,
):
pass
[ ]:
# ---< Mejor >-----------------------------------------------------------------
class FeatureConfig:
with_mean: bool = True
with_std: bool = False
copy: bool = True
with_max_clipping: bool = True
with_min_clipping: bool = True
with_one_hot_encoding: bool = True
with_generate_embeddings: bool = True
def feature_transform(df, config:FeatureConfig):
pass
Llamada de funciones#
Las funciones en Python tienen argumentos posicionales, keywords, *args y **kwargs.
Siembre use keywords
build_model(size, "high")
build_model(model_capacity=size, model_threshold="high")
Efectos colaterales en tuberías de ingeniería de características#
Opción 1: Copie todo
[ ]:
def fill_age_nan(input_df):
df = input_df.copy()
df["Age"] = df["Age"].fillna(df["Age"].median())
return df
def append_is_adult_column(input_df):
df = input_df.copy()
df["is_adult"] = df["Age"].apply(lambda x: 0 if x <= 21 else 1)
return df
df = fill_age_nan(df)
df = append_is_adult_column(df)
Opción 2: Retorne una nueva columna
[ ]:
def get_median_age(input_df):
median_age = input_df["Age"].median()
return median_age
def get_is_adult_column(input_df):
is_adult = df["Age"].apply(lambda x: 0 if x < 21 else 1)
return is_adult
median_age = get_median_age(df)
df["Age"] = df["Age"].fillna(median_age)
df["is_adult"] = get_is_adult_column(df)
Opción 3: Append solo dentro de las funciones
[ ]:
def fill_age_nan(df):
df["Age_no_nan"] = df["Age"].fillna(df["Age"].median())
def append_is_adult_column(df):
df["is_adult"] = df["Age_no_nan"].apply(lambda x: 0 if x <= 21 else 1)
fill_age_nan(df)
append_is_adult_column(df)
Haga el acoplamiento temporal explicito#
[ ]:
def append_age_based_features(df):
original_age = df["Age"].copy() # <<<-----
fill_age_nan(df)
append_is_adult_column(df)
append_is_senior_column(df)
append_is_millenial_column(df)
df["Age"] = original_age # <<<-----
Evite mezclar comandos y consultas#
[ ]:
# Mezcla de comandos y consultas
def fill_age_nan(df):
df["Age"] = df["Age"].fillna(df["Age"].median())
return df
# Query version
def get_age_with_no_nan(df):
return df["Age"].fillna(df["Age"].median())
# Command version
def fill_age_nan_inplace(df):
df["Age"] = df["Age"].fillna(df["Age"].median())
Use excepciones#
[ ]:
# ---< Mal >-------------------------------------------------------------------
def reset_experiment_components():
success = 1
if reset_accuracy_metrics() == success:
if model.reset_trainable_layers() == success:
if model.reset_gradients() == success:
print("metrics and model reset successfully")
else:
print("model gradients reset failed")
else:
print("model layers reset failed")
else:
print("Unable to reset accuracy metrics")
[ ]:
# ---< Mejor >-----------------------------------------------------------------
def reset_experiment_components():
try:
reset_accuracy_metrics()
model.reset_trainable_layers()
model.reset_gradients()
print("metrics and model reset successfully")
except MetricsException:
print("Unable to reset accuracy metrics")
except ModelLayerResetException:
print("model layers reset failed")
except ModelGradientResetException:
print("model gradients reset failed")
Separe el flujo normal de los casos excepcionales#
[ ]:
# ---< Bien >------------------------------------------------------------------
def reset_experiment_components():
try:
reset_components()
except MetricsException:
print("Unable to reset accuracy metrics")
except ModelLayerResetException:
print("model layers reset failed")
except ModelGradientResetException:
print("model gradients reset failed")
def reset_components():
reset_accuracy_metrics()
model.reset_trainable_layers()
model.reset_gradients()
print("metrics and model reset successfully")
Derive clases personalizadas para sus excepciones#
[ ]:
class Error(Exception):
"""Base class for other exceptions"""
pass
class ImportError(Error):
"""Raised when the import fails to find target package"""
pass
class NotFittedError(Error):
"""Raised when calling an estimator that was not fit yet"""
pass
class ConvergenceError(Error):
"""Raised when training reached last iteration without loss convergence"""
pass