IterativeImputer#
Imputador multivariado que estima cada característica de las otras.
Para cada paso, una columna es designada como
y
y las demás columnas comoX
. Entonces, un regresor es ajustado para(X, y)
.
El regresor obtenido para cada característica es usado para imputar los valores faltantes.
El proceso es realizado de forma iterativa en
max_iter
rondas de imputación.
[1]:
import numpy as np
X_train = np.array(
[
[1, 2],
[3, 6],
[4, 8],
[np.nan, 3],
[7, np.nan],
]
)
X_test = np.array(
[
[np.nan, 2],
[6, np.nan],
[np.nan, 6],
]
)
[2]:
from sklearn.linear_model import LinearRegression
#
# X_train
# ----------------
# [
# [1, 2], media columna 1: (1 + 3 + 4 + 7) / 4 = 3.75
# [3, 6],
# [4, 8], media columna 2: (2 + 6 + 8 + 3) / 4 = 4.75
# [np.nan, 3],
# [7, np.nan],
# ]
#
X_train = np.array(
[
[1, 2],
[3, 6],
[4, 8],
[3.75, 3],
[7, 4.75],
]
)
for i in range(100):
m = LinearRegression()
#
# completado de la columna 1
#
m.fit(
X_train[:, 0].reshape(-1, 1),
X_train[:, 1],
)
X_train[4, 1] = m.predict(
X_train[4, 0].reshape(-1, 1),
)[0]
#
# completado de la columna 0
#
m.fit(
X_train[:, 1].reshape(-1, 1),
X_train[:, 0],
)
X_train[3, 0] = m.predict(
X_train[3, 1].reshape(-1, 1),
)[0]
X_train
[2]:
array([[ 1. , 2. ],
[ 3. , 6. ],
[ 4. , 8. ],
[ 1.5 , 3. ],
[ 7. , 13.99999999]])
[3]:
#
# X_test = np.array(
# [
# [np.nan, 2],
# [6, np.nan],
# [np.nan, 6],
# ]
# )
#
m = LinearRegression()
m.fit(X_train[:, 0].reshape(-1, 1), X_train[:, 1])
X_test[1, 1] = m.predict([[6]])[0]
m.fit(X_train[:, 1].reshape(-1, 1), X_train[:, 0])
X_test[0, 0] = m.predict([[2]])[0]
X_test[2, 0] = m.predict([[6]])[0]
X_test
[3]:
array([[ 1. , 2. ],
[ 6. , 11.99999999],
[ 3. , 6. ]])
[4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
iterativeImputer = IterativeImputer(
# -------------------------------------------------------------------------
# The estimator to use at each step of the round-robin imputation
estimator=LinearRegression(),
# -------------------------------------------------------------------------
# The placeholder for the missing values.
missing_values=np.nan,
# -------------------------------------------------------------------------
# Whether to sample from the (Gaussian) predictive posterior of the fitted
# estimator for each imputation.
sample_posterior=False,
# -------------------------------------------------------------------------
# Maximum number of imputation rounds to perform before returning the
# imputations computed during the final round. A round is a single
# imputation of each feature with missing values.
max_iter=10,
# -------------------------------------------------------------------------
# Number of other features to use to estimate the missing values of each
# feature column. Nearness between features is measured using the absolute
# correlation coefficient between each feature pair (after initial
# imputation).
n_nearest_features=None,
# -------------------------------------------------------------------------
# Which strategy to use to initialize the missing values.
# - "mean"
# - "median"
# - "most_frequent"
# - "constant"
initial_strategy="mean",
# -------------------------------------------------------------------------
# The order in which the features will be imputed. Possible values:
# - "ascending": From features with fewest missing values to most.
# - "decending": From features with most missing values to fewest.
# - "roman": Left to right.
# - "arabic": Right to left.
# - "random": A random order for each round.
imputation_order="ascending",
# -------------------------------------------------------------------------
# If True then features with missing values during transform which did not
# have any missing values during fit will be imputed with the initial
# imputation method only.
skip_complete=False,
# -------------------------------------------------------------------------
# Minimum possible imputed value.
min_value=-np.inf,
# -------------------------------------------------------------------------
# Maximum possible imputed value.
max_value=np.inf,
# -------------------------------------------------------------------------
# The seed of the pseudo random number generator to use.
random_state=None,
)
iterativeImputer.fit(X_train)
iterativeImputer.transform(X_test)
[4]:
array([[ 1. , 2. ],
[ 6. , 11.99999999],
[ 3. , 6. ]])