IterativeImputer#

  • Imputador multivariado que estima cada característica de las otras.

  • Para cada paso, una columna es designada como y y las demás columnas como X. Entonces, un regresor es ajustado para (X, y).

  • El regresor obtenido para cada característica es usado para imputar los valores faltantes.

  • El proceso es realizado de forma iterativa en max_iter rondas de imputación.

[1]:
import numpy as np

X_train = np.array(
    [
        [1, 2],
        [3, 6],
        [4, 8],
        [np.nan, 3],
        [7, np.nan],
    ]
)

X_test = np.array(
    [
        [np.nan, 2],
        [6, np.nan],
        [np.nan, 6],
    ]
)
[2]:
from sklearn.linear_model import LinearRegression

#
#     X_train
# ----------------
# [
#     [1, 2],       media columna 1: (1 + 3 + 4 + 7) / 4 = 3.75
#     [3, 6],
#     [4, 8],       media columna 2: (2 + 6 + 8 + 3) / 4 = 4.75
#     [np.nan, 3],
#     [7, np.nan],
# ]
#

X_train = np.array(
    [
        [1, 2],
        [3, 6],
        [4, 8],
        [3.75, 3],
        [7, 4.75],
    ]
)

for i in range(100):
    m = LinearRegression()

    #
    # completado de la columna 1
    #
    m.fit(
        X_train[:, 0].reshape(-1, 1),
        X_train[:, 1],
    )

    X_train[4, 1] = m.predict(
        X_train[4, 0].reshape(-1, 1),
    )[0]

    #
    # completado de la columna 0
    #
    m.fit(
        X_train[:, 1].reshape(-1, 1),
        X_train[:, 0],
    )

    X_train[3, 0] = m.predict(
        X_train[3, 1].reshape(-1, 1),
    )[0]

X_train
[2]:
array([[ 1.        ,  2.        ],
       [ 3.        ,  6.        ],
       [ 4.        ,  8.        ],
       [ 1.5       ,  3.        ],
       [ 7.        , 13.99999999]])
[3]:
#
# X_test = np.array(
#     [
#         [np.nan, 2],
#         [6, np.nan],
#         [np.nan, 6],
#     ]
# )
#

m = LinearRegression()

m.fit(X_train[:, 0].reshape(-1, 1), X_train[:, 1])
X_test[1, 1] = m.predict([[6]])[0]

m.fit(X_train[:, 1].reshape(-1, 1), X_train[:, 0])
X_test[0, 0] = m.predict([[2]])[0]
X_test[2, 0] = m.predict([[6]])[0]

X_test
[3]:
array([[ 1.        ,  2.        ],
       [ 6.        , 11.99999999],
       [ 3.        ,  6.        ]])
[4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

iterativeImputer = IterativeImputer(
    # -------------------------------------------------------------------------
    # The estimator to use at each step of the round-robin imputation
    estimator=LinearRegression(),
    # -------------------------------------------------------------------------
    # The placeholder for the missing values.
    missing_values=np.nan,
    # -------------------------------------------------------------------------
    # Whether to sample from the (Gaussian) predictive posterior of the fitted
    # estimator for each imputation.
    sample_posterior=False,
    # -------------------------------------------------------------------------
    # Maximum number of imputation rounds to perform before returning the
    # imputations computed during the final round.  A round is a single
    # imputation of each feature with missing values.
    max_iter=10,
    # -------------------------------------------------------------------------
    # Number of other features to use to estimate the missing values of each
    # feature column. Nearness between features is measured using the absolute
    # correlation coefficient between each feature pair (after initial
    # imputation).
    n_nearest_features=None,
    # -------------------------------------------------------------------------
    # Which strategy to use to initialize the missing values.
    # - "mean"
    # - "median"
    # - "most_frequent"
    # - "constant"
    initial_strategy="mean",
    # -------------------------------------------------------------------------
    # The order in which the features will be imputed. Possible values:
    # - "ascending": From features with fewest missing values to most.
    # - "decending": From features with most missing values to fewest.
    # - "roman": Left to right.
    # - "arabic": Right to left.
    # - "random": A random order for each round.
    imputation_order="ascending",
    # -------------------------------------------------------------------------
    # If True then features with missing values during transform which did not
    # have any missing values during fit will be imputed with the initial
    # imputation method only.
    skip_complete=False,
    # -------------------------------------------------------------------------
    # Minimum possible imputed value.
    min_value=-np.inf,
    # -------------------------------------------------------------------------
    # Maximum possible imputed value.
    max_value=np.inf,
    # -------------------------------------------------------------------------
    # The seed of the pseudo random number generator to use.
    random_state=None,
)

iterativeImputer.fit(X_train)

iterativeImputer.transform(X_test)
[4]:
array([[ 1.        ,  2.        ],
       [ 6.        , 11.99999999],
       [ 3.        ,  6.        ]])