Transformación de datos heterogeneos con ColumnTransformer#

  • Permite transformar las columnas individualmente o por grupos.

  • Las columnas pueden ser especificadas por nombre o por tipo.

  • Se pueden indicar transformaciones por defecto.

Dataset de prueba#

import pandas as pd

# Creación de un dataframe de prueba
X = pd.DataFrame(
        "city": [
        "title": [
            "His Last Bow",
            "How Watson Learned the Trick",
            "A Moveable Feast",
            "The Grapes of Wrath",
        "expert_rating": [
        "user_rating": [

city title expert_rating user_rating
0 London His Last Bow 5 4
1 London How Watson Learned the Trick 3 5
2 Paris A Moveable Feast 4 4
3 Sallisaw The Grapes of Wrath 5 3


from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

column_trans = ColumnTransformer(
    # -------------------------------------------------------------------------
    # List of (name, transformer, columns) tuples specifying the transformer
    # objects to be applied to subsets of the data.
        ("categories", OneHotEncoder(dtype="int"), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    # -------------------------------------------------------------------------
    # By default, only the specified columns in transformers are transformed
    # and combined in the output, and the non-specified columns are dropped.
    # (default of 'drop'). By specifying remainder='passthrough', all remaining
    # columns that were not specified in transformers will be automatically
    # passed through.
ColumnTransformer(transformers=[('categories', OneHotEncoder(dtype='int'),
                                ('title_bow', CountVectorizer(), 'title')])
# Nombres de las columnas transformadas
array(['categories__city_London', 'categories__city_Paris',
       'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast',
       'title_bow__grapes', 'title_bow__his', 'title_bow__how',
       'title_bow__last', 'title_bow__learned', 'title_bow__moveable',
       'title_bow__of', 'title_bow__the', 'title_bow__trick',
       'title_bow__watson', 'title_bow__wrath'], dtype=object)
# X transformado
array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1]])
# Visualización como un dataframe
categories__city_London categories__city_Paris categories__city_Sallisaw title_bow__bow title_bow__feast title_bow__grapes title_bow__his title_bow__how title_bow__last title_bow__learned title_bow__moveable title_bow__of title_bow__the title_bow__trick title_bow__watson title_bow__wrath
0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0
2 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0
3 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1

Selección de columnas basadas en su tipo#

import numpy as np
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer(
        ("scale", StandardScaler(), make_column_selector(dtype_include=np.number)),
            make_column_selector(pattern="city", dtype_include=object),
array([[ 0.90453403,  0.        ,  1.        ,  0.        ,  0.        ],
       [-1.50755672,  1.41421356,  1.        ,  0.        ,  0.        ],
       [-0.30151134,  0.        ,  0.        ,  1.        ,  0.        ],
       [ 0.90453403, -1.41421356,  0.        ,  0.        ,  1.        ]])
pd.DataFrame(ct.transform(X), columns=ct.get_feature_names_out())
scale__expert_rating scale__user_rating onehot__city_London onehot__city_Paris onehot__city_Sallisaw
0 0.904534 0.000000 1.0 0.0 0.0
1 -1.507557 1.414214 1.0 0.0 0.0
2 -0.301511 0.000000 0.0 1.0 0.0
3 0.904534 -1.414214 0.0 0.0 1.0

Uso de “passthrough”#

# Copia de las columnas no transformadas a la matriz
# de salida
column_trans = ColumnTransformer(
        ("city_category", OneHotEncoder(dtype="int"), ["city"]),
        ("title_bow", CountVectorizer(), "title"),

array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 4],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 3, 5],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 4],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 5, 3]])

Aplicación de un transformador por defecto#

from sklearn.preprocessing import MinMaxScaler

column_trans = ColumnTransformer(
        ("city_category", OneHotEncoder(), ["city"]),
        ("title_bow", CountVectorizer(), "title"),

column_trans.fit_transform(X)[:, -2:]
array([[1. , 0.5],
       [0. , 1. ],
       [0.5, 0.5],
       [1. , 0. ]])

Creación de un tranformador con make_column_transformer#

from sklearn.compose import make_column_transformer

column_trans = make_column_transformer(
    (OneHotEncoder(), ["city"]),
    (CountVectorizer(), "title"),

                  transformers=[('onehotencoder', OneHotEncoder(), ['city']),
                                ('countvectorizer', CountVectorizer(),
