Transformación de datos heterogeneos con ColumnTransformer#

Permite transformar las columnas individualmente o por grupos.

Las columnas pueden ser especificadas por nombre o por tipo.

Se pueden indicar transformaciones por defecto.

Dataset de prueba#

[1]:

import pandas as pd

#
# Creación de un dataframe de prueba
#
X = pd.DataFrame(
    {
        "city": [
            "London",
            "London",
            "Paris",
            "Sallisaw",
        ],
        "title": [
            "His Last Bow",
            "How Watson Learned the Trick",
            "A Moveable Feast",
            "The Grapes of Wrath",
        ],
        "expert_rating": [
            5,
            3,
            4,
            5,
        ],
        "user_rating": [
            4,
            5,
            4,
            3,
        ],
    }
)

X

[1]:

	city	title	expert_rating	user_rating
0	London	His Last Bow	5	4
1	London	How Watson Learned the Trick	3	5
2	Paris	A Moveable Feast	4	4
3	Sallisaw	The Grapes of Wrath	5	3

ColumnTransformer#

[2]:

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

column_trans = ColumnTransformer(
    # -------------------------------------------------------------------------
    # List of (name, transformer, columns) tuples specifying the transformer
    # objects to be applied to subsets of the data.
    transformers=[
        ("categories", OneHotEncoder(dtype="int"), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    ],
    # -------------------------------------------------------------------------
    # By default, only the specified columns in transformers are transformed
    # and combined in the output, and the non-specified columns are dropped.
    # (default of 'drop'). By specifying remainder='passthrough', all remaining
    # columns that were not specified in transformers will be automatically
    # passed through.
    remainder="drop",
)

column_trans.fit(X)

[2]:

ColumnTransformer(transformers=[('categories', OneHotEncoder(dtype='int'),
                                 ['city']),
                                ('title_bow', CountVectorizer(), 'title')])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

[3]:

#
# Nombres de las columnas transformadas
#
column_trans.get_feature_names_out()

[3]:

array(['categories__city_London', 'categories__city_Paris',
       'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast',
       'title_bow__grapes', 'title_bow__his', 'title_bow__how',
       'title_bow__last', 'title_bow__learned', 'title_bow__moveable',
       'title_bow__of', 'title_bow__the', 'title_bow__trick',
       'title_bow__watson', 'title_bow__wrath'], dtype=object)

[4]:

#
# X transformado
#
column_trans.transform(X).toarray()

[4]:

array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1]])

[5]:

#
# Visualización como un dataframe
#
pd.DataFrame(
    column_trans.transform(X).toarray(),
    columns=column_trans.get_feature_names_out(),
)

[5]:

	categories__city_London	categories__city_Paris	categories__city_Sallisaw	title_bow__bow	title_bow__feast	title_bow__grapes	title_bow__his	title_bow__how	title_bow__last	title_bow__learned	title_bow__moveable	title_bow__of	title_bow__the	title_bow__trick	title_bow__watson	title_bow__wrath
0	1	0	0	1	0	0	1	0	1	0	0	0	0	0	0	0
1	1	0	0	0	0	0	0	1	0	1	0	0	1	1	1	0
2	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0
3	0	0	1	0	0	1	0	0	0	0	0	1	1	0	0	1

Selección de columnas basadas en su tipo#

[6]:

import numpy as np
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer(
    [
        ("scale", StandardScaler(), make_column_selector(dtype_include=np.number)),
        (
            "onehot",
            OneHotEncoder(),
            make_column_selector(pattern="city", dtype_include=object),
        ),
    ]
)
ct.fit_transform(X)

[6]:

array([[ 0.90453403,  0.        ,  1.        ,  0.        ,  0.        ],
       [-1.50755672,  1.41421356,  1.        ,  0.        ,  0.        ],
       [-0.30151134,  0.        ,  0.        ,  1.        ,  0.        ],
       [ 0.90453403, -1.41421356,  0.        ,  0.        ,  1.        ]])

[7]:

pd.DataFrame(ct.transform(X), columns=ct.get_feature_names_out())

[7]:

	scale__expert_rating	scale__user_rating	onehot__city_London	onehot__city_Paris	onehot__city_Sallisaw
0	0.904534	0.000000	1.0	0.0	0.0
1	-1.507557	1.414214	1.0	0.0	0.0
2	-0.301511	0.000000	0.0	1.0	0.0
3	0.904534	-1.414214	0.0	0.0	1.0

Uso de “passthrough”#

[8]:

#
# Copia de las columnas no transformadas a la matriz
# de salida
#
column_trans = ColumnTransformer(
    [
        ("city_category", OneHotEncoder(dtype="int"), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    ],
    remainder="passthrough",
)

column_trans.fit_transform(X)

[8]:

array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 4],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 3, 5],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 4],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 5, 3]])

Aplicación de un transformador por defecto#

[9]:

from sklearn.preprocessing import MinMaxScaler

column_trans = ColumnTransformer(
    [
        ("city_category", OneHotEncoder(), ["city"]),
        ("title_bow", CountVectorizer(), "title"),
    ],
    remainder=MinMaxScaler(),
)

column_trans.fit_transform(X)[:, -2:]

[9]:

array([[1. , 0.5],
       [0. , 1. ],
       [0.5, 0.5],
       [1. , 0. ]])

Creación de un tranformador con make_column_transformer#

[10]:

from sklearn.compose import make_column_transformer

column_trans = make_column_transformer(
    (OneHotEncoder(), ["city"]),
    (CountVectorizer(), "title"),
    remainder=MinMaxScaler(),
)

column_trans

[10]:

ColumnTransformer(remainder=MinMaxScaler(),
                  transformers=[('onehotencoder', OneHotEncoder(), ['city']),
                                ('countvectorizer', CountVectorizer(),
                                 'title')])

	categories__city_London	categories__city_Paris	categories__city_Sallisaw	title_bow__bow	title_bow__feast	title_bow__grapes	title_bow__his	title_bow__how	title_bow__last	title_bow__learned	title_bow__moveable	title_bow__of	title_bow__the	title_bow__trick	title_bow__watson	title_bow__wrath
0	1	0	0	1	0	0	1	0	1	0	0	0	0	0	0	0
1	1	0	0	0	0	0	0	1	0	1	0	0	1	1	1	0
2	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0
3	0	0	1	0	0	1	0	0	0	0	0	1	1	0	0	1

	categories__city_London	categories__city_Paris	categories__city_Sallisaw	title_bow__bow	title_bow__feast	title_bow__grapes	title_bow__his	title_bow__how	title_bow__last	title_bow__learned	title_bow__moveable	title_bow__of	title_bow__the	title_bow__trick	title_bow__watson	title_bow__wrath
0	1	0	0	1	0	0	1	0	1	0	0	0	0	0	0	0
1	1	0	0	0	0	0	0	1	0	1	0	0	1	1	1	0
2	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0
3	0	0	1	0	0	1	0	0	0	0	0	1	1	0	0	1

	categories__city_London	categories__city_Paris	categories__city_Sallisaw	title_bow__bow	title_bow__feast	title_bow__grapes	title_bow__his	title_bow__how	title_bow__last	title_bow__learned	title_bow__moveable	title_bow__of	title_bow__the	title_bow__trick	title_bow__watson	title_bow__wrath
0	1	0	0	1	0	0	1	0	1	0	0	0	0	0	0	0
1	1	0	0	0	0	0	0	1	0	1	0	0	1	1	1	0
2	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0
3	0	0	1	0	0	1	0	0	0	0	0	1	1	0	0	1