OneHotEncoder#

[1]:
import pandas as pd

X = [
    ["male", "from US", "uses Safari"],
    ["female", "from Europe", "uses Firefox"],
]

df = pd.DataFrame(X, columns=["sex", "from", "uses"])
df
[1]:
sex from uses
0 male from US uses Safari
1 female from Europe uses Firefox
[2]:
#
# No se debe usar
#
pd.get_dummies(df)
[2]:
sex_female sex_male from_from Europe from_from US uses_uses Firefox uses_uses Safari
0 0 1 0 1 0 1
1 1 0 1 0 1 0
[3]:
#
# Con sklearn
#
import numpy as np
from sklearn.preprocessing import OneHotEncoder

oneHotEncoder = OneHotEncoder(
    # -------------------------------------------------------------------------
    # Categories (unique values) per feature:
    # - ‘auto’ : Determine categories automatically from the training data.
    # - list : categories[i] holds the categories expected in the ith column.
    # The passed categories should not mix strings and numeric values within a
    # single feature, and should be sorted in case of numeric values.
    categories="auto",
    # -------------------------------------------------------------------------
    # Specifies a methodology to use to drop one of the categories per feature.
    # - None: retain all features (the default).
    # - ‘first’ : drop the first category in each feature. If only one category
    #   is present, the feature will be dropped entirely.
    # - ‘if_binary’ : drop the first category in each feature with two
    #   categories. Features with 1 or more than 2 categories are left intact.
    # - array : drop[i] is the category in feature X[:, i] that should be
    # dropped.
    drop=None,
    # -------------------------------------------------------------------------
    # Desired dtype of output.
    dtype=np.float64,
    # -------------------------------------------------------------------------
    # Whether to raise an error or ignore if an unknown categorical feature is
    # present during transform (default is to raise).
    # - 'error'
    # - 'ignore'
    handle_unknown="error",
)

oneHotEncoder.fit(X)

oneHotEncoder.transform(df.values).toarray()
[3]:
array([[0., 1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 1., 0.]])
[4]:
oneHotEncoder.categories_
[4]:
[array(['female', 'male'], dtype=object),
 array(['from Europe', 'from US'], dtype=object),
 array(['uses Firefox', 'uses Safari'], dtype=object)]