OneHotEncoder#
[1]:
import pandas as pd
X = [
    ["male", "from US", "uses Safari"],
    ["female", "from Europe", "uses Firefox"],
]
df = pd.DataFrame(X, columns=["sex", "from", "uses"])
df
[1]:
| sex | from | uses | |
|---|---|---|---|
| 0 | male | from US | uses Safari | 
| 1 | female | from Europe | uses Firefox | 
[2]:
#
# No se debe usar
#
pd.get_dummies(df)
[2]:
| sex_female | sex_male | from_from Europe | from_from US | uses_uses Firefox | uses_uses Safari | |
|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 1 | 0 | 1 | 
| 1 | 1 | 0 | 1 | 0 | 1 | 0 | 
[3]:
#
# Con sklearn
#
import numpy as np
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder(
    # -------------------------------------------------------------------------
    # Categories (unique values) per feature:
    # - ‘auto’ : Determine categories automatically from the training data.
    # - list : categories[i] holds the categories expected in the ith column.
    # The passed categories should not mix strings and numeric values within a
    # single feature, and should be sorted in case of numeric values.
    categories="auto",
    # -------------------------------------------------------------------------
    # Specifies a methodology to use to drop one of the categories per feature.
    # - None: retain all features (the default).
    # - ‘first’ : drop the first category in each feature. If only one category
    #   is present, the feature will be dropped entirely.
    # - ‘if_binary’ : drop the first category in each feature with two
    #   categories. Features with 1 or more than 2 categories are left intact.
    # - array : drop[i] is the category in feature X[:, i] that should be
    # dropped.
    drop=None,
    # -------------------------------------------------------------------------
    # Desired dtype of output.
    dtype=np.float64,
    # -------------------------------------------------------------------------
    # Whether to raise an error or ignore if an unknown categorical feature is
    # present during transform (default is to raise).
    # - 'error'
    # - 'ignore'
    handle_unknown="error",
)
oneHotEncoder.fit(X)
oneHotEncoder.transform(df.values).toarray()
[3]:
array([[0., 1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 1., 0.]])
[4]:
oneHotEncoder.categories_
[4]:
[array(['female', 'male'], dtype=object),
 array(['from Europe', 'from US'], dtype=object),
 array(['uses Firefox', 'uses Safari'], dtype=object)]