OneHotEncoder#

[1]:

import pandas as pd

X = [
    ["male", "from US", "uses Safari"],
    ["female", "from Europe", "uses Firefox"],
]

df = pd.DataFrame(X, columns=["sex", "from", "uses"])
df

[1]:

	sex	from	uses
0	male	from US	uses Safari
1	female	from Europe	uses Firefox

[2]:

#
# No se debe usar
#
pd.get_dummies(df)

[2]:

	sex_female	sex_male	from_from Europe	from_from US	uses_uses Firefox	uses_uses Safari
0	0	1	0	1	0	1
1	1	0	1	0	1	0

[3]:

#
# Con sklearn
#
import numpy as np
from sklearn.preprocessing import OneHotEncoder

oneHotEncoder = OneHotEncoder(
    # -------------------------------------------------------------------------
    # Categories (unique values) per feature:
    # - ‘auto’ : Determine categories automatically from the training data.
    # - list : categories[i] holds the categories expected in the ith column.
    # The passed categories should not mix strings and numeric values within a
    # single feature, and should be sorted in case of numeric values.
    categories="auto",
    # -------------------------------------------------------------------------
    # Specifies a methodology to use to drop one of the categories per feature.
    # - None: retain all features (the default).
    # - ‘first’ : drop the first category in each feature. If only one category
    #   is present, the feature will be dropped entirely.
    # - ‘if_binary’ : drop the first category in each feature with two
    #   categories. Features with 1 or more than 2 categories are left intact.
    # - array : drop[i] is the category in feature X[:, i] that should be
    # dropped.
    drop=None,
    # -------------------------------------------------------------------------
    # Desired dtype of output.
    dtype=np.float64,
    # -------------------------------------------------------------------------
    # Whether to raise an error or ignore if an unknown categorical feature is
    # present during transform (default is to raise).
    # - 'error'
    # - 'ignore'
    handle_unknown="error",
)

oneHotEncoder.fit(X)

oneHotEncoder.transform(df.values).toarray()

[3]:

array([[0., 1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 1., 0.]])

[4]:

oneHotEncoder.categories_

[4]:

[array(['female', 'male'], dtype=object),
 array(['from Europe', 'from US'], dtype=object),
 array(['uses Firefox', 'uses Safari'], dtype=object)]