OneHotEncoder#
[1]:
import pandas as pd
X = [
["male", "from US", "uses Safari"],
["female", "from Europe", "uses Firefox"],
]
df = pd.DataFrame(X, columns=["sex", "from", "uses"])
df
[1]:
sex | from | uses | |
---|---|---|---|
0 | male | from US | uses Safari |
1 | female | from Europe | uses Firefox |
[2]:
#
# No se debe usar
#
pd.get_dummies(df)
[2]:
sex_female | sex_male | from_from Europe | from_from US | uses_uses Firefox | uses_uses Safari | |
---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 1 | 0 | 1 |
1 | 1 | 0 | 1 | 0 | 1 | 0 |
[3]:
#
# Con sklearn
#
import numpy as np
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder(
# -------------------------------------------------------------------------
# Categories (unique values) per feature:
# - ‘auto’ : Determine categories automatically from the training data.
# - list : categories[i] holds the categories expected in the ith column.
# The passed categories should not mix strings and numeric values within a
# single feature, and should be sorted in case of numeric values.
categories="auto",
# -------------------------------------------------------------------------
# Specifies a methodology to use to drop one of the categories per feature.
# - None: retain all features (the default).
# - ‘first’ : drop the first category in each feature. If only one category
# is present, the feature will be dropped entirely.
# - ‘if_binary’ : drop the first category in each feature with two
# categories. Features with 1 or more than 2 categories are left intact.
# - array : drop[i] is the category in feature X[:, i] that should be
# dropped.
drop=None,
# -------------------------------------------------------------------------
# Desired dtype of output.
dtype=np.float64,
# -------------------------------------------------------------------------
# Whether to raise an error or ignore if an unknown categorical feature is
# present during transform (default is to raise).
# - 'error'
# - 'ignore'
handle_unknown="error",
)
oneHotEncoder.fit(X)
oneHotEncoder.transform(df.values).toarray()
[3]:
array([[0., 1., 0., 1., 0., 1.],
[1., 0., 1., 0., 1., 0.]])
[4]:
oneHotEncoder.categories_
[4]:
[array(['female', 'male'], dtype=object),
array(['from Europe', 'from US'], dtype=object),
array(['uses Firefox', 'uses Safari'], dtype=object)]