KBinsDiscretizer#

[1]:

import numpy as np

X = np.array(
    [
        [0.0, 4.0, 12],
        [1.0, 5.0, 15],
        [2.0, 6.0, 14],
        [3.0, 3.0, 11],
        [4.0, 7.0, 11],
        [5.0, 3.0, 13],
        [6.0, 7.0, 11],
        [7.0, 6.0, 14],
        [8.0, 3.0, 18],
        [9.0, 2.0, 13],
        [10.0, 1.0, 14],
    ]
)
X

[1]:

array([[ 0.,  4., 12.],
       [ 1.,  5., 15.],
       [ 2.,  6., 14.],
       [ 3.,  3., 11.],
       [ 4.,  7., 11.],
       [ 5.,  3., 13.],
       [ 6.,  7., 11.],
       [ 7.,  6., 14.],
       [ 8.,  3., 18.],
       [ 9.,  2., 13.],
       [10.,  1., 14.]])

[2]:

from sklearn.preprocessing import KBinsDiscretizer

kBinsDiscretizer = KBinsDiscretizer(
    # -------------------------------------------------------------------------
    # The number of bins to produce. Raises ValueError if n_bins < 2.
    n_bins=[3, 2, 2],
    # -------------------------------------------------------------------------
    # Method used to encode the transformed result.
    # - 'onehot'
    # - 'onehot-dense'
    # - 'ordinal'
    encode="ordinal",
    # -------------------------------------------------------------------------
    # Strategy used to define the widths of the bins.
    # - 'uniform': All bins in each feature have identical widths.
    # - 'quantile': All bins in each feature have the same number of points.
    # - 'kmeans': Values in each bin have the same nearest center of a 1D
    #   k-means cluster.
    strategy="quantile",
    # -------------------------------------------------------------------------
    # The desired data-type for the output.
    # - np.float32
    # - np.float64
    # dtype=None,
)


kBinsDiscretizer.fit(X)

kBinsDiscretizer.transform(X)

[2]:

array([[0., 1., 0.],
       [0., 1., 1.],
       [0., 1., 1.],
       [0., 0., 0.],
       [1., 1., 0.],
       [1., 0., 1.],
       [1., 1., 0.],
       [2., 1., 1.],
       [2., 0., 1.],
       [2., 0., 1.],
       [2., 0., 1.]])

[3]:

kBinsDiscretizer.bin_edges_

[3]:

array([array([ 0.        ,  3.33333333,  6.66666667, 10.        ]),
       array([1., 4., 7.]), array([11., 13., 18.])], dtype=object)

[4]:

kBinsDiscretizer.n_bins_

[4]:

array([3, 2, 2])