sklearn.tree.DecisionTreeRegressor#

Ultima modificación: 2023-03-11 | YouTube

Arbol de decisión para regresión.

[1]:

from sklearn.datasets import load_diabetes

X, y = load_diabetes(return_X_y=True)

[2]:

from sklearn.tree import DecisionTreeRegressor

decisionTreeRegressor = DecisionTreeRegressor(
    # --------------------------------------------------------------------------
    # The function to measure the quality of a split.
    # * 'squared_error': is equal to variance reduction as feature selection
    #    criterion and minimizes the L2 loss using the mean of each terminal
    #    node
    # * 'friedman_mse': which uses mean squared error with Friedman’s
    #   improvement score for potential splits
    # * 'absolute_error': for the mean absolute error, which minimizes the
    #   L1 loss using the median of each terminal node
    # * 'poisson': which uses reduction in Poisson deviance to find splits.
    criterion="squared_error",
    # --------------------------------------------------------------------------
    #
    #    El resto de parámetros fueron discutidos en DecisionTreeClassifier
    #
    # --------------------------------------------------------------------------
    # The strategy used to choose the split at each node. Supported strategies
    # are “best” to choose the best split and “random” to choose the best
    # random split.
    splitter="best",
    # --------------------------------------------------------------------------
    # The maximum depth of the tree. If None, then nodes are expanded until all
    # leaves are pure or until all leaves contain less than min_samples_split
    # samples.
    max_depth=None,
    # --------------------------------------------------------------------------
    # The minimum number of samples required to split an internal node:
    # * If int, then consider min_samples_split as the minimum number.
    # * If float, then min_samples_split is a fraction and
    #   ceil(min_samples_split * n_samples) are the minimum number of samples
    #   for each split.
    min_samples_split=2,
    # --------------------------------------------------------------------------
    # The minimum number of samples required to be at a leaf node. A split
    # point at any depth will only be considered if it leaves at least
    # min_samples_leaf training samples in each of the left and right branches.
    # This may have the effect of smoothing the model, especially in
    # regression.
    # * If int, then consider min_samples_leaf as the minimum number.
    # * If float, then min_samples_leaf is a fraction and
    #   ceil(min_samples_leaf * n_samples) are the minimum number of samples
    #   for each node.
    min_samples_leaf=1,
    # --------------------------------------------------------------------------
    # The minimum weighted fraction of the sum total of weights (of all the
    # input samples) required to be at a leaf node. Samples have equal weight
    # when sample_weight is not provided.
    min_weight_fraction_leaf=0.0,
    # --------------------------------------------------------------------------
    # The number of features to consider when looking for the best split:
    # * If int, then consider max_features features at each split.
    # * If float, then max_features is a fraction and
    #   max(1, int(max_features * n_features_in_)) features are considered at
    #   each split.
    # * If “sqrt”, then max_features=sqrt(n_features).
    # * If “log2”, then max_features=log2(n_features).
    # * If None, then max_features=n_features.
    max_features=None,
    # --------------------------------------------------------------------------
    # Controls the randomness of the estimator. The features are always
    # randomly permuted at each split, even if splitter is set to "best". When
    # max_features < n_features, the algorithm will select max_features at
    # random at each split before finding the best split among them. But the
    # best found split may vary across different runs, even if
    # max_features=n_features. That is the case, if the improvement of the
    # criterion is identical for several splits and one split has to be
    # selected at random. To obtain a deterministic behaviour during fitting,
    # random_state has to be fixed to an integer.
    random_state=None,
    # --------------------------------------------------------------------------
    # Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are
    # defined as relative reduction in impurity. If None then unlimited number
    # of leaf nodes.
    max_leaf_nodes=None,
    # --------------------------------------------------------------------------
    # A node will be split if this split induces a decrease of the impurity
    # greater than or equal to this value.
    #
    # The weighted impurity decrease equation is the following:
    #
    #   N_t / N * (impurity - N_t_R / N_t * right_impurity
    #                       - N_t_L / N_t * left_impurity)
    #
    # where N is the total number of samples, N_t is the number of samples at
    # the current node, N_t_L is the number of samples in the left child, and
    # N_t_R is the number of samples in the right child.
    #
    # N, N_t, N_t_R and N_t_L all refer to the weighted sum, if sample_weight
    # is passed.
    min_impurity_decrease=0.0,
    # --------------------------------------------------------------------------
    # Complexity parameter used for Minimal Cost-Complexity Pruning. The
    # subtree with the largest cost complexity that is smaller than ccp_alpha
    # will be chosen. By default, no pruning is performed.
    ccp_alpha=0.0,
)

decisionTreeRegressor.fit(X, y)
decisionTreeRegressor.score(X, y)

[2]:

1.0

[3]:

decisionTreeRegressor.predict(X)

[3]:

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 288.,  88., 292.,  71.,
       197., 186.,  25.,  84.,  96., 195.,  53., 217., 172., 131., 214.,
        59.,  70., 220., 268., 152.,  47.,  74., 295., 101., 151., 127.,
       237., 225.,  81., 151., 107.,  64., 138., 185., 265., 101., 137.,
       143., 141.,  79., 292., 178.,  91., 116.,  86., 122.,  72., 129.,
       142.,  90., 158.,  39., 196., 222., 277.,  99., 196., 202., 155.,
        77., 191.,  70.,  73.,  49.,  65., 263., 248., 296., 214., 185.,
        78.,  93., 252., 150.,  77., 208.,  77., 108., 160.,  53., 220.,
       154., 259.,  90., 246., 124.,  67.,  72., 257., 262., 275., 177.,
        71.,  47., 187., 125.,  78.,  51., 258., 215., 303., 243.,  91.,
       150., 310., 153., 346.,  63.,  89.,  50.,  39., 103., 308., 116.,
       145.,  74.,  45., 115., 264.,  87., 202., 127., 182., 241.,  66.,
        94., 283.,  64., 102., 200., 265.,  94., 230., 181., 156., 233.,
        60., 219.,  80.,  68., 332., 248.,  84., 200.,  55.,  85.,  89.,
        31., 129.,  83., 275.,  65., 198., 236., 253., 124.,  44., 172.,
       114., 142., 109., 180., 144., 163., 147.,  97., 220., 190., 109.,
       191., 122., 230., 242., 248., 249., 192., 131., 237.,  78., 135.,
       244., 199., 270., 164.,  72.,  96., 306.,  91., 214.,  95., 216.,
       263., 178., 113., 200., 139., 139.,  88., 148.,  88., 243.,  71.,
        77., 109., 272.,  60.,  54., 221.,  90., 311., 281., 182., 321.,
        58., 262., 206., 233., 242., 123., 167.,  63., 197.,  71., 168.,
       140., 217., 121., 235., 245.,  40.,  52., 104., 132.,  88.,  69.,
       219.,  72., 201., 110.,  51., 277.,  63., 118.,  69., 273., 258.,
        43., 198., 242., 232., 175.,  93., 168., 275., 293., 281.,  72.,
       140., 189., 181., 209., 136., 261., 113., 131., 174., 257.,  55.,
        84.,  42., 146., 212., 233.,  91., 111., 152., 120.,  67., 310.,
        94., 183.,  66., 173.,  72.,  49.,  64.,  48., 178., 104., 132.,
       220.,  57.])

[4]:

decisionTreeRegressor.feature_importances_

[4]:

array([0.04278675, 0.00902901, 0.23099855, 0.08696601, 0.08256838,
       0.04880831, 0.06316078, 0.01869221, 0.34793949, 0.06905048])