sklearn.tree.ExtraTreeClassifier#

  • Ultima modificación: 2023-03-11 | YouTube

  • Es un árbol de clasificación extremadamente aleatorizado.

  • Difiere del arból clásico de decisión en la forma en que es construido.

  • Cuando se van a separar los ejemplos de un nodo en dos grupos, se generan particiones aleatorias para cada una de las max_features características seleccionadas y se escoge la mejor partición.

  • Cuando max_features es igual a 1, se construye un árbol completamente aleatorio.

[1]:
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True)
[2]:
from sklearn.tree import ExtraTreeClassifier

extraTreeClassifier = ExtraTreeClassifier(
    # --------------------------------------------------------------------------
    #
    #    Recibe exactamente los mismos parámetros que DecisionTreeClassifier
    #
    # --------------------------------------------------------------------------
    # The function to measure the quality of a split. Supported criteria are
    # “gini” for the Gini impurity and “log_loss” and “entropy” both for the
    # Shannon information gain
    criterion="gini",
    # --------------------------------------------------------------------------
    # The strategy used to choose the split at each node. Supported strategies
    # are “best” to choose the best split and “random” to choose the best
    # random split.
    splitter="best",
    # --------------------------------------------------------------------------
    # The maximum depth of the tree. If None, then nodes are expanded until all
    # leaves are pure or until all leaves contain less than min_samples_split
    # samples.
    max_depth=None,
    # --------------------------------------------------------------------------
    # The minimum number of samples required to split an internal node:
    # * If int, then consider min_samples_split as the minimum number.
    # * If float, then min_samples_split is a fraction and
    #   ceil(min_samples_split * n_samples) are the minimum number of samples
    #   for each split.
    min_samples_split=2,
    # --------------------------------------------------------------------------
    # The minimum number of samples required to be at a leaf node. A split
    # point at any depth will only be considered if it leaves at least
    # min_samples_leaf training samples in each of the left and right branches.
    # This may have the effect of smoothing the model, especially in
    # regression.
    # * If int, then consider min_samples_leaf as the minimum number.
    # * If float, then min_samples_leaf is a fraction and
    #   ceil(min_samples_leaf * n_samples) are the minimum number of samples
    #   for each node.
    min_samples_leaf=1,
    # --------------------------------------------------------------------------
    # The minimum weighted fraction of the sum total of weights (of all the
    # input samples) required to be at a leaf node. Samples have equal weight
    # when sample_weight is not provided.
    min_weight_fraction_leaf=0.0,
    # --------------------------------------------------------------------------
    # The number of features to consider when looking for the best split:
    # * If int, then consider max_features features at each split.
    # * If float, then max_features is a fraction and
    #   max(1, int(max_features * n_features_in_)) features are considered at
    #   each split.
    # * If “sqrt”, then max_features=sqrt(n_features).
    # * If “log2”, then max_features=log2(n_features).
    # * If None, then max_features=n_features.
    max_features=None,
    # --------------------------------------------------------------------------
    # Controls the randomness of the estimator. The features are always
    # randomly permuted at each split, even if splitter is set to "best". When
    # max_features < n_features, the algorithm will select max_features at
    # random at each split before finding the best split among them. But the
    # best found split may vary across different runs, even if
    # max_features=n_features. That is the case, if the improvement of the
    # criterion is identical for several splits and one split has to be
    # selected at random. To obtain a deterministic behaviour during fitting,
    # random_state has to be fixed to an integer.
    random_state=None,
    # --------------------------------------------------------------------------
    # Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are
    # defined as relative reduction in impurity. If None then unlimited number
    # of leaf nodes.
    max_leaf_nodes=None,
    # --------------------------------------------------------------------------
    # A node will be split if this split induces a decrease of the impurity
    # greater than or equal to this value.
    #
    # The weighted impurity decrease equation is the following:
    #
    #   N_t / N * (impurity - N_t_R / N_t * right_impurity
    #                       - N_t_L / N_t * left_impurity)
    #
    # where N is the total number of samples, N_t is the number of samples at
    # the current node, N_t_L is the number of samples in the left child, and
    # N_t_R is the number of samples in the right child.
    #
    # N, N_t, N_t_R and N_t_L all refer to the weighted sum, if sample_weight
    # is passed.
    min_impurity_decrease=0.0,
    # --------------------------------------------------------------------------
    # Weights associated with classes in the form {class_label: weight}. If
    # None, all classes are supposed to have weight one.
    class_weight=None,
    # --------------------------------------------------------------------------
    # Complexity parameter used for Minimal Cost-Complexity Pruning. The
    # subtree with the largest cost complexity that is smaller than ccp_alpha
    # will be chosen. By default, no pruning is performed.
    ccp_alpha=0.0,
)

extraTreeClassifier.fit(X, y)
extraTreeClassifier.score(X, y)
[2]:
1.0
[3]:
extraTreeClassifier.predict(X)
[3]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
[4]:
extraTreeClassifier.classes_
[4]:
array([0, 1, 2])
[5]:
extraTreeClassifier.n_classes_
[5]:
3