LogisticRegressionCV#

Implementa la regresión logística con validación cruzada para el parámetro C.

[1]:

from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

[2]:

from sklearn.linear_model import LogisticRegressionCV

logisticRegressionCV = LogisticRegressionCV(
    # --------------------------------------------------------------------------
    # Each of the values in Cs describes the inverse of regularization strength.
    # If Cs is as an int, then a grid of Cs values are chosen in a logarithmic
    # scale between 1e-4 and 1e4. Like in support vector machines, smaller
    # values specify stronger regularization.
    Cs=[1e-3, 1e-2, 1e-1, 1],
    # --------------------------------------------------------------------------
    # Specifies if a constant (a.k.a. bias or intercept) should be added to the
    # decision function.
    fit_intercept=True,
    # --------------------------------------------------------------------------
    # The default cross-validation generator used is Stratified K-Folds. If an
    # integer is provided, then it is the number of folds used.
    cv=None,
    # --------------------------------------------------------------------------
    # Specify the norm of the penalty:
    # * 'l2': add a L2 penalty term and it is the default choice.
    # * 'l1': add a L1 penalty term.
    # * 'elasticnet': both L1 and L2 penalty terms are added.
    penalty="l2",
    # --------------------------------------------------------------------------
    # A string (see model evaluation documentation) or a scorer callable
    # object / function with signature scorer(estimator, X, y).
    scoring=None,
    # --------------------------------------------------------------------------
    # Algorithm to use in the optimization problem. Default is ‘lbfgs’. To
    # choose a solver, you might want to consider the following aspects:
    # * For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and
    #   ‘saga’ are faster for large ones.
    # * For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’
    #   handle multinomial loss.
    # * ‘liblinear’ and is limited to one-versus-rest schemes.
    # * ‘newton-cholesky’ is a good choice for n_samples >> n_features,
    #   especially with one-hot encoded categorical features with rare
    #   categories. Note that it is limited to binary classification and the
    #   one-versus-rest reduction for multiclass classification. Be aware that
    #   the memory usage of this solver has a quadratic dependency on n_features
    #   because it explicitly computes the Hessian matrix.
    solver="lbfgs",
    # --------------------------------------------------------------------------
    # Tolerance for stopping criteria.
    tol=0.0001,
    # --------------------------------------------------------------------------
    # Maximum number of iterations taken for the solvers to converge.
    max_iter=1000,
    # --------------------------------------------------------------------------
    # Weights associated with classes in the form {class_label: weight}. If not
    # given, all classes are supposed to have weight one.
    # * 'balanced' uses the values of y to automatically adjust weights
    # inversely proportional to class frequencies in the input data as
    # n_samples / (n_classes * np.bincount(y)).
    class_weight=None,
    # --------------------------------------------------------------------------
    # If set to True, the scores are averaged across all folds, and the coefs
    # and the C that corresponds to the best score is taken, and a final refit
    # is done using these parameters.
    #
    # Otherwise the coefs, intercepts and C that correspond to the best scores
    # across folds are averaged.
    refit=True,
    # --------------------------------------------------------------------------
    # Used when solver == ‘sag’, ‘saga’ or ‘liblinear’ to shuffle the data.
    random_state=None,
    # --------------------------------------------------------------------------
    # {‘ovr’, ‘multinomial’, ‘auto’}
    # If the option chosen is ‘ovr’, then a binary problem is fit for each
    # label. For ‘multinomial’ the loss minimised is the multinomial loss fit
    # across the entire probability distribution, even when the data is binary.
    multi_class="auto",
    # --------------------------------------------------------------------------
    # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1. Only used if
    # penalty='elasticnet'. Setting l1_ratio=0 is equivalent to using
    # penalty='l2', while setting l1_ratio=1 is equivalent to using penalty='l1'.
    # For 0 < l1_ratio <1, the penalty is a combination of L1 and L2.
    l1_ratios=None,
)

logisticRegressionCV.fit(X, y)

logisticRegressionCV.predict(X)

[2]:

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])

[3]:

logisticRegressionCV.intercept_

[3]:

array([29.63204311])

[4]:

logisticRegressionCV.coef_

[4]:

array([[ 0.98073972,  0.19049121, -0.29300361,  0.02408592, -0.16773977,
        -0.22202195, -0.51221874, -0.27695865, -0.25210201, -0.03132659,
        -0.07860476,  1.30855956,  0.13383261, -0.11150831, -0.02331991,
         0.05411632, -0.04088007, -0.03580027, -0.03659877,  0.01168161,
         0.09922278, -0.44981211, -0.10590221, -0.01316356, -0.33656522,
        -0.71536982, -1.38542672, -0.57016644, -0.70482241, -0.09764186]])

[5]:

logisticRegressionCV.predict_proba(X)[0:10]

[5]:

array([[1.00000000e+00, 2.75164335e-14],
       [9.99995346e-01, 4.65357777e-06],
       [9.99999464e-01, 5.35566632e-07],
       [6.86987914e-01, 3.13012086e-01],
       [9.99745484e-01, 2.54516312e-04],
       [7.52946748e-01, 2.47053252e-01],
       [9.99994218e-01, 5.78244161e-06],
       [9.89586771e-01, 1.04132294e-02],
       [9.43836648e-01, 5.61633519e-02],
       [9.98662627e-01, 1.33737312e-03]])

[6]:

logisticRegressionCV.score(X, y)

[6]:

0.9595782073813708