Test de permutación de un score con permutation_test_score — 7:34#

  • Ultima modificación: 2023-02-27 | YouTube

  • Evalua la significancia de un score computado con validación cruzada usando permutaciones.

  • Se permuta la variable de salida para generar datos aleatorios y computar un valor p empírico en contra de la hipótesis nula de que las variables de entrada y la salida son independientes.

Datos#

[1]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

Experimento 1#

[2]:
from sklearn.model_selection import StratifiedKFold, permutation_test_score
from sklearn.svm import SVC

clf = SVC(kernel="linear", random_state=7)
cv = StratifiedKFold(2, shuffle=True, random_state=0)

score, permutation_scores, pvalue = permutation_test_score(
    # -------------------------------------------------------------------------
    # The object to use to fit the data.
    estimator=clf,
    # -------------------------------------------------------------------------
    # The data to fit.
    X=X,
    # -------------------------------------------------------------------------
    # The target variable to try to predict in the case of supervised learning.
    y=y,
    # -------------------------------------------------------------------------
    # Labels to constrain permutation within groups, i.e. y values are permuted
    # among samples with the same group identifier. When not specified,
    # y values are permuted among all samples.
    groups=None,
    # -------------------------------------------------------------------------
    # Metric
    scoring="accuracy",
    # -------------------------------------------------------------------------
    # Determines the cross-validation splitting strategy.
    cv=cv,
    # -------------------------------------------------------------------------
    # Number of times to permute y.
    n_permutations=1000,
    # -------------------------------------------------------------------------
    # Pass an int for reproducible output for permutation of y values among
    # samples.
    random_state=0,
    # -------------------------------------------------------------------------
    # The verbosity level.
    verbose=0,
    # -------------------------------------------------------------------------
    # Parameters to pass to the fit method of the estimator.
    fit_params=None,
)

score
[2]:
0.9666666666666667
[3]:
permutation_scores[:10]
[3]:
array([0.33333333, 0.26      , 0.35333333, 0.37333333, 0.33333333,
       0.36666667, 0.34      , 0.38      , 0.36666667, 0.34666667])
[4]:
pvalue
[4]:
0.000999000999000999
[5]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

ax.hist(permutation_scores, bins=20, density=True)
ax.axvline(score, ls="--", color="r")
score_label = f"Score on original\ndata: {score:.2f}\n(p-value: {pvalue:.3f})"
ax.text(0.7, 10, score_label, fontsize=12)
ax.set_xlabel("Accuracy score")
ax.set_ylabel("Probability")
plt.show()
../_images/04_calculo_de_metricas_05_test_de_permutacion_9_0.png

Experimento 2#

[6]:
import numpy as np

n_uncorrelated_features = 20
rng = np.random.RandomState(seed=0)
X_rand = rng.normal(size=(X.shape[0], n_uncorrelated_features))
[7]:
score, permutation_scores, pvalue = permutation_test_score(
    clf,
    X_rand,
    y,
    scoring="accuracy",
    cv=cv,
    n_permutations=1000,
)

score
[7]:
0.30000000000000004
[8]:
permutation_scores[:10]
[8]:
array([0.39333333, 0.28666667, 0.34      , 0.28      , 0.32      ,
       0.28      , 0.33333333, 0.31333333, 0.34      , 0.29333333])
[9]:
pvalue
[9]:
0.7772227772227772
[10]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

ax.hist(permutation_scores, bins=20, density=True)
ax.axvline(score, ls="--", color="r")
score_label = f"Score on original\ndata: {score:.2f}\n(p-value: {pvalue:.3f})"
ax.text(0.7, 10, score_label, fontsize=12)
ax.set_xlabel("Accuracy score")
ax.set_ylabel("Probability")
plt.show()
../_images/04_calculo_de_metricas_05_test_de_permutacion_15_0.png