La función make_blobs — 7:11 min#

  • 7:11 min | Ultima modificación: Septiembre 27, 2021 | YouTube

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html#sklearn.datasets.make_blobs

La función make_blobs de scikit-learn se usa para generar clusters de datos n-dimensionales, los cuales son usados para probar algoritmos de clustering y clasificación. En esta lección se describe el uso de esta función.

[1]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

X, y = make_blobs(
    # -------------------------------------------------------------------------
    # If int, it is the total number of points equally divided among clusters.
    # If array-like, each element of the sequence indicates the number of
    # samples per cluster
    n_samples=150,
    # -------------------------------------------------------------------------
    # The number of features for each sample.
    n_features=2,
    # -------------------------------------------------------------------------
    # The number of centers to generate, or the fixed center locations.
    centers=3,
    # -------------------------------------------------------------------------
    # The standard deviation of the clusters.
    cluster_std=0.8,
    # -------------------------------------------------------------------------
    # The bounding box for each cluster center when centers are generated at
    # random.
    center_box=(-10, 10),
    # -------------------------------------------------------------------------
    # Shuffle the samples.
    shuffle=False,
    # -------------------------------------------------------------------------
    # Determines random number generation for dataset creation.
    random_state=12345,
)


plt.figure(figsize=(7, 7))

plt.scatter(
    X[:50, 0],
    X[:50, 1],
    color="tab:red",
    edgecolors="k",
    s=120,
    alpha=0.9,
)
plt.scatter(
    X[50:100, 0],
    X[50:100, 1],
    color="tab:blue",
    edgecolors="k",
    s=120,
    alpha=0.9,
)
plt.scatter(
    X[100:, 0],
    X[100:, 1],
    color="tab:green",
    edgecolors="k",
    s=120,
    alpha=0.9,
)

plt.gca().spines["left"].set_color("gray")
plt.gca().spines["bottom"].set_color("gray")
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.axis("equal")
plt.show()
../_images/53_datasets_20_make_blobs_3_0.png
[5]:
X, y = make_blobs(
    n_samples=[50, 30, 20],
    n_features=2,
    centers=[
        [-8, -4],
        [3, 0],
        [-3, 7],
    ],
    cluster_std=2.5,
    center_box=(-10, 10),
    shuffle=False,
    random_state=12345,
)


plt.figure(figsize=(9, 8))

plt.scatter(
    X[:50, 0],
    X[:50, 1],
    color="tab:red",
    edgecolors="k",
    s=120,
    alpha=0.9,
)
plt.scatter(
    X[50:80, 0],
    X[50:80, 1],
    color="tab:blue",
    edgecolors="k",
    s=120,
    alpha=0.9,
)
plt.scatter(
    X[80:, 0],
    X[80:, 1],
    color="tab:green",
    edgecolors="k",
    s=120,
    alpha=0.9,
)

plt.gca().spines["left"].set_color("gray")
plt.gca().spines["bottom"].set_color("gray")
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.axis("equal")
plt.show()
../_images/53_datasets_20_make_blobs_4_0.png