Análisis de patrones horarios de demanda#

El archivo https://github.com/jdvelasq/datalabs/blob/master/datasets/demanda_comercial.csv contiene la demanda comercial horaria del SIN. A partir de la información suministrada, determine cuantos tipos de dia hay.

Carga de datos#

[1]:
import pandas as pd


owner = "jdvelasq"
repo = "datalabs"
file = "datasets/demanda_comercial.csv"

file_url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{file}"


data = pd.read_csv(file_url, sep=";", decimal=",", thousands=".")
data.head()
[1]:
Fecha H01 H02 H03 H04 H05 H06 H07 H08 H09 ... H15 H16 H17 H18 H19 H20 H21 H22 H23 H24
0 2017-01-01 6090271.96 5887031.28 5708693.89 5538133.02 5389774.53 5288199.94 4851217.85 4876162.18 5073813.27 ... 5745568.87 5647807.40 5609297.84 5690105.56 6761038.26 7077861.59 6979468.90 6695312.12 6289107.72 5794545.98
1 2017-01-02 5557603.42 5361012.17 5239119.86 5163721.70 5249539.75 5438852.95 5566947.57 6055220.80 6588005.66 ... 7610559.73 7584026.97 7421194.49 7317194.26 8182697.67 8482127.69 8251009.75 7836278.95 7221234.85 6660691.64
2 2017-01-03 6160676.02 5924880.04 5764416.93 5685832.28 5778222.13 5985840.29 6083907.36 6545100.21 7092804.54 ... 8148614.93 8117168.00 7930900.03 7783762.28 8653467.38 8835882.88 8562448.23 8095841.79 7443299.51 6840297.15
3 2017-01-04 6321851.20 6092135.24 5905390.85 5827867.82 5925730.02 6182279.89 6276759.96 6737660.53 7256819.19 ... 8164513.51 8132006.92 7962749.39 7841533.59 8700506.07 8860255.66 8611085.76 8147451.75 7471240.82 6838250.32
4 2017-01-05 6395397.96 6162409.22 6001153.67 5918673.26 6005924.21 6225027.53 6322923.78 6739120.44 7307847.42 ... 8144209.65 8136193.07 8022918.36 7927405.07 8718181.89 8832234.87 8579992.47 8063978.64 7406615.66 6829864.57

5 rows × 25 columns

Preparación de datos#

[2]:
#
# Borrado de registros con nulos
#
data = data.dropna()
[3]:
#
# Borrado de duplicados
#
data = data.drop_duplicates()
[4]:
#
# Fecha como indice
#
data = data.set_index('Fecha')
[5]:
#
# Se divide cada fila por el máximo de la fila para
# hacer adimensionales los datos
#
data = data.apply(lambda row: row/row.max(), axis=1)
data.head(10)
[5]:
H01 H02 H03 H04 H05 H06 H07 H08 H09 H10 ... H15 H16 H17 H18 H19 H20 H21 H22 H23 H24
Fecha
2017-01-01 0.860468 0.831753 0.806556 0.782459 0.761498 0.747147 0.685407 0.688932 0.716857 0.757250 ... 0.811766 0.797954 0.792513 0.803930 0.955237 1.0 0.986099 0.945951 0.888560 0.818686
2017-01-02 0.655213 0.632036 0.617666 0.608777 0.618894 0.641213 0.656315 0.713880 0.776693 0.822799 ... 0.897247 0.894118 0.874921 0.862660 0.964699 1.0 0.972752 0.923858 0.851347 0.785262
2017-01-03 0.697234 0.670548 0.652387 0.643493 0.653950 0.677447 0.688545 0.740741 0.802727 0.844419 ... 0.922219 0.918660 0.897579 0.880926 0.979355 1.0 0.969054 0.916246 0.842395 0.774150
2017-01-04 0.713507 0.687580 0.666503 0.657754 0.668799 0.697754 0.708417 0.760436 0.819030 0.858744 ... 0.921476 0.917807 0.898704 0.885023 0.981970 1.0 0.971878 0.919550 0.843231 0.771789
2017-01-05 0.724097 0.697718 0.679460 0.670122 0.680001 0.704808 0.715892 0.763014 0.827406 0.862729 ... 0.922101 0.921193 0.908368 0.897554 0.987087 1.0 0.971441 0.913017 0.838589 0.773288
2017-01-06 0.721367 0.693060 0.677716 0.669919 0.680114 0.711488 0.727354 0.777911 0.832607 0.874234 ... 0.931452 0.923880 0.903917 0.884226 0.980638 1.0 0.971224 0.919679 0.849822 0.782385
2017-01-07 0.771179 0.737493 0.719772 0.707964 0.712711 0.727125 0.727165 0.767520 0.831088 0.875044 ... 0.891179 0.878254 0.862782 0.869156 0.975394 1.0 0.974567 0.927254 0.858286 0.797428
2017-01-08 0.815849 0.785902 0.760155 0.743666 0.738028 0.739256 0.711478 0.734445 0.774577 0.805300 ... 0.842050 0.828468 0.824091 0.844483 0.973957 1.0 0.975346 0.926929 0.864727 0.804509
2017-01-09 0.741549 0.717449 0.697649 0.688828 0.693286 0.697630 0.676364 0.701891 0.747615 0.789990 ... 0.834565 0.820170 0.815800 0.841162 0.965378 1.0 0.966967 0.914443 0.851577 0.792191
2017-01-10 0.667308 0.639970 0.628090 0.626889 0.646465 0.683864 0.705480 0.759278 0.826977 0.865367 ... 0.931860 0.928386 0.917774 0.902276 0.986291 1.0 0.959489 0.900394 0.822406 0.746639

10 rows × 24 columns

[6]:
#
# Patrones de ejemplo
#
import matplotlib.pyplot as plt

plt.figure(figsize=(7,4))
plt.plot(data.loc['2017-01-01',:], '.-', color='tab:blue')
plt.plot(data.loc['2017-06-05',:], '.-', color='tab:orange')
plt.plot(data.loc['2017-07-20',:], '.-', color='tab:green')
plt.xticks(rotation=90)
plt.show()
../_images/46_clustering_06_Tutorial.Clasificacion_demanda_comercial_9_0.png

Deteriminación de la cantidad de perfiles diferentes#

[7]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

n_clusters = 10
scores = []

for n in range(2, n_clusters):

    kmeans = KMeans(n_clusters=n, n_init='auto')
    kmeans.fit(data)
    labels = kmeans.labels_
    scores.append(silhouette_score(data, labels, metric="euclidean"))


plt.figure(figsize=(5,4))
plt.plot(range(2, n_clusters), scores, marker='o', color='tab:blue', alpha=0.9)
plt.xlabel('número de clusters')
plt.gca().spines["left"].set_color("gray")
plt.gca().spines["bottom"].set_color("gray")
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.grid()
plt.show()
../_images/46_clustering_06_Tutorial.Clasificacion_demanda_comercial_11_0.png
[8]:
#
# Dos patrones óptimos
#
kmeans = KMeans(n_clusters=2, n_init='auto')
kmeans.fit(data)

plt.figure(figsize=(7,4))
plt.plot(kmeans.cluster_centers_[0], '.-', color='tab:blue')
plt.plot(kmeans.cluster_centers_[1], '.-', color='tab:orange')
plt.show()
../_images/46_clustering_06_Tutorial.Clasificacion_demanda_comercial_12_0.png
[9]:
data = data.assign(cluster=kmeans.labels_)
data = data.assign(day=pd.to_datetime(data.index).day_of_week)
data[['cluster', 'day']]
[9]:
cluster day
Fecha
2017-01-01 0 6
2017-01-02 0 0
2017-01-03 1 1
2017-01-04 1 2
2017-01-05 1 3
... ... ...
2022-08-27 1 5
2022-08-28 0 6
2022-08-29 1 0
2022-08-30 1 1
2022-08-31 1 2

2069 rows × 2 columns

[10]:
data_per_cluster = data.loc[data.cluster == 0, 'day']
days = data_per_cluster.value_counts()
days = days.sort_index()
days
[10]:
day
0     67
1     13
2     12
3     11
4     17
5     28
6    296
Name: count, dtype: int64
[11]:
data_per_cluster = data.loc[data.cluster == 1, 'day']
days = data_per_cluster.value_counts()
days = days.sort_index()
days
[11]:
day
0    229
1    283
2    284
3    284
4    278
5    267
Name: count, dtype: int64