Análisis de patrones horarios de demanda#
El archivo https://github.com/jdvelasq/datalabs/blob/master/datasets/demanda_comercial.csv contiene la demanda comercial horaria del SIN. A partir de la información suministrada, determine cuantos tipos de dia hay.
Carga de datos#
[1]:
import pandas as pd
owner = "jdvelasq"
repo = "datalabs"
file = "datasets/demanda_comercial.csv"
file_url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{file}"
data = pd.read_csv(file_url, sep=";", decimal=",", thousands=".")
data.head()
[1]:
Fecha | H01 | H02 | H03 | H04 | H05 | H06 | H07 | H08 | H09 | ... | H15 | H16 | H17 | H18 | H19 | H20 | H21 | H22 | H23 | H24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017-01-01 | 6090271.96 | 5887031.28 | 5708693.89 | 5538133.02 | 5389774.53 | 5288199.94 | 4851217.85 | 4876162.18 | 5073813.27 | ... | 5745568.87 | 5647807.40 | 5609297.84 | 5690105.56 | 6761038.26 | 7077861.59 | 6979468.90 | 6695312.12 | 6289107.72 | 5794545.98 |
1 | 2017-01-02 | 5557603.42 | 5361012.17 | 5239119.86 | 5163721.70 | 5249539.75 | 5438852.95 | 5566947.57 | 6055220.80 | 6588005.66 | ... | 7610559.73 | 7584026.97 | 7421194.49 | 7317194.26 | 8182697.67 | 8482127.69 | 8251009.75 | 7836278.95 | 7221234.85 | 6660691.64 |
2 | 2017-01-03 | 6160676.02 | 5924880.04 | 5764416.93 | 5685832.28 | 5778222.13 | 5985840.29 | 6083907.36 | 6545100.21 | 7092804.54 | ... | 8148614.93 | 8117168.00 | 7930900.03 | 7783762.28 | 8653467.38 | 8835882.88 | 8562448.23 | 8095841.79 | 7443299.51 | 6840297.15 |
3 | 2017-01-04 | 6321851.20 | 6092135.24 | 5905390.85 | 5827867.82 | 5925730.02 | 6182279.89 | 6276759.96 | 6737660.53 | 7256819.19 | ... | 8164513.51 | 8132006.92 | 7962749.39 | 7841533.59 | 8700506.07 | 8860255.66 | 8611085.76 | 8147451.75 | 7471240.82 | 6838250.32 |
4 | 2017-01-05 | 6395397.96 | 6162409.22 | 6001153.67 | 5918673.26 | 6005924.21 | 6225027.53 | 6322923.78 | 6739120.44 | 7307847.42 | ... | 8144209.65 | 8136193.07 | 8022918.36 | 7927405.07 | 8718181.89 | 8832234.87 | 8579992.47 | 8063978.64 | 7406615.66 | 6829864.57 |
5 rows × 25 columns
Preparación de datos#
[2]:
#
# Borrado de registros con nulos
#
data = data.dropna()
[3]:
#
# Borrado de duplicados
#
data = data.drop_duplicates()
[4]:
#
# Fecha como indice
#
data = data.set_index('Fecha')
[5]:
#
# Se divide cada fila por el máximo de la fila para
# hacer adimensionales los datos
#
data = data.apply(lambda row: row/row.max(), axis=1)
data.head(10)
[5]:
H01 | H02 | H03 | H04 | H05 | H06 | H07 | H08 | H09 | H10 | ... | H15 | H16 | H17 | H18 | H19 | H20 | H21 | H22 | H23 | H24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Fecha | |||||||||||||||||||||
2017-01-01 | 0.860468 | 0.831753 | 0.806556 | 0.782459 | 0.761498 | 0.747147 | 0.685407 | 0.688932 | 0.716857 | 0.757250 | ... | 0.811766 | 0.797954 | 0.792513 | 0.803930 | 0.955237 | 1.0 | 0.986099 | 0.945951 | 0.888560 | 0.818686 |
2017-01-02 | 0.655213 | 0.632036 | 0.617666 | 0.608777 | 0.618894 | 0.641213 | 0.656315 | 0.713880 | 0.776693 | 0.822799 | ... | 0.897247 | 0.894118 | 0.874921 | 0.862660 | 0.964699 | 1.0 | 0.972752 | 0.923858 | 0.851347 | 0.785262 |
2017-01-03 | 0.697234 | 0.670548 | 0.652387 | 0.643493 | 0.653950 | 0.677447 | 0.688545 | 0.740741 | 0.802727 | 0.844419 | ... | 0.922219 | 0.918660 | 0.897579 | 0.880926 | 0.979355 | 1.0 | 0.969054 | 0.916246 | 0.842395 | 0.774150 |
2017-01-04 | 0.713507 | 0.687580 | 0.666503 | 0.657754 | 0.668799 | 0.697754 | 0.708417 | 0.760436 | 0.819030 | 0.858744 | ... | 0.921476 | 0.917807 | 0.898704 | 0.885023 | 0.981970 | 1.0 | 0.971878 | 0.919550 | 0.843231 | 0.771789 |
2017-01-05 | 0.724097 | 0.697718 | 0.679460 | 0.670122 | 0.680001 | 0.704808 | 0.715892 | 0.763014 | 0.827406 | 0.862729 | ... | 0.922101 | 0.921193 | 0.908368 | 0.897554 | 0.987087 | 1.0 | 0.971441 | 0.913017 | 0.838589 | 0.773288 |
2017-01-06 | 0.721367 | 0.693060 | 0.677716 | 0.669919 | 0.680114 | 0.711488 | 0.727354 | 0.777911 | 0.832607 | 0.874234 | ... | 0.931452 | 0.923880 | 0.903917 | 0.884226 | 0.980638 | 1.0 | 0.971224 | 0.919679 | 0.849822 | 0.782385 |
2017-01-07 | 0.771179 | 0.737493 | 0.719772 | 0.707964 | 0.712711 | 0.727125 | 0.727165 | 0.767520 | 0.831088 | 0.875044 | ... | 0.891179 | 0.878254 | 0.862782 | 0.869156 | 0.975394 | 1.0 | 0.974567 | 0.927254 | 0.858286 | 0.797428 |
2017-01-08 | 0.815849 | 0.785902 | 0.760155 | 0.743666 | 0.738028 | 0.739256 | 0.711478 | 0.734445 | 0.774577 | 0.805300 | ... | 0.842050 | 0.828468 | 0.824091 | 0.844483 | 0.973957 | 1.0 | 0.975346 | 0.926929 | 0.864727 | 0.804509 |
2017-01-09 | 0.741549 | 0.717449 | 0.697649 | 0.688828 | 0.693286 | 0.697630 | 0.676364 | 0.701891 | 0.747615 | 0.789990 | ... | 0.834565 | 0.820170 | 0.815800 | 0.841162 | 0.965378 | 1.0 | 0.966967 | 0.914443 | 0.851577 | 0.792191 |
2017-01-10 | 0.667308 | 0.639970 | 0.628090 | 0.626889 | 0.646465 | 0.683864 | 0.705480 | 0.759278 | 0.826977 | 0.865367 | ... | 0.931860 | 0.928386 | 0.917774 | 0.902276 | 0.986291 | 1.0 | 0.959489 | 0.900394 | 0.822406 | 0.746639 |
10 rows × 24 columns
[6]:
#
# Patrones de ejemplo
#
import matplotlib.pyplot as plt
plt.figure(figsize=(7,4))
plt.plot(data.loc['2017-01-01',:], '.-', color='tab:blue')
plt.plot(data.loc['2017-06-05',:], '.-', color='tab:orange')
plt.plot(data.loc['2017-07-20',:], '.-', color='tab:green')
plt.xticks(rotation=90)
plt.show()
Deteriminación de la cantidad de perfiles diferentes#
[7]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
n_clusters = 10
scores = []
for n in range(2, n_clusters):
kmeans = KMeans(n_clusters=n, n_init='auto')
kmeans.fit(data)
labels = kmeans.labels_
scores.append(silhouette_score(data, labels, metric="euclidean"))
plt.figure(figsize=(5,4))
plt.plot(range(2, n_clusters), scores, marker='o', color='tab:blue', alpha=0.9)
plt.xlabel('número de clusters')
plt.gca().spines["left"].set_color("gray")
plt.gca().spines["bottom"].set_color("gray")
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.grid()
plt.show()
[8]:
#
# Dos patrones óptimos
#
kmeans = KMeans(n_clusters=2, n_init='auto')
kmeans.fit(data)
plt.figure(figsize=(7,4))
plt.plot(kmeans.cluster_centers_[0], '.-', color='tab:blue')
plt.plot(kmeans.cluster_centers_[1], '.-', color='tab:orange')
plt.show()
[9]:
data = data.assign(cluster=kmeans.labels_)
data = data.assign(day=pd.to_datetime(data.index).day_of_week)
data[['cluster', 'day']]
[9]:
cluster | day | |
---|---|---|
Fecha | ||
2017-01-01 | 0 | 6 |
2017-01-02 | 0 | 0 |
2017-01-03 | 1 | 1 |
2017-01-04 | 1 | 2 |
2017-01-05 | 1 | 3 |
... | ... | ... |
2022-08-27 | 1 | 5 |
2022-08-28 | 0 | 6 |
2022-08-29 | 1 | 0 |
2022-08-30 | 1 | 1 |
2022-08-31 | 1 | 2 |
2069 rows × 2 columns
[10]:
data_per_cluster = data.loc[data.cluster == 0, 'day']
days = data_per_cluster.value_counts()
days = days.sort_index()
days
[10]:
day
0 67
1 13
2 12
3 11
4 17
5 28
6 296
Name: count, dtype: int64
[11]:
data_per_cluster = data.loc[data.cluster == 1, 'day']
days = data_per_cluster.value_counts()
days = days.sort_index()
days
[11]:
day
0 229
1 283
2 284
3 284
4 278
5 267
Name: count, dtype: int64