Análisis de patrones horarios de demanda#

El archivo https://github.com/jdvelasq/datalabs/blob/master/datasets/demanda_comercial.csv contiene la demanda comercial horaria del SIN. A partir de la información suministrada, determine cuantos tipos de dia hay.

Carga de datos#

[1]:

import pandas as pd

owner = "jdvelasq"
repo = "datalabs"
file = "datasets/demanda_comercial.csv"

file_url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{file}"

data = pd.read_csv(file_url, sep=";", decimal=",", thousands=".")
data.head()

[1]:

	Fecha	H01	H02	H03	H04	H05	H06	H07	H08	H09	...	H15	H16	H17	H18	H19	H20	H21	H22	H23	H24
0	2017-01-01	6090271.96	5887031.28	5708693.89	5538133.02	5389774.53	5288199.94	4851217.85	4876162.18	5073813.27	...	5745568.87	5647807.40	5609297.84	5690105.56	6761038.26	7077861.59	6979468.90	6695312.12	6289107.72	5794545.98
1	2017-01-02	5557603.42	5361012.17	5239119.86	5163721.70	5249539.75	5438852.95	5566947.57	6055220.80	6588005.66	...	7610559.73	7584026.97	7421194.49	7317194.26	8182697.67	8482127.69	8251009.75	7836278.95	7221234.85	6660691.64
2	2017-01-03	6160676.02	5924880.04	5764416.93	5685832.28	5778222.13	5985840.29	6083907.36	6545100.21	7092804.54	...	8148614.93	8117168.00	7930900.03	7783762.28	8653467.38	8835882.88	8562448.23	8095841.79	7443299.51	6840297.15
3	2017-01-04	6321851.20	6092135.24	5905390.85	5827867.82	5925730.02	6182279.89	6276759.96	6737660.53	7256819.19	...	8164513.51	8132006.92	7962749.39	7841533.59	8700506.07	8860255.66	8611085.76	8147451.75	7471240.82	6838250.32
4	2017-01-05	6395397.96	6162409.22	6001153.67	5918673.26	6005924.21	6225027.53	6322923.78	6739120.44	7307847.42	...	8144209.65	8136193.07	8022918.36	7927405.07	8718181.89	8832234.87	8579992.47	8063978.64	7406615.66	6829864.57

5 rows × 25 columns

Preparación de datos#

[2]:

#
# Borrado de registros con nulos
#
data = data.dropna()

[3]:

#
# Borrado de duplicados
#
data = data.drop_duplicates()

[4]:

#
# Fecha como indice
#
data = data.set_index('Fecha')

[5]:

#
# Se divide cada fila por el máximo de la fila para
# hacer adimensionales los datos
#
data = data.apply(lambda row: row/row.max(), axis=1)
data.head(10)

[5]:

	H01	H02	H03	H04	H05	H06	H07	H08	H09	H10	...	H15	H16	H17	H18	H19	H20	H21	H22	H23	H24
Fecha
2017-01-01	0.860468	0.831753	0.806556	0.782459	0.761498	0.747147	0.685407	0.688932	0.716857	0.757250	...	0.811766	0.797954	0.792513	0.803930	0.955237	1.0	0.986099	0.945951	0.888560	0.818686
2017-01-02	0.655213	0.632036	0.617666	0.608777	0.618894	0.641213	0.656315	0.713880	0.776693	0.822799	...	0.897247	0.894118	0.874921	0.862660	0.964699	1.0	0.972752	0.923858	0.851347	0.785262
2017-01-03	0.697234	0.670548	0.652387	0.643493	0.653950	0.677447	0.688545	0.740741	0.802727	0.844419	...	0.922219	0.918660	0.897579	0.880926	0.979355	1.0	0.969054	0.916246	0.842395	0.774150
2017-01-04	0.713507	0.687580	0.666503	0.657754	0.668799	0.697754	0.708417	0.760436	0.819030	0.858744	...	0.921476	0.917807	0.898704	0.885023	0.981970	1.0	0.971878	0.919550	0.843231	0.771789
2017-01-05	0.724097	0.697718	0.679460	0.670122	0.680001	0.704808	0.715892	0.763014	0.827406	0.862729	...	0.922101	0.921193	0.908368	0.897554	0.987087	1.0	0.971441	0.913017	0.838589	0.773288
2017-01-06	0.721367	0.693060	0.677716	0.669919	0.680114	0.711488	0.727354	0.777911	0.832607	0.874234	...	0.931452	0.923880	0.903917	0.884226	0.980638	1.0	0.971224	0.919679	0.849822	0.782385
2017-01-07	0.771179	0.737493	0.719772	0.707964	0.712711	0.727125	0.727165	0.767520	0.831088	0.875044	...	0.891179	0.878254	0.862782	0.869156	0.975394	1.0	0.974567	0.927254	0.858286	0.797428
2017-01-08	0.815849	0.785902	0.760155	0.743666	0.738028	0.739256	0.711478	0.734445	0.774577	0.805300	...	0.842050	0.828468	0.824091	0.844483	0.973957	1.0	0.975346	0.926929	0.864727	0.804509
2017-01-09	0.741549	0.717449	0.697649	0.688828	0.693286	0.697630	0.676364	0.701891	0.747615	0.789990	...	0.834565	0.820170	0.815800	0.841162	0.965378	1.0	0.966967	0.914443	0.851577	0.792191
2017-01-10	0.667308	0.639970	0.628090	0.626889	0.646465	0.683864	0.705480	0.759278	0.826977	0.865367	...	0.931860	0.928386	0.917774	0.902276	0.986291	1.0	0.959489	0.900394	0.822406	0.746639

10 rows × 24 columns

[6]:

#
# Patrones de ejemplo
#
import matplotlib.pyplot as plt

plt.figure(figsize=(7,4))
plt.plot(data.loc['2017-01-01',:], '.-', color='tab:blue')
plt.plot(data.loc['2017-06-05',:], '.-', color='tab:orange')
plt.plot(data.loc['2017-07-20',:], '.-', color='tab:green')
plt.xticks(rotation=90)
plt.show()

../_images/46_clustering_06_Tutorial.Clasificacion_demanda_comercial_9_0.png

Deteriminación de la cantidad de perfiles diferentes#

[7]:

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

n_clusters = 10
scores = []

for n in range(2, n_clusters):

    kmeans = KMeans(n_clusters=n, n_init='auto')
    kmeans.fit(data)
    labels = kmeans.labels_
    scores.append(silhouette_score(data, labels, metric="euclidean"))


plt.figure(figsize=(5,4))
plt.plot(range(2, n_clusters), scores, marker='o', color='tab:blue', alpha=0.9)
plt.xlabel('número de clusters')
plt.gca().spines["left"].set_color("gray")
plt.gca().spines["bottom"].set_color("gray")
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.grid()
plt.show()

../_images/46_clustering_06_Tutorial.Clasificacion_demanda_comercial_11_0.png

[8]:

#
# Dos patrones óptimos
#
kmeans = KMeans(n_clusters=2, n_init='auto')
kmeans.fit(data)

plt.figure(figsize=(7,4))
plt.plot(kmeans.cluster_centers_[0], '.-', color='tab:blue')
plt.plot(kmeans.cluster_centers_[1], '.-', color='tab:orange')
plt.show()

../_images/46_clustering_06_Tutorial.Clasificacion_demanda_comercial_12_0.png

[9]:

data = data.assign(cluster=kmeans.labels_)
data = data.assign(day=pd.to_datetime(data.index).day_of_week)
data[['cluster', 'day']]

[9]:

	cluster	day
Fecha
2017-01-01	0	6
2017-01-02	0	0
2017-01-03	1	1
2017-01-04	1	2
2017-01-05	1	3
...	...	...
2022-08-27	1	5
2022-08-28	0	6
2022-08-29	1	0
2022-08-30	1	1
2022-08-31	1	2

2069 rows × 2 columns

[10]:

data_per_cluster = data.loc[data.cluster == 0, 'day']
days = data_per_cluster.value_counts()
days = days.sort_index()
days

[10]:

day
0     67
1     13
2     12
3     11
4     17
5     28
6    296
Name: count, dtype: int64

[11]:

data_per_cluster = data.loc[data.cluster == 1, 'day']
days = data_per_cluster.value_counts()
days = days.sort_index()
days

[11]:

day
0    229
1    283
2    284
3    284
4    278
5    267
Name: count, dtype: int64