Procesamiento de datos en paralelo con map()#
Última modificación: Mayo 14, 2022
Adaptado del libro “Mastering Large Datasets with Python”.
Ejemplo de un cálculo con un ciclo for#
[1]:
import re
import os
[2]:
REPEAT = 1000000
phone_numbers = [
"(123) 456-7890",
"1234567890",
"123.456.7890",
"+1 123 456-7890",
]
new_numbers = []
regex = re.compile(r"\d")
[3]:
%%timeit
for phone_number in phone_numbers * REPEAT:
digits = regex.findall(phone_number)
area_code = "".join(digits[-10:-7])
first_3 = "".join(digits[-7:-4])
last_4 = "".join(digits[-4 : len(digits)])
correct_format = "({}) {}-{}".format(
area_code,
first_3,
last_4,
)
new_numbers.append(correct_format)
7.15 s ± 61.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Solución con map()#
[4]:
class PhoneFormatter:
def __init__(self):
self.regex = re.compile(r"\d")
def pretty_format(self, phone_number):
digits = self.regex.findall(phone_number)
area_code = "".join(digits[-10:-7])
first_3 = "".join(digits[-7:-4])
last_4 = "".join(digits[-4 : len(digits)])
return "({}) {}-{}".format(
area_code,
first_3,
last_4,
)
phone_numbers = [
"(123) 456-7890",
"1234567890",
"123.456.7890",
"+1 123 456-7890",
]
formatter = PhoneFormatter()
[5]:
list(map(formatter.pretty_format, phone_numbers))
[5]:
['(123) 456-7890', '(123) 456-7890', '(123) 456-7890', '(123) 456-7890']
[6]:
%%timeit
list(map(formatter.pretty_format, phone_numbers * REPEAT))
7.92 s ± 175 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Multiprocessing#
[7]:
#
# Número de procesadores
#
os.cpu_count()
[7]:
16
[8]:
from multiprocessing import Pool
[9]:
%%timeit
with Pool(processes=1) as p:
p.map(formatter.pretty_format, phone_numbers * REPEAT)
8.72 s ± 108 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[10]:
%%timeit
with Pool(processes=2) as p:
p.map(formatter.pretty_format, phone_numbers * REPEAT)
4.68 s ± 35.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[11]:
%%timeit
with Pool(processes=3) as p:
p.map(formatter.pretty_format, phone_numbers * REPEAT)
3.39 s ± 65.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[12]:
%%timeit
with Pool(processes=4) as p:
p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.73 s ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[13]:
%%timeit
with Pool(processes=5) as p:
p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.37 s ± 13.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[14]:
%%timeit
with Pool(processes=6) as p:
p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.18 s ± 60.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[15]:
%%timeit
with Pool(processes=7) as p:
p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.08 s ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[16]:
%%timeit
with Pool(processes=8) as p:
p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.24 s ± 353 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[17]:
%%timeit
with Pool(processes=9) as p:
p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.11 s ± 253 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[18]:
%%timeit
with Pool(processes=10) as p:
p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.05 s ± 38.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)