Procesamiento de datos en paralelo con map()#

  • Última modificación: Mayo 14, 2022

Adaptado del libro “Mastering Large Datasets with Python”.

Ejemplo de un cálculo con un ciclo for#

[1]:
import re
import os
[2]:
REPEAT = 1000000

phone_numbers = [
    "(123) 456-7890",
    "1234567890",
    "123.456.7890",
    "+1 123 456-7890",
]

new_numbers = []

regex = re.compile(r"\d")
[3]:
%%timeit

for phone_number in phone_numbers * REPEAT:

    digits = regex.findall(phone_number)

    area_code = "".join(digits[-10:-7])
    first_3 = "".join(digits[-7:-4])
    last_4 = "".join(digits[-4 : len(digits)])

    correct_format = "({}) {}-{}".format(
        area_code,
        first_3,
        last_4,
    )

    new_numbers.append(correct_format)
7.15 s ± 61.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Solución con map()#

[4]:
class PhoneFormatter:
    def __init__(self):
        self.regex = re.compile(r"\d")

    def pretty_format(self, phone_number):

        digits = self.regex.findall(phone_number)

        area_code = "".join(digits[-10:-7])
        first_3 = "".join(digits[-7:-4])
        last_4 = "".join(digits[-4 : len(digits)])

        return "({}) {}-{}".format(
            area_code,
            first_3,
            last_4,
        )


phone_numbers = [
    "(123) 456-7890",
    "1234567890",
    "123.456.7890",
    "+1 123 456-7890",
]

formatter = PhoneFormatter()
[5]:
list(map(formatter.pretty_format, phone_numbers))
[5]:
['(123) 456-7890', '(123) 456-7890', '(123) 456-7890', '(123) 456-7890']
[6]:
%%timeit

list(map(formatter.pretty_format, phone_numbers * REPEAT))
7.92 s ± 175 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Multiprocessing#

[7]:
#
# Número de procesadores
#
os.cpu_count()
[7]:
16
[8]:
from multiprocessing import Pool
[9]:
%%timeit

with Pool(processes=1) as p:
    p.map(formatter.pretty_format, phone_numbers * REPEAT)
8.72 s ± 108 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[10]:
%%timeit

with Pool(processes=2) as p:
    p.map(formatter.pretty_format, phone_numbers * REPEAT)
4.68 s ± 35.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[11]:
%%timeit

with Pool(processes=3) as p:
    p.map(formatter.pretty_format, phone_numbers * REPEAT)
3.39 s ± 65.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[12]:
%%timeit

with Pool(processes=4) as p:
    p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.73 s ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[13]:
%%timeit

with Pool(processes=5) as p:
    p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.37 s ± 13.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[14]:
%%timeit

with Pool(processes=6) as p:
    p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.18 s ± 60.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[15]:
%%timeit

with Pool(processes=7) as p:
    p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.08 s ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[16]:
%%timeit

with Pool(processes=8) as p:
    p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.24 s ± 353 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[17]:
%%timeit

with Pool(processes=9) as p:
    p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.11 s ± 253 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[18]:
%%timeit

with Pool(processes=10) as p:
    p.map(formatter.pretty_format, phone_numbers * REPEAT)
2.05 s ± 38.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)