Aceleracion de map()#

  • Última modificación: Mayo 14, 2022

Adaptado del libro “Mastering Large Datasets with Python”.

Preparación de datos#

[1]:
import os
import re
[2]:
REPEAT = 1000000

phone_numbers = [
    "(123) 456-7890",
    "1234567890",
    "123.456.7890",
    "+1 123 456-7890",
]

new_numbers = []

regex = re.compile(r"\d")
[3]:
class PhoneFormatter:
    def __init__(self):
        self.regex = re.compile(r"\d")

    def pretty_format(self, phone_number):

        digits = self.regex.findall(phone_number)

        area_code = "".join(digits[-10:-7])
        first_3 = "".join(digits[-7:-4])
        last_4 = "".join(digits[-4 : len(digits)])

        return "({}) {}-{}".format(
            area_code,
            first_3,
            last_4,
        )


phone_numbers = [
    "(123) 456-7890",
    "1234567890",
    "123.456.7890",
    "+1 123 456-7890",
]

formatter = PhoneFormatter()

Definición de funciones#

[4]:
def lazzy_map(phone_numbers):
    return list(
        map(
            formatter.pretty_format,
            phone_numbers * REPEAT,
        )
    )
[5]:
from multiprocessing import Pool


def parallel_map(phone_numbers):
    with Pool() as pool:
        result = pool.map(
            formatter.pretty_format,
            phone_numbers * REPEAT,
        )
    return result

Medición de tiempos#

[6]:
%%timeit

_ = lazzy_map(phone_numbers)
7.24 s ± 199 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[7]:
%%timeit

_ = parallel_map(phone_numbers)
1.45 s ± 104 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Función imap()#

[8]:
#
# Esta función retorna un iterador por lo que se retraza
# su ejecución hasta que el valor es requerido
#
def parallel_imap(phone_numbers):
    with Pool() as pool:
        result = pool.imap(
            formatter.pretty_format,
            phone_numbers * REPEAT,
        )
    return result
[9]:
%%timeit

_ = parallel_imap(phone_numbers)
112 ms ± 13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Función imap_unordered()#

[10]:
#
# Esta función retorna un iterador por lo que se retraza
# su ejecución hasta que el valor es requerido
#
def parallel_imap_unordered(phone_numbers):
    with Pool() as pool:
        result = pool.imap_unordered(
            formatter.pretty_format,
            phone_numbers * REPEAT,
        )
    return result
[11]:
%%timeit

_ = parallel_imap_unordered(phone_numbers)
106 ms ± 3.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Función starmap()#

[12]:
x = [7, 3, 1, 19, 11, 3]
y = [3, 4, 6, 10, 14, 1]
[13]:
%%timeit

[max(k) for k in zip(x, y)]
862 ns ± 6.55 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[14]:
from itertools import starmap
[15]:
%%timeit

list(starmap(max, zip(x, y)))
663 ns ± 7.59 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)