Tipos de datos complejos#

60 min | Última modificación: Noviembre 07, 2019

Cell magic `%%pig`#

[1]:

from IPython.core.magic import Magics, cell_magic, line_magic, magics_class
from pexpect import spawn

TIMEOUT = 60
PROG = "pig"
PROMPT = ["\r\n>> ", "\r\ngrunt> "]
DISCARD = ["INFO  org.apache", "WARN  org.apache"]
QUIT = "quit"


@magics_class
class Magic(Magics):
    def __init__(self, shell):
        super().__init__(shell)
        self.app = spawn(PROG, timeout=60)
        self.app.expect(PROMPT)

    @cell_magic
    def pig(self, line, cell):
        cell_lines = [cell_line.strip() for cell_line in cell.split("\n")]
        cell_lines = [cell_line for cell_line in cell_lines if cell_line != ""]
        for cell_line in cell_lines:
            self.app.sendline(cell_line)
            self.app.expect(PROMPT, timeout=TIMEOUT)
            output = self.app.before.decode()
            output = output.replace("\r\n", "\n")
            output = output.split("\n")
            output = [output_line.strip() for output_line in output]
            for output_line in output:
                if output_line not in cell_lines:
                    if not any(word in output_line for word in DISCARD):
                        print(output_line)
        return None

    @line_magic
    def quit(self, line):
        self.app.sendline(QUIT)


def load_ipython_extension(ip):
    ip.register_magics(Magic(ip))


load_ipython_extension(ip=get_ipython())

Datos simples#

Los siguientes son los tipos de datos soportados por Pig:

int      long      float       double      chararray
boolean  datetime  biginteger  bigdecimal  bytearray

## Datos complejos

Apache Pig trabaja con la siguiente jerarquía de relaciones (http://pig.apache.org/docs/r0.17.0/basic.html#relations):

Una tuple es un ser de campos ordenados: (field1, field2, ….).
Una bag es un conjunto de tuplas: {(…), (…), …}
Un map es un conjunto de parejas [key#value, ….]

### TUPLE

[2]:

%%writefile /tmp/data.tsv
A   10      (1, 2)
B   20      (3, 4)
C   30      (5, 6)
D   40      (7, 8)

Overwriting /tmp/data.tsv

[3]:

!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv

Deleted data.tsv

[4]:

%%pig
--
-- Los campos del archivo están separados por
-- tabuladores.
--
u = LOAD 'data.tsv'
    AS (f1:CHARARRAY, f2:INT, f3:TUPLE(p:INT, q:INT));
DUMP u;

(A,10,(1,2))
(B,20,(3,4))
(C,30,(5,6))
(D,40,(7,8))

[5]:

%%pig
--
-- Los campos de la tupla pueden ser accesados
-- por nombre o por posición.
--
r = FOREACH u GENERATE f3.p, f3.$1 ;
DUMP r;

(1,2)
(3,4)
(5,6)
(7,8)

[6]:

%%pig
--
-- Aqui se accesan los campos de la tupla por
-- posicion ya que no tienen nombre.
--
u = LOAD 'data.tsv' AS (f1:CHARARRAY, f2:INT, f3:TUPLE(INT, INT));
r = FOREACH u GENERATE $2.$0, $2.$1;
DUMP r;

(1,2)
(3,4)
(5,6)
(7,8)

[7]:

%%writefile /tmp/data.tsv
A   (1,  2) (3,  4)
B   (5,  6) (7,  8)
C   (9, 10) (11, 12)

Overwriting /tmp/data.tsv

[8]:

!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv

Deleted data.tsv

[9]:

%%pig
--
-- Se seleccionan los campos por nombre
--
u = LOAD 'data.tsv'
    AS (f1: CHARARRAY,
        t1: TUPLE(t1a: INT, t1b: INT),
        t2: TUPLE(t2a: INT, t2b: INT));

DUMP u;

(A,(1,2),(3,4))
(B,(5,6),(7,8))
(C,(9,10),(11,12))

[10]:

%%pig
r = FOREACH u GENERATE f1, t2.t2b;
DUMP r;

(A,4)
(B,8)
(C,12)

BAG#

[11]:

%%writefile /tmp/data.tsv
A   10      {(1,2),(3,4)}
B   20      {(5,6),(7,8)}
C   30      {(9,10),(11,12)}
D   40      {(13,14),(15,16)}

Overwriting /tmp/data.tsv

[12]:

!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv

Deleted data.tsv

[13]:

%%pig
--
-- Se selecciona el `bag` por nombre
--
u = LOAD 'data.tsv' AS (
        f1:CHARARRAY,
        f2:INT,
        f3:BAG{t:TUPLE(p:INT, q:INT)}
    );
DUMP u;

(A,10,{(1,2),(3,4)})
(B,20,{(5,6),(7,8)})
(C,30,{(9,10),(11,12)})
(D,40,{(13,14),(15,16)})

[14]:

%%pig
r = FOREACH u GENERATE f3;
DUMP r;

({(1,2),(3,4)})
({(5,6),(7,8)})
({(9,10),(11,12)})
({(13,14),(15,16)})

[15]:

%%pig
r = FOREACH u GENERATE f3.p;
DUMP r;

({(1),(3)})
({(5),(7)})
({(9),(11)})
({(13),(15)})

MAP#

[16]:

%%writefile /tmp/data.tsv
A   10      [a#1,b#2]
B   20      [a#3,c#4]
C   30      [b#5,c#6]
D   40      [b#7,c#8]

Overwriting /tmp/data.tsv

[17]:

!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv

Deleted data.tsv

[18]:

%%pig
u = LOAD 'data.tsv'
    AS (f1:CHARARRAY, f2:INT, f3:MAP[]);
r = FOREACH u GENERATE f3#'a', f3#'c';
DUMP r

(1,)
(3,4)
(,6)
(,8)

## Manipulación de datos complejos: FLATTEN

[19]:

%%writefile /tmp/data.tsv
A   10      (1, 2)
B   20      (3, 4)
C   30      (5, 6)
D   40      (7, 8)

Overwriting /tmp/data.tsv

[20]:

!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv

Deleted data.tsv

[21]:

%%pig
u = LOAD 'data.tsv'
    AS (f1:CHARARRAY, f2:INT, f3:TUPLE(p:INT, q:INT));
DUMP u;

(A,10,(1,2))
(B,20,(3,4))
(C,30,(5,6))
(D,40,(7,8))

[22]:

%%pig
r = FOREACH u GENERATE f1, FLATTEN(f3);
DUMP r;

(A,1,2)
(B,3,4)
(C,5,6)
(D,7,8)

[23]:

%%writefile /tmp/data.tsv
A   10      {(1),(2)}
B   20      {(3),(4)}
C   30      {(5),(6)}
D   40      {(7),(8)}

Overwriting /tmp/data.tsv

[24]:

!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv

Deleted data.tsv

[25]:

%%pig
u = LOAD 'data.tsv' AS (
        f1:CHARARRAY,
        f2:INT,
        f3:BAG{t: TUPLE(p:INT)}
    );
DUMP u;

(A,10,{(1),(2)})
(B,20,{(3),(4)})
(C,30,{(5),(6)})
(D,40,{(7),(8)})

[26]:

%%pig
r = FOREACH u GENERATE f1, FLATTEN(f3);
DUMP r;

(A,1)
(A,2)
(B,3)
(B,4)
(C,5)
(C,6)
(D,7)
(D,8)

[27]:

%%pig
r = FOREACH u GENERATE FLATTEN(f3);
DUMP r;

(1)
(2)
(3)
(4)
(5)
(6)
(7)
(8)

[28]:

%%pig
--
-- se pueden colocar varios comandos dentro de
-- un FOREACH
--
r1 = FOREACH u {
        GENERATE FLATTEN(f3);
};
DUMP r1;

(1)
(2)
(3)
(4)
(5)
(6)
(7)
(8)

[29]:

%%pig
r1 = FOREACH u GENERATE (DOUBLE) $1;
DUMP r1;

(10.0)
(20.0)
(30.0)
(40.0)

[30]:

%quit