Tipos de datos complejos#

  • 60 min | Última modificación: Noviembre 07, 2019

Cell magic %%pig#

[1]:
from IPython.core.magic import Magics, cell_magic, line_magic, magics_class
from pexpect import spawn

TIMEOUT = 60
PROG = "pig"
PROMPT = ["\r\n>> ", "\r\ngrunt> "]
DISCARD = ["INFO  org.apache", "WARN  org.apache"]
QUIT = "quit"


@magics_class
class Magic(Magics):
    def __init__(self, shell):
        super().__init__(shell)
        self.app = spawn(PROG, timeout=60)
        self.app.expect(PROMPT)

    @cell_magic
    def pig(self, line, cell):
        cell_lines = [cell_line.strip() for cell_line in cell.split("\n")]
        cell_lines = [cell_line for cell_line in cell_lines if cell_line != ""]
        for cell_line in cell_lines:
            self.app.sendline(cell_line)
            self.app.expect(PROMPT, timeout=TIMEOUT)
            output = self.app.before.decode()
            output = output.replace("\r\n", "\n")
            output = output.split("\n")
            output = [output_line.strip() for output_line in output]
            for output_line in output:
                if output_line not in cell_lines:
                    if not any(word in output_line for word in DISCARD):
                        print(output_line)
        return None

    @line_magic
    def quit(self, line):
        self.app.sendline(QUIT)


def load_ipython_extension(ip):
    ip.register_magics(Magic(ip))


load_ipython_extension(ip=get_ipython())

Datos simples#

Los siguientes son los tipos de datos soportados por Pig:

int      long      float       double      chararray
boolean  datetime  biginteger  bigdecimal  bytearray

## Datos complejos

Apache Pig trabaja con la siguiente jerarquía de relaciones (http://pig.apache.org/docs/r0.17.0/basic.html#relations):

  • Una tuple es un ser de campos ordenados: (field1, field2, ….).

  • Una bag es un conjunto de tuplas: {(…), (…), …}

  • Un map es un conjunto de parejas [key#value, ….]

### TUPLE

[2]:
%%writefile /tmp/data.tsv
A   10      (1, 2)
B   20      (3, 4)
C   30      (5, 6)
D   40      (7, 8)
Overwriting /tmp/data.tsv
[3]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[4]:
%%pig
--
-- Los campos del archivo están separados por
-- tabuladores.
--
u = LOAD 'data.tsv'
    AS (f1:CHARARRAY, f2:INT, f3:TUPLE(p:INT, q:INT));
DUMP u;
(A,10,(1,2))
(B,20,(3,4))
(C,30,(5,6))
(D,40,(7,8))
[5]:
%%pig
--
-- Los campos de la tupla pueden ser accesados
-- por nombre o por posición.
--
r = FOREACH u GENERATE f3.p, f3.$1 ;
DUMP r;
(1,2)
(3,4)
(5,6)
(7,8)
[6]:
%%pig
--
-- Aqui se accesan los campos de la tupla por
-- posicion ya que no tienen nombre.
--
u = LOAD 'data.tsv' AS (f1:CHARARRAY, f2:INT, f3:TUPLE(INT, INT));
r = FOREACH u GENERATE $2.$0, $2.$1;
DUMP r;
(1,2)
(3,4)
(5,6)
(7,8)
[7]:
%%writefile /tmp/data.tsv
A   (1,  2) (3,  4)
B   (5,  6) (7,  8)
C   (9, 10) (11, 12)
Overwriting /tmp/data.tsv
[8]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[9]:
%%pig
--
-- Se seleccionan los campos por nombre
--
u = LOAD 'data.tsv'
    AS (f1: CHARARRAY,
        t1: TUPLE(t1a: INT, t1b: INT),
        t2: TUPLE(t2a: INT, t2b: INT));

DUMP u;
(A,(1,2),(3,4))
(B,(5,6),(7,8))
(C,(9,10),(11,12))
[10]:
%%pig
r = FOREACH u GENERATE f1, t2.t2b;
DUMP r;
(A,4)
(B,8)
(C,12)

BAG#

[11]:
%%writefile /tmp/data.tsv
A   10      {(1,2),(3,4)}
B   20      {(5,6),(7,8)}
C   30      {(9,10),(11,12)}
D   40      {(13,14),(15,16)}
Overwriting /tmp/data.tsv
[12]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[13]:
%%pig
--
-- Se selecciona el `bag` por nombre
--
u = LOAD 'data.tsv' AS (
        f1:CHARARRAY,
        f2:INT,
        f3:BAG{t:TUPLE(p:INT, q:INT)}
    );
DUMP u;
(A,10,{(1,2),(3,4)})
(B,20,{(5,6),(7,8)})
(C,30,{(9,10),(11,12)})
(D,40,{(13,14),(15,16)})
[14]:
%%pig
r = FOREACH u GENERATE f3;
DUMP r;
({(1,2),(3,4)})
({(5,6),(7,8)})
({(9,10),(11,12)})
({(13,14),(15,16)})
[15]:
%%pig
r = FOREACH u GENERATE f3.p;
DUMP r;
({(1),(3)})
({(5),(7)})
({(9),(11)})
({(13),(15)})

MAP#

[16]:
%%writefile /tmp/data.tsv
A   10      [a#1,b#2]
B   20      [a#3,c#4]
C   30      [b#5,c#6]
D   40      [b#7,c#8]
Overwriting /tmp/data.tsv
[17]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[18]:
%%pig
u = LOAD 'data.tsv'
    AS (f1:CHARARRAY, f2:INT, f3:MAP[]);
r = FOREACH u GENERATE f3#'a', f3#'c';
DUMP r
(1,)
(3,4)
(,6)
(,8)

## Manipulación de datos complejos: FLATTEN

[19]:
%%writefile /tmp/data.tsv
A   10      (1, 2)
B   20      (3, 4)
C   30      (5, 6)
D   40      (7, 8)
Overwriting /tmp/data.tsv
[20]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[21]:
%%pig
u = LOAD 'data.tsv'
    AS (f1:CHARARRAY, f2:INT, f3:TUPLE(p:INT, q:INT));
DUMP u;
(A,10,(1,2))
(B,20,(3,4))
(C,30,(5,6))
(D,40,(7,8))
[22]:
%%pig
r = FOREACH u GENERATE f1, FLATTEN(f3);
DUMP r;
(A,1,2)
(B,3,4)
(C,5,6)
(D,7,8)
[23]:
%%writefile /tmp/data.tsv
A   10      {(1),(2)}
B   20      {(3),(4)}
C   30      {(5),(6)}
D   40      {(7),(8)}
Overwriting /tmp/data.tsv
[24]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[25]:
%%pig
u = LOAD 'data.tsv' AS (
        f1:CHARARRAY,
        f2:INT,
        f3:BAG{t: TUPLE(p:INT)}
    );
DUMP u;
(A,10,{(1),(2)})
(B,20,{(3),(4)})
(C,30,{(5),(6)})
(D,40,{(7),(8)})
[26]:
%%pig
r = FOREACH u GENERATE f1, FLATTEN(f3);
DUMP r;
(A,1)
(A,2)
(B,3)
(B,4)
(C,5)
(C,6)
(D,7)
(D,8)
[27]:
%%pig
r = FOREACH u GENERATE FLATTEN(f3);
DUMP r;
(1)
(2)
(3)
(4)
(5)
(6)
(7)
(8)
[28]:
%%pig
--
-- se pueden colocar varios comandos dentro de
-- un FOREACH
--
r1 = FOREACH u {
        GENERATE FLATTEN(f3);
};
DUMP r1;
(1)
(2)
(3)
(4)
(5)
(6)
(7)
(8)
[29]:
%%pig
r1 = FOREACH u GENERATE (DOUBLE) $1;
DUMP r1;
(10.0)
(20.0)
(30.0)
(40.0)
[30]:
%quit