Tipos de datos complejos#
60 min | Última modificación: Noviembre 07, 2019
Cell magic %%pig
#
[1]:
from IPython.core.magic import Magics, cell_magic, line_magic, magics_class
from pexpect import spawn
TIMEOUT = 60
PROG = "pig"
PROMPT = ["\r\n>> ", "\r\ngrunt> "]
DISCARD = ["INFO org.apache", "WARN org.apache"]
QUIT = "quit"
@magics_class
class Magic(Magics):
def __init__(self, shell):
super().__init__(shell)
self.app = spawn(PROG, timeout=60)
self.app.expect(PROMPT)
@cell_magic
def pig(self, line, cell):
cell_lines = [cell_line.strip() for cell_line in cell.split("\n")]
cell_lines = [cell_line for cell_line in cell_lines if cell_line != ""]
for cell_line in cell_lines:
self.app.sendline(cell_line)
self.app.expect(PROMPT, timeout=TIMEOUT)
output = self.app.before.decode()
output = output.replace("\r\n", "\n")
output = output.split("\n")
output = [output_line.strip() for output_line in output]
for output_line in output:
if output_line not in cell_lines:
if not any(word in output_line for word in DISCARD):
print(output_line)
return None
@line_magic
def quit(self, line):
self.app.sendline(QUIT)
def load_ipython_extension(ip):
ip.register_magics(Magic(ip))
load_ipython_extension(ip=get_ipython())
Datos simples#
Los siguientes son los tipos de datos soportados por Pig:
int long float double chararray
boolean datetime biginteger bigdecimal bytearray
## Datos complejos
Apache Pig trabaja con la siguiente jerarquía de relaciones (http://pig.apache.org/docs/r0.17.0/basic.html#relations):
Una tuple es un ser de campos ordenados: (field1, field2, ….).
Una bag es un conjunto de tuplas: {(…), (…), …}
Un map es un conjunto de parejas [key#value, ….]
### TUPLE
[2]:
%%writefile /tmp/data.tsv
A 10 (1, 2)
B 20 (3, 4)
C 30 (5, 6)
D 40 (7, 8)
Overwriting /tmp/data.tsv
[3]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[4]:
%%pig
--
-- Los campos del archivo están separados por
-- tabuladores.
--
u = LOAD 'data.tsv'
AS (f1:CHARARRAY, f2:INT, f3:TUPLE(p:INT, q:INT));
DUMP u;
(A,10,(1,2))
(B,20,(3,4))
(C,30,(5,6))
(D,40,(7,8))
[5]:
%%pig
--
-- Los campos de la tupla pueden ser accesados
-- por nombre o por posición.
--
r = FOREACH u GENERATE f3.p, f3.$1 ;
DUMP r;
(1,2)
(3,4)
(5,6)
(7,8)
[6]:
%%pig
--
-- Aqui se accesan los campos de la tupla por
-- posicion ya que no tienen nombre.
--
u = LOAD 'data.tsv' AS (f1:CHARARRAY, f2:INT, f3:TUPLE(INT, INT));
r = FOREACH u GENERATE $2.$0, $2.$1;
DUMP r;
(1,2)
(3,4)
(5,6)
(7,8)
[7]:
%%writefile /tmp/data.tsv
A (1, 2) (3, 4)
B (5, 6) (7, 8)
C (9, 10) (11, 12)
Overwriting /tmp/data.tsv
[8]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[9]:
%%pig
--
-- Se seleccionan los campos por nombre
--
u = LOAD 'data.tsv'
AS (f1: CHARARRAY,
t1: TUPLE(t1a: INT, t1b: INT),
t2: TUPLE(t2a: INT, t2b: INT));
DUMP u;
(A,(1,2),(3,4))
(B,(5,6),(7,8))
(C,(9,10),(11,12))
[10]:
%%pig
r = FOREACH u GENERATE f1, t2.t2b;
DUMP r;
(A,4)
(B,8)
(C,12)
BAG#
[11]:
%%writefile /tmp/data.tsv
A 10 {(1,2),(3,4)}
B 20 {(5,6),(7,8)}
C 30 {(9,10),(11,12)}
D 40 {(13,14),(15,16)}
Overwriting /tmp/data.tsv
[12]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[13]:
%%pig
--
-- Se selecciona el `bag` por nombre
--
u = LOAD 'data.tsv' AS (
f1:CHARARRAY,
f2:INT,
f3:BAG{t:TUPLE(p:INT, q:INT)}
);
DUMP u;
(A,10,{(1,2),(3,4)})
(B,20,{(5,6),(7,8)})
(C,30,{(9,10),(11,12)})
(D,40,{(13,14),(15,16)})
[14]:
%%pig
r = FOREACH u GENERATE f3;
DUMP r;
({(1,2),(3,4)})
({(5,6),(7,8)})
({(9,10),(11,12)})
({(13,14),(15,16)})
[15]:
%%pig
r = FOREACH u GENERATE f3.p;
DUMP r;
({(1),(3)})
({(5),(7)})
({(9),(11)})
({(13),(15)})
MAP#
[16]:
%%writefile /tmp/data.tsv
A 10 [a#1,b#2]
B 20 [a#3,c#4]
C 30 [b#5,c#6]
D 40 [b#7,c#8]
Overwriting /tmp/data.tsv
[17]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[18]:
%%pig
u = LOAD 'data.tsv'
AS (f1:CHARARRAY, f2:INT, f3:MAP[]);
r = FOREACH u GENERATE f3#'a', f3#'c';
DUMP r
(1,)
(3,4)
(,6)
(,8)
## Manipulación de datos complejos: FLATTEN
[19]:
%%writefile /tmp/data.tsv
A 10 (1, 2)
B 20 (3, 4)
C 30 (5, 6)
D 40 (7, 8)
Overwriting /tmp/data.tsv
[20]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[21]:
%%pig
u = LOAD 'data.tsv'
AS (f1:CHARARRAY, f2:INT, f3:TUPLE(p:INT, q:INT));
DUMP u;
(A,10,(1,2))
(B,20,(3,4))
(C,30,(5,6))
(D,40,(7,8))
[22]:
%%pig
r = FOREACH u GENERATE f1, FLATTEN(f3);
DUMP r;
(A,1,2)
(B,3,4)
(C,5,6)
(D,7,8)
[23]:
%%writefile /tmp/data.tsv
A 10 {(1),(2)}
B 20 {(3),(4)}
C 30 {(5),(6)}
D 40 {(7),(8)}
Overwriting /tmp/data.tsv
[24]:
!hadoop fs -rm data.tsv
!hadoop fs -put /tmp/data.tsv
Deleted data.tsv
[25]:
%%pig
u = LOAD 'data.tsv' AS (
f1:CHARARRAY,
f2:INT,
f3:BAG{t: TUPLE(p:INT)}
);
DUMP u;
(A,10,{(1),(2)})
(B,20,{(3),(4)})
(C,30,{(5),(6)})
(D,40,{(7),(8)})
[26]:
%%pig
r = FOREACH u GENERATE f1, FLATTEN(f3);
DUMP r;
(A,1)
(A,2)
(B,3)
(B,4)
(C,5)
(C,6)
(D,7)
(D,8)
[27]:
%%pig
r = FOREACH u GENERATE FLATTEN(f3);
DUMP r;
(1)
(2)
(3)
(4)
(5)
(6)
(7)
(8)
[28]:
%%pig
--
-- se pueden colocar varios comandos dentro de
-- un FOREACH
--
r1 = FOREACH u {
GENERATE FLATTEN(f3);
};
DUMP r1;
(1)
(2)
(3)
(4)
(5)
(6)
(7)
(8)
[29]:
%%pig
r1 = FOREACH u GENERATE (DOUBLE) $1;
DUMP r1;
(10.0)
(20.0)
(30.0)
(40.0)
[30]:
%quit