Programación Avanzada#
30 min | Última modificación: Noviembre 07, 2019
Cell magic %%pig
#
[1]:
from IPython.core.magic import Magics, cell_magic, line_magic, magics_class
from pexpect import spawn
TIMEOUT = 120
PROG = "pig"
PROMPT = ["\r\n>> ", "\r\ngrunt> "]
DISCARD = ["INFO org.apache", "WARN org.apache"]
QUIT = "quit"
@magics_class
class Magic(Magics):
def __init__(self, shell):
super().__init__(shell)
self.app = spawn(PROG, timeout=60)
self.app.expect(PROMPT)
@cell_magic
def pig(self, line, cell):
cell_lines = [cell_line.strip() for cell_line in cell.split("\n")]
cell_lines = [cell_line for cell_line in cell_lines if cell_line != ""]
for cell_line in cell_lines:
self.app.sendline(cell_line)
self.app.expect(PROMPT, timeout=TIMEOUT)
output = self.app.before.decode()
output = output.replace("\r\n", "\n")
output = output.split("\n")
output = [output_line.strip() for output_line in output]
for output_line in output:
if output_line not in cell_lines:
if not any(word in output_line for word in DISCARD):
print(output_line)
return None
@line_magic
def quit(self, line):
self.app.sendline(QUIT)
def load_ipython_extension(ip):
ip.register_magics(Magic(ip))
load_ipython_extension(ip=get_ipython())
Actividad#
Explique el resultado que produce cada uno de los comandos presentados a continuación.
COGROUP#
[2]:
%%writefile /tmp/data0.tsv
A 10 1
B 20 2
C 30 3
D 40 4
Overwriting /tmp/data0.tsv
[3]:
%%writefile /tmp/data1.tsv
A 50 5
B 60 6
C 70 7
D 80 8
Overwriting /tmp/data1.tsv
[4]:
!hadoop fs -put /tmp/*.tsv .
!hadoop fs -ls
put: `data.tsv': File exists
put: `data0.tsv': File exists
put: `data1.tsv': File exists
Found 5 items
-rw-r--r-- 1 root supergroup 60 2022-05-17 02:15 data.tsv
-rw-r--r-- 1 root supergroup 28 2022-05-17 02:18 data0.tsv
-rw-r--r-- 1 root supergroup 28 2022-05-17 02:18 data1.tsv
-rw-r--r-- 1 root supergroup 27 2022-05-17 02:12 jointable.csv
-rw-r--r-- 1 root supergroup 616 2022-05-17 02:08 persons.csv
[5]:
%%pig
u = LOAD 'data0.tsv' AS (f1:CHARARRAY, f2:INT, f3:INT);
v = LOAD 'data1.tsv' AS (f1:CHARARRAY, f4:INT, f5:INT);
s = COGROUP u BY f1, v BY f1;
DUMP s;
(A,{(A,10,1)},{(A,50,5)})
(B,{(B,20,2)},{(B,60,6)})
(C,{(C,30,3)},{(C,70,7)})
(D,{(D,40,4)},{(D,80,8)})
UNION#
[6]:
%%pig
s = UNION u, v;
DUMP s;
(A,10,1)
(B,20,2)
(C,30,3)
(D,40,4)
(A,50,5)
(B,60,6)
(C,70,7)
(D,80,8)
CROSS#
[7]:
%%pig
s = CROSS u, v;
DUMP s;
(D,40,4,D,80,8)
(D,40,4,C,70,7)
(D,40,4,B,60,6)
(D,40,4,A,50,5)
(C,30,3,D,80,8)
(C,30,3,C,70,7)
(C,30,3,B,60,6)
(C,30,3,A,50,5)
(B,20,2,D,80,8)
(B,20,2,C,70,7)
(B,20,2,B,60,6)
(B,20,2,A,50,5)
(A,10,1,D,80,8)
(A,10,1,C,70,7)
(A,10,1,B,60,6)
(A,10,1,A,50,5)
RANK#
[8]:
%%pig
s = RANK u BY f2;
DUMP s;
(1,A,10,1)
(2,B,20,2)
(3,C,30,3)
(4,D,40,4)
STREAM#
Revise la documentación sobre este operador.
[9]:
%%pig
w = STREAM s THROUGH `tail -n 2 `;
DUMP w;
(3,C,30,3)
(4,D,40,4)
CUBE#
[10]:
%%pig
DUMP u;
(A,10,1)
(B,20,2)
(C,30,3)
(D,40,4)
[11]:
%%pig
s = CUBE u BY cube(f1, f2);
DUMP s;
((A,10),{(A,10,1)})
((A,),{(A,,1)})
((B,20),{(B,20,2)})
((B,),{(B,,2)})
((C,30),{(C,30,3)})
((C,),{(C,,3)})
((D,40),{(D,40,4)})
((D,),{(D,,4)})
((,10),{(,10,1)})
((,20),{(,20,2)})
((,30),{(,30,3)})
((,40),{(,40,4)})
((,),{(,,4),(,,3),(,,1),(,,2)})
EXPLAIN#
Revise la documentación sobre este operador.
ILLUSTRATE#
[12]:
%%pig
ILLUSTRATE s;
(C,30,3)
------------------------------------------------------
| u | f1:chararray | f2:int | f3:int |
------------------------------------------------------
| | C | 30 | 3 |
| | D | 40 | 4 |
------------------------------------------------------
---------------------------------------------------------
| cube | f1:chararray | f2:int | f3:int |
---------------------------------------------------------
| | C | 30 | 3 |
| | C | | 3 |
| | | 30 | 3 |
| | | | 3 |
| | D | 40 | 4 |
| | D | | 4 |
| | | 40 | 4 |
| | | | 4 |
---------------------------------------------------------
---------------------------------------------------------------------------------------------------------------------------
| s | group:tuple(f1:chararray,f2:int) | cube:bag{:tuple(f1:chararray,f2:int,f3:int)} |
---------------------------------------------------------------------------------------------------------------------------
| | (C, 30) | {(C, 30, 3)} |
| | (C, ) | {(C, , 3)} |
| | (D, 40) | {(D, 40, 4)} |
| | (D, ) | {(D, , 4)} |
| | (, 30) | {(, 30, 3)} |
| | (, 40) | {(, 40, 4)} |
| | (, ) | {(, , 3), (, , 4)} |
---------------------------------------------------------------------------------------------------------------------------
[13]:
%quit