Programación Avanzada#

  • 30 min | Última modificación: Noviembre 07, 2019

Cell magic %%pig#

[1]:
from IPython.core.magic import Magics, cell_magic, line_magic, magics_class
from pexpect import spawn

TIMEOUT = 120
PROG = "pig"
PROMPT = ["\r\n>> ", "\r\ngrunt> "]
DISCARD = ["INFO  org.apache", "WARN  org.apache"]
QUIT = "quit"


@magics_class
class Magic(Magics):
    def __init__(self, shell):
        super().__init__(shell)
        self.app = spawn(PROG, timeout=60)
        self.app.expect(PROMPT)

    @cell_magic
    def pig(self, line, cell):
        cell_lines = [cell_line.strip() for cell_line in cell.split("\n")]
        cell_lines = [cell_line for cell_line in cell_lines if cell_line != ""]
        for cell_line in cell_lines:
            self.app.sendline(cell_line)
            self.app.expect(PROMPT, timeout=TIMEOUT)
            output = self.app.before.decode()
            output = output.replace("\r\n", "\n")
            output = output.split("\n")
            output = [output_line.strip() for output_line in output]
            for output_line in output:
                if output_line not in cell_lines:
                    if not any(word in output_line for word in DISCARD):
                        print(output_line)
        return None

    @line_magic
    def quit(self, line):
        self.app.sendline(QUIT)


def load_ipython_extension(ip):
    ip.register_magics(Magic(ip))


load_ipython_extension(ip=get_ipython())

Actividad#

Explique el resultado que produce cada uno de los comandos presentados a continuación.

COGROUP#

[2]:
%%writefile /tmp/data0.tsv
A   10      1
B   20      2
C   30      3
D   40      4
Overwriting /tmp/data0.tsv
[3]:
%%writefile /tmp/data1.tsv
A   50      5
B   60      6
C   70      7
D   80      8
Overwriting /tmp/data1.tsv
[4]:
!hadoop fs -put /tmp/*.tsv .
!hadoop fs -ls
put: `data.tsv': File exists
put: `data0.tsv': File exists
put: `data1.tsv': File exists
Found 5 items
-rw-r--r--   1 root supergroup         60 2022-05-17 02:15 data.tsv
-rw-r--r--   1 root supergroup         28 2022-05-17 02:18 data0.tsv
-rw-r--r--   1 root supergroup         28 2022-05-17 02:18 data1.tsv
-rw-r--r--   1 root supergroup         27 2022-05-17 02:12 jointable.csv
-rw-r--r--   1 root supergroup        616 2022-05-17 02:08 persons.csv
[5]:
%%pig
u = LOAD 'data0.tsv' AS (f1:CHARARRAY, f2:INT, f3:INT);
v = LOAD 'data1.tsv' AS (f1:CHARARRAY, f4:INT, f5:INT);
s = COGROUP u BY f1, v BY f1;
DUMP s;
(A,{(A,10,1)},{(A,50,5)})
(B,{(B,20,2)},{(B,60,6)})
(C,{(C,30,3)},{(C,70,7)})
(D,{(D,40,4)},{(D,80,8)})

UNION#

[6]:
%%pig
s = UNION u, v;
DUMP s;
(A,10,1)
(B,20,2)
(C,30,3)
(D,40,4)
(A,50,5)
(B,60,6)
(C,70,7)
(D,80,8)

CROSS#

[7]:
%%pig
s = CROSS u, v;
DUMP s;
(D,40,4,D,80,8)
(D,40,4,C,70,7)
(D,40,4,B,60,6)
(D,40,4,A,50,5)
(C,30,3,D,80,8)
(C,30,3,C,70,7)
(C,30,3,B,60,6)
(C,30,3,A,50,5)
(B,20,2,D,80,8)
(B,20,2,C,70,7)
(B,20,2,B,60,6)
(B,20,2,A,50,5)
(A,10,1,D,80,8)
(A,10,1,C,70,7)
(A,10,1,B,60,6)
(A,10,1,A,50,5)

RANK#

[8]:
%%pig
s = RANK u BY f2;
DUMP s;
(1,A,10,1)
(2,B,20,2)
(3,C,30,3)
(4,D,40,4)

STREAM#

Revise la documentación sobre este operador.

[9]:
%%pig
w = STREAM s THROUGH  `tail -n 2 `;
DUMP w;
(3,C,30,3)
(4,D,40,4)

CUBE#

[10]:
%%pig
DUMP u;
(A,10,1)
(B,20,2)
(C,30,3)
(D,40,4)
[11]:
%%pig
s = CUBE u BY cube(f1, f2);
DUMP s;
((A,10),{(A,10,1)})
((A,),{(A,,1)})
((B,20),{(B,20,2)})
((B,),{(B,,2)})
((C,30),{(C,30,3)})
((C,),{(C,,3)})
((D,40),{(D,40,4)})
((D,),{(D,,4)})
((,10),{(,10,1)})
((,20),{(,20,2)})
((,30),{(,30,3)})
((,40),{(,40,4)})
((,),{(,,4),(,,3),(,,1),(,,2)})

EXPLAIN#

Revise la documentación sobre este operador.

ILLUSTRATE#

[12]:
%%pig
ILLUSTRATE s;
(C,30,3)
------------------------------------------------------
| u     | f1:chararray     | f2:int     | f3:int     |
------------------------------------------------------
|       | C                | 30         | 3          |
|       | D                | 40         | 4          |
------------------------------------------------------
---------------------------------------------------------
| cube     | f1:chararray     | f2:int     | f3:int     |
---------------------------------------------------------
|          | C                | 30         | 3          |
|          | C                |            | 3          |
|          |                  | 30         | 3          |
|          |                  |            | 3          |
|          | D                | 40         | 4          |
|          | D                |            | 4          |
|          |                  | 40         | 4          |
|          |                  |            | 4          |
---------------------------------------------------------
---------------------------------------------------------------------------------------------------------------------------
| s     | group:tuple(f1:chararray,f2:int)             | cube:bag{:tuple(f1:chararray,f2:int,f3:int)}                     |
---------------------------------------------------------------------------------------------------------------------------
|       | (C, 30)                                      | {(C, 30, 3)}                                                     |
|       | (C, )                                        | {(C, , 3)}                                                       |
|       | (D, 40)                                      | {(D, 40, 4)}                                                     |
|       | (D, )                                        | {(D, , 4)}                                                       |
|       | (, 30)                                       | {(, 30, 3)}                                                      |
|       | (, 40)                                       | {(, 40, 4)}                                                      |
|       | (, )                                         | {(, , 3), (, , 4)}                                               |
---------------------------------------------------------------------------------------------------------------------------

[13]:
%quit