Movimiento de datos entre MySQL (Maria DB) y el HDFS#
Última modificación: Mayo 19, 2022
Descarga de datos#
[1]:
filenames = [
"drivers.csv",
"timesheet.csv",
"truck_event_text_partition.csv",
]
url = "https://raw.githubusercontent.com/jdvelasq/datalabs/master/datasets/drivers/"
for filename in filenames:
!wget --quiet {url + filename} -P /tmp/
Preparación de la base de datos#
[2]:
import mariadb
conn = mariadb.connect(
user="root",
password="",
)
cur = conn.cursor()
#
# Creación de la BD
#
cur.execute("DROP DATABASE IF EXISTS demo_db;")
cur.execute("CREATE DATABASE demo_db;")
cur.execute("USE demo_db;")
cur.execute(
"""
DROP TABLE IF EXISTS drivers;
"""
)
cur.execute(
"""
CREATE TABLE drivers (
driverId INT,
name VARCHAR(20),
ssn VARCHAR(20),
location VARCHAR(40),
certified VARCHAR(20),
wage_plan VARCHAR(20)
);
"""
)
cur.execute(
"""
DROP TABLE IF EXISTS timesheet;
"""
)
cur.execute(
"""
CREATE TABLE timesheet (
driverId INT,
week INT,
hours_logged INT,
miles_logged INT
);
"""
)
conn.commit()
import pandas as pd
drivers = pd.read_csv("/tmp/drivers.csv")
for i, row in drivers.iterrows():
sql = "INSERT INTO drivers VALUES (%s,%s,%s,%s,%s,%s)"
cur.execute(sql, tuple(row))
conn.commit()
cur.execute("SELECT * FROM drivers LIMIT 5;")
result = cur.fetchall()
#
# Creación y permisos para el usuario remoto
#
cur.execute("CREATE USER 'sqoop'@'%' IDENTIFIED BY 'secret'; ")
cur.execute("GRANT ALL ON demo_db.* TO 'sqoop'@'%';")
conn.close()
result
[2]:
[(10, 'George Vetticaden', '621011971', '244-4532 Nulla Rd.', 'N', 'miles'),
(11, 'Jamie Engesser', '262112338', '366-4125 Ac Street', 'N', 'miles'),
(12, 'Paul Coddin', '198041975', 'Ap #622-957 Risus. Street', 'Y', 'hours'),
(13, 'Joe Niemiec', '139907145', '2071 Hendrerit. Ave', 'Y', 'hours'),
(14, 'Adis Cesir', '820812209', 'Ap #810-1228 In St.', 'Y', 'hours')]
[3]:
%%writefile list-databases.sh
sqoop list-databases \
--connect jdbc:mysql://localhost:3306/demo_db \
--username sqoop \
--password secret
Overwriting list-databases.sh
[4]:
#
# El error es debido a que SQLite3 no tiene databases internamente.
#
!bash list-databases.sh
information_schema
demo_db
Listado de las tablas existentes en la base de datos de MySQL#
[5]:
%%writefile list-tables.sh
sqoop list-tables \
--connect jdbc:mysql://localhost:3306/demo_db \
--username sqoop \
--password secret
Overwriting list-tables.sh
[6]:
!bash list-tables.sh
timesheet
drivers
Verificación de los registros en MySQL con query#
[7]:
%%writefile query.sh
sqoop eval \
--connect jdbc:mysql://localhost:3306/demo_db \
--username sqoop \
--password secret \
--query "SELECT * FROM drivers LIMIT 3"
Overwriting query.sh
[8]:
!bash query.sh
----------------------------------------------------------------------------------------------------------------------------------
| driverId | name | ssn | location | certified | wage_plan |
----------------------------------------------------------------------------------------------------------------------------------
| 10 | George Vetticaden | 621011971 | 244-4532 Nulla Rd. | N | miles |
| 11 | Jamie Engesser | 262112338 | 366-4125 Ac Street | N | miles |
| 12 | Paul Coddin | 198041975 | Ap #622-957 Risus. Street | Y | hours |
----------------------------------------------------------------------------------------------------------------------------------
Importación de una tabla completa al HDFS#
[9]:
%%writefile full_import.sh
sqoop import \
--connect jdbc:mysql://localhost:3306/demo_db \
--username sqoop \
--password secret \
--table drivers \
--target-dir /tmp/drivers \
--m 1
Writing /tmp/full_import.sh
[10]:
!bash full_import.sh
Note: /tmp/sqoop-root/compile/a1208aef74d58ca4992fb5cdd2864896/drivers.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
[11]:
!hdfs dfs -ls /tmp/drivers
Found 2 items
-rw-r--r-- 1 root supergroup 0 2022-05-27 15:50 /tmp/drivers/_SUCCESS
-rw-r--r-- 1 root supergroup 1963 2022-05-27 15:50 /tmp/drivers/part-m-00000
[12]:
!hdfs dfs -cat /tmp/drivers/part-m-00000
10,George Vetticaden,621011971,244-4532 Nulla Rd.,N,miles
11,Jamie Engesser,262112338,366-4125 Ac Street,N,miles
12,Paul Coddin,198041975,Ap #622-957 Risus. Street,Y,hours
13,Joe Niemiec,139907145,2071 Hendrerit. Ave,Y,hours
14,Adis Cesir,820812209,Ap #810-1228 In St.,Y,hours
15,Rohit Bakshi,239005227,648-5681 Dui- Rd.,Y,hours
16,Tom McCuch,363303105,P.O. Box 313- 962 Parturient Rd.,Y,hours
17,Eric Mizell,123808238,P.O. Box 579- 2191 Gravida. Street,Y,hours
18,Grant Liu,171010151,Ap #928-3159 Vestibulum Av.,Y,hours
19,Ajay Singh,160005158,592-9430 Nonummy Avenue,Y,hours
20,Chris Harris,921812303,883-2691 Proin Avenue,Y,hours
21,Jeff Markham,209408086,Ap #852-7966 Facilisis St.,Y,hours
22,Nadeem Asghar,783204269,154-9147 Aliquam Ave,Y,hours
23,Adam Diaz,928312208,P.O. Box 260- 6127 Vitae Road,Y,hours
24,Don Hilborn,254412152,4361 Ac Road,Y,hours
25,Jean-Philippe Playe,913310051,P.O. Box 812- 6238 Ac Rd.,Y,hours
26,Michael Aube,124705141,P.O. Box 213- 8948 Nec Ave,Y,hours
27,Mark Lochbihler,392603159,8355 Ipsum St.,Y,hours
28,Olivier Renault,959908181,P.O. Box 243- 6509 Erat. Avenue,Y,hours
29,Teddy Choi,185502192,P.O. Box 106- 7003 Amet Rd.,Y,hours
30,Dan Rice,282307061,Ap #881-9267 Mollis Avenue,Y,hours
31,Rommel Garcia,858912101,P.O. Box 945- 6015 Sociis St.,Y,hours
32,Ryan Templeton,290304287,765-6599 Egestas. Av.,Y,hours
33,Sridhara Sabbella,967409015,Ap #477-2507 Sagittis Avenue,Y,hours
34,Frank Romano,391407216,Ap #753-6814 Quis Ave,Y,hours
35,Emil Siemes,971401151,321-2976 Felis Rd.,Y,hours
36,Andrew Grande,245303216,Ap #685-9598 Egestas Rd.,Y,hours
37,Wes Floyd,190504074,P.O. Box 269- 9611 Nulla Street,Y,hours
38,Scott Shaw,386411175,276 Lobortis Road,Y,hours
39,David Kaiser,967706052,9185 At Street,Y,hours
40,Nicolas Maillard,208510217,1027 Quis Rd.,Y,hours
41,Greg Phillips,308103116,P.O. Box 847- 5961 Arcu. Road,Y,hours
42,Randy Gelhausen,853302254,145-4200 In- Avenue,Y,hours
43,Dave Patton,977706052,3028 A- St.,Y,hours
Importación de un subconjunto de datos de una tabla al HDFS#
[13]:
%%writefile partial-import.sh
hdfs dfs -rm -r /tmp/drivers
sqoop import \
--connect jdbc:mysql://localhost:3306/demo_db \
--username sqoop \
--password secret \
--table drivers \
--target-dir /tmp/drivers/ \
-m 1 \
--where "driverId=10"
Overwriting partial-import.sh
[14]:
!bash partial-import.sh
Deleted /tmp/drivers
Note: /tmp/sqoop-root/compile/3edc8ddf5f6c6a3ece00d9a79c112858/drivers.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
[15]:
!hdfs dfs -ls /tmp/drivers/
Found 2 items
-rw-r--r-- 1 root supergroup 0 2022-05-27 15:50 /tmp/drivers/_SUCCESS
-rw-r--r-- 1 root supergroup 58 2022-05-27 15:50 /tmp/drivers/part-m-00000
[16]:
!hdfs dfs -cat /tmp/drivers/part-m-00000
10,George Vetticaden,621011971,244-4532 Nulla Rd.,N,miles
Movimiento de timesheet.csv
al HDFS#
[17]:
!tail +2 /tmp/timesheet.csv > /tmp/timesheet1.csv
!head /tmp/timesheet1.csv
10,1,70,3300
10,2,70,3300
10,3,60,2800
10,4,70,3100
10,5,70,3200
10,6,70,3300
10,7,70,3000
10,8,70,3300
10,9,70,3200
10,10,50,2500
[18]:
!hdfs dfs -rm /tmp/timesheet.csv
!hdfs dfs -copyFromLocal /tmp/timesheet1.csv /tmp/timesheet.csv
!hdfs dfs -ls /tmp/
rm: `/tmp/timesheet.csv': No such file or directory
Found 4 items
drwxr-xr-x - root supergroup 0 2022-05-27 15:50 /tmp/drivers
drwxrwx--- - root supergroup 0 2022-05-27 15:49 /tmp/hadoop-yarn
drwxrwxrwx - root supergroup 0 2022-05-27 15:49 /tmp/hive
-rw-r--r-- 1 root supergroup 26164 2022-05-27 15:51 /tmp/timesheet.csv
Exportación de datos del HDFS a MySQL#
[ ]:
%%writefile export.sh
sqoop export \
--connect jdbc:mysql://localhost:3306/demo_db \
--username sqoop \
--password secret \
--table timesheet \
--export-dir /tmp/timesheet.csv
[ ]:
!bash export.sh
Verificación#
[ ]:
conn = mariadb.connect(user="root", password="", database="demo_db")
cur = conn.cursor()
cur.execute(
"""
SELECT * FROM timesheet LIMIT 5;
"""
)
result = cur.fetchall()
conn.close()
result
[ ]:
!rm *.java *.sh *.log