Introducción a csvkit#
Descarga de los datos#
[1]:
#
# Crea un archivo temporal para el tutorial
#
!mkdir -p /tmp/csvkit_tutorial
#
# Cambia el directorio de trabajo del notebook al temporal
#
%cd /tmp/csvkit_tutorial
/tmp/csvkit_tutorial
[2]:
#
# Descarga de los datos
#
!curl -L -O https://raw.githubusercontent.com/wireservice/csvkit/master/examples/realdata/ne_1033_data.xlsx
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 65331 100 65331 0 0 428k 0 --:--:-- --:--:-- --:--:-- 428k
[3]:
!ls
data.csv ne_1033_data.xlsx
in2csv#
[4]:
#
# Exporta el archivo xlsx a CSV
#
!in2csv ne_1033_data.xlsx > data.csv
[5]:
!head data.csv
state,county,fips,nsn,item_name,quantity,ui,acquisition_cost,total_cost,ship_date,federal_supply_category,federal_supply_category_name,federal_supply_class,federal_supply_class_name
NE,ADAMS,31001.0,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1.0,Each,138.0,138.0,2008-07-11,10.0,WEAPONS,1005.0,"Guns, through 30 mm"
NE,ADAMS,31001.0,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1.0,Each,138.0,138.0,2008-07-11,10.0,WEAPONS,1005.0,"Guns, through 30 mm"
NE,ADAMS,31001.0,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1.0,Each,138.0,138.0,2008-07-11,10.0,WEAPONS,1005.0,"Guns, through 30 mm"
NE,ADAMS,31001.0,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1.0,Each,138.0,138.0,2008-07-11,10.0,WEAPONS,1005.0,"Guns, through 30 mm"
NE,ADAMS,31001.0,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1.0,Each,138.0,138.0,2008-07-11,10.0,WEAPONS,1005.0,"Guns, through 30 mm"
NE,ADAMS,31001.0,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1.0,Each,138.0,138.0,2008-07-11,10.0,WEAPONS,1005.0,"Guns, through 30 mm"
NE,BUFFALO,31019.0,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1.0,Each,499.0,499.0,2008-09-24,10.0,WEAPONS,1005.0,"Guns, through 30 mm"
NE,BUFFALO,31019.0,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1.0,Each,499.0,499.0,2008-09-24,10.0,WEAPONS,1005.0,"Guns, through 30 mm"
NE,BUFFALO,31019.0,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1.0,Each,499.0,499.0,2008-09-24,10.0,WEAPONS,1005.0,"Guns, through 30 mm"
csvlook#
[6]:
#
# En el *Terminal* use: csvlook data.csv | less -S
#
!csvlook data.csv | head
| state | county | fips | nsn | item_name | quantity | ui | acquisition_cost | total_cost | ship_date | federal_supply_category | federal_supply_category_name | federal_supply_class | federal_supply_class_name |
| ----- | ---------- | ------ | ---------------- | -------------------------------------------------------------- | -------- | ------- | ---------------- | ---------- | ---------- | ----------------------- | ----------------------------------- | -------------------- | --------------------------------------------------------------- |
| NE | ADAMS | 31,001 | 1005-00-589-1271 | RIFLE,7.62 MILLIMETER | 1 | Each | 138.00 | 138.00 | 2008-07-11 | 10 | WEAPONS | 1,005 | Guns, through 30 mm |
| NE | ADAMS | 31,001 | 1005-00-589-1271 | RIFLE,7.62 MILLIMETER | 1 | Each | 138.00 | 138.00 | 2008-07-11 | 10 | WEAPONS | 1,005 | Guns, through 30 mm |
| NE | ADAMS | 31,001 | 1005-00-589-1271 | RIFLE,7.62 MILLIMETER | 1 | Each | 138.00 | 138.00 | 2008-07-11 | 10 | WEAPONS | 1,005 | Guns, through 30 mm |
| NE | ADAMS | 31,001 | 1005-00-589-1271 | RIFLE,7.62 MILLIMETER | 1 | Each | 138.00 | 138.00 | 2008-07-11 | 10 | WEAPONS | 1,005 | Guns, through 30 mm |
| NE | ADAMS | 31,001 | 1005-00-589-1271 | RIFLE,7.62 MILLIMETER | 1 | Each | 138.00 | 138.00 | 2008-07-11 | 10 | WEAPONS | 1,005 | Guns, through 30 mm |
| NE | ADAMS | 31,001 | 1005-00-589-1271 | RIFLE,7.62 MILLIMETER | 1 | Each | 138.00 | 138.00 | 2008-07-11 | 10 | WEAPONS | 1,005 | Guns, through 30 mm |
| NE | BUFFALO | 31,019 | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 499.00 | 2008-09-24 | 10 | WEAPONS | 1,005 | Guns, through 30 mm |
| NE | BUFFALO | 31,019 | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 499.00 | 2008-09-24 | 10 | WEAPONS | 1,005 | Guns, through 30 mm |
csvcut#
[7]:
#
# Columnas en los datos
#
!csvcut -n data.csv
1: state
2: county
3: fips
4: nsn
5: item_name
6: quantity
7: ui
8: acquisition_cost
9: total_cost
10: ship_date
11: federal_supply_category
12: federal_supply_category_name
13: federal_supply_class
14: federal_supply_class_name
[8]:
#
# Extracción de columnas por posición
#
!csvcut -c 2,5,6 data.csv | head
county,item_name,quantity
ADAMS,"RIFLE,7.62 MILLIMETER",1.0
ADAMS,"RIFLE,7.62 MILLIMETER",1.0
ADAMS,"RIFLE,7.62 MILLIMETER",1.0
ADAMS,"RIFLE,7.62 MILLIMETER",1.0
ADAMS,"RIFLE,7.62 MILLIMETER",1.0
ADAMS,"RIFLE,7.62 MILLIMETER",1.0
BUFFALO,"RIFLE,5.56 MILLIMETER",1.0
BUFFALO,"RIFLE,5.56 MILLIMETER",1.0
BUFFALO,"RIFLE,5.56 MILLIMETER",1.0
[9]:
#
# Extracción de columnas por nombre
#
!csvcut -c county,item_name,quantity data.csv | csvlook | head
| county | item_name | quantity |
| ---------- | -------------------------------------------------------------- | -------- |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| BUFFALO | RIFLE,5.56 MILLIMETER | 1 |
| BUFFALO | RIFLE,5.56 MILLIMETER | 1 |
Pipeline#
[10]:
!in2csv ne_1033_data.xlsx | csvcut -c county,item_name,quantity | csvlook | head
| county | item_name | quantity |
| ---------- | -------------------------------------------------------------- | -------- |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| ADAMS | RIFLE,7.62 MILLIMETER | 1 |
| BUFFALO | RIFLE,5.56 MILLIMETER | 1 |
| BUFFALO | RIFLE,5.56 MILLIMETER | 1 |