Introducción a DataTest#
Ultima modificación: Mayo 14, 2022
[13]:
from datatest import validate
import pandas as pd
Validación#
Comparaciones inteligentes
[2]:
data = ['A', 'B', 'A']
requirement = {'A', 'B'}
validate(data, requirement)
[3]:
data = [2, 4, 6, 8]
def is_even(x):
return x % 2 == 0
validate(data, requirement=is_even)
[4]:
data = [2, 4, 6, 8]
requirement = int
validate(data, requirement)
[5]:
data = [('a', 2), ('b', 4), ('c', 6)]
def is_even(x):
return x % 2 == 0
requirement = (str, is_even)
validate(data, requirement)
Manejo automático de datos
[7]:
#
# Elemento
#
data = 42
requirement = int # <- Same for all formats.
validate(data, requirement)
[8]:
#
# Grupo de elementos
#
data = [1, 2, 3]
requirement = int # <- Same for all formats.
validate(data, requirement)
[9]:
#
# Mapping
#
data = {'A': 1, 'B': 2, 'C': 3}
requirement = int # <- Same for all formats.
validate(data, requirement)
[10]:
#
# Mapping of groups
#
data = {'X': [1, 2, 3], 'Y': [4, 5, 6], 'Z': [7, 8, 9]}
requirement = int # <- Same for all formats.
validate(data, requirement)
[14]:
#
# Pandas
#
df = pd.DataFrame([('x', 1, 12.25),
('y', 2, 33.75),
('z', 3, 101.5)],
columns=['A', 'B', 'C'])
validate(df[['A', 'B']], (str, int))
[16]:
#
# Pandas (Integrated API)
#
import datatest as dt
dt.register_accessors()
df = pd.DataFrame([('x', 1, 12.25),
('y', 2, 33.75),
('z', 3, 101.5)],
columns=['A', 'B', 'C'])
df[['A', 'B']].validate((str, int))
[17]:
#
# NumPy
#
import numpy as np
a = np.array([('x', 1, 12.25),
('y', 2, 33.75),
('z', 3, 101.5)],
dtype='U10, int32, float32')
validate(a[['f0', 'f1']], (str, int))
[18]:
#
# Bases de datos
#
import sqlite3
conn = sqlite3.connect(':memory:')
conn.executescript('''
CREATE TABLE mydata(A, B, C);
INSERT INTO mydata VALUES('x', 1, 12.25);
INSERT INTO mydata VALUES('y', 2, 33.75);
INSERT INTO mydata VALUES('z', 3, 101.5);
''')
cursor = conn.cursor()
cursor.execute('SELECT A, B FROM mydata;')
validate(cursor, (str, int))
Errores#
“Missing” Differences
[19]:
data = ['A', 'B']
requirement = {'A', 'B', 'C', 'D'}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError Traceback (most recent call last)
Input In [19], in <cell line: 3>()
1 data = ['A', 'B']
2 requirement = {'A', 'B', 'C', 'D'}
----> 3 validate(data, requirement)
File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
294 if isinstance(requirement_object, sequence_or_order_types):
295 err._sorted_str = False
--> 296 raise err
ValidationError: does not satisfy set membership (2 differences): [
Missing('C'),
Missing('D'),
]
“Extra” Differences
[20]:
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError Traceback (most recent call last)
Input In [20], in <cell line: 3>()
1 data = ['A', 'B', 'C', 'D']
2 requirement = {'A', 'B'}
----> 3 validate(data, requirement)
File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
294 if isinstance(requirement_object, sequence_or_order_types):
295 err._sorted_str = False
--> 296 raise err
ValidationError: does not satisfy set membership (2 differences): [
Extra('C'),
Extra('D'),
]
“Invalid” Differences
[21]:
data = [('a', 2), ('b', 4), ('c', 6), (1.25, 8), ('e', 9)]
def is_even(x):
return x % 2 == 0
requirement = (str, is_even)
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError Traceback (most recent call last)
Input In [21], in <cell line: 7>()
4 return x % 2 == 0
6 requirement = (str, is_even)
----> 7 validate(data, requirement)
File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
294 if isinstance(requirement_object, sequence_or_order_types):
295 err._sorted_str = False
--> 296 raise err
ValidationError: does not satisfy `(str, is_even())` (2 differences): [
Invalid((1.25, 8)),
Invalid(('e', 9)),
]
“Deviation” Differences
[22]:
data = {
'A': 100,
'B': 200,
'C': 299,
'D': 405,
}
requirement = {
'A': 100,
'B': 200,
'C': 300,
'D': 400,
}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError Traceback (most recent call last)
Input In [22], in <cell line: 15>()
1 data = {
2 'A': 100,
3 'B': 200,
4 'C': 299,
5 'D': 405,
6 }
8 requirement = {
9 'A': 100,
10 'B': 200,
11 'C': 300,
12 'D': 400,
13 }
---> 15 validate(data, requirement)
File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
294 if isinstance(requirement_object, sequence_or_order_types):
295 err._sorted_str = False
--> 296 raise err
ValidationError: does not satisfy mapping requirements (2 differences): {
'C': Deviation(-1, 300),
'D': Deviation(+5, 400),
}
Acceptances#
[23]:
#
# Using acceptance
#
from datatest import (
validate,
accepted,
Extra,
)
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
with accepted(Extra):
validate(data, requirement)
[24]:
#
# No Acceptance
#
from datatest import (
validate,
accepted,
Extra,
)
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError Traceback (most recent call last)
Input In [24], in <cell line: 12>()
10 data = ['A', 'B', 'C', 'D']
11 requirement = {'A', 'B'}
---> 12 validate(data, requirement)
File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
294 if isinstance(requirement_object, sequence_or_order_types):
295 err._sorted_str = False
--> 296 raise err
ValidationError: does not satisfy set membership (2 differences): [
Extra('C'),
Extra('D'),
]
Accepted Instance
[ ]:
#
# Using Acceptance
#
from datatest import (
validate,
accepted,
Extra,
)
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
with accepted(Extra('C')):
validate(data, requirement)
[25]:
#
# No Acceptance
#
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError Traceback (most recent call last)
Input In [25], in <cell line: 6>()
4 data = ['A', 'B', 'C', 'D']
5 requirement = {'A', 'B'}
----> 6 validate(data, requirement)
File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
294 if isinstance(requirement_object, sequence_or_order_types):
295 err._sorted_str = False
--> 296 raise err
ValidationError: does not satisfy set membership (2 differences): [
Extra('C'),
Extra('D'),
]
Accepted Container of Instances
[26]:
#
# Using Acceptance
#
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
with accepted([Extra('C'), Extra('D')]):
validate(data, requirement)
[27]:
#
# No acceptance
#
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError Traceback (most recent call last)
Input In [27], in <cell line: 6>()
4 data = ['A', 'B', 'C', 'D']
5 requirement = {'A', 'B'}
----> 6 validate(data, requirement)
File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
294 if isinstance(requirement_object, sequence_or_order_types):
295 err._sorted_str = False
--> 296 raise err
ValidationError: does not satisfy set membership (2 differences): [
Extra('C'),
Extra('D'),
]
Accepted Tolerance
[28]:
#
# Ussing Acceptance
#
data = {
'A': 100,
'B': 200,
'C': 299,
'D': 405,
}
requirement = {
'A': 100,
'B': 200,
'C': 300,
'D': 400,
}
with accepted.tolerance(5): # accepts ±5
validate(data, requirement)
[29]:
#
# No Acceptance
#
data = {
'A': 100,
'B': 200,
'C': 299,
'D': 405,
}
requirement = {
'A': 100,
'B': 200,
'C': 300,
'D': 400,
}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError Traceback (most recent call last)
Input In [29], in <cell line: 16>()
4 data = {
5 'A': 100,
6 'B': 200,
7 'C': 299,
8 'D': 405,
9 }
10 requirement = {
11 'A': 100,
12 'B': 200,
13 'C': 300,
14 'D': 400,
15 }
---> 16 validate(data, requirement)
File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
294 if isinstance(requirement_object, sequence_or_order_types):
295 err._sorted_str = False
--> 296 raise err
ValidationError: does not satisfy mapping requirements (2 differences): {
'C': Deviation(-1, 300),
'D': Deviation(+5, 400),
}
Otras Acceptances
accepted.keys()
accepted.args()
accepted.pecent()
Otras
Combining Acceptances
from datatest import (
validate,
accepted,
)
# Accept up to five missing differences.
with accepted(Missing) & accepted.count(5):
validate(..., ...)
# Accept differences of ±10 or ±5%.
with accepted.tolerance(10) | accepted.percent(0.05):
validate(..., ...)
Herramientas para el manejo de datos#
Directorio de trabajo
import pandas as pd
from datatest import working_directory
with working_directory(__file__):
my_df = pd.read_csv('myfile.csv')
Repeating Contanier
Se usa para operar sobre diferentes archivos al mismo tiempo en vez de duplicar operaciones.
# -----------------------------------------------------------------------------
# Using RepeatingContanier
#
import pandas as pd
from datatest import RepeatingContainer
repeating = RepeatingContainer([
pd.read_csv('file1.csv'),
pd.read_csv('file2.csv'),
])
counted1, counted2 = repeating['C'].count()
filled1, filled2 = repeating.fillna(method='backfill')
summed1, summed2 = repeating[['A', 'C']].groupby('A').sum()
# -----------------------------------------------------------------------------
# No RepeatingContainer
#
import pandas as pd
df1 = pd.read_csv('file1.csv')
df2 = pd.read_csv('file2.csv')
counted1 = df1['C'].count()
counted2 = df2['C'].count()
filled1 = df1.fillna(method='backfill')
filled2 = df2.fillna(method='backfill')
summed1 = df1[['A', 'C']].groupby('A').sum()
summed2 = df2[['A', 'C']].groupby('A').sum()