Introducción a DataTest#

  • Ultima modificación: Mayo 14, 2022

[13]:
from datatest import validate
import pandas as pd

Validación#

Comparaciones inteligentes

[2]:
data = ['A', 'B', 'A']
requirement = {'A', 'B'}
validate(data, requirement)
[3]:

data = [2, 4, 6, 8] def is_even(x): return x % 2 == 0 validate(data, requirement=is_even)
[4]:
data = [2, 4, 6, 8]
requirement = int
validate(data, requirement)

[5]:
data = [('a', 2), ('b', 4), ('c', 6)]

def is_even(x):
    return x % 2 == 0

requirement = (str, is_even)
validate(data, requirement)

Manejo automático de datos

[7]:
#
# Elemento
#
data = 42
requirement = int  # <- Same for all formats.
validate(data, requirement)
[8]:
#
# Grupo de elementos
#
data = [1, 2, 3]
requirement = int  # <- Same for all formats.
validate(data, requirement)
[9]:
#
# Mapping
#
data = {'A': 1, 'B': 2, 'C': 3}
requirement = int  # <- Same for all formats.
validate(data, requirement)
[10]:
#
# Mapping of groups
#
data = {'X': [1, 2, 3], 'Y': [4, 5, 6], 'Z': [7, 8, 9]}
requirement = int  # <- Same for all formats.
validate(data, requirement)
[14]:
#
# Pandas
#
df = pd.DataFrame([('x', 1, 12.25),
                   ('y', 2, 33.75),
                   ('z', 3, 101.5)],
                  columns=['A', 'B', 'C'])

validate(df[['A', 'B']], (str, int))
[16]:
#
# Pandas (Integrated API)
#
import datatest as dt

dt.register_accessors()

df = pd.DataFrame([('x', 1, 12.25),
                   ('y', 2, 33.75),
                   ('z', 3, 101.5)],
                  columns=['A', 'B', 'C'])

df[['A', 'B']].validate((str, int))
[17]:
#
# NumPy
#
import numpy as np

a = np.array([('x', 1, 12.25),
              ('y', 2, 33.75),
              ('z', 3, 101.5)],
             dtype='U10, int32, float32')

validate(a[['f0', 'f1']], (str, int))
[18]:
#
# Bases de datos
#
import sqlite3

conn = sqlite3.connect(':memory:')
conn.executescript('''
    CREATE TABLE mydata(A, B, C);
    INSERT INTO mydata VALUES('x', 1, 12.25);
    INSERT INTO mydata VALUES('y', 2, 33.75);
    INSERT INTO mydata VALUES('z', 3, 101.5);
''')
cursor = conn.cursor()

cursor.execute('SELECT A, B FROM mydata;')
validate(cursor, (str, int))

Errores#

“Missing” Differences

[19]:
data = ['A', 'B']
requirement = {'A', 'B', 'C', 'D'}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
Input In [19], in <cell line: 3>()
      1 data = ['A', 'B']
      2 requirement = {'A', 'B', 'C', 'D'}
----> 3 validate(data, requirement)

File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
    294 if isinstance(requirement_object, sequence_or_order_types):
    295     err._sorted_str = False
--> 296 raise err

ValidationError: does not satisfy set membership (2 differences): [
    Missing('C'),
    Missing('D'),
]

“Extra” Differences

[20]:
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
Input In [20], in <cell line: 3>()
      1 data = ['A', 'B', 'C', 'D']
      2 requirement = {'A', 'B'}
----> 3 validate(data, requirement)

File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
    294 if isinstance(requirement_object, sequence_or_order_types):
    295     err._sorted_str = False
--> 296 raise err

ValidationError: does not satisfy set membership (2 differences): [
    Extra('C'),
    Extra('D'),
]

“Invalid” Differences

[21]:
data = [('a', 2), ('b', 4), ('c', 6), (1.25, 8), ('e', 9)]

def is_even(x):
    return x % 2 == 0

requirement = (str, is_even)
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
Input In [21], in <cell line: 7>()
      4     return x % 2 == 0
      6 requirement = (str, is_even)
----> 7 validate(data, requirement)

File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
    294 if isinstance(requirement_object, sequence_or_order_types):
    295     err._sorted_str = False
--> 296 raise err

ValidationError: does not satisfy `(str, is_even())` (2 differences): [
    Invalid((1.25, 8)),
    Invalid(('e', 9)),
]

“Deviation” Differences

[22]:
data = {
    'A': 100,
    'B': 200,
    'C': 299,
    'D': 405,
}

requirement = {
    'A': 100,
    'B': 200,
    'C': 300,
    'D': 400,
}

validate(data, requirement)
---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
Input In [22], in <cell line: 15>()
      1 data = {
      2     'A': 100,
      3     'B': 200,
      4     'C': 299,
      5     'D': 405,
      6 }
      8 requirement = {
      9     'A': 100,
     10     'B': 200,
     11     'C': 300,
     12     'D': 400,
     13 }
---> 15 validate(data, requirement)

File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
    294 if isinstance(requirement_object, sequence_or_order_types):
    295     err._sorted_str = False
--> 296 raise err

ValidationError: does not satisfy mapping requirements (2 differences): {
    'C': Deviation(-1, 300),
    'D': Deviation(+5, 400),
}

Acceptances#

[23]:
#
# Using acceptance
#
from datatest import (
    validate,
    accepted,
    Extra,
)

data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
with accepted(Extra):
    validate(data, requirement)
[24]:
#
# No Acceptance
#
from datatest import (
    validate,
    accepted,
    Extra,
)

data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
Input In [24], in <cell line: 12>()
     10 data = ['A', 'B', 'C', 'D']
     11 requirement = {'A', 'B'}
---> 12 validate(data, requirement)

File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
    294 if isinstance(requirement_object, sequence_or_order_types):
    295     err._sorted_str = False
--> 296 raise err

ValidationError: does not satisfy set membership (2 differences): [
    Extra('C'),
    Extra('D'),
]

Accepted Instance

[ ]:
#
# Using Acceptance
#
from datatest import (
    validate,
    accepted,
    Extra,
)

data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
with accepted(Extra('C')):
    validate(data, requirement)
[25]:
#
# No Acceptance
#
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
Input In [25], in <cell line: 6>()
      4 data = ['A', 'B', 'C', 'D']
      5 requirement = {'A', 'B'}
----> 6 validate(data, requirement)

File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
    294 if isinstance(requirement_object, sequence_or_order_types):
    295     err._sorted_str = False
--> 296 raise err

ValidationError: does not satisfy set membership (2 differences): [
    Extra('C'),
    Extra('D'),
]

Accepted Container of Instances

[26]:
#
# Using Acceptance
#
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
with accepted([Extra('C'), Extra('D')]):
    validate(data, requirement)
[27]:
#
# No acceptance
#
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
Input In [27], in <cell line: 6>()
      4 data = ['A', 'B', 'C', 'D']
      5 requirement = {'A', 'B'}
----> 6 validate(data, requirement)

File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
    294 if isinstance(requirement_object, sequence_or_order_types):
    295     err._sorted_str = False
--> 296 raise err

ValidationError: does not satisfy set membership (2 differences): [
    Extra('C'),
    Extra('D'),
]

Accepted Tolerance

[28]:
#
# Ussing Acceptance
#
data = {
    'A': 100,
    'B': 200,
    'C': 299,
    'D': 405,
}
requirement = {
    'A': 100,
    'B': 200,
    'C': 300,
    'D': 400,
}
with accepted.tolerance(5):  # accepts ±5
    validate(data, requirement)
[29]:
#
# No Acceptance
#
data = {
    'A': 100,
    'B': 200,
    'C': 299,
    'D': 405,
}
requirement = {
    'A': 100,
    'B': 200,
    'C': 300,
    'D': 400,
}
validate(data, requirement)
---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
Input In [29], in <cell line: 16>()
      4 data = {
      5     'A': 100,
      6     'B': 200,
      7     'C': 299,
      8     'D': 405,
      9 }
     10 requirement = {
     11     'A': 100,
     12     'B': 200,
     13     'C': 300,
     14     'D': 400,
     15 }
---> 16 validate(data, requirement)

File /usr/local/lib/python3.8/dist-packages/datatest/validation.py:296, in ValidateType.__call__(self, data, requirement, msg)
    294 if isinstance(requirement_object, sequence_or_order_types):
    295     err._sorted_str = False
--> 296 raise err

ValidationError: does not satisfy mapping requirements (2 differences): {
    'C': Deviation(-1, 300),
    'D': Deviation(+5, 400),
}

Otras Acceptances

  • accepted.keys()

  • accepted.args()

  • accepted.pecent()

  • Otras

Combining Acceptances

from datatest import (
    validate,
    accepted,
)

# Accept up to five missing differences.
with accepted(Missing) & accepted.count(5):
    validate(..., ...)

# Accept differences of ±10 or ±5%.
with accepted.tolerance(10) | accepted.percent(0.05):
    validate(..., ...)

Herramientas para el manejo de datos#

Directorio de trabajo

import pandas as pd
from datatest import working_directory

with working_directory(__file__):
    my_df = pd.read_csv('myfile.csv')

Repeating Contanier

Se usa para operar sobre diferentes archivos al mismo tiempo en vez de duplicar operaciones.

# -----------------------------------------------------------------------------
# Using RepeatingContanier
#
import pandas as pd
from datatest import RepeatingContainer

repeating = RepeatingContainer([
    pd.read_csv('file1.csv'),
    pd.read_csv('file2.csv'),
])

counted1, counted2 = repeating['C'].count()

filled1, filled2 = repeating.fillna(method='backfill')

summed1, summed2 = repeating[['A', 'C']].groupby('A').sum()
# -----------------------------------------------------------------------------
# No RepeatingContainer
#
import pandas as pd

df1 = pd.read_csv('file1.csv')
df2 = pd.read_csv('file2.csv')

counted1 = df1['C'].count()
counted2 = df2['C'].count()

filled1 = df1.fillna(method='backfill')
filled2 = df2.fillna(method='backfill')

summed1 = df1[['A', 'C']].groupby('A').sum()
summed2 = df2[['A', 'C']].groupby('A').sum()