Commit c3a5c042 authored by Pierre Dittgen's avatar Pierre Dittgen

Release supernumerary columns

parents 1995952b d6ebc64a
Pipeline #1280 passed with stages
in 2 minutes and 55 seconds
## 0.5.0
- Handle supernumerary columns:
- allow to 'repair' tabular file before content validation
## 0.4.2
- Add `french_siren_value` custom check (thx to Antoine Augusti)
......
......@@ -12,7 +12,7 @@ with readme_filepath.open('rt', encoding='utf-8') as fd_in:
setup(
name='validata_core',
version='0.4.2',
version='0.5.0',
description="Validata Core library",
long_description=LONG_DESCRIPTION,
......@@ -57,8 +57,9 @@ setup(
'goodtables',
'importlib_resources',
'requests',
'tabulator',
'tableschema',
'tablib[pandas]',
'tabulator',
'toolz',
# for custom_checks
......
......@@ -3,6 +3,7 @@ from io import BytesIO
import pytest
from openpyxl import Workbook
from validata_core import validate
......@@ -89,7 +90,7 @@ def schema_siren():
def validate_csv_text(**options):
return validate(scheme='text', format='csv', **options)
return validate(scheme='text', format='csv', with_repair=False, **options)
def test_empty_file(schema_abc):
......@@ -269,7 +270,7 @@ def test_invalid_custom_check_siren(schema_siren):
def validate_xlsx_bytes(**options):
return validate(scheme='bytes', format='xlsx', **options)
return validate(scheme='bytes', format='xlsx', with_repair=False, **options)
def build_one_cell_xlsx(cell):
......
import pytest
import tablib
from validata_core import EMPTY_HEADER, repair_core
def gen_dataset(rows):
"""Turns rows into tablib.Dataset"""
return tablib.Dataset(*rows[1:], headers=rows[0])
def err_msg_data(err):
"""Shortcut"""
return dict(err)['message-data']
ASTRONAUTS_ROWS = [
['Name', 'Country', 'Year'],
['Neil Armstrong', 'USA', 1958],
['Scott Carpenter', 'USA', 1959],
['Ivan Anikeyev', 'USSR', 1960],
['Neil Armstrong', 'USA', 1960],
['Tatyana Kuznetsova', 'USSR', 1962],
['Neil Armstrong', 'USA', 1962],
]
@pytest.fixture()
def astronauts_dataset():
# https://en.wikipedia.org/wiki/List_of_astronauts_by_year_of_selection
return gen_dataset(ASTRONAUTS_ROWS)
def test_no_repair_to_be_done(astronauts_dataset):
source_dataset = astronauts_dataset
fixed_dataset, report = repair_core(source_dataset, ASTRONAUTS_ROWS[0])
assert fixed_dataset == source_dataset
assert len(report) == 0
def test_reorder_columns(astronauts_dataset):
source_dataset = astronauts_dataset
fixed_dataset, report = repair_core(source_dataset, ['Country', 'Year', 'Name'])
assert fixed_dataset != source_dataset
assert fixed_dataset[0] == ('USA', 1958, 'Neil Armstrong')
assert len(report) == 1
assert report[0].code == 'wrong-headers-order'
def test_missing_column_at_start():
rows = [row[1:] for row in ASTRONAUTS_ROWS]
source_dataset = tablib.Dataset(*rows[1:], headers=rows[0])
fixed_dataset, report = repair_core(source_dataset, ASTRONAUTS_ROWS[0])
assert fixed_dataset != source_dataset
assert fixed_dataset[0] == ('', 'USA', 1958)
assert len(report) == 1
assert report[0].code == 'missing-header'
assert err_msg_data(report[0]).get('column-name') == 'Name'
def test_missing_column_inside():
rows = [[row[0]] + row[2:] for row in ASTRONAUTS_ROWS]
source_dataset = tablib.Dataset(*rows[1:], headers=rows[0])
fixed_dataset, report = repair_core(source_dataset, ASTRONAUTS_ROWS[0])
assert fixed_dataset != source_dataset
assert fixed_dataset[0] == ('Neil Armstrong', '', 1958)
assert len(report) == 1
assert report[0].code == 'missing-header'
assert err_msg_data(report[0]).get('column-name') == 'Country'
def test_missing_column_at_end():
rows = [row[:-1] for row in ASTRONAUTS_ROWS]
source_dataset = tablib.Dataset(*rows[1:], headers=rows[0])
fixed_dataset, report = repair_core(source_dataset, ASTRONAUTS_ROWS[0])
assert fixed_dataset != source_dataset
assert fixed_dataset[0] == ('Neil Armstrong', 'USA', '')
assert len(report) == 1
assert report[0].code == 'missing-header'
assert err_msg_data(report[0]).get('column-name') == 'Year'
def test_empty_column_at_start():
rows = [[EMPTY_HEADER if i == 0 else ''] + row for i, row in enumerate(ASTRONAUTS_ROWS)]
source_dataset = tablib.Dataset(*rows[1:], headers=rows[0])
fixed_dataset, report = repair_core(source_dataset, ASTRONAUTS_ROWS[0])
assert fixed_dataset != source_dataset
assert fixed_dataset[0] == ('Neil Armstrong', 'USA', 1958)
assert len(report) == 1
assert report[0].code == 'blank-header'
def test_empty_column_at_end():
rows = [row + [EMPTY_HEADER if i == 0 else ''] for i, row in enumerate(ASTRONAUTS_ROWS)]
source_dataset = tablib.Dataset(*rows[1:], headers=rows[0])
fixed_dataset, report = repair_core(source_dataset, ASTRONAUTS_ROWS[0])
assert fixed_dataset != source_dataset
assert fixed_dataset[0] == ('Neil Armstrong', 'USA', 1958)
assert len(report) == 1
assert report[0].code == 'blank-header'
def test_empty_column_inside():
rows = [[row[0]] + [EMPTY_HEADER if i == 0 else ''] + row[1:] for i, row in enumerate(ASTRONAUTS_ROWS)]
source_dataset = tablib.Dataset(*rows[1:], headers=rows[0])
fixed_dataset, report = repair_core(source_dataset, ASTRONAUTS_ROWS[0])
assert fixed_dataset != source_dataset
assert fixed_dataset[0] == ('Neil Armstrong', 'USA', 1958)
assert len(report) == 1
assert report[0].code == 'blank-header'
def test_wrong_named_column(astronauts_dataset):
source_dataset = astronauts_dataset
source_dataset.headers = ('Name', 'Land', 'Year')
fixed_dataset, report = repair_core(source_dataset, ASTRONAUTS_ROWS[0])
assert fixed_dataset != source_dataset
assert fixed_dataset.headers == ['Name', 'Country', 'Year', 'Land']
assert fixed_dataset[0] == ('Neil Armstrong', '', 1958, 'USA')
assert len(report) == 2
assert any(r.code == 'extra-header' and err_msg_data(r).get('column-name') == 'Land' for r in report)
assert any(r.code == 'missing-header' and err_msg_data(r).get('column-name') == 'Country' for r in report)
import csv
import io
import itertools
import logging
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
import goodtables
import importlib_resources
import requests
import tableschema
import tablib
from toolz import get_in, thread_first, update_in
import goodtables
import tabulator
from . import csv_helpers, loaders, messages
from .custom_checks import (cohesive_columns_value, compare_columns_value, extra_or_missing_header, french_siret_value,
nomenclature_actes_value, sum_columns_value, year_interval_value, french_siren_value)
from .custom_checks import (cohesive_columns_value, compare_columns_value, extra_or_missing_header, french_siren_value,
french_siret_value, nomenclature_actes_value, sum_columns_value, year_interval_value)
from .spec import spec
log = logging.getLogger(__name__)
......@@ -116,10 +121,6 @@ def amend_report(report):
# Tag 'structure' or 'value'
errors = [{**err, 'tag': categorize_err(err)} for err in report['tables'][0]['errors']]
# Remove 'value' errors if 'structure' errors other than 'invalid-column-delimiter'
if any([err['tag'] == 'structure' and err['code'] != 'invalid-column-delimiter' for err in errors]):
errors = [err for err in errors if err['tag'] != 'value']
# Among value errors, only keep a single error by error cell
# => the 1st encountered one
filtered_errors = []
......@@ -144,8 +145,8 @@ def amend_report(report):
return report
def validate(source, schema, **options):
"""Validate a `source` using a `schema`.
def retrieve_schema_descriptor(schema):
"""Transforms a schema into a schema_descriptor
`schema` can be either:
- a `pathlib.Path`
......@@ -159,28 +160,195 @@ def validate(source, schema, **options):
schema = str(schema)
if not isinstance(schema, tableschema.Schema):
schema = tableschema.Schema(schema)
schema_descriptor = schema.descriptor
return schema.descriptor
# Needed here because tablib Dataset doesn't allow empty column headers
EMPTY_HEADER = '__empty_header__'
def repair_core(dataset: tablib.Dataset, schema_field_names):
"""Core engine of repair function:
Check tabular data and return transformed dataset and report log
"""
report = []
def append_col(dataset: tablib.Dataset, column_values, header):
"""work around a tablib bug on append_col,
see https://github.com/vinayak-mehta/tablib/issues/33"""
dataset.append_col(column_values, header=header)
if dataset.headers is None:
dataset.headers = [header]
# Same field names, same order, just return dataset as is
if dataset.headers == schema_field_names:
return dataset, report
# else, work!
rows_nb = len(dataset.dict)
content_dataset = tablib.Dataset()
rejected_cols_dataset = tablib.Dataset()
column_names_dict = dict()
last_nonempty_header_col = None
first_nonempty_header_col = None
empty_header_cols = []
for i, head in enumerate(dataset.headers):
# Don't keep empty header column
if head == EMPTY_HEADER:
empty_header_cols.append(i)
continue
# Remember first non-empty header
if first_nonempty_header_col is None:
first_nonempty_header_col = i
# Remember last non-empty header
last_nonempty_header_col = i
# Move unknown columns in a special dataset
if head not in schema_field_names:
report.append(goodtables.Error(code='extra-header',
message_substitutions={'column-name': head}))
append_col(rejected_cols_dataset, dataset.get_col(i), head)
continue
# Move duplicate columns in a special dataset
if head in column_names_dict:
report.append(goodtables.Error(code='duplicate-header',
message_substitutions={'column-number': i+1, 'column-name': head}))
append_col(rejected_cols_dataset, dataset.get_col(i), head)
# Normal case
else:
append_col(content_dataset, dataset.get_col(i), head)
column_names_dict[head] = i
# add blank-header errors
def create_blank_header_error(col_id, pos_type, addon={}):
return goodtables.Error(code='blank-header', message_substitutions={
'column-number': col_id + 1,
'position': pos_type,
**addon
})
# With context to ease repairing
for col_id in empty_header_cols:
if col_id < first_nonempty_header_col:
report.append(create_blank_header_error(col_id, 'leading'))
elif col_id > last_nonempty_header_col:
report.append(create_blank_header_error(col_id, 'trailing'))
else:
before_header = list(filter(lambda elt: elt != EMPTY_HEADER, dataset.headers[:col_id][::-1]))[0]
after_header = list(filter(lambda elt: elt != EMPTY_HEADER, dataset.headers[col_id+1:]))[0]
position_addon = {
'before-header-name': before_header,
'after-header-name': after_header,
}
report.append(create_blank_header_error(col_id, 'in', addon=position_addon))
# Compare ordering
if content_dataset.headers:
schema_order_extract = [h for h in schema_field_names if h in content_dataset.headers]
if content_dataset.headers != schema_order_extract:
report.append(goodtables.Error(code='wrong-headers-order',
message_substitutions={'actual-order': content_dataset.headers,
'wanted-order': schema_order_extract}))
# Then reorder and create empty columns if no content found
fixed_dataset = tablib.Dataset()
for h in schema_field_names:
if content_dataset.headers and h in content_dataset.headers:
col_id = content_dataset.headers.index(h)
append_col(fixed_dataset, content_dataset.get_col(col_id), h)
else:
append_col(fixed_dataset, [''] * rows_nb, h)
report.append(goodtables.Error(code='missing-header',
message_substitutions={'column-number': i+1, 'column-name': h}))
# Adds rejected columns at the end if any
if len(rejected_cols_dataset) != 0:
for i, h in enumerate(rejected_cols_dataset.headers):
append_col(fixed_dataset, rejected_cols_dataset.get_col(i), h)
return fixed_dataset, report
def repair(source, schema_descriptor, **repair_options):
"""Try to repair a `source` using a `schema
Returns (fixed_source, report)
"""
def to_inline_data(dataset):
return [dataset.headers] + [dataset[i] for i in range(len(dataset))]
def consume_source(source, **options):
stream = tabulator.stream.Stream(source, **options)
stream.open()
# Get source headers
headers = next(stream.iter())
# And source body rows
body_rows = list(stream.iter())
return headers, body_rows
# Gets schema content
schema_field_names = [f.get('name') for f in schema_descriptor.get('fields')]
# consume source to get headers and content
try:
headers, body_rows = consume_source(source, **repair_options)
except StopIteration:
return (source, [])
# Create dataset for easier post processing
dataset = tablib.Dataset(*body_rows, headers=[h if h else EMPTY_HEADER for h in headers])
# Repair dataset!
fixed_dataset, column_errors = repair_core(dataset, schema_field_names)
# Return fixed source with potential errors
return (to_inline_data(fixed_dataset), column_errors)
def validate(source, schema, with_repair=True, **options):
"""Validate a `source` using a `schema`."""
schema_descriptor = retrieve_schema_descriptor(schema)
base_options = {
**options,
"custom_loaders": loaders.custom_loaders, # to use Validata BytesLoader
}
fixed_source, structure_errors = source, None
checks = ['structure', 'schema', {'extra-or-missing-header': {}}]
if with_repair:
fixed_source, structure_errors = repair(source, schema_descriptor, **base_options)
checks = ['structure', 'schema']
custom_checks_config = schema_descriptor.get('custom_checks')
if custom_checks_config:
for custom_check_conf in custom_checks_config:
checks.append({custom_check_conf['name']: custom_check_conf['params']})
inspector = goodtables.Inspector(
checks=checks,
skip_checks=['non-matching-header', 'extra-header', 'missing-header'],
row_limit=VALIDATA_MAX_ROWS,
)
options = {**options, "custom_loaders": loaders.custom_loaders}
report = inspector.inspect(source=source, schema=schema_descriptor, **options)
options = {**base_options}
if with_repair:
options['scheme'] = 'stream'
options['format'] = 'inline'
report = inspector.inspect(source=fixed_source, schema=schema_descriptor, **options)
if report['tables'][0].get('format') == "csv" and not any(
get_in(['errors', err['code'], 'type'], spec, default=None) == 'source'
for err in report['tables'][0]['errors']
):
standard_csv_delimiter = ","
dialect = csv_helpers.detect_dialect(source, **options)
dialect = csv_helpers.detect_dialect(fixed_source, **options)
if dialect is None:
error = goodtables.Error(code='unknown-csv-dialect')
report = prepend_error(report, table_index=0, error=dict(error))
......@@ -196,6 +364,11 @@ def validate(source, schema, **options):
)
report = prepend_error(report, table_index=0, error=dict(error))
# If some errors have been encountered during repair process
if structure_errors:
for structure_err in structure_errors[::-1]:
report = prepend_error(report, table_index=0, error=dict(structure_err))
# Translate error messages
report = improve_messages(report, schema_descriptor)
......
......@@ -58,7 +58,8 @@ def et_join(values):
def blank_header(err, schema):
"""blank-header error"""
return u_err(err, 'Colonne sans en-tête', 'Toutes les colonnes doivent contenir une en-tête')
column_id = err['message-data']['column-number']
return u_err(err, 'En-tête manquant', "La colonne n°{} n'a pas d'entête".format(column_id))
def blank_row(err, schema):
......@@ -68,7 +69,7 @@ def blank_row(err, schema):
def duplicate_header(err, schema):
"""duplicate-header error"""
return u_err(err, 'En-tête répétée', 'Les colonnes doivent avoir des en-têtes uniques')
return u_err(err, 'En-tête répété', 'Les colonnes doivent avoir des en-têtes uniques')
def duplicate_row(err, schema):
......@@ -83,6 +84,14 @@ def duplicate_row(err, schema):
return u_err(err, 'Ligne dupliquée', msg)
def missing_header(err, schema):
"""missing header"""
column_name = err['message-data']['column-name']
msg = 'La colonne `{}` présente dans le schéma est introuvable dans le fichier'.format(column_name)
return u_err(err, 'Colonne manquante', msg)
def extra_value(err, schema):
"""extra-value error"""
return u_err(err, 'Valeur surnuméraire', "Le nombre de cellules de cette ligne excède" +
......@@ -291,7 +300,7 @@ def extra_headers(err, schema):
"""extra-headers"""
cols = err['message-data']['headers']
if len(cols) == 1:
err['message'] = "La colonne \"{}\" est inconnue dans le schéma".format(cols[0])
err['message'] = "La colonne `{}` est inconnue dans le schéma".format(cols[0])
else:
col_list = ''.join(['- {}\n'.format(col) for col in cols])
# addon_info = 'Utilisez-vous le bon schéma ?' if len(cols) == len(headers) else ''
......@@ -313,6 +322,7 @@ ERROR_MESSAGE_FUNC = {
'blank-row': blank_row,
'duplicate-header': duplicate_header,
'duplicate-row': duplicate_row,
'missing-header': missing_header,
'enumerable-constraint': enumerable_constraint,
'extra-value': extra_value,
'maximum-constraint': maximum_constraint,
......@@ -353,10 +363,11 @@ ERROR_MESSAGE_FUNC = {
ERROR_MESSAGE_DEFAULT_TITLE = {
# Core checks
'blank-header': 'colonne sans en-tête',
'blank-header': 'en-tête manquant',
'blank-row': 'ligne vide',
'duplicate-header': 'en-tête répété',
'duplicate-row': 'ligne dupliquée',
'missing-header': 'colonne manquante',
'enumerable-constraint': 'valeur incorrecte',
'maximum-constraint': 'valeur maximale non respectée',
'maximum-length-constraint': 'longueur maximale non respectée',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment