Commit 62148385 authored by Pierre Dittgen's avatar Pierre Dittgen
Browse files

Amend report

parent ed3d4ab3
......@@ -160,6 +160,17 @@ a,b,c,d"""
assert report['tables'][0]['errors'][0]['message-data']['headers'] == ['D']
def test_extra_multiple():
source = """A,B,C,X,Y
a,b,c,x,y"""
report = validate_csv_text(source=source, schema=schema_abc)
assert report['tables'][0]['error-count'] == 1
assert len(report['tables'][0]['errors']) == 1
assert report['tables'][0]['error-stats']['total'] == 2
assert report['tables'][0]['errors'][0]['code'] == 'extra-headers'
assert report['tables'][0]['errors'][0]['message-data']['headers'] == ['X', 'Y']
def test_missing_and_extra_headers_multiple():
source = """A,Z,D
a,z,d"""
......@@ -181,18 +192,6 @@ a,c,b"""
assert report['tables'][0]['errors'][0]['code'] == 'wrong-headers-order'
def test_missing_and_duplicate_headers():
source = """A,A,B
a,a,b"""
report = validate_csv_text(source=source, schema=schema_abc)
assert report['tables'][0]['error-count'] == 2
assert len(report['tables'][0]['errors']) == 2
assert report['tables'][0]['errors'][0]['code'] == 'missing-headers'
assert report['tables'][0]['errors'][0]['message-data']['headers'] == ['C']
assert report['tables'][0]['errors'][1]['code'] == 'duplicate-header'
assert report['tables'][0]['errors'][1]['message-data']['column_numbers'] == '1'
def test_invalid_delimiter_and_missing_header():
source = """A;C
a;c"""
......
import importlib.util
import json
import logging
from collections import defaultdict
from datetime import datetime
from pathlib import Path
......@@ -65,6 +66,85 @@ def improve_messages(report, schema):
return report
def compute_error_statistics(errors):
"""Computes error statistics as a dict:
{
'total': 123, # Total nb of errors
'structure-errors': {
'nb': 3, # Structure errors nb
'distribution': [
['foobar', 2],
['baz', 1]
]
},
'value-errors': {
'nb': 2, # Value errors nb
'distribution': [
['bar', 1],
['buz', 119]
]
}
}
"""
# Nb of errors by category
errors_nb_dict = {'structure': 0, 'value': 0}
# Errors distribution by category
errors_dist_dict = {'structure': defaultdict(int), 'value': defaultdict(int)}
# Fill in error stats
for err in errors:
err_tag = err['tag']
errors_nb = len(err['message-data']['headers']) \
if err['code'] in ('extra-headers', 'missing-headers') else 1
errors_nb_dict[err_tag] += errors_nb
errors_dist_dict[err_tag][err['title']] += errors_nb
# Compute statistics
return {
'structure-errors': {
'nb': errors_nb_dict['structure'],
'distribution': list(errors_dist_dict['structure'].items()),
},
'value-errors': {
'nb': errors_nb_dict['value'],
'distribution': list(errors_dist_dict['value'].items()),
},
'total': errors_nb_dict['structure'] + errors_nb_dict['value']
}
def amend_report(report):
"""tag 'structure' and 'value' error
Remove 'value' errors if 'structure' errors
Computes statistics
"""
def categorize_err(err):
"""Computes error category: 'structure' or 'value'"""
if err.get('column-number') is None and err.get('row-number') is None:
return 'structure'
return 'value'
# Tag 'structure' or 'value'
errors = [{**err, 'tag': categorize_err(err)} for err in report['tables'][0]['errors']]
# Remove 'value' errors if 'structure' errors other than 'invalid-column-delimiter'
if any([err['tag'] == 'structure' and err['code'] != 'invalid-column-delimiter' for err in errors]):
errors = [err for err in errors if err['tag'] != 'value']
# Integrate enhanced errors into report
report['tables'][0]['errors'] = errors
report['tables'][0]['error-count'] = len(errors)
# Store statistics
stats = compute_error_statistics(errors)
report['tables'][0]['error-stats'] = stats
return report
class Validator:
def __init__(self, schemas_config=None):
self.schemas_config = get_schemas_config() if schemas_config is None else schemas_config
......@@ -126,6 +206,10 @@ class Validator:
# Translate error messages
report = improve_messages(report, schema_descriptor)
# Tag errors ('structure' or 'value')
# Compute statistics
report = amend_report(report)
# Add date
report['date'] = datetime.now().isoformat()
......
......@@ -56,11 +56,21 @@ def et_join(values):
# This is adapted to pophover display
def blank_header(err, schema):
"""blank-header error"""
return u_err(err, 'Colonne sans en-tête', 'Toutes les colonnes doivent contenir une en-tête')
def blank_row(err, schema):
"""blank-row error"""
return u_err(err, 'Ligne vide', 'Les lignes vides doivent être retirées de la table')
def duplicate_header(err, schema):
"""duplicate-header error"""
return u_err(err, 'En-tête répétée', 'Les colonnes doivent avoir des en-têtes uniques')
def duplicate_row(err, schema):
"""duplicate-row error"""
msg_prefix = 'La ligne est identique '
......@@ -203,6 +213,11 @@ def french_siret_value(err, schema):
return u_err(err, 'Numéro SIRET non valide',
'Le numéro de SIRET indiqué n\'est pas valide selon la définition de l\'[INSEE](https://www.insee.fr/fr/metadonnees/definition/c1841)')
def compare_columns_value(err, schema):
"""compare-columns-value"""
return u_err(err, 'Comparaison de colonne', err['message'])
# Validata pre-checks
#
# -> Error message is stored in 'message' key
......@@ -214,7 +229,7 @@ def invalid_column_delimiter(err, schema):
msg_tpl = 'Le fichier CSV utilise le délimiteur de colonne « {} » au lieu du délimiteur attendu « {} ».'
err['message'] = msg_tpl.format(md.get('detected'), md.get(
'expected')) + "\n\nPour vous permettre de continuer la validation, un remplacement automatique a été réalisé."
return err
return u_err(err, 'Délimiteur de colonne invalide', err['message'])
def missing_headers(err, schema):
......@@ -228,7 +243,7 @@ def missing_headers(err, schema):
addon_info = '\nUtilisez-vous le bon schéma ?' if len(cols) == fields_nb else ''
err['message'] = "Les colonnes suivantes n'ont pas été trouvées dans le fichier :\n{}{}".format(
col_list, addon_info)
return err
return u_err(err, 'Colonne(s) manquante(s)', err['message'])
def extra_headers(err, schema):
......@@ -240,34 +255,37 @@ def extra_headers(err, schema):
col_list = ''.join(['- {}\n'.format(col) for col in cols])
# addon_info = 'Utilisez-vous le bon schéma ?' if len(cols) == len(headers) else ''
err['message'] = "Les colonnes suivantes sont inconnues dans le schéma :\n{}".format(col_list)
return err
return u_err(err, 'Colonne(s) surnuméraire(s)', err['message'])
# def wrong_headers_order(err, headers, schema):
# """ wrong-headers-order """
# fields = [f['name'] for f in schema.get('fields', [])]
# assert len(headers) == len(fields), 'Wrong column order between two lists of different lengths'
# msgs = []
# for i, (header, field) in enumerate(zip(headers, fields)):
# if header == field:
# continue
# msgs.append('la colonne {} devrait être « {} » (au lieu de « {} »)'.format(i+1, field, header))
# errors_str = '<ul>\n' + '\n'.join(['<li>{}</li>\n'.format(msg) for msg in msgs]) + '\n</ul>'
# err['message'] = "Les colonnes du tableau ne sont pas dans l'ordre attendu :\n{}".format(errors_str)
# return err
def wrong_headers_order(err, schema):
"""wrong-headers-order"""
return u_err(err, 'Ordre des colonnes',
'Les colonnes du tableau ne sont pas dans l\'ordre défini par le schéma')
ERROR_MESSAGE_FUNC = {
# Core checks
# blank-header
'blank-header': blank_header,
'blank-row': blank_row,
# duplicate-header
'duplicate-header': duplicate_header,
'duplicate-row': duplicate_row,
'enumerable-constraint': enumerable_constraint,
'maximum-constraint': maximum_constraint,
'maximum-length-constraint': maximum_length_constraint,
'minimum-constraint': minimum_constraint,
'minimum-length-constraint': minimum_length_constraint,
# These 3 errors are skipped
# - non-matching-header
# - extra-header
# - missing-header
# and replaced by 3 aggregated errors:
# - missing-headers
# - extra-headers
# - wrong-headers-order
# missing-value
'pattern-constraint': pattern_constraint,
'required-constraint': required_constraint,
......@@ -278,8 +296,9 @@ ERROR_MESSAGE_FUNC = {
'extra-headers': extra_headers,
'invalid-column-delimiter': invalid_column_delimiter,
'missing-headers': missing_headers,
# 'wrong-headers-order': wrong_headers_order,
'wrong-headers-order': wrong_headers_order,
# Validata custom checks
'french-siret-value': french_siret_value,
'compare-columns-value': compare_columns_value,
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment