Commit 57168281 authored by Antoine Augusti's avatar Antoine Augusti

Error stats: add a count by column and code

parent cc6f7a8b
......@@ -48,6 +48,31 @@ def schema_date():
}
@pytest.fixture
def schema_types_and_required():
return {
"$schema": "https://frictionlessdata.io/schemas/table-schema.json",
"fields": [
{
"name": "A",
"title": "Field A",
"type": "number",
"constraints": {
"required": True
}
},
{
"name": "B",
"title": "Field B",
"type": "date",
"constraints": {
"required": True
}
}
]
}
@pytest.fixture
def schema_number():
return {
......@@ -266,6 +291,24 @@ def test_invalid_custom_check_siren(schema_siren):
assert report['tables'][0]['errors'][0]['code'] == 'french-siren-value'
def test_error_stats(schema_types_and_required):
source = """A,B
2,2020-04-01
,foo
"""
report = validate_csv_text(source=source, schema=schema_types_and_required)
assert report['tables'][0]['error-stats'] == {
'structure-errors': {'count': 0, 'count-by-code': {}},
'value-errors': {
'count': 2,
'rows-count': 1,
'count-by-code': {'required-constraint': 1, 'type-or-format-error': 1},
'count-by-col-and-code': {'A': {'required-constraint': 1}, 'B': {'type-or-format-error': 1}}
},
'count': 2
}
# XLSX
......
......@@ -54,7 +54,7 @@ def improve_messages(report, schema):
return report
def compute_error_statistics(errors):
def compute_error_statistics(errors, columns):
"""Computes error statistics as a dict:
{
'count': 12,
......@@ -71,6 +71,10 @@ def compute_error_statistics(errors):
'type-or-format-error': 2,
'pattern-constraint': 7,
'french-siret-value': 1,
},
'count-by-col-and-code': {
'A': {'required-constraint': 7},
'B': {'type-or-format-error': 3}
}
},
}
......@@ -82,6 +86,9 @@ def compute_error_statistics(errors):
# Errors distribution by category
errors_dist_dict = {'structure': defaultdict(int), 'value': defaultdict(int)}
# Errors by column and category
errors_col_code = {}
# Fill in error stats
for err in errors:
err_tag = err['tag']
......@@ -89,6 +96,11 @@ def compute_error_statistics(errors):
if err['code'] in ('extra-headers', 'missing-headers') else 1
errors_nb_dict[err_tag] += errors_nb
errors_dist_dict[err_tag][err['code']] += errors_nb
if err_tag == "value":
col = columns[(err["column-number"] - 1)]
if col not in errors_col_code:
errors_col_code[col] = defaultdict(int)
errors_col_code[col][err["code"]] += 1
# Compute statistics
return {
......@@ -100,6 +112,7 @@ def compute_error_statistics(errors):
'count': errors_nb_dict['value'],
'rows-count': len(set([err['row-number'] for err in errors if err['tag'] == 'value'])),
'count-by-code': errors_dist_dict['value'],
'count-by-col-and-code': errors_col_code,
},
'count': errors_nb_dict['structure'] + errors_nb_dict['value']
}
......@@ -138,7 +151,10 @@ def amend_report(report):
report['tables'][0]['error-count'] = len(errors)
# Store statistics
stats = compute_error_statistics(errors)
columns = {}
if 'headers' in report['tables'][0]:
columns = report['tables'][0]['headers']
stats = compute_error_statistics(errors, columns)
report['tables'][0]['error-stats'] = stats
return report
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment