Commit fcec6a34 authored by Pierre Dittgen's avatar Pierre Dittgen

wip

parent cd8c734a
Pipeline #2353 failed with stage
in 2 minutes and 57 seconds
......@@ -6,18 +6,16 @@ from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
import goodtables
import frictionless
import importlib_resources
import requests
import tableschema
import tablib
import tabulator
from toolz import get_in, thread_first, update_in
from . import csv_helpers, loaders, messages
from .custom_checks import (cohesive_columns_value, compare_columns_value, extra_or_missing_header, french_siren_value,
french_siret_value, nomenclature_actes_value, sum_columns_value, year_interval_value)
from .spec import spec
from .custom_checks import available_checks
# from .spec import spec
log = logging.getLogger(__name__)
......@@ -46,7 +44,7 @@ def improve_messages(report, schema):
if report is None:
return None
for table_id in range(report['table-count']):
for table_id in range(report['stats']['tables']):
table = report['tables'][table_id]
table['errors'] = messages.improve_messages(table['errors'], schema)
......@@ -123,13 +121,16 @@ def amend_report(report):
Remove 'value' errors if 'structure' errors
Computes statistics
"""
def categorize_err(err):
"""Computes error category: 'structure' or 'value'"""
if err.get('column-number') is None and err.get('row-number') is None:
return 'structure'
return 'value'
if report["stats"]["tables"] == 0:
import ipdb; ipdb.set_trace()
# No table!
# Tag 'structure' or 'value'
errors = [{**err, 'tag': categorize_err(err)} for err in report['tables'][0]['errors']]
......@@ -173,9 +174,9 @@ def retrieve_schema_descriptor(schema):
"""
if isinstance(schema, Path):
schema = str(schema)
if not isinstance(schema, tableschema.Schema):
schema = tableschema.Schema(schema)
return schema.descriptor
if not isinstance(schema, frictionless.schema.Schema):
schema = frictionless.Schema(schema)
return schema
# Needed here because tablib Dataset doesn't allow empty column headers
......@@ -307,13 +308,12 @@ def repair(source, schema_descriptor, **repair_options):
return [dataset.headers] + [dataset[i] for i in range(len(dataset))]
def consume_source(source, **options):
stream = tabulator.stream.Stream(source, **options)
stream.open()
table = frictionless.Table(source, **options)
table.open()
# Get source headers
headers = next(stream.iter())
headers = table.header
# And source body rows
body_rows = list(stream.iter())
body_rows = list(table.data_stream)
return headers, body_rows
......@@ -341,62 +341,70 @@ def validate(source, schema, with_repair=True, **options):
schema_descriptor = retrieve_schema_descriptor(schema)
base_options = {
**options,
"custom_loaders": loaders.custom_loaders, # to use Validata BytesLoader
}
fixed_source, structure_errors = source, None
checks = ['structure', 'schema', {'extra-or-missing-header': {}}]
if with_repair:
fixed_source, structure_errors = repair(source, schema_descriptor, **base_options)
checks = ['structure', 'schema']
# TODO: handle repair
#checks = ['structure', 'schema', {'extra-or-missing-header': {}}]
#if with_repair:
# fixed_source, structure_errors = repair(source, schema_descriptor, **options)
# checks = ['structure', 'schema']
# Extract custom checks reference from table schema
extra_checks = None
custom_checks_config = schema_descriptor.get('custom_checks')
if custom_checks_config:
for custom_check_conf in custom_checks_config:
checks.append({custom_check_conf['name']: custom_check_conf['params']})
extra_checks = []
for cc_conf in custom_checks_config:
cc_name = cc_conf["name"]
if cc_name in available_checks:
cc_class = available_checks[cc_name]
cc_descriptor = cc_conf["params"]
extra_checks.append((cc_class, cc_descriptor))
inspector_options_keys = [
'checks', 'skip_checks', 'infer_schema',
'infer_fields', 'order_fields', 'error_limit',
'table_limit', 'row_limit',
'pick_errors', 'skip_errors', 'infer_schema',
'infer_fields', 'sync_schema', 'limit_errors',
'table_limit', 'query',
]
# TODO: merge options
inspector_options = {
**{
'checks': checks,
'skip_checks': ['non-matching-header', 'extra-header', 'missing-header'],
'row_limit': VALIDATA_MAX_ROWS,
# TODO: Fix `pick_errors` content
#'pick_errors': checks,
'skip_errors': ['non-matching-header', 'extra-header', 'missing-header'],
'query': frictionless.Query(limit_rows=VALIDATA_MAX_ROWS),
'extra_checks': extra_checks,
},
**{k: v for k, v in options.items() if k in inspector_options_keys}
}
inspector = goodtables.Inspector(**inspector_options)
options = {**base_options}
validate_options = {**options, **inspector_options}
if with_repair:
options['scheme'] = 'stream'
options['format'] = 'inline'
report = inspector.inspect(source=fixed_source, schema=schema_descriptor, **options)
if report['tables'][0].get('format') == "csv" and not any(
get_in(['errors', err['code'], 'type'], spec, default=None) == 'source'
for err in report['tables'][0]['errors']
):
standard_csv_delimiter = ","
dialect = csv_helpers.detect_dialect(fixed_source, **options)
if dialect is None:
error = goodtables.Error(code='unknown-csv-dialect')
report = prepend_error(report, table_index=0, error=dict(error))
else:
detected_delimiter = dialect.delimiter
if detected_delimiter != standard_csv_delimiter:
error = goodtables.Error(
code='invalid-column-delimiter',
message_substitutions={
"detected": detected_delimiter,
"expected": standard_csv_delimiter,
},
)
report = prepend_error(report, table_index=0, error=dict(error))
validate_options['scheme'] = 'stream'
validate_options['format'] = 'inline'
report = frictionless.validate_table(fixed_source, schema=schema_descriptor, **validate_options)
# TODO: support error types
# if report['tables'][0].get('format') == "csv" and not any(
# get_in(['errors', err['code'], 'type'], spec, default=None) == 'source'
# for err in report['tables'][0]['errors']
# ):
# standard_csv_delimiter = ","
# dialect = csv_helpers.detect_dialect(fixed_source, **options)
# if dialect is None:
# error = goodtables.Error(code='unknown-csv-dialect')
# report = prepend_error(report, table_index=0, error=dict(error))
# else:
# detected_delimiter = dialect.delimiter
# if detected_delimiter != standard_csv_delimiter:
# error = goodtables.Error(
# code='invalid-column-delimiter',
# message_substitutions={
# "detected": detected_delimiter,
# "expected": standard_csv_delimiter,
# },
# )
# report = prepend_error(report, table_index=0, error=dict(error))
# If some errors have been encountered during repair process
if structure_errors:
......
import csv
import tabulator
import logging
from frictionless import Table
log = logging.getLogger(__name__)
def detect_dialect(source, **stream_params):
"""Read source using `tabulator.loader`, but use `csv.Sniffer` to detect delimiter, because `tabulator.parsers.csv` tries
only those delimiters: '',\t;|''.
"""Read source using `tabulator.loader`, but use `csv.Sniffer` to detect delimiter,
because `tabulator.parsers.csv` tries only those delimiters: '',\t;|''.
"""
with tabulator.Stream(source, **stream_params) as stream:
parser = stream._Stream__parser
parser.reset()
sample = prepare_sample(stream._Stream__parser._CSVParser__chars)
try:
return csv.Sniffer().sniff(sample)
except Exception as e:
log.exception(e)
return None
with Table(source, **table_params) as table:
table.open()
return dialect
def prepare_sample(stream):
......
from .french_siren_value import FrenchSirenValue
from .french_siret_value import FrenchSiretValue
from .year_interval_value import YearIntervalValue
# Please keep the below dict up-to-date
available_checks = {
"french-siren-value": FrenchSirenValue,
"french-siret-value": FrenchSiretValue,
"year-interval-value": YearIntervalValue,
}
\ No newline at end of file
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
import stdnum.fr.siren
from goodtables.registry import check
from goodtables.error import Error
@check("french-siren-value", type="custom", context="body")
class FrenchSirenValue(object):
def __init__(self, column, **options):
self.__column = column
def check_row(self, cells):
# Get cell
cell = None
for item in cells:
if "header" not in item:
# Skip columns without headers. This can occur in particular with formatted Excel files.
continue
if self.__column in [item["column-number"], item["header"]]:
cell = item
break
# Check cell
if not cell:
return
from frictionless import Check, errors
class FrenchSirenValueError(errors.CellError):
"""Custom error."""
code = "french-siren-value"
name = "French SIREN value"
tags = ["#body"]
template = "La valeur {cell} n'est pas un numéro SIREN français valide."
description = "Le numéro de SIREN indiqué n'est pas valide selon la définition de l'[INSEE](https://www.insee.fr/fr/metadonnees/definition/c2047)"
class FrenchSirenValue(Check):
"""Check french SIREN number validity."""
possible_Errors = [FrenchSirenValueError] # type: ignore
def prepare(self):
"""Extract custom params from descriptor."""
self.__column = self.get("column")
# Check value
value = cell.get('value')
if not value:
def validate_task(self):
if self.__column not in self.table.schema.field_names:
note = 'french siren value check requires field "%s"' % self.__column
yield errors.TaskError(note=note)
def validate_row(self, row):
cell_value = row[self.__column]
# Empty cell, don't check!
if not cell_value:
return
if not stdnum.fr.siren.is_valid(value):
message = 'La valeur "{value}" n\'est pas un numéro SIREN français valide.'
message_substitutions = {"value": value}
error = Error(
"french-siren-value",
cell,
message=message,
message_substitutions=message_substitutions,
)
return [error]
if not stdnum.fr.siren.is_valid(cell_value):
yield FrenchSirenValueError.from_row(row, note="", field_name=self.__column)
metadata_profile = { # type: ignore
"type": "object",
"required": ["column"],
"properties": {"column": {"type": "string"}},
}
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
from simpleeval import simple_eval
import stdnum.fr.siret
from goodtables.registry import check
from goodtables.error import Error
from frictionless import Check, errors
class FrenchSiretValueError(errors.CellError):
"""Custom error."""
code = "french-siret-value"
name = "French SIRET value"
tags = ["#body"]
template = "La valeur {cell} n'est pas un numéro SIRET français valide."
description = "Le numéro de SIRET indiqué n'est pas valide selon la définition de l\'[INSEE](https://www.insee.fr/fr/metadonnees/definition/c1841)"
# Module API
@check('french-siret-value', type='custom', context='body')
class FrenchSiretValue(object):
class FrenchSiretValue(Check):
"""Check french SIRET number validity."""
# Public
possible_Errors = [FrenchSiretValueError] # type: ignore
def __init__(self, column, **options):
self.__column = column
def prepare(self):
"""Extract custom params from descriptor."""
self.__column = self.get("column")
def check_row(self, cells):
# Get cell
cell = None
for item in cells:
if 'header' not in item:
# Skip columns without headers. This can occur in particular with formatted Excel files.
continue
if self.__column in [item['column-number'], item['header']]:
cell = item
break
# Check cell
if not cell:
return
def validate_task(self):
if self.__column not in self.table.schema.field_names:
note = 'french siret value check requires field "%s"' % self.__column
yield errors.TaskError(note=note)
# Check value
value = cell.get('value')
if not value:
def validate_row(self, row):
cell_value = row[self.__column]
# Empty cell, don't check!
if not cell_value:
return
if not stdnum.fr.siret.is_valid(value):
message = "La valeur \"{value}\" n'est pas un numéro SIRET français valide."
message_substitutions = {
'value': value,
}
error = Error(
'french-siret-value',
cell,
message=message,
message_substitutions=message_substitutions
)
return [error]
if not stdnum.fr.siret.is_valid(cell_value):
yield FrenchSiretValueError.from_row(row, note="", field_name=self.__column)
metadata_profile = { # type: ignore
"type": "object",
"required": ["column"],
"properties": {"column": {"type": "string"}},
}
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
"""
Year Interval Value check
......@@ -19,87 +14,83 @@ from __future__ import unicode_literals
Pierre Dittgen, Jailbreak
"""
import re
from goodtables.registry import check
from goodtables.error import Error
from frictionless import Check, errors
YEAR_INTERVAL_RE = re.compile(r'^(\d{4})/(\d{4})$')
YEAR_RE = re.compile(r'^\d{4}$')
# Module API
class YearIntervalValueError(errors.CellError):
"""Custom error."""
code = "year-interval-value"
name = "Year interval value"
tags = ["#body"]
template = "Erreur sur l'année ou l'intervalle d'année '{cell}' ({note})."
description = "Année ou intervalle d'année"
@check('year-interval-value', type='custom', context='body')
class YearIntervalValue(object):
"""
Year Interval Value check class
"""
# Public
def __init__(self, column, **options):
self.__column = column
self.allow_year_only = options.get('allow-year-only') in ('true', 'yes')
class YearIntervalValue(Check):
"""Year Interval Value check class."""
def check_row(self, cells):
# Get cell
cell = None
for item in cells:
if self.__column in [item['column-number'], item['header']]:
cell = item
break
possible_Errors = [YearIntervalValueError] # type: ignore
def prepare(self):
"""Extract custom params from descriptor."""
self.__column = self.get("column")
self.__allow_year_only = self.get("allow-year-only") in ("true", "yes")
def validate_task(self):
if self.__column not in self.table.schema.field_names:
note = 'year interval value check requires field "%s"' % self.__column
yield errors.TaskError(note=note)
# Check cell
if not cell:
return
# Check value
value = cell.get('value')
if not value:
def validate_row(self, row):
cell_value = row[self.__column]
# Empty cell, don't check!
if not cell_value:
return
# Checks for interval format
rm = YEAR_INTERVAL_RE.match(value)
rm = YEAR_INTERVAL_RE.match(cell_value)
if not rm:
# Not an interval, is this a year only?
if self.allow_year_only:
ym = YEAR_RE.match(value)
if self.__allow_year_only:
ym = YEAR_RE.match(cell_value)
# No -> add error
if not ym:
return self.err(cell,
"La valeur \"{value}\" n'a pas le format attendu pour une année (AAAA) ou un intervalle d'année (AAAA/AAAA))",
{'value': value})
note = "format attendu: année (AAAA) ou intervale (AAAA/AAAA)"
yield YearIntervalValueError.from_row(row, note=note, field_name=self.__column)
# This is a year, cool!
else:
return
# Year ok
return
# not a period -> add error
return self.err(cell,
"La valeur \"{value}\" n'a pas le format attendu pour une période (AAAA/AAAA).",
{'value': value})
note = "format attendu: AAAA/AAAA"
yield YearIntervalValueError.from_row(row, note=note, field_name=self.__column)
return
year1 = int(rm.group(1))
year2 = int(rm.group(2))
if year1 == year2:
return self.err(cell,
"Période \"{value}\" invalide. Les deux années doivent être différentes).",
{'value': value})
note = "les deux années doivent être différentes"
yield YearIntervalValueError.from_row(row, note=note, field_name=self.__column)
return
if year1 > year2:
return self.err(cell,
"Période \"{value}\" invalide. La deuxième année doit être postérieure à la première"
+ " ({tip}).", {'value': value, 'tip': '{}/{}'.format(year2, year1)})
def err(self, cell, msg, msg_substitutions):
""" Create and return formatted error """
error = Error(
'year-interval-value',
cell,
message=msg,
message_substitutions=msg_substitutions
)
return [error]
note = "la deuxième année ({}) doit être postérieure à la première ({})".format(year1, year2)
yield YearIntervalValueError.from_row(row, note=note, field_name=self.__column)
return
metadata_profile = { # type: ignore
"type": "object",
"required": ["column"],
"properties": {"column": {"type": "string"}, "allow-year-only": {"type": "string"}},
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment