Commit ed3d4ab3 authored by Christophe Benz's avatar Christophe Benz
Browse files

Remove pre-checks, load custom-checks from schema

parent 10cfdd39
## 0.1.1 -> 0.2.0
Breaking changes:
- Add `validata_code.Validator` class.
- Remove `validata_code.validate` function.
## 0.1.0 -> 0.1.1
Non-breaking changes:
......
......@@ -14,7 +14,7 @@ License :: OSI Approved :: GNU Affero General Public License v3
setup(
name='validata_core',
version='0.1.1',
version='0.2.0',
author='Christophe Benz',
author_email='christophe.benz@jailbreak.paris',
......
......@@ -3,7 +3,7 @@ from io import BytesIO
from openpyxl import Workbook
from validata_core import validate
from validata_core import Validator
schema_abc = {
"$schema": "https://frictionlessdata.io/schemas/table-schema.json",
......@@ -51,21 +51,9 @@ schema_number = {
]
}
pre_checks_conf = [
{
"name": "column-delimiter",
"params": {
"delimiter": ","
}
},
{
"name": "missing-columns",
}
]
def validate_csv_text(**options):
return validate(scheme='text', format='csv', pre_checks_conf=pre_checks_conf, **options)
return Validator().validate(scheme='text', format='csv', **options)
def test_valid_delimiter():
......@@ -91,8 +79,6 @@ def test_invalid_delimiter_percent():
source = """A%B%C
a%b%c"""
report = validate_csv_text(source=source, schema=schema_abc)
assert report['tables'][0]['error-count'] == 1
assert len(report['tables'][0]['errors']) == 1
assert report['tables'][0]['errors'][0]['code'] == 'invalid-column-delimiter'
assert report['tables'][0]['errors'][0]['message-data']['detected'] == '%'
assert report['tables'][0]['errors'][0]['message-data']['expected'] == ','
......@@ -204,7 +190,6 @@ a,a,b"""
assert report['tables'][0]['errors'][0]['code'] == 'missing-headers'
assert report['tables'][0]['errors'][0]['message-data']['headers'] == ['C']
assert report['tables'][0]['errors'][1]['code'] == 'duplicate-header'
assert report['tables'][0]['errors'][1]['column-number'] == 2
assert report['tables'][0]['errors'][1]['message-data']['column_numbers'] == '1'
......@@ -224,7 +209,7 @@ a;c"""
def validate_xlsx_bytes(**options):
return validate(scheme='bytes', format='xlsx', pre_checks_conf=pre_checks_conf, **options)
return Validator().validate(scheme='bytes', format='xlsx', **options)
def build_one_cell_xlsx(cell):
......
......@@ -12,7 +12,7 @@ from toolz import thread_first, update_in
import goodtables
from . import loaders, messages, pre_checks
from . import csv_helpers, loaders, messages
log = logging.getLogger(__name__)
......@@ -65,106 +65,68 @@ def improve_messages(report, schema):
return report
def load_schema_and_checks(schema_value, checks):
schemas_config = get_schemas_config()
schema_config = schemas_config.get(schema_value)
schema = resolve_schema(schema_value, schema_config)
if isinstance(schema_value, str) and schema_config is not None:
# `schema` is an SCDL tag
assert checks is None, checks
checks_url = schema_config["goodtables_checks_json_url"]
goodtables_checks_json = requests.get(checks_url).json()
else:
# `schema` is a file path
assert isinstance(checks, Path), checks
with checks.open() as fp:
goodtables_checks_json = json.load(fp)
pre_checks_conf, checks = build_checks(goodtables_checks_json)
return schema, pre_checks_conf, checks
def build_checks(goodtables_checks_json):
pre_checks_conf = []
checks = ['structure', 'schema']
# pre-checks
pre_checks_conf = goodtables_checks_json.get('pre_checks')
# custom checks
custom_checks_conf = goodtables_checks_json.get('custom_checks')
if custom_checks_conf is not None:
for custom_check_conf in custom_checks_conf:
checks.append({custom_check_conf['name']: custom_check_conf['params']})
return pre_checks_conf, checks
def resolve_schema(source, schema_config=None):
"""Return a `tableschema.Schema` instance from `source` which can be either:
- a `pathlib.Path`
- a `str` containing either:
- a file path
- an URL
- a SCDL tag as described in `schemas.toml` (i.e. `scdl-prenoms`)
- a `dict` representing the schema in JSON
- a `tableschema.Schema` instance
"""
if isinstance(source, Path):
source = str(source)
if isinstance(source, str) and schema_config is not None:
# `source` is a SCDL tag (i.e. `scdl-prenoms`)
source = schema_config["schema_json_url"]
schema = Schema(source)
return schema
def validate(source, schema_source, checks=None, **options):
"""Validate a `source` applying pre-checks and checks.
`schema` is resolved using `resolve_schema`.
"""
schema, pre_checks_conf, checks = load_schema_and_checks(schema_source, checks)
schema_descriptor = schema.descriptor
report = None
inspector = goodtables.Inspector(
checks=(['structure', 'schema'] if checks is None else checks) + [{'extra-or-missing-header': {}}],
skip_checks=['non-matching-header', 'extra-header', 'missing-header'],
row_limit=VALIDATA_MAX_ROWS)
options = {**options, "custom_loaders": loaders.custom_loaders}
for pre_check_conf in pre_checks_conf or []:
pre_check_name = pre_check_conf['name']
pre_check_options = {**options, **pre_check_conf.get('params', {})}
pre_check_class = pre_checks.pre_checks_classes.get(pre_check_name)
if pre_check_class is None:
log.error("Could not find pre-check class for %r", pre_check_name)
continue
pre_check = pre_check_class(source, **pre_check_options)
error = pre_check.run()
if error is not None:
log.debug("error: %r", dict(error))
with pre_check.get_fixed_stream() as fixed_stream:
rows = list(fixed_stream)
report = inspector.inspect(source=rows, schema=schema_descriptor,
**{**options, "format": "inline", "scheme": None})
log.debug("report: %r", report)
report = add_error(report, table_index=0, error=dict(error))
if report is None:
# If no pre-checks have been executed, or all pre-checks were successful, no report was computed,
# so fallback to a normal validation, without pre-checks.
class Validator:
def __init__(self, schemas_config=None):
self.schemas_config = get_schemas_config() if schemas_config is None else schemas_config
def load_schema(self, source):
"""Return a `tableschema.Schema` instance from `source`."""
if isinstance(source, Path):
source = str(source)
if isinstance(source, str) and source.startswith("scdl-"):
# `source` seems to be a SCDL tag (i.e. `scdl-prenoms`)
schema_config = self.schemas_config.get(source)
if schema_config is not None:
source = schema_config["schema"]
return Schema(source)
def validate(self, source, schema, **options):
"""Validate a `source` using a `schema`.
`schema` can be either:
- a `pathlib.Path`
- a `str` containing either:
- a file path
- an URL
- a SCDL tag as described in `schemas.toml` (i.e. `scdl-prenoms`)
- a `dict` representing the schema in JSON
- a `tableschema.Schema` instance
"""
if not isinstance(schema, Schema):
schema = self.load_schema(schema)
schema_descriptor = schema.descriptor
checks = ['structure', 'schema', {'extra-or-missing-header': {}}]
custom_checks_config = schema_descriptor.get('custom_checks')
if custom_checks_config:
for custom_check_conf in custom_checks_config:
checks.append({custom_check_conf['name']: custom_check_conf['params']})
inspector = goodtables.Inspector(
checks=checks,
skip_checks=['non-matching-header', 'extra-header', 'missing-header'],
row_limit=VALIDATA_MAX_ROWS,
)
options = {**options, "custom_loaders": loaders.custom_loaders}
report = inspector.inspect(source=source, schema=schema_descriptor, **options)
# Translate error messages
report = improve_messages(report, schema_descriptor)
# Add date
report['date'] = datetime.now().isoformat()
return report
if report['tables'][0]['format'] == "csv":
scdl_delimiter = ","
detected_delimiter = csv_helpers.detect_dialect(source, **options).delimiter
if detected_delimiter != scdl_delimiter:
error = goodtables.Error(
code='invalid-column-delimiter',
message_substitutions={
"detected": detected_delimiter,
"expected": scdl_delimiter,
},
)
report = add_error(report, table_index=0, error=dict(error))
# Translate error messages
report = improve_messages(report, schema_descriptor)
# Add date
report['date'] = datetime.now().isoformat()
return report
......@@ -12,27 +12,17 @@ from pathlib import Path
import toml
from . import get_schemas_config, validate
from . import Validator
def cli():
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('source', help='URL or path to tabular file (CSV, XLS, etc.) to validate')
parser.add_argument('--checks', type=Path, help='path to goodtables-checks JSON file')
parser.add_argument('--config', type=Path, help='use alternate `schemas.toml` config file')
parser.add_argument('--log', default='WARNING', help='level of logging messages')
parser.add_argument('--schema', help='URL or path to table schema JSON file')
args = parser.parse_args()
if args.config is not None:
with args.config.open() as fp:
schemas_config = toml.load(fp)
else:
schemas_config = get_schemas_config()
schema_config = schemas_config.get(args.schema)
if schema_config is not None and args.checks is not None:
parser.error("When using a SCDL tag, checks are defined in `schemas.toml`. Don't use --checks option.")
numeric_level = getattr(logging, args.log.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: {}'.format(args.log))
......@@ -42,11 +32,12 @@ def cli():
stream=sys.stderr, # script outputs data
)
report = validate(
source=args.source,
checks=args.checks,
schema=args.schema,
)
schemas_config = None
if args.config is not None:
with args.config.open() as fp:
schemas_config = toml.load(fp)
report = Validator(schemas_config).validate(source=args.source, schema=args.schema)
json.dump(report, sys.stdout, ensure_ascii=False, indent=2, sort_keys=True)
......
"""
Pre-checks are checks that aren't handled by Goodtables, and can sometimes be fixed to reach the real checks.
"""
import abc
import tabulator
import goodtables
from . import csv_helpers
class PreCheck(abc.ABC):
name = None
accepted_formats = None
def __init__(self, source, **options):
self.source = source
self.options = options
@abc.abstractmethod
def get_fixed_stream(self):
"""Return a `tabulator.Stream` which content fixes the error detected by this check."""
pass
@abc.abstractmethod
def get_error(self):
"""To be implemented by each pre-check class.
Return a `goodtables.Error` or `None`.
"""
pass
def run(self):
"""Check that `self.source` format is accepted, then call `self.get_error`.
Return a `goodtables.Error` or `None`.
"""
with tabulator.Stream(self.source, **self.options) as stream:
if self.accepted_formats is not None and stream.format not in self.accepted_formats:
return None
return self.get_error()
class ColumnDelimiter(PreCheck):
accepted_formats = {'csv'}
def get_error(self):
delimiter = self.options['delimiter'] # required
self.detected_delimiter = csv_helpers.detect_dialect(self.source, **self.options).delimiter
if delimiter == self.detected_delimiter:
return None
return goodtables.Error(
code='invalid-column-delimiter',
message_substitutions={
"detected": self.detected_delimiter,
"expected": delimiter,
},
)
def get_fixed_stream(self):
options = {**self.options, "delimiter": self.detected_delimiter}
return tabulator.Stream(self.source, **options)
pre_checks_classes = {
"column-delimiter": ColumnDelimiter,
}
[scdl-adresses]
schema_json_url = "https://git.opendatafrance.net/scdl/adresses/raw/master/schema-scdl-adresses.json"
goodtables_checks_json_url = "https://git.opendatafrance.net/scdl/adresses/raw/master/goodtables-checks.json"
schema = "https://git.opendatafrance.net/scdl/adresses/raw/master/schema.json"
title = "Adresses locales"
description = "Liste des adresses locales d'une collectivité"
specurl = "http://www.opendatafrance.net/SCDL_Adresses_Locales"
docurl = "https://dev.validata.fr/docs/schemas/scdl-adresses.html"
doc_url = "https://dev.validata.fr/docs/schemas/scdl-adresses.html"
[[scdl-adresses.examples]]
name = "Adresses fictives invalides"
......@@ -16,12 +14,10 @@ url = "https://git.opendatafrance.net/scdl/adresses/raw/v1.1/exemples/20180424_b
[scdl-deliberations]
schema_json_url = "https://git.opendatafrance.net/scdl/deliberations/raw/master/schema.json"
goodtables_checks_json_url = "https://git.opendatafrance.net/scdl/deliberations/raw/master/goodtables-checks.json"
schema = "https://git.opendatafrance.net/scdl/deliberations/raw/master/schema.json"
title = "Délibérations"
description = "Liste des délibérations adoptées par une assemblée locale"
specurl = "http://www.opendatafrance.net/SCDL_Deliberations"
docurl = "https://dev.validata.fr/docs/schemas/scdl-deliberations.html"
doc_url = "https://dev.validata.fr/docs/schemas/scdl-deliberations.html"
[[scdl-deliberations.examples]]
name = "Délibérations fictives valides"
......@@ -33,12 +29,10 @@ url = "https://git.opendatafrance.net/scdl/deliberations/raw/v2.0/examples/Delib
[scdl-marches-publics]
schema_json_url = "https://git.opendatafrance.net/scdl/marches-publics/raw/master/schema.json"
goodtables_checks_json_url = "https://git.opendatafrance.net/scdl/marches-publics/raw/master/goodtables-checks.json"
schema = "https://git.opendatafrance.net/scdl/marches-publics/raw/master/schema.json"
title = "Marchés publics"
description = "Liste des marchés publics attribués par une collectivité"
specurl = "http://www.opendatafrance.net/SCDL_Marches_Publics"
docurl = "https://dev.validata.fr/docs/schemas/scdl-marches-publics.html"
doc_url = "https://dev.validata.fr/docs/schemas/scdl-marches-publics.html"
[[scdl-marches-publics.examples]]
name = "Marchés publics fictifs valides"
......@@ -50,12 +44,10 @@ url = "https://git.opendatafrance.net/scdl/marches-publics/raw/master/exemples/e
[scdl-prenoms]
schema_json_url = "https://github.com/Jailbreak-Paris/liste-prenoms-nouveaux-nes/raw/v1.1.2/prenom-schema.json"
goodtables_checks_json_url = "https://github.com/Jailbreak-Paris/liste-prenoms-nouveaux-nes/raw/v1.1.2/goodtables-checks.json"
schema = "https://github.com/Jailbreak-Paris/liste-prenoms-nouveaux-nes/raw/v1.1.2/prenom-schema.json"
title = "Prénoms des nouveaux-nés"
description = "Liste des prénoms des nouveaux-nés déclarés à l'état-civil"
specurl = "https://docs.google.com/document/d/1KuAUWX2nfdsxZnizM1mR54_o6n0827HID52zpMIOaF8/edit#"
docurl = "https://dev.validata.fr/docs/schemas/scdl-prenoms.html"
doc_url = "https://dev.validata.fr/docs/schemas/scdl-prenoms.html"
[[scdl-prenoms.examples]]
name = "Prénoms des nouveaux-nés Digne-les-Bains 2017"
......@@ -71,12 +63,10 @@ url = "https://data.opendatasoft.com/explore/dataset/prenoms@hauts-de-seine/down
[scdl-subventions]
schema_json_url = "https://git.opendatafrance.net/scdl/subventions/raw/master/schema.json"
goodtables_checks_json_url = "https://git.opendatafrance.net/scdl/subventions/raw/master/goodtables-checks.json"
schema = "https://git.opendatafrance.net/scdl/subventions/raw/master/schema.json"
title = "Subventions"
description = "Liste des subventions publiques attribuées par une collectivité"
specurl = "http://www.opendatafrance.net/SCDL_Subventions"
docurl = "https://dev.validata.fr/docs/schemas/scdl-subventions.html"
doc_url = "https://dev.validata.fr/docs/schemas/scdl-subventions.html"
[[scdl-subventions.examples]]
name = "Subventions fictives invalides"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment