Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Validata
Validata Core
Commits
fcec6a34
Commit
fcec6a34
authored
Nov 24, 2020
by
Pierre Dittgen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
wip
parent
cd8c734a
Pipeline
#2353
failed with stage
in 2 minutes and 57 seconds
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
205 additions
and
217 deletions
+205
-217
validata_core/__init__.py
validata_core/__init__.py
+67
-59
validata_core/csv_helpers.py
validata_core/csv_helpers.py
+7
-14
validata_core/custom_checks/__init__.py
validata_core/custom_checks/__init__.py
+10
-0
validata_core/custom_checks/french_siren_value.py
validata_core/custom_checks/french_siren_value.py
+38
-42
validata_core/custom_checks/french_siret_value.py
validata_core/custom_checks/french_siret_value.py
+33
-43
validata_core/custom_checks/year_interval_value.py
validata_core/custom_checks/year_interval_value.py
+50
-59
No files found.
validata_core/__init__.py
View file @
fcec6a34
...
...
@@ -6,18 +6,16 @@ from collections import defaultdict
from
datetime
import
datetime
,
timezone
from
pathlib
import
Path
import
goodtables
import
frictionless
import
importlib_resources
import
requests
import
tableschema
import
tablib
import
tabulator
from
toolz
import
get_in
,
thread_first
,
update_in
from
.
import
csv_helpers
,
loaders
,
messages
from
.custom_checks
import
(
cohesive_columns_value
,
compare_columns_value
,
extra_or_missing_header
,
french_siren_value
,
french_siret_value
,
nomenclature_actes_value
,
sum_columns_value
,
year_interval_value
)
from
.spec
import
spec
from
.custom_checks
import
available_checks
# from .spec import spec
log
=
logging
.
getLogger
(
__name__
)
...
...
@@ -46,7 +44,7 @@ def improve_messages(report, schema):
if
report
is
None
:
return
None
for
table_id
in
range
(
report
[
'
table-count
'
]):
for
table_id
in
range
(
report
[
'
stats'
][
'tables
'
]):
table
=
report
[
'tables'
][
table_id
]
table
[
'errors'
]
=
messages
.
improve_messages
(
table
[
'errors'
],
schema
)
...
...
@@ -123,13 +121,16 @@ def amend_report(report):
Remove 'value' errors if 'structure' errors
Computes statistics
"""
def
categorize_err
(
err
):
"""Computes error category: 'structure' or 'value'"""
if
err
.
get
(
'column-number'
)
is
None
and
err
.
get
(
'row-number'
)
is
None
:
return
'structure'
return
'value'
if
report
[
"stats"
][
"tables"
]
==
0
:
import
ipdb
;
ipdb
.
set_trace
()
# No table!
# Tag 'structure' or 'value'
errors
=
[{
**
err
,
'tag'
:
categorize_err
(
err
)}
for
err
in
report
[
'tables'
][
0
][
'errors'
]]
...
...
@@ -173,9 +174,9 @@ def retrieve_schema_descriptor(schema):
"""
if
isinstance
(
schema
,
Path
):
schema
=
str
(
schema
)
if
not
isinstance
(
schema
,
table
schema
.
Schema
):
schema
=
tableschema
.
Schema
(
schema
)
return
schema
.
descriptor
if
not
isinstance
(
schema
,
frictionless
.
schema
.
Schema
):
schema
=
frictionless
.
Schema
(
schema
)
return
schema
# Needed here because tablib Dataset doesn't allow empty column headers
...
...
@@ -307,13 +308,12 @@ def repair(source, schema_descriptor, **repair_options):
return
[
dataset
.
headers
]
+
[
dataset
[
i
]
for
i
in
range
(
len
(
dataset
))]
def
consume_source
(
source
,
**
options
):
stream
=
tabulator
.
stream
.
Stream
(
source
,
**
options
)
stream
.
open
()
table
=
frictionless
.
Table
(
source
,
**
options
)
table
.
open
()
# Get source headers
headers
=
next
(
stream
.
iter
())
headers
=
table
.
header
# And source body rows
body_rows
=
list
(
stream
.
iter
()
)
body_rows
=
list
(
table
.
data_stream
)
return
headers
,
body_rows
...
...
@@ -341,62 +341,70 @@ def validate(source, schema, with_repair=True, **options):
schema_descriptor
=
retrieve_schema_descriptor
(
schema
)
base_options
=
{
**
options
,
"custom_loaders"
:
loaders
.
custom_loaders
,
# to use Validata BytesLoader
}
fixed_source
,
structure_errors
=
source
,
None
checks
=
[
'structure'
,
'schema'
,
{
'extra-or-missing-header'
:
{}}]
if
with_repair
:
fixed_source
,
structure_errors
=
repair
(
source
,
schema_descriptor
,
**
base_options
)
checks
=
[
'structure'
,
'schema'
]
# TODO: handle repair
#checks = ['structure', 'schema', {'extra-or-missing-header': {}}]
#if with_repair:
# fixed_source, structure_errors = repair(source, schema_descriptor, **options)
# checks = ['structure', 'schema']
# Extract custom checks reference from table schema
extra_checks
=
None
custom_checks_config
=
schema_descriptor
.
get
(
'custom_checks'
)
if
custom_checks_config
:
for
custom_check_conf
in
custom_checks_config
:
checks
.
append
({
custom_check_conf
[
'name'
]:
custom_check_conf
[
'params'
]})
extra_checks
=
[]
for
cc_conf
in
custom_checks_config
:
cc_name
=
cc_conf
[
"name"
]
if
cc_name
in
available_checks
:
cc_class
=
available_checks
[
cc_name
]
cc_descriptor
=
cc_conf
[
"params"
]
extra_checks
.
append
((
cc_class
,
cc_descriptor
))
inspector_options_keys
=
[
'
check
s'
,
'skip_
check
s'
,
'infer_schema'
,
'infer_fields'
,
'
order_fields'
,
'error_limit
'
,
'table_limit'
,
'
row_limit
'
,
'
pick_error
s'
,
'skip_
error
s'
,
'infer_schema'
,
'infer_fields'
,
'
sync_schema'
,
'limit_errors
'
,
'table_limit'
,
'
query
'
,
]
# TODO: merge options
inspector_options
=
{
**
{
'checks'
:
checks
,
'skip_checks'
:
[
'non-matching-header'
,
'extra-header'
,
'missing-header'
],
'row_limit'
:
VALIDATA_MAX_ROWS
,
# TODO: Fix `pick_errors` content
#'pick_errors': checks,
'skip_errors'
:
[
'non-matching-header'
,
'extra-header'
,
'missing-header'
],
'query'
:
frictionless
.
Query
(
limit_rows
=
VALIDATA_MAX_ROWS
),
'extra_checks'
:
extra_checks
,
},
**
{
k
:
v
for
k
,
v
in
options
.
items
()
if
k
in
inspector_options_keys
}
}
inspector
=
goodtables
.
Inspector
(
**
inspector_options
)
options
=
{
**
base
_options
}
validate_
options
=
{
**
options
,
**
inspector
_options
}
if
with_repair
:
options
[
'scheme'
]
=
'stream'
options
[
'format'
]
=
'inline'
report
=
inspector
.
inspect
(
source
=
fixed_source
,
schema
=
schema_descriptor
,
**
options
)
if
report
[
'tables'
][
0
].
get
(
'format'
)
==
"csv"
and
not
any
(
get_in
([
'errors'
,
err
[
'code'
],
'type'
],
spec
,
default
=
None
)
==
'source'
for
err
in
report
[
'tables'
][
0
][
'errors'
]
):
standard_csv_delimiter
=
","
dialect
=
csv_helpers
.
detect_dialect
(
fixed_source
,
**
options
)
if
dialect
is
None
:
error
=
goodtables
.
Error
(
code
=
'unknown-csv-dialect'
)
report
=
prepend_error
(
report
,
table_index
=
0
,
error
=
dict
(
error
))
else
:
detected_delimiter
=
dialect
.
delimiter
if
detected_delimiter
!=
standard_csv_delimiter
:
error
=
goodtables
.
Error
(
code
=
'invalid-column-delimiter'
,
message_substitutions
=
{
"detected"
:
detected_delimiter
,
"expected"
:
standard_csv_delimiter
,
},
)
report
=
prepend_error
(
report
,
table_index
=
0
,
error
=
dict
(
error
))
validate_options
[
'scheme'
]
=
'stream'
validate_options
[
'format'
]
=
'inline'
report
=
frictionless
.
validate_table
(
fixed_source
,
schema
=
schema_descriptor
,
**
validate_options
)
# TODO: support error types
# if report['tables'][0].get('format') == "csv" and not any(
# get_in(['errors', err['code'], 'type'], spec, default=None) == 'source'
# for err in report['tables'][0]['errors']
# ):
# standard_csv_delimiter = ","
# dialect = csv_helpers.detect_dialect(fixed_source, **options)
# if dialect is None:
# error = goodtables.Error(code='unknown-csv-dialect')
# report = prepend_error(report, table_index=0, error=dict(error))
# else:
# detected_delimiter = dialect.delimiter
# if detected_delimiter != standard_csv_delimiter:
# error = goodtables.Error(
# code='invalid-column-delimiter',
# message_substitutions={
# "detected": detected_delimiter,
# "expected": standard_csv_delimiter,
# },
# )
# report = prepend_error(report, table_index=0, error=dict(error))
# If some errors have been encountered during repair process
if
structure_errors
:
...
...
validata_core/csv_helpers.py
View file @
fcec6a34
import
csv
import
tabulator
import
logging
from
frictionless
import
Table
log
=
logging
.
getLogger
(
__name__
)
def
detect_dialect
(
source
,
**
stream_params
):
"""Read source using `tabulator.loader`, but use `csv.Sniffer` to detect delimiter,
because `tabulator.parsers.csv` tries
only those delimiters: '',
\t
;|''.
"""Read source using `tabulator.loader`, but use `csv.Sniffer` to detect delimiter,
because `tabulator.parsers.csv` tries
only those delimiters: '',
\t
;|''.
"""
with
tabulator
.
Stream
(
source
,
**
stream_params
)
as
stream
:
parser
=
stream
.
_Stream__parser
parser
.
reset
()
sample
=
prepare_sample
(
stream
.
_Stream__parser
.
_CSVParser__chars
)
try
:
return
csv
.
Sniffer
().
sniff
(
sample
)
except
Exception
as
e
:
log
.
exception
(
e
)
return
None
with
Table
(
source
,
**
table_params
)
as
table
:
table
.
open
()
return
dialect
def
prepare_sample
(
stream
):
...
...
validata_core/custom_checks/__init__.py
View file @
fcec6a34
from
.french_siren_value
import
FrenchSirenValue
from
.french_siret_value
import
FrenchSiretValue
from
.year_interval_value
import
YearIntervalValue
# Please keep the below dict up-to-date
available_checks
=
{
"french-siren-value"
:
FrenchSirenValue
,
"french-siret-value"
:
FrenchSiretValue
,
"year-interval-value"
:
YearIntervalValue
,
}
\ No newline at end of file
validata_core/custom_checks/french_siren_value.py
View file @
fcec6a34
# -*- coding: utf-8 -*-
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
absolute_import
from
__future__
import
unicode_literals
import
stdnum.fr.siren
from
goodtables.registry
import
check
from
goodtables.error
import
Error
@
check
(
"french-siren-value"
,
type
=
"custom"
,
context
=
"body"
)
class
FrenchSirenValue
(
object
):
def
__init__
(
self
,
column
,
**
options
):
self
.
__column
=
column
def
check_row
(
self
,
cells
):
# Get cell
cell
=
None
for
item
in
cells
:
if
"header"
not
in
item
:
# Skip columns without headers. This can occur in particular with formatted Excel files.
continue
if
self
.
__column
in
[
item
[
"column-number"
],
item
[
"header"
]]:
cell
=
item
break
# Check cell
if
not
cell
:
return
from
frictionless
import
Check
,
errors
class
FrenchSirenValueError
(
errors
.
CellError
):
"""Custom error."""
code
=
"french-siren-value"
name
=
"French SIREN value"
tags
=
[
"#body"
]
template
=
"La valeur {cell} n'est pas un numéro SIREN français valide."
description
=
"Le numéro de SIREN indiqué n'est pas valide selon la définition de l'[INSEE](https://www.insee.fr/fr/metadonnees/definition/c2047)"
class
FrenchSirenValue
(
Check
):
"""Check french SIREN number validity."""
possible_Errors
=
[
FrenchSirenValueError
]
# type: ignore
def
prepare
(
self
):
"""Extract custom params from descriptor."""
self
.
__column
=
self
.
get
(
"column"
)
# Check value
value
=
cell
.
get
(
'value'
)
if
not
value
:
def
validate_task
(
self
):
if
self
.
__column
not
in
self
.
table
.
schema
.
field_names
:
note
=
'french siren value check requires field "%s"'
%
self
.
__column
yield
errors
.
TaskError
(
note
=
note
)
def
validate_row
(
self
,
row
):
cell_value
=
row
[
self
.
__column
]
# Empty cell, don't check!
if
not
cell_value
:
return
if
not
stdnum
.
fr
.
siren
.
is_valid
(
value
):
message
=
'La valeur "{value}" n
\'
est pas un numéro SIREN français valide.'
message_substitutions
=
{
"value"
:
value
}
error
=
Error
(
"french-siren-value"
,
cell
,
message
=
message
,
message_substitutions
=
message_substitutions
,
)
return
[
error
]
if
not
stdnum
.
fr
.
siren
.
is_valid
(
cell_
value
):
yield
FrenchSirenValueError
.
from_row
(
row
,
note
=
""
,
field_name
=
self
.
__column
)
metadata_profile
=
{
# type: ignore
"type"
:
"object"
,
"required"
:
[
"column"
]
,
"properties"
:
{
"column"
:
{
"type"
:
"string"
}}
,
}
validata_core/custom_checks/french_siret_value.py
View file @
fcec6a34
# -*- coding: utf-8 -*-
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
absolute_import
from
__future__
import
unicode_literals
from
simpleeval
import
simple_eval
import
stdnum.fr.siret
from
goodtables.registry
import
check
from
goodtables.error
import
Error
from
frictionless
import
Check
,
errors
class
FrenchSiretValueError
(
errors
.
CellError
):
"""Custom error."""
code
=
"french-siret-value"
name
=
"French SIRET value"
tags
=
[
"#body"
]
template
=
"La valeur {cell} n'est pas un numéro SIRET français valide."
description
=
"Le numéro de SIRET indiqué n'est pas valide selon la définition de l
\'
[INSEE](https://www.insee.fr/fr/metadonnees/definition/c1841)"
# Module API
@
check
(
'f
rench
-s
iret
-v
alue
'
,
type
=
'custom'
,
context
=
'body'
)
class
FrenchSiretValue
(
object
):
class
F
rench
S
iret
V
alue
(
Check
):
"""Check french SIRET number validity."""
# Public
possible_Errors
=
[
FrenchSiretValueError
]
# type: ignore
def
__init__
(
self
,
column
,
**
options
):
self
.
__column
=
column
def
prepare
(
self
):
"""Extract custom params from descriptor."""
self
.
__column
=
self
.
get
(
"column"
)
def
check_row
(
self
,
cells
):
# Get cell
cell
=
None
for
item
in
cells
:
if
'header'
not
in
item
:
# Skip columns without headers. This can occur in particular with formatted Excel files.
continue
if
self
.
__column
in
[
item
[
'column-number'
],
item
[
'header'
]]:
cell
=
item
break
# Check cell
if
not
cell
:
return
def
validate_task
(
self
):
if
self
.
__column
not
in
self
.
table
.
schema
.
field_names
:
note
=
'french siret value check requires field "%s"'
%
self
.
__column
yield
errors
.
TaskError
(
note
=
note
)
# Check value
value
=
cell
.
get
(
'value'
)
if
not
value
:
def
validate_row
(
self
,
row
):
cell_value
=
row
[
self
.
__column
]
# Empty cell, don't check!
if
not
cell_value
:
return
if
not
stdnum
.
fr
.
siret
.
is_valid
(
value
):
message
=
"La valeur
\"
{value}
\"
n'est pas un numéro SIRET français valide."
message_substitutions
=
{
'value'
:
value
,
}
error
=
Error
(
'french-siret-value'
,
cell
,
message
=
message
,
message_substitutions
=
message_substitutions
)
return
[
error
]
if
not
stdnum
.
fr
.
siret
.
is_valid
(
cell_value
):
yield
FrenchSiretValueError
.
from_row
(
row
,
note
=
""
,
field_name
=
self
.
__column
)
metadata_profile
=
{
# type: ignore
"type"
:
"object"
,
"required"
:
[
"column"
],
"properties"
:
{
"column"
:
{
"type"
:
"string"
}},
}
validata_core/custom_checks/year_interval_value.py
View file @
fcec6a34
# -*- coding: utf-8 -*-
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
absolute_import
from
__future__
import
unicode_literals
"""
Year Interval Value check
...
...
@@ -19,87 +14,83 @@ from __future__ import unicode_literals
Pierre Dittgen, Jailbreak
"""
import
re
from
goodtables.registry
import
check
from
goodtables.error
import
E
rror
from
frictionless
import
Check
,
e
rror
s
YEAR_INTERVAL_RE
=
re
.
compile
(
r
'^(\d{4})/(\d{4})$'
)
YEAR_RE
=
re
.
compile
(
r
'^\d{4}$'
)
# Module API
class
YearIntervalValueError
(
errors
.
CellError
):
"""Custom error."""
code
=
"year-interval-value"
name
=
"Year interval value"
tags
=
[
"#body"
]
template
=
"Erreur sur l'année ou l'intervalle d'année '{cell}' ({note})."
description
=
"Année ou intervalle d'année"
@
check
(
'year-interval-value'
,
type
=
'custom'
,
context
=
'body'
)
class
YearIntervalValue
(
object
):
"""
Year Interval Value check class
"""
# Public
def
__init__
(
self
,
column
,
**
options
):
self
.
__column
=
column
self
.
allow_year_only
=
options
.
get
(
'allow-year-only'
)
in
(
'true'
,
'yes'
)
class
YearIntervalValue
(
Check
):
"""Year Interval Value check class."""
def
check_row
(
self
,
cells
):
# Get cell
cell
=
None
for
item
in
cells
:
if
self
.
__column
in
[
item
[
'column-number'
],
item
[
'header'
]]:
cell
=
item
break
possible_Errors
=
[
YearIntervalValueError
]
# type: ignore
def
prepare
(
self
):
"""Extract custom params from descriptor."""
self
.
__column
=
self
.
get
(
"column"
)
self
.
__allow_year_only
=
self
.
get
(
"allow-year-only"
)
in
(
"true"
,
"yes"
)
def
validate_task
(
self
):
if
self
.
__column
not
in
self
.
table
.
schema
.
field_names
:
note
=
'year interval value check requires field "%s"'
%
self
.
__column
yield
errors
.
TaskError
(
note
=
note
)
# Check cell
if
not
cell
:
return
# Check value
value
=
cell
.
get
(
'value'
)
if
not
value
:
def
validate_row
(
self
,
row
):
cell_value
=
row
[
self
.
__column
]
# Empty cell, don't check!
if
not
cell_value
:
return
# Checks for interval format
rm
=
YEAR_INTERVAL_RE
.
match
(
value
)
rm
=
YEAR_INTERVAL_RE
.
match
(
cell_
value
)
if
not
rm
:
# Not an interval, is this a year only?
if
self
.
allow_year_only
:
ym
=
YEAR_RE
.
match
(
value
)
if
self
.
__
allow_year_only
:
ym
=
YEAR_RE
.
match
(
cell_
value
)
# No -> add error
if
not
ym
:
return
self
.
err
(
cell
,
"La valeur
\"
{value}
\"
n'a pas le format attendu pour une année (AAAA) ou un intervalle d'année (AAAA/AAAA))"
,
{
'value'
:
value
})
note
=
"format attendu: année (AAAA) ou intervale (AAAA/AAAA)"
yield
YearIntervalValueError
.
from_row
(
row
,
note
=
note
,
field_name
=
self
.
__column
)
# This is a year, cool!
else
:
return
# Year ok
return
# not a period -> add error
return
self
.
err
(
cell
,
"La valeur
\"
{value}
\"
n'a pas le format attendu pour une période (AAAA/AAAA)."
,
{
'value'
:
value
})
note
=
"format attendu: AAAA/AAAA"
yield
YearIntervalValueError
.
from_row
(
row
,
note
=
note
,
field_name
=
self
.
__column
)
return
year1
=
int
(
rm
.
group
(
1
))
year2
=
int
(
rm
.
group
(
2
))
if
year1
==
year2
:
return
self
.
err
(
cell
,
"Période
\"
{value}
\"
invalide. Les deux années doivent être différentes)."
,
{
'value'
:
value
})
note
=
"les deux années doivent être différentes"
yield
YearIntervalValueError
.
from_row
(
row
,
note
=
note
,
field_name
=
self
.
__column
)
return
if
year1
>
year2
:
return
self
.
err
(
cell
,
"Période
\"
{value}
\"
invalide. La deuxième année doit être postérieure à la première"
+
" ({tip})."
,
{
'value'
:
value
,
'tip'
:
'{}/{}'
.
format
(
year2
,
year1
)})
def
err
(
self
,
cell
,
msg
,
msg_substitutions
):
""" Create and return formatted error """
error
=
Error
(
'year-interval-value'
,
cell
,
message
=
msg
,
message_substitutions
=
msg_substitutions
)
return
[
error
]
note
=
"la deuxième année ({}) doit être postérieure à la première ({})"
.
format
(
year1
,
year2
)
yield
YearIntervalValueError
.
from_row
(
row
,
note
=
note
,
field_name
=
self
.
__column
)
return
metadata_profile
=
{
# type: ignore
"type"
:
"object"
,
"required"
:
[
"column"
],
"properties"
:
{
"column"
:
{
"type"
:
"string"
},
"allow-year-only"
:
{
"type"
:
"string"
}},
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment