__init__.py 7.33 KB
Newer Older
Christophe Benz's avatar
Christophe Benz committed
1
import logging
Pierre Dittgen's avatar
Pierre Dittgen committed
2
from collections import Counter
Pierre Dittgen's avatar
Pierre Dittgen committed
3
from datetime import datetime, timezone
4
from typing import Optional
Christophe Benz's avatar
Christophe Benz committed
5

Pierre Dittgen's avatar
wip    
Pierre Dittgen committed
6
7
import frictionless

8
9
from validata_core.custom_checks.utils import build_check_error

10
from .custom_checks import MissingRequiredHeader, available_checks
11
from .error_messages import error_translate
12
13
14
15
16
from .helpers import (
    _extract_header_and_rows_from_frictionless_source,
    is_body_error,
    is_structure_error,
)
Pierre Dittgen's avatar
Pierre Dittgen committed
17
from .structure_warnings import iter_structure_warnings
Christophe Benz's avatar
Christophe Benz committed
18
19
20

log = logging.getLogger(__name__)

21
22
VALIDATA_MAX_ROWS = 100000

Pierre Dittgen's avatar
Pierre Dittgen committed
23

Pierre Dittgen's avatar
Pierre Dittgen committed
24
def extract_required_field_names(schema: frictionless.schema.Schema):
Pierre Dittgen's avatar
Pierre Dittgen committed
25
26
27
28
29
    return [
        field["name"]
        for field in schema.get("fields", [])
        if field.get("constraints") and field["constraints"].get("required", False)
    ]
Pierre Dittgen's avatar
Pierre Dittgen committed
30

31

32
def compute_badge_metrics(report, config) -> Optional[dict]:
Pierre Dittgen's avatar
Pierre Dittgen committed
33
34
35
36
    """Compute badge metrics from report statistics and badge configuration."""

    def build_badge(structure_status, body_status=None, error_ratio=None):
        """Badge info creation"""
Pierre Dittgen's avatar
Pierre Dittgen committed
37
38
        if structure_status == "KO":
            return {"structure": "KO"}
Pierre Dittgen's avatar
Pierre Dittgen committed
39
40
41
        return {
            "structure": structure_status,
            "body": body_status,
Pierre Dittgen's avatar
Pierre Dittgen committed
42
            "error-ratio": error_ratio,
Pierre Dittgen's avatar
Pierre Dittgen committed
43
44
        }

Pierre Dittgen's avatar
wip    
Pierre Dittgen committed
45
    if not report["tasks"]:
46
47
        return None

Pierre Dittgen's avatar
wip    
Pierre Dittgen committed
48
    table_report = report["tasks"][0]
Pierre Dittgen's avatar
Pierre Dittgen committed
49
50

    # Compute number of cells
51
52
    resource_data = table_report["resource"]["data"]
    cell_total_number = len(resource_data[0]) * len(resource_data)
Pierre Dittgen's avatar
Pierre Dittgen committed
53
54

    # No errors
55
56
57
58
    if (
        table_report["stats"]["errors"] == 0
        and len(table_report["structure_warnings"]) == 0
    ):
Pierre Dittgen's avatar
Pierre Dittgen committed
59
        return build_badge("OK", "OK", 0.0)
Pierre Dittgen's avatar
Pierre Dittgen committed
60
61
62

    # Structure part
    structure_status = None
Pierre Dittgen's avatar
Pierre Dittgen committed
63
    structure_errors_count = len(
64
        [err for err in table_report["errors"] if is_structure_error(err)]
Pierre Dittgen's avatar
Pierre Dittgen committed
65
    )
Pierre Dittgen's avatar
Pierre Dittgen committed
66
    if structure_errors_count == 0:
Pierre Dittgen's avatar
Pierre Dittgen committed
67
68
69
        structure_status = (
            "OK" if len(table_report["structure_warnings"]) == 0 else "WARN"
        )
Pierre Dittgen's avatar
Pierre Dittgen committed
70
    else:
Pierre Dittgen's avatar
Pierre Dittgen committed
71
        return build_badge("KO")
Pierre Dittgen's avatar
Pierre Dittgen committed
72
73

    # body part
74
    value_errors = [err for err in table_report["errors"] if is_body_error(err)]
Pierre Dittgen's avatar
Pierre Dittgen committed
75
    if len(value_errors) == 0:
Pierre Dittgen's avatar
Pierre Dittgen committed
76
        return build_badge(structure_status, "OK", 0.0)
Pierre Dittgen's avatar
Pierre Dittgen committed
77
78

    # Computes error ratio
Pierre Dittgen's avatar
Pierre Dittgen committed
79
    weight_dict = config["body"]["errors-weight"]
Pierre Dittgen's avatar
Pierre Dittgen committed
80
    err_code_counter = Counter([err.code for err in value_errors])
Pierre Dittgen's avatar
Pierre Dittgen committed
81
82
83
84
85
86
87
    ratio = (
        sum(
            [
                nb * weight_dict.get(err_code, 1.0)
                for err_code, nb in err_code_counter.items()
            ]
        )
Pierre Dittgen's avatar
Pierre Dittgen committed
88
        / cell_total_number
Pierre Dittgen's avatar
Pierre Dittgen committed
89
90
    )
    body_status = "WARN" if ratio < config["body"]["acceptability-threshold"] else "KO"
Pierre Dittgen's avatar
Pierre Dittgen committed
91
92
93
94

    return build_badge(structure_status, body_status, ratio)


95
def validate(source, schema, **options):
Pierre Dittgen's avatar
wip    
Pierre Dittgen committed
96
97
    """Validate a `source` using a `schema`."""

98
    # Handle different schema format
Pierre Dittgen's avatar
Pierre Dittgen committed
99
100
101
    if isinstance(schema, dict) or (
        isinstance(schema, str) and schema.startswith("http")
    ):
102
103
104
        schema = frictionless.schema.Schema(schema)
    elif not isinstance(schema, frictionless.schema.Schema):
        schema = frictionless.schema.Schema(str(schema))
Pierre Dittgen's avatar
Pierre Dittgen committed
105

Pierre Dittgen's avatar
Pierre Dittgen committed
106
107
    # First extra check is devoted to detect required missing headers
    required_field_names = extract_required_field_names(schema)
Pierre Dittgen's avatar
Pierre Dittgen committed
108
    extra_checks = [
Pierre Dittgen's avatar
wip    
Pierre Dittgen committed
109
        MissingRequiredHeader({"required_field_names": required_field_names})
Pierre Dittgen's avatar
Pierre Dittgen committed
110
    ]
Pierre Dittgen's avatar
Pierre Dittgen committed
111

Pierre Dittgen's avatar
Pierre Dittgen committed
112
    # Dynamically add custom check based on schema needs
113
    check_errors = []
Pierre Dittgen's avatar
Pierre Dittgen committed
114
115
    for cc_conf in schema.get("custom_checks", []):
        cc_name = cc_conf["name"]
116
117
118
119
120
121
122
123
        if cc_name not in available_checks:
            check_errors.append(
                build_check_error(cc_name, note="custom check inconnu.")
            )
            continue
        cc_class = available_checks[cc_name]
        cc_descriptor = cc_conf["params"]
        extra_checks.append(cc_class(cc_descriptor))
124

125
126
    # Merge options to pass to frictionless
    validate_options = {
Pierre Dittgen's avatar
wip    
Pierre Dittgen committed
127
128
129
130
        "layout": frictionless.Layout(limit_rows=VALIDATA_MAX_ROWS),
        "checks": extra_checks,  # add custom_checks if needed
        # Don't care about missing, extra or unordered columns
        "detector": frictionless.Detector(schema_sync=True),
131
        **{
Pierre Dittgen's avatar
Pierre Dittgen committed
132
133
134
135
            k: v
            for k, v in options.items()
            if k
            in {
136
                "format",
Pierre Dittgen's avatar
Pierre Dittgen committed
137
138
139
140
141
142
                "infer_schema",
                "infer_fields",
                "limit_errors",
                "pick_errors",
                "query",
                "skip_errors",
143
                "scheme",
Pierre Dittgen's avatar
Pierre Dittgen committed
144
                "table_limit",
145
            }
Pierre Dittgen's avatar
Pierre Dittgen committed
146
        },
147
    }
Pierre Dittgen's avatar
Pierre Dittgen committed
148
    original_schema = schema.copy()
Pierre Dittgen's avatar
wip    
Pierre Dittgen committed
149
    report = frictionless.validate_resource(source, schema=schema, **validate_options)
Christophe Benz's avatar
Christophe Benz committed
150

Pierre Dittgen's avatar
Pierre Dittgen committed
151
152
    # add structure warnings if needed
    source_options = {
Pierre Dittgen's avatar
Pierre Dittgen committed
153
154
        k: v
        for k, v in validate_options.items()
Pierre Dittgen's avatar
Pierre Dittgen committed
155
156
        if k in {"dialect", "encoding", "format", "scheme"}
    }
157
158
159
160
161
162

    source_header = None
    if report["tasks"]:
        source_header, _ = _extract_header_and_rows_from_frictionless_source(
            source, **source_options
        )
Pierre Dittgen's avatar
wip    
Pierre Dittgen committed
163
    for table in report["tasks"]:
Pierre Dittgen's avatar
Pierre Dittgen committed
164
165
166
167
168
        table["structure_warnings"] = list(
            iter_structure_warnings(
                source_header, required_field_names, original_schema
            )
        )
169
170
171
        if check_errors:
            table["errors"].extend(check_errors)
            report["valid"] = False
Pierre Dittgen's avatar
Pierre Dittgen committed
172

173
    # translate errors
174
    report["errors"] = [error_translate(err, schema) for err in report["errors"]]
Pierre Dittgen's avatar
wip    
Pierre Dittgen committed
175
    for table in report["tasks"]:
176
        table["errors"] = [error_translate(err, schema) for err in table["errors"]]
Christophe Benz's avatar
Christophe Benz committed
177

Pierre Dittgen's avatar
Pierre Dittgen committed
178
    # Add date
Pierre Dittgen's avatar
Pierre Dittgen committed
179
    report["date"] = datetime.now(timezone.utc).isoformat()
Pierre Dittgen's avatar
Pierre Dittgen committed
180

Christophe Benz's avatar
Christophe Benz committed
181
    return report
182
183
184
185
186


def compute_badge(report, config) -> dict:
    """Compute badge from report statistics and badge configuration."""

187
188
    def build_badge(structure_status, body_status=None, error_ratio=None):
        """Badge info creation"""
Pierre Dittgen's avatar
Pierre Dittgen committed
189
190
        if structure_status == "KO":
            return {"structure": "KO"}
191
192
193
        return {
            "structure": structure_status,
            "body": body_status,
Pierre Dittgen's avatar
Pierre Dittgen committed
194
            "error-ratio": error_ratio,
195
196
        }

197
    # Gets stats from report
Pierre Dittgen's avatar
Pierre Dittgen committed
198
    stats = report["tables"][0]["error-stats"]
199
200

    # And total number of cells
Pierre Dittgen's avatar
Pierre Dittgen committed
201
202
    column_count = len(report["tables"][0]["headers"])
    row_count = report["tables"][0]["row-count"]
203
204
205
    cell_total_number = column_count * row_count

    # No errors
Pierre Dittgen's avatar
Pierre Dittgen committed
206
207
    if stats["count"] == 0:
        return build_badge("OK", "OK", 0.0)
208
209
210

    # Structure part
    structure_status = None
Pierre Dittgen's avatar
Pierre Dittgen committed
211
212
    if stats["structure-errors"]["count"] == 0:
        structure_status = "OK"
213
    else:
Pierre Dittgen's avatar
Pierre Dittgen committed
214
215
216
        cbc = stats["structure-errors"]["count-by-code"]
        if len(cbc) == 1 and "invalid-column-delimiter" in cbc:
            structure_status = "WARN"
217
218
        else:
            # structure_status = 'KO'
Pierre Dittgen's avatar
Pierre Dittgen committed
219
            return build_badge("KO")
220
221

    # body part
Pierre Dittgen's avatar
Pierre Dittgen committed
222
223
224
    value_errors = stats["value-errors"]
    if value_errors["count"] == 0:
        return build_badge(structure_status, "OK", 0.0)
225
226

    # Computes error ratio
Pierre Dittgen's avatar
Pierre Dittgen committed
227
228
229
230
231
232
233
234
    weight_dict = config["body"]["errors-weight"]
    ratio = (
        sum(
            [
                nb * weight_dict.get(err, 1.0)
                for err, nb in value_errors["count-by-code"].items()
            ]
        )
235
        / cell_total_number
Pierre Dittgen's avatar
Pierre Dittgen committed
236
237
    )
    body_status = "WARN" if ratio < config["body"]["acceptability-threshold"] else "KO"
238

239
    return build_badge(structure_status, body_status, ratio)