......@@ -7,6 +7,7 @@ import json
import logging
import subprocess
import tempfile
from collections import defaultdict
from datetime import datetime
from operator import itemgetter
from pathlib import Path
......@@ -139,6 +140,18 @@ def extract_source_data(source: ValidataResource, schema_descriptor, preview_row
"""Transform value into string"""
return '' if val is None else str(val)
def compute_duplicate_header_column_indices(source_header, duplicate_header_names):
column_name_to_indices = defaultdict(list)
for i, h in enumerate(source_header):
if h in duplicate_header_names:
col_indices = set()
for v in column_name_to_indices.values():
return col_indices
header = None
rows = []
nb_rows = 0
......@@ -166,11 +179,21 @@ def extract_source_data(source: ValidataResource, schema_descriptor, preview_row
preview_rows_nb = min(preview_rows_nb, nb_rows)
# Computes original_headers display
# wrong headers order: display all headers as in error
if any([err.code == 'wrong-headers-order' for err in repair_report]):
source_header_info = [(h, True) for h in source_header]
# else display header error for:
# - blank-header
# - unknown-header
# - duplicate-header
schema_field_names = [f['name'] for f in schema_descriptor.get('fields') or []]
source_header_info = [(h, not h or h not in schema_field_names) for h in source_header]
#import ipdb; ipdb.set_trace()
duplicate_header_names = [err._message_substitutions['column-name'] for err in repair_report if err.code == 'duplicate-header']
duplicate_col_indices = compute_duplicate_header_column_indices(source_header, duplicate_header_names)
source_header_info = [(h, not h or h not in schema_field_names or i in duplicate_col_indices) for i, h in enumerate(source_header)]
return {
'source_header_info': source_header_info,
