Commit 9c3d1ee8 authored by Pierre Dittgen's avatar Pierre Dittgen

xlsx source: drop trailing empty columns

parent 955b176b
import csv
import io
import itertools
import logging
from collections import defaultdict
from datetime import datetime, timezone
......@@ -270,17 +271,25 @@ def repair(source, schema_descriptor, **repair_options):
# And source body rows
body_rows = list(stream.iter())
return headers, body_rows
return headers, body_rows, stream.format
# Gets schema content
schema_field_names = [f.get('name') for f in schema_descriptor.get('fields')]
# consume source to get headers and content
headers, body_rows = consume_source(source, **repair_options)
headers, body_rows, stream_format = consume_source(source, **repair_options)
except StopIteration:
return (source, [])
# Special case for XSLX format
# => remove empty trailing columns
if stream_format == 'xlsx':
empty_cols_nb = len(list(itertools.takewhile(lambda h: h is None or h == '', headers[::-1])))
if empty_cols_nb:
headers = headers[: -empty_cols_nb]
body_rows = [row[: -empty_cols_nb] for row in body_rows]
# Create dataset for easier post processing
dataset = tablib.Dataset(*body_rows, headers=[h if h else EMPTY_HEADER for h in headers])
