Commit 2c80a336 authored by Pierre Dittgen's avatar Pierre Dittgen
Browse files

Clean siren values containing left-to-right-mark in organisations.csv

parent af8029c4
......@@ -21,7 +21,9 @@ download_opendatafrance_data() {
echo "Downloading OpenDataFrance data..."
mkdir -p $ODF_DUMP_DIR
download_file_or_exit $ORGA_CSV_URL $ODF_ORGA_FILE
download_file_or_exit $ORGA_CSV_URL $ODF_DUMP_DIR/raw_organisations.csv
$PYTHON $LIB_DIR/download/clean_siren.py $ODF_DUMP_DIR/raw_organisations.csv $ODF_ORGA_FILE
rm $ODF_DUMP_DIR/raw_organisations.csv
download_file_or_exit $PLF_CSV_URL $ODF_DUMP_DIR/plateformes.csv
}
......
#!/usr/bin/env python3
"""Clean CSV file.
- strip spaces and '\u200e' (Left-to-right mark) in siren column
- de-duplicate siren
"""
import argparse
from pathlib import Path
def main():
parser = argparse.ArgumentParser()
parser.add_argument("source_csv_file", type=Path, help="source CSV file")
parser.add_argument("target_csv_file", type=Path, help="target CSV file")
args = parser.parse_args()
if not args.source_csv_file.exist():
parser.error("Source file %r not found", args.source_csv_file)
source_csv_file = args.source_csv_file
target_csv_file = args.target_csv_file
siren_set = set()
with source_csv_file.open("rt", encoding="utf-8") as fin:
reader = csv.reader(fin)
header = next(reader)
assert header[0] == "siren", header[0]
with target_csv_file.open("wt", encoding="utf-8") as fout:
writer = csv.writer()
writer.writerow(header)
for row in reader:
siren = row[0].strip("\u200e ")
if siren in siren_set:
print(f"duplicate siren found: {siren!r}")
continue
siren_set.add(siren)
writer.writerow([siren, *row[1:]])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment