Commit 37c0b849 authored by Pierre Dittgen's avatar Pierre Dittgen

Fix siren cleaning

parent d21a8a24
Pipeline #2619 failed with stages
in 13 minutes and 51 seconds
......@@ -18,12 +18,13 @@ download_opendatafrance_data() {
ODF_DUMP_DIR=$DUMPS_DIR/opendatafrance
ORGA_CSV_URL="https://docs.google.com/spreadsheets/d/1iNHCRV4mfu4P0-VeiMFV6VVJvqQS8yf0sHNwkVcmGiQ/export?format=csv&id=1iNHCRV4mfu4P0-VeiMFV6VVJvqQS8yf0sHNwkVcmGiQ&gid=0"
PLF_CSV_URL="https://docs.google.com/spreadsheets/d/1iNHCRV4mfu4P0-VeiMFV6VVJvqQS8yf0sHNwkVcmGiQ/export?format=csv&id=1iNHCRV4mfu4P0-VeiMFV6VVJvqQS8yf0sHNwkVcmGiQ&gid=1319740109"
ORGA_CSV_TMP_FILE=$ODF_DUMP_DIR/raw_organisations.csv
echo "Downloading OpenDataFrance data..."
mkdir -p $ODF_DUMP_DIR
download_file_or_exit $ORGA_CSV_URL $ODF_DUMP_DIR/raw_organisations.csv
$PYTHON $LIB_DIR/download/clean_siren.py $ODF_DUMP_DIR/raw_organisations.csv $ODF_ORGA_FILE
rm $ODF_DUMP_DIR/raw_organisations.csv
download_file_or_exit $ORGA_CSV_URL $ORGA_CSV_TMP_FILE
$PYTHON $LIB_DIR/download/clean_siren.py $ORGA_CSV_TMP_FILE $ODF_ORGA_FILE
rm $ORGA_CSV_TMP_FILE
download_file_or_exit $PLF_CSV_URL $ODF_DUMP_DIR/plateformes.csv
}
......
......@@ -5,6 +5,8 @@
- de-duplicate siren
"""
import argparse
import csv
import sys
from pathlib import Path
......@@ -14,7 +16,7 @@ def main():
parser.add_argument("target_csv_file", type=Path, help="target CSV file")
args = parser.parse_args()
if not args.source_csv_file.exist():
if not args.source_csv_file.exists():
parser.error("Source file %r not found", args.source_csv_file)
source_csv_file = args.source_csv_file
......@@ -26,14 +28,18 @@ def main():
header = next(reader)
assert header[0] == "siren", header[0]
with target_csv_file.open("wt", encoding="utf-8") as fout:
writer = csv.writer()
writer = csv.writer(fout)
writer.writerow(header)
for row in reader:
siren = row[0].strip("\u200e ")
if siren in siren_set:
print(f"duplicate siren found: {siren!r}")
print(f"Duplicate siren found: {siren!r}")
continue
siren_set.add(siren)
writer.writerow([siren, *row[1:]])
if __name__ == "__main__":
sys.exit(main())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment