Commit aa9b328e authored by Pierre Dittgen's avatar Pierre Dittgen
Browse files

Rename clean_siren.py into clean_odf_orga.py / deal with coordinates...

Rename clean_siren.py into clean_odf_orga.py / deal with coordinates containing commas as decimal separator
parent cdf1fe21
Pipeline #3602 passed with stages
in 184 minutes and 25 seconds
......@@ -24,7 +24,7 @@ download_opendatafrance_data() {
echo "Downloading OpenDataFrance data..."
mkdir -p $ODF_DUMP_DIR
download_file_or_exit $ORGA_CSV_URL $ORGA_CSV_TMP_FILE
$PYTHON $LIB_DIR/download/clean_siren.py $ORGA_CSV_TMP_FILE $ODF_ORGA_FILE
$PYTHON $LIB_DIR/download/clean_odf_orga.py $ORGA_CSV_TMP_FILE $ODF_ORGA_FILE
rm $ORGA_CSV_TMP_FILE
download_file_or_exit $PLF_CSV_URL $ODF_DUMP_DIR/plateformes.csv
}
......
......@@ -3,6 +3,7 @@
- strip spaces and '\u200e' (Left-to-right mark) in siren column
- de-duplicate siren
- replace comma (,) by dot (.) in lat and lng coordinates
"""
import argparse
import csv
......@@ -27,18 +28,26 @@ def main():
reader = csv.reader(fin)
header = next(reader)
assert header[0] == "siren", header[0]
assert header[11] == "lat", header[11]
assert header[12] == "long", header[12]
with target_csv_file.open("wt", encoding="utf-8") as fout:
writer = csv.writer(fout)
writer.writerow(header)
for row in reader:
# SIREN cleaning
siren = row[0].strip("\u200e ")
if siren in siren_set:
print(f"Duplicate siren found: {siren!r}")
continue
siren_set.add(siren)
row[0] = siren
writer.writerow([siren, *row[1:]])
# Coords formatting
row[11] = row[11].replace(",", ".")
row[12] = row[12].replace(",", ".")
writer.writerow(row)
if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment