Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Observatoire
observatoire-scripts
Commits
2c80a336
Commit
2c80a336
authored
Nov 09, 2020
by
Pierre Dittgen
Browse files
Clean siren values containing left-to-right-mark in organisations.csv
parent
af8029c4
Changes
2
Hide whitespace changes
Inline
Side-by-side
download_and_prepare_data
View file @
2c80a336
...
...
@@ -21,7 +21,9 @@ download_opendatafrance_data() {
echo
"Downloading OpenDataFrance data..."
mkdir
-p
$ODF_DUMP_DIR
download_file_or_exit
$ORGA_CSV_URL
$ODF_ORGA_FILE
download_file_or_exit
$ORGA_CSV_URL
$ODF_DUMP_DIR
/raw_organisations.csv
$PYTHON
$LIB_DIR
/download/clean_siren.py
$ODF_DUMP_DIR
/raw_organisations.csv
$ODF_ORGA_FILE
rm
$ODF_DUMP_DIR
/raw_organisations.csv
download_file_or_exit
$PLF_CSV_URL
$ODF_DUMP_DIR
/plateformes.csv
}
...
...
lib/download/clean_siren.py
0 → 100644
View file @
2c80a336
#!/usr/bin/env python3
"""Clean CSV file.
- strip spaces and '
\u200e
' (Left-to-right mark) in siren column
- de-duplicate siren
"""
import
argparse
from
pathlib
import
Path
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"source_csv_file"
,
type
=
Path
,
help
=
"source CSV file"
)
parser
.
add_argument
(
"target_csv_file"
,
type
=
Path
,
help
=
"target CSV file"
)
args
=
parser
.
parse_args
()
if
not
args
.
source_csv_file
.
exist
():
parser
.
error
(
"Source file %r not found"
,
args
.
source_csv_file
)
source_csv_file
=
args
.
source_csv_file
target_csv_file
=
args
.
target_csv_file
siren_set
=
set
()
with
source_csv_file
.
open
(
"rt"
,
encoding
=
"utf-8"
)
as
fin
:
reader
=
csv
.
reader
(
fin
)
header
=
next
(
reader
)
assert
header
[
0
]
==
"siren"
,
header
[
0
]
with
target_csv_file
.
open
(
"wt"
,
encoding
=
"utf-8"
)
as
fout
:
writer
=
csv
.
writer
()
writer
.
writerow
(
header
)
for
row
in
reader
:
siren
=
row
[
0
].
strip
(
"
\u200e
"
)
if
siren
in
siren_set
:
print
(
f
"duplicate siren found:
{
siren
!
r
}
"
)
continue
siren_set
.
add
(
siren
)
writer
.
writerow
([
siren
,
*
row
[
1
:]])
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment