An error occurred while loading the file. Please try again.
-
Gaetano Raffaele authoredf626ac8e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
"""
AIDMOIt : Collect data and insert to HDFS cluster
Step :
1/ Browse CSV file
2/ Download File
3/ Insert in HDFS cluster
4/ Build ISO19139 XML describing metadata
5/ Insert ISO19139 in geonetwork
"""
import os
import re
import urllib.request
import pandas as pd
import requests
import json
from pywebhdfs.webhdfs import PyWebHdfsClient
import subprocess
def getUrlFromOpendata3M(inputCSV):
"""
Data from 3M opendata website are collected in 4 steps :
1/ Parse CSV
2/ From this file, get all links to 3M opendata website
3/ For each link get the 3M ID of dataset
4/ For each 3M dataset's ID get url asking 3M opendata's API:
metadata
data
:return:dictionnary with id node of 3M opendata dataset as key and a dictionnary as value containing data & metadata
Exemple of return :
{"[u'9795']": {'data': [u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PduHierarchieVoies.zip', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PduHierarchieVoies_geojson.zip', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PduHierarchieVoies.ods', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PduNotice.pdf'], 'metadata': [TO BIG TO PRINT for this example!!!]},
"[u'3413']": {'data': [u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_OccupationSol.zip', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_OccupSol_Lyr.zip', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_OccupSol_Nomenclature_2018.pdf', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_OccupationSol_Archives.zip'], 'metadata': [TO BIG TO PRINT for this example!!!]},
" [u'9860']": {'data': [u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PopFine.zip', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PopFine_Description.docx', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PopFine_Schema.docx'], 'metadata': [TO BIG TO PRINT for this example!!!]},
"""
# Step 1 and 2
dataInvetoryFile = pd.read_csv(inputCSV, sep = ';')
weblinks = dataInvetoryFile['datasetURL']
# Step 3
idNodePattern = re.compile("https{0,1}:..data.montpellier3m.fr.node.(\d+)")
idNodeList = []
for weblink in weblinks:
html = requests.get(weblink)
idNodeList.append(re.findall(idNodePattern, html.text))
# Step 4
opendata3mDataMetada = {}
idcsv = 1
for node in idNodeList:
opendata3mData = []
nodeDataMetada = {'metadata': None, 'data': None, 'idCSV': idcsv}
metadata = requests.get("http://data.montpellier3m.fr/api/3/action/package_show?id="+node[0]).json()
#get resources
for resource in metadata['result']['resources']:
opendata3mData.append(resource['url'])
nodeDataMetada['data'] = opendata3mData
nodeDataMetada['metadata'] = metadata
opendata3mDataMetada.update({str(node): nodeDataMetada})
idcsv = idcsv+1
return opendata3mDataMetada
def downloadOpendata3MFiles(opendata3mDataMetada, pathToSaveDownloadedData):
"""
Download all resources given
:param opendata3mDataMetada: dictionary containing metadata and data to download by Id node from 3M opendata
:return: None
"""
nboffiledl = 0
for node in opendata3mDataMetada:
for fileToDownoald in opendata3mDataMetada[node]['data']:
urllib.request.urlretrieve(fileToDownoald, os.path.join(pathToSaveDownloadedData, fileToDownoald.split('/')[-1]))
nboffiledl = nboffiledl + 1
return nboffiledl
if __name__ == '__main__':
#Init variables
dirname = os.path.dirname(__file__)
inputCSV = os.path.join(dirname, '../input/datasources.csv')
pathToSaveDownloadedData = os.path.join(dirname, '../output/data')
pathToSaveDownloadedMeta = os.path.join(dirname, '../output/meta/meta.json')
pathToSaveHDFSPath = os.path.join(dirname, '../output/hdfspath/hdfspath.csv')
nboffiledl = 0
namenode = "namenode" # hostname or IP address for HDFS cluster's namenode
namenodePort = "9870"
hdfsuser = "hadoop"
#end of init variables
print("AIDMOIt ingestion module starts")
"""Get URL of data and metadata from 3M Opendata website"""
opendata3mDataMetada = getUrlFromOpendata3M(inputCSV)
jsonfile = open(pathToSaveDownloadedMeta, "w")
jsonfile.write(json.dumps(opendata3mDataMetada))
jsonfile.close()
"""Download File"""
nboffiledl = downloadOpendata3MFiles(opendata3mDataMetada, pathToSaveDownloadedData)
"""Insert files inside HDFS and store file"""
# connect to HDFS
hdfs = PyWebHdfsClient(host=namenode, port=namenodePort, user_name=hdfsuser)
for file in os.listdir(pathToSaveDownloadedData):
if(str(file) != ".forgit"):
try:
pathInDL = "."
file_data = str(file)
hdfs.create_file(file_data, pathInDL)
except Exception as e:
print('Failed to upload in HDFS: '+ str(e))
"""Build and insert iso19139 xml to geonetwork"""
try:
subprocess.call("/usr/bin/Rscript addServicesToGN.R")
except :
print("R error due to OSM ? Try re-launched")
subprocess.call("R -f addServicesToGN.R", shell=True)
print(str(nboffiledl)+" files downloaded in : " + pathToSaveDownloadedData)
print("AIDMOIt ingestion module ends")