main.py 5.47 KiB
#!/usr/bin/env python

"""
AIDMOIt : Collect data and insert to HDFS cluster

Step :
    1/ Browse CSV file
    2/ Download File
    3/ Insert in HDFS cluster
    4/ Build ISO19139 XML describing metadata
    5/ Insert ISO19139 in geonetwork
"""
import os
import re
import urllib.request
import pandas as pd
import requests
import json
from pywebhdfs.webhdfs import PyWebHdfsClient
import subprocess


def getUrlFromOpendata3M(inputCSV):
    """
    Data from 3M opendata website are collected in 4 steps :
        1/ Parse CSV
        2/ From this file, get all links to 3M opendata website
        3/ For each link get the 3M ID of dataset
        4/ For each 3M dataset's ID get url asking 3M opendata's API:
            metadata
            data
    :return:dictionnary with id node of 3M opendata dataset as key and a dictionnary as value containing data & metadata
        Exemple of return :
        {"[u'9795']": {'data': [u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PduHierarchieVoies.zip', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PduHierarchieVoies_geojson.zip', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PduHierarchieVoies.ods', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PduNotice.pdf'], 'metadata': [TO BIG TO PRINT for this example!!!]},
        "[u'3413']": {'data': [u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_OccupationSol.zip', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_OccupSol_Lyr.zip', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_OccupSol_Nomenclature_2018.pdf', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_OccupationSol_Archives.zip'], 'metadata': [TO BIG TO PRINT for this example!!!]},
        " [u'9860']": {'data': [u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PopFine.zip', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PopFine_Description.docx', u'http://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_PopFine_Schema.docx'], 'metadata': [TO BIG TO PRINT for this example!!!]},
    """
    # Step 1 and 2
    dataInvetoryFile = pd.read_csv(inputCSV, sep = ';')
    weblinks = dataInvetoryFile['datasetURL']

    # Step 3
    idNodePattern = re.compile("https{0,1}:..data.montpellier3m.fr.node.(\d+)")
    idNodeList = []
    for weblink in weblinks:
        html = requests.get(weblink)
        idNodeList.append(re.findall(idNodePattern, html.text))

     # Step 4
    opendata3mDataMetada = {}
    idcsv = 1
    for node in idNodeList:
        opendata3mData = []
        nodeDataMetada = {'metadata': None, 'data': None, 'idCSV': idcsv}
        metadata = requests.get("http://data.montpellier3m.fr/api/3/action/package_show?id="+node[0]).json()
        #get resources
        for resource in metadata['result']['resources']:
            opendata3mData.append(resource['url'])
        nodeDataMetada['data'] = opendata3mData
        nodeDataMetada['metadata'] = metadata
        opendata3mDataMetada.update({str(node): nodeDataMetada})
        idcsv = idcsv+1

    return opendata3mDataMetada


def downloadOpendata3MFiles(opendata3mDataMetada, pathToSaveDownloadedData):
    """
    Download all resources given

    :param opendata3mDataMetada: dictionary containing metadata and data to download by Id node from 3M opendata
    :return: None
    """
    nboffiledl = 0
    for node in opendata3mDataMetada:
        for fileToDownoald in opendata3mDataMetada[node]['data']:
            urllib.request.urlretrieve(fileToDownoald, os.path.join(pathToSaveDownloadedData, fileToDownoald.split('/')[-1]))
            nboffiledl = nboffiledl + 1

    return nboffiledl


if __name__ == '__main__':
    #Init variables
    dirname = os.path.dirname(__file__)
    inputCSV = os.path.join(dirname, '../input/datasources.csv')
    pathToSaveDownloadedData = os.path.join(dirname, '../output/data')
    pathToSaveDownloadedMeta = os.path.join(dirname, '../output/meta/meta.json')
    pathToSaveHDFSPath = os.path.join(dirname, '../output/hdfspath/hdfspath.csv')
    nboffiledl = 0

    namenode = "namenode" # hostname or IP address for HDFS cluster's namenode
    namenodePort = "9870"
    hdfsuser = "hadoop"
    #end of init variables

    print("AIDMOIt ingestion module starts")

    """Get URL of data and metadata from 3M Opendata website"""
    opendata3mDataMetada = getUrlFromOpendata3M(inputCSV)
    jsonfile = open(pathToSaveDownloadedMeta, "w")
    jsonfile.write(json.dumps(opendata3mDataMetada))
    jsonfile.close()
    """Download File"""
    nboffiledl = downloadOpendata3MFiles(opendata3mDataMetada, pathToSaveDownloadedData)

    """Insert files inside HDFS and store file"""
    # connect to HDFS
    hdfs = PyWebHdfsClient(host=namenode, port=namenodePort, user_name=hdfsuser)
    for file in os.listdir(pathToSaveDownloadedData):
        if(str(file) != ".forgit"):
            try:
                pathInDL = "."
                file_data = str(file)
                hdfs.create_file(file_data, pathInDL)
            except Exception as e:
                print('Failed to upload in HDFS: '+ str(e))

    """Build and insert iso19139 xml to geonetwork"""
    try:
        subprocess.call("/usr/bin/Rscript  addServicesToGN.R")
    except :
        print("R error due to OSM ? Try re-launched")
        subprocess.call("R -f addServicesToGN.R", shell=True)

    print(str(nboffiledl)+" files downloaded in : " + pathToSaveDownloadedData)
    print("AIDMOIt ingestion module ends")