From e8b6b8d12ef63e6ec82c05e8fa8fce909f7ddba3 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Mon, 29 Apr 2019 10:50:40 +0200
Subject: [PATCH] add geoname + hierarchy inclusion process + normalization of
 Wikidata ID

---
 .gitignore                                    |   6 +
 Readme.md                                     |   5 +
 config/configuration.json                     |   3 +-
 config/mappings.json                          |  12 ++
 config/mappingsv2.json                        |  97 +++++++++++++
 .../__pycache__/__init__.cpython-36.pyc       | Bin 145 -> 148 bytes
 .../basic_extraction.cpython-36.pyc           | Bin 2413 -> 2757 bytes
 .../class_extraction.cpython-36.pyc           | Bin 1936 -> 1939 bytes
 .../property_extract.cpython-36.pyc           | Bin 1592 -> 1595 bytes
 .../__pycache__/wiki_links.cpython-36.pyc     | Bin 1141 -> 1144 bytes
 custom_process/basic_extraction.py            |   3 +
 gazetteer.py                                  | 134 ++++++++++++------
 gazetteer2es.py                               |  65 ++++-----
 hierarchy.py                                  |  60 ++++++++
 utils.py                                      |  25 ++++
 wikidata/helpers.py                           |   1 +
 wikidata/property_wd.py                       |   6 +-
 wikidata/reader.py                            |   5 +
 wikidata/types_wd.py                          |   6 +-
 19 files changed, 349 insertions(+), 79 deletions(-)
 create mode 100644 config/mappingsv2.json
 create mode 100644 hierarchy.py
 create mode 100644 utils.py

diff --git a/.gitignore b/.gitignore
index 308a8d4..9166bcb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,9 @@ latest-all.json.gz
 temp/*
 custom_process/__pycache__/*
 out_final.json
+__pycache__
+temp
+geodict*
+out*
+.idea
+.DS_Store
\ No newline at end of file
diff --git a/Readme.md b/Readme.md
index 2132e08..a48c1df 100644
--- a/Readme.md
+++ b/Readme.md
@@ -25,4 +25,9 @@ Simply run the command line
     $ python3 gazeteer2es.py [ES host if not localhost]
     
 
+## TODO
+
+* Add geonames missing entry (look into save.py)
+* Add a step that create unique ID for Geodict
+
 **Gaurav Shrivastava, Jacques Fize @ 2017**
\ No newline at end of file
diff --git a/config/configuration.json b/config/configuration.json
index e14e1e9..076015f 100644
--- a/config/configuration.json
+++ b/config/configuration.json
@@ -1,7 +1,8 @@
 {
   "osm_boundaries_dir":"osm-boundaries",
+  "pre_dl_osm": "/Users/jacquesfize/install/osm-boundaries",
   "temp_dir":"temp",
-  "wikidata_dump":"latest-all.json.gz",
+  "wikidata_dump":"/Volumes/Sauvegarde/latest-all.json.gz",
   "lang_list":["en","fr","de","es"],
   "properties_to_extract":[
     {"id":"P47","isMultiple":true,"type":"EntityID","mappings":"keyword","mappings_details":{}},
diff --git a/config/mappings.json b/config/mappings.json
index 027778a..9d49384 100644
--- a/config/mappings.json
+++ b/config/mappings.json
@@ -58,6 +58,9 @@
                 "osmID": {
                     "type": "keyword"
                 },
+                "wikidataID": {
+                    "type": "keyword"
+                },
                 "path": {
                     "type": "keyword"
                 },
@@ -70,6 +73,15 @@
                 "P706": {
                     "type": "keyword"
                 },
+              "inc_P131": {
+                    "type": "keyword"
+                },
+              "inc_P706": {
+                    "type": "keyword"
+                },
+              "inc_geoname": {
+                    "type": "keyword"
+                },
                  "geometry": {
                     "type": "geo_shape",
                     "tree": "quadtree",
diff --git a/config/mappingsv2.json b/config/mappingsv2.json
new file mode 100644
index 0000000..36e6f95
--- /dev/null
+++ b/config/mappingsv2.json
@@ -0,0 +1,97 @@
+{
+    "mappings": {
+        "_default_": {
+            "properties": {
+                "de": {
+                    "type": "keyword"
+                },
+                "en": {
+                    "type": "keyword"
+                },
+                "es": {
+                    "type": "keyword"
+                },
+                "fr": {
+                    "type": "keyword"
+                },
+                "aliases": {
+                    "type": "nested",
+                    "properties": {
+                        "de": {
+                            "type": "keyword"
+                        },
+                        "en": {
+                            "type": "keyword"
+                        },
+                        "es": {
+                            "type": "keyword"
+                        },
+                        "fr": {
+                            "type": "keyword"
+                        }
+                    }
+                },
+                "instance_of": {
+                    "type": "keyword"
+                },
+                "coord": {
+                    "type": "geo_point"
+                },
+                "geonameID": {
+                    "type": "keyword"
+                },
+                "class": {
+                    "type": "keyword"
+                },
+                "id": {
+                    "type": "keyword"
+                },
+                "country": {
+                    "type": "keyword"
+                },
+                "continent": {
+                    "type": "keyword"
+                },
+                "score": {
+                    "type": "float"
+                },
+                "osmID": {
+                    "type": "keyword"
+                },
+                "wikidataID": {
+                    "type": "keyword"
+                },
+                "path": {
+                    "type": "keyword"
+                },
+                "P47": {
+                    "type": "keyword"
+                },
+                "share_border_with": {
+                    "type": "keyword"
+                },
+                "P131": {
+                    "type": "keyword"
+                },
+                "P706": {
+                    "type": "keyword"
+                },
+                "located_in_adm_terr_ent": {
+                    "type": "keyword"
+                },
+                "located_in_terr_feature": {
+                    "type": "keyword"
+                },
+                "inc_geoname": {
+                    "type": "keyword"
+                },
+                "geometry": {
+                    "type": "geo_shape",
+                    "tree": "quadtree",
+                    "precision": "100m"
+                }
+
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/custom_process/__pycache__/__init__.cpython-36.pyc b/custom_process/__pycache__/__init__.cpython-36.pyc
index 32b2ceac0dd176fee582e3e5d3deace746143615..c31ddfe621f9a7f7589d980786e78ce493b880b0 100644
GIT binary patch
delta 50
zcmbQpIE9hjn3tEU);MS)yM?@?erR!OQL%njVsc?=YH?a-RjR&wW=TL%epYI7N%6#J
FGXQc&5P<*y

delta 47
zcmbQjIFXUvn3tDp+D+$)>=rV1`WgATsrp%o$%Un<#rh?w#U=6SiB+j3si{R1<IDhC
C=@1_P

diff --git a/custom_process/__pycache__/basic_extraction.cpython-36.pyc b/custom_process/__pycache__/basic_extraction.cpython-36.pyc
index c1b722257898fa6ddc3fd047271d81092b6ac3d8..696cff8ebe4c37a5a81ddf6d79d1281e710accb3 100644
GIT binary patch
delta 872
zcmY*Y&rcIU6rMNTnf`Douu!oev<V5dMB9R>7ZNQ(2r7gK1|=A>>8^E`+QRMvg)HGv
zjfodc=4L#4@M=7J@ZbfbM-L4L<K2+>7x-o?jqc`snR(y$X5M_W`%C+rh+XdJh<$zb
zY3*nHee68IMUjD&W-x<4C9CW?QOGJ^BH6H5gbe%^bvR<05pRIFPBYLc%G5~h58kQi
zhE*-6y~k#Hu{b+(x43XSJzZQbrgOR6hcEORq~`MXT+?y$wqDxWHr+?ozL}r3yhW#G
zn<dW`LrQ`75JKD$v8)W!2#8%}Li+)P_{1hQ_|Sk?;2evO$`~Ar6J=oRj88F3;U9(b
zW={+Bs?_^*y<>$`&jFFI7Z-UpkovyYiwed5t1##!`A=i9!KfB!n5~MJEO8k3!GI)b
z0FVcyO7F}4I@=u;g4yY6%A{mNZFV7;ow6)oX`8JeOWU3BfbD9tFOcoB)uGl=jBr4A
zx9N9CcVml1-ZQM(Jq@zK(V%ab)mjZFiBF9vR#D9Lid{a`*<__!%`sM4&3!UA-4t*6
z2qeT8J|BygO>bSVSh_3v!kdXvc@<L#1mZ9boNAzk#Ov_VVJRfD_;3t{aTCK`L$Js}
zsD8`C%bF$+ZyYFP;)i@`h=P5ffg9XW>m#kPuOtbo&K@wSNumutGib}?t??DIrLM?o
zG=7p81V6ygLNhGeJ9JIA$#(S0wwVn%Io#ICBQ&|bxoK97EO&-w?uvv7NwDBhO{Jn&
z%T3sEGLi}4f*chPmN48n0s#e7h+t^Y5Btd}-L*<Hb<fdDo>i;n(4$DpB>W*x!lV2u
f);I;xAGy@oEf<lVM5x*q$%^I35#&TFIuG;@Ha*OY

delta 548
zcmY+BOG_L<6ov1tR8`wO_B?ttM0_nA92CQt7|6zjiU<mVDEOF-4%1CSi?+_Rh@p*!
zhGZ+;t$#pt6I{3#+{{K7{sch~T)1^*b*qh1aN*wXoT^h*7p?b}F=U#??!eU2y!qeg
zKrB!ZyN!Rqp)BU0PtYP3@s9SfL9?uYY1(G?=_C7&9TTOW-kMW#E7i>{r(W`%y1)FR
z^2_lZr$#e;Q04%jUZB6chmQF->ly(9aKXg@Lp;FR6+Q4TqgTERVVC(@zyegf$=6*Y
z<oHHgJJ_*eXDV_AY`J*>);}Rc%Qwjr?cI%?7Ny|si^68Z@J@3iI&~RqgqxyE(Yj{_
zXk#0Mh_D3fY%w}7on$M--Ro|AIHDgCIu>`YRK%1ebVX5?B#|-AB=onU88_ASd=*=Z
zaW=t+3TIWsKQiBwbFC0orFyD!HO)N-Zi9f?x^gi!%hQ?c%9KUb^J;ckc=ff(@iN`W
zUTmS1HceH{%NL_>7*Pc<jbBkRi_@(($ND6V=>0fE8~SH68;Q6oP1yXAPV@#2(6%v+
F_yiM)gpdFL

diff --git a/custom_process/__pycache__/class_extraction.cpython-36.pyc b/custom_process/__pycache__/class_extraction.cpython-36.pyc
index 80b54e87a17d6d781a3714190e7c09dddb421502..7b0e8984cc218fb24a251281dfabdbffed609a78 100644
GIT binary patch
delta 53
zcmbQhKbfE1n3tEU);MS*dk&Mlr+#R0YEiL%R$_8tX=-s=W>u=bduB;MQGQlxa!K*#
IekMj%0IMnyApigX

delta 50
zcmbQtKY^d!n3tDp+D+$;>^V#_Zu%MdxvBbDiOGefsm1ywsl_Gn>4{aTC8?=Jn<p_b
GvH}2;P!PWW

diff --git a/custom_process/__pycache__/property_extract.cpython-36.pyc b/custom_process/__pycache__/property_extract.cpython-36.pyc
index de7db971e619abc91937ff3e89f474013442c533..3a1cbbcf4f4c92d01d8e3736d6f759b25ac0e81b 100644
GIT binary patch
delta 53
zcmdnNvzv$An3tEU);MS*`y(cKPyNv1)S_bjti<HP($wO#%&Js<_so)jqWrAX<dWje
ItjxyD0LpI>hyVZp

delta 50
zcmdnZvxA4-n3tDpN2Aw9_D4)IZu%MdxvBbDiOGefsm1ywsl_Gn>4{aTC8?=Jn>m<`
FnE|O75cdE8

diff --git a/custom_process/__pycache__/wiki_links.cpython-36.pyc b/custom_process/__pycache__/wiki_links.cpython-36.pyc
index 6763cf3c0d9ad9053ea0cc91d166545483c417f6..4c1647e55e5c3f7e32394fa4ed56cbf842addf52 100644
GIT binary patch
delta 53
zcmey$@q>fin3tDpOX<pu>`xixo%KVDQ;UlAvl5dFOH+%}GOJSc-7`x9it@8klS_&>
Jb1>yH0RRMX69E7K

delta 50
zcmeyt@s)$!n3tEU{b#^N_NR<84*D7SxvBbDiOGefsm1ywsl_Gn>4{aTC8?=Jo4J{C
FnE=Yd5$yl~

diff --git a/custom_process/basic_extraction.py b/custom_process/basic_extraction.py
index 68f00ce..7969e3b 100644
--- a/custom_process/basic_extraction.py
+++ b/custom_process/basic_extraction.py
@@ -13,7 +13,9 @@ from wikidata.reader import Reader
 from wikidata.process_wd import *
 
 config=Configuration("config/configuration.json")
+
 class BasicExtraction(Process):
+
     def __init__(self, id, labels_fn,page_rank):
         super(BasicExtraction, Process.__init__(self, id))
         self.dataframe = {}
@@ -32,6 +34,7 @@ class BasicExtraction(Process):
         self.labels_list = json.load(f)
         f.close()
 
+        print("Loading the PAGERANK DATA ...")
         f = open(page_rank,encoding = 'utf-8')
         self.scores = json.load(f)
         f.close()
diff --git a/gazetteer.py b/gazetteer.py
index e16a7be..bf3bcf0 100644
--- a/gazetteer.py
+++ b/gazetteer.py
@@ -8,6 +8,8 @@ from custom_process.wiki_links import *
 from custom_process.class_extraction import *
 from custom_process.property_extract import *
 from gis.convex_hull import get_convex_hull
+from tqdm import tqdm
+from utils import wc_l
 
 __config=Configuration("config/configuration.json")
 
@@ -15,7 +17,7 @@ __config=Configuration("config/configuration.json")
 
 
 def temp(filename):
-    return os.path.join(__config.temp_dir,filename)
+    return os.path.join(__config.temp_dir, filename)
 
 def import_data():
 
@@ -30,25 +32,36 @@ def import_data():
 
     print("Downloading Geonames ...")
     filename=temp("allCountries.zip")
-    urllib.request.urlretrieve(
-        "http://download.geonames.org/export/dump/allCountries.zip",filename)
-    print("Geonames data retrieved !!")
-    print("Extracting the geonames data!")
-    zip_ref = zipfile.ZipFile(filename, 'r')
-    zip_ref.extractall("./{0}".format(__config.temp_dir))
-    print("Extracted !")
+    if not os.path.exists(temp("allCountries.txt")):
+        urllib.request.urlretrieve(
+            "http://download.geonames.org/export/dump/allCountries.zip",filename)
+        print("Geonames data retrieved !!")
+
+        print("Extracting the geonames data!")
+        zip_ref = zipfile.ZipFile(filename, 'r')
+        zip_ref.extractall("./{0}".format(__config.temp_dir))
+        print("Extracted !")
+
     print("Extracting labels")
     os.system('cut -f 1,2 {0} > {1}'.format(temp("allCountries.txt"),temp("labels.txt")))
+
     print("Extracting the class")
     os.system('cut -f 1,7,8 {0} > {1}'.format(temp("allCountries.txt"),temp("class_codes.txt")))
+
+    size_label_txt=wc_l(temp("labels.txt"))
     f = open(temp("labels.txt"), encoding = 'utf-8')
     labels = {}
-    for line in f:
+    for line in tqdm(f,total=size_label_txt,desc="Create JSON containing labels for every GeonameID "):
         line = line.strip().split("\t")
         labels[line[0]] = line[1]
     f.close()
+
     open(temp("labels.json"), "w").write(json.dumps(labels))#, ensure_ascii=False))
-    os.system('git clone https://github.com/missinglink/osm-boundaries.git')
+
+    if not "pre_dl_osm" in config:
+        os.system('git clone https://github.com/missinglink/osm-boundaries.git')
+    else:
+        config["osm_boundaries_dir"]=config["pre_dl_osm"]
 
 
 def basic_gazetteer(outfile):
@@ -62,17 +75,16 @@ def basic_gazetteer(outfile):
     """
 
     if not os.path.isfile(os.path.join(__config.temp_dir,"labels.json")):
-        print("Give correct labels file name!!")
-        return False
+        raise FileNotFoundError("Give correct labels file name!!")
+
     if not os.path.isfile(__config.wikidata_dump):
-        print('Give correct path to wikidata json dump ')
-        return False
+        raise FileNotFoundError('Give correct path to wikidata json dump ')
 
-    proc1 = BasicExtraction(1,os.path.join(__config.temp_dir,"labels.json"),"resources/wd_page_rank.json")
-    dump = Reader(__config.wikidata_dump,'utf-8')
+    proc1 = BasicExtraction(1, os.path.join(__config.temp_dir, "labels.json"), "resources/wd_page_rank.json")
+    dump = Reader(__config.wikidata_dump, 'utf-8')
     controller = WDController(dump,proc1)
     controller.process_all()
-    open(outfile, 'w').write(json.dumps(proc1.dataframe))#,ensure_ascii=False))
+    open(outfile, 'w').write(json.dumps(proc1.dataframe))
     return True
 
 
@@ -92,7 +104,7 @@ def add_properties(input_gazetteer,output_gazetteer,configuration_file):
     dump = Reader(__config.wikidata_dump,'utf-8')
     controller = WDController(dump,proc1)
     controller.process_all()
-    open(output_gazetteer, 'w').write(json.dumps(proc1.dataframe))#,ensure_ascii=False))
+    open(output_gazetteer, 'w').write(json.dumps(proc1.dataframe))
     return True
 
 
@@ -106,8 +118,8 @@ def extract_classes(gazeteer):
     :return:
     """
     if not os.path.isfile(__config.wikidata_dump):
-        print('Give correct path to wikidata json dump')
-        return None
+        raise FileNotFoundError('Give correct path to wikidata json dump')
+
     proc3 = ClassExtraction(1, os.path.join(__config.temp_dir,"class_codes.txt"), gazeteer)
     dump = Reader(__config.wikidata_dump, 'utf-8')
     controller = WDController(dump, proc3)
@@ -130,7 +142,7 @@ def add_classes(gazeteer,outfile):
     iterations = 0
     places = 0
     keys = set(data.keys())
-    for key in keys:
+    for key in tqdm(keys,desc="Add Classes"):
         iterations = iterations + 1
         temp_ = []
         if 'instance_of' in data[key].keys():
@@ -163,7 +175,7 @@ def extract_missing_WikiIDS(interm_outfile,outfile):
     iterations = 0
     output=open(interm_outfile,"w")
     total=len(paths)
-    output.write(json.dumps(finding_links(paths)))#,ensure_ascii=False))
+    output.write(json.dumps(finding_links(paths)))
     proc2 = WikipediaURI(2, outfile, interm_outfile)
     dump = Reader(__config.wikidata_dump, 'utf-8')
     controller = WDController(dump, proc2)
@@ -189,8 +201,8 @@ def missing_wikidata_IDS(missing_ids):
     df = read_tsv(os.path.join(__config.osm_boundaries_dir,'meta.tsv'),encoding = 'utf-8',columns = True)#'./osm-boundaries/meta.tsv'
     wikidata_IDs = []
     paths = [os.path.join(__config.osm_boundaries_dir,'data',path) for path in df['path']]
-    iterations = 0
-    for path in paths:
+    # iterations = 0
+    for path in tqdm(paths,desc="Browsing OSM data"):
         f = open(path,encoding = 'utf-8')
         dataframe = json.load(f)
         f.close()
@@ -207,9 +219,9 @@ def missing_wikidata_IDS(missing_ids):
                 wikidata_IDs.append(None)
         else:
             wikidata_IDs.append(None)
-        if iterations%1000 == 0:
-            sys.stdout.write("\r iterations: "+'{:,}'.format(iterations))
-        iterations = iterations + 1
+        # if iterations%1000 == 0:
+        #     sys.stdout.write("\r iterations: "+'{:,}'.format(iterations))
+        # iterations = iterations + 1
     df['Wiki_IDs'] = wikidata_IDs
     df.to_csv(temp('meta_all.csv'),index = False)#'temp/meta_all.csv'
 
@@ -230,11 +242,11 @@ def adding_geometry(infile,out_file,output_final_fn):
     Wiki_IDs = set(list(path_association.keys()))
     data = json.loads(open(out_file).read())
     outfile = open(output_final_fn, 'w')
-    iterations = 0
+    # iterations = 0
     places = 0
     keys = set(data.keys())
-    for key in keys:
-        iterations = iterations + 1
+    for key in tqdm(keys,desc="Browsing Geodict"):
+        # iterations = iterations + 1
         temp= data[key]
         temp["id"]=key
         if key in Wiki_IDs:
@@ -247,43 +259,85 @@ def adding_geometry(infile,out_file,output_final_fn):
         outfile.write(json.dumps(temp)+"\n")#,ensure_ascii=False
         del data[key]
 
-        if iterations % 100 == 0:
-            sys.stdout.write("\rEntity Parsed: " + '{:,}'.format(iterations) + " Places with boundaries parsed: " + '{:,}'.format(places))
-
+        # if iterations % 100 == 0:
+        #     sys.stdout.write("\rEntity Parsed: " + '{:,}'.format(iterations) + " Places with boundaries parsed: " + '{:,}'.format(places))
 
+def add_final_spatial_entities(input,output):
+    """
+    Add Missing Geonames entries and building Geodict IDs (En cours)
+    :param input:
+    :param output:
+    :return:
+    """
+    d_geo = {}
+    geonames_i = open(temp("allCountries.txt"))
+    for entry in geonames_i:
+        row = entry.split("\t")
+
+        d_geo[row[0]] = {lang: row[1] for lang in ["en", "fr", "es", "de"]}
+        d_geo[row[0]]["aliases"] = {lang: row[3].split(",") for lang in ["en", "fr", "es", "de"]}
+        d_geo[row[0]]["coord"] = {"lat": float(row[4]), "lon": float(row[5])}
+        d_geo[row[0]]["class"] = ["{0}-{1}".format(row[6], row[7])]
+
+    geoname_id_index = set(d_geo.keys())
+
+    already_in_geodict = set([])
+    for line in open(input):
+        data = json.loads(line.strip())
+        if "geonameID" in data:
+            already_in_geodict.add(data["geonameID"])
+
+    diff = geoname_id_index.difference(already_in_geodict)
+    prefix = "GD"
+    i = 1
+    output = open(output, 'w')
+    size_input = wc_l(input)
+    for line in tqdm(open(input),total=size_input,desc="Browsing Geodict"):
+        data = json.loads(line.strip())
+        data["wikidataID"] = data["id"]
+        data["id"] = prefix + str(i)
+        output.write(json.dumps(data) + "\n")
+        i += 1
+    for geo_id in tqdm(diff):
+        data = d_geo[geo_id]
+        data["id"] = prefix + str(i)
+        data["geonameID"] = geo_id
+        output.write(json.dumps(data) + "\n")
+        i += 1
 
 def main():
     start=time.time()
     if not os.path.exists(__config.temp_dir):
         os.makedirs(__config.temp_dir)
     # Import the data sources required to be harvested for creation of gazetteer
-    print("[1/6] Download required datasets...")
+    print("[1/7] Download required datasets...")
     import_data()
 
     # Create a first basic gazeteer
-    print("[2/6] Building the core gazetteer...")
+    print("[2/7] Building the core gazetteer...")
     basic_gazetteer(temp("1stoutput.json"))
 
     # Associate geonames classe to the instance_of(P31) values
-    print("[3/6] Associate a class to each entry...")
+    print("[3/7] Associate a class to each entry...")
     extract_classes(temp("1stoutput.json"))
-
     # Add class to each entity
     add_classes(temp("1stoutput.json"),temp("2ndoutput.json"))
 
     # Extract missing wikidata IDs in the boundary data
-    print("[4/6] Find missing WD ids within boundary data...")
+    print("[4/7] Find missing WD ids within boundary data...")
     extract_missing_WikiIDS(temp('found_missing_links.json'),temp('missing_Wikidata_IDS.txt'))
     missing_wikidata_IDS(temp('missing_Wikidata_IDS.txt'))
 
     # Adding properties from configuration_file
-    print("[5/6] Add user properties...")
+    print("[5/7] Add user properties...")
     add_properties(temp("2ndoutput.json"),temp("3rdoutput.json"),'config/configuration.json')
 
     # Add boundaries in the final data
-    print("[6/6] Adding adminstrative boundary/ies...")
+    print("[6/7] Adding adminstrative boundary/ies...")
     adding_geometry(temp("meta_all.csv"),temp("3rdoutput.json"),'out_final.json')
 
+    print("7/7")
+    add_final_spatial_entities("out_final.json","out_final_extended.json")
     print("The gazeteer was created in {0} hours".format(((time.time()-start)/60)/60))
 
 
diff --git a/gazetteer2es.py b/gazetteer2es.py
index f393a25..fd4fc11 100644
--- a/gazetteer2es.py
+++ b/gazetteer2es.py
@@ -2,74 +2,75 @@ import argparse, json, sys
 from elasticsearch import Elasticsearch,helpers
 from elasticsearch import helpers
 import copy
+from tqdm import tqdm
+from mytoolbox.text.size import wc_l
 
-def polygon_transformation4ES(temp,simple=True):
-    final = []
-    if simple:
-        final=copy.copy(temp)
-        final.append(temp[0])
-        final=final
-    else:
-        for i in temp:
-            t=copy.copy(i)
-            t.append(i[0])
-            final.append(t)
-    return final
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("input", help="give input json file")
     parser.add_argument("-e", "--es_host", help="Elasticsearch Host address", default="127.0.0.1")
+    parser.add_argument("-p", "--es_port", help="Elasticsearch Host port", default="9200")
     args = parser.parse_args()
+
+    if not os.path.exists(args.input):
+        raise FileNotFoundError("Input File '{0}' not found !".format(args.input))
+
     file_name = args.input
     es_client = Elasticsearch(args.es_host)
+
     if not es_client.ping():
-        print("Can't connect to ES ! ")
-        sys.exit(1)
+        raise ConnectionError("Could not connect to Elasticserver at {0}".format(args.es_host))
+
+    # If exists in the dataase, delete !
     if es_client.indices.exists(index="gazetteer"):
         es_client.indices.delete(index="gazetteer")
+
+    # Open input file
     gazetteer = open(file_name, encoding='utf-8')
-    i = 1
-    mappings = json.load(open("config/mappings.json"))
+
+ 
+    mappings = json.load(open("config/mappings.json")) 
+    # complete Mapping depending on custom properties extracted
     property_to_be_mapped = json.load(open('config/configuration.json'))
     for prop in property_to_be_mapped["properties_to_extract"]:
         mappings['mappings']['_default_']['properties'][prop['id']] = {'type':prop["mappings"]}
         if prop["mappings_details"]:
             for k,v in prop["mappings_details"].items():
                 mappings['mappings']['_default_']['properties'][prop['id']][k]=v
-    print(mappings)
+    print("Mapping of Geodict index: ", mappings)
+
+    # Creation of the index in Elasticsearch databased
     es_client.indices.create(index="gazetteer", body=mappings)
     action_list=[]
-    for line in gazetteer:
+
+    number_of_entries = wc_l(file_name)
+
+    for line in tqdm(gazetteer,desc="Importing ...",total=number_of_entries):
         data = json.loads(line.strip())
         if '_score' in data.keys():
             data['score'] = data['_score']
             del data['_score']
         if "geometry" in data:
-            del data["geometry"]
+            del data["geometry"] # Difficult with ES ... so we delete it
         if "coord" in data:
-            if data["coord"]["lat"] >90 or data["coord"]["lon"] >180:
-                i+=1
-                continue
-        if not data["fr"]:
-            i+=1
-            continue
-                #print("AFTER",data["geometry"])
-                #return
-        #es_client.index("gazetteer", "place", data)
+            data["coord"]["lat"]=float(data["coord"]["lat"])
+            data["coord"]["lon"]= float(data["coord"]["lon"])
+
+            if data["coord"]["lat"] >90 or data["coord"]["lon"] >180:continue
+
+        if not data["fr"]:continue
+
         actions = {
         "_index": "gazetteer",
         "_type": "place",
         "_source": data
         }
-        #print(data["fr"])
         action_list.append(actions)
-        if i % 1000 == 0:
-            #print(action_list)
+        if len(action_list) % 1000 == 0:
             helpers.bulk(es_client,action_list,request_timeout=30)
             sys.stdout.write("\rEntity transferred: " + '{:,}'.format(i))
             action_list = []
-        i += 1
 
 
 if __name__ == '__main__':
diff --git a/hierarchy.py b/hierarchy.py
new file mode 100644
index 0000000..1e33428
--- /dev/null
+++ b/hierarchy.py
@@ -0,0 +1,60 @@
+import pandas as pd
+from tqdm import tqdm
+
+df = pd.read_csv("hierarchy.txt",sep="\t",header=None,names="parentId childId type".split())
+
+ids = df.parentId.values.tolist()
+ids.extend(df.childId.values.tolist())
+ids = list(set(ids))
+
+inclusion_relations_ = dict(df["childId parentId".split()].values)
+
+
+inc_dict_geonames = {} 
+for childId,parentId in tqdm(inclusion_relations_.items()):
+    if not childId in inc_dict_geonames:
+        inc_dict_geonames[childId] = [parentId]
+        if parentId in inc_dict_geonames:
+            inc_dict_geonames[childId].extend(inc_dict_geonames[parentId])
+        else:
+            B = parentId
+            while 1:
+                if B in inclusion_relations_:
+                    inc_dict_geonames[childId].append(inclusion_relations_[B])
+                    B = inclusion_relations_[B]
+                else:
+                    break
+            inc_dict_geonames[parentId] = inc_dict_geonames[childId][1:]
+            
+import json
+path="out_final_extended.json"
+geonames2GD,wikidata2GD = {}, {}
+
+from mytoolbox.text.size import wc_l
+
+size_data = wc_l(path)
+
+for line in tqdm(open(path),total=size_data):
+    data = json.loads(line.strip("\n,"))
+    if "geonameID" in data:
+        geonames2GD[data["geonameID"]]=data["id"]
+    if "wikidataID" in data:
+        wikidata2GD[data["wikidataID"]]=data["id"]
+
+output_path = "geodict_final_29_04_19.json"
+
+output = open(output_path,'w')
+
+name_col = {"P131":"located_in_adm_terr_ent",
+            "P706":"located_in_terr_feature",
+            "P47":"share_border_with"}
+
+for line in tqdm(open(path),total=size_data):
+    data = json.loads(line.strip("\n,"))
+    for property_ in ["P131","P706","P47"]:
+        if not property_ in data:
+            continue
+        data[name_col[property_]] = [wikidata2GD[id_] for id_ in data[property_] if id_ in wikidata2GD]
+    if "geonameID" in data and data["geonameID"] in inc_dict_geonames:
+        data["geoname_hierarchy"] = inc_dict_geonames[data["geonameID"]]
+    output.write("{0}\n,".format(json.dumps(data)))
\ No newline at end of file
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..9ef9562
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,25 @@
+# coding = utf-8
+import os
+
+def blocks(files, size=65536):
+    while True:
+        b = files.read(size)
+        if not b: break
+        yield b
+
+def wc_l(text_input_fn):
+    """
+    Count the number of line in a file
+
+    Parameters
+    ----------
+    text_input_fn : str
+        filepath
+
+    """
+    if not os.path.exists(text_input_fn):
+        raise FileNotFoundError("{0} does not exists !".format(text_input_fn))
+
+    with open(text_input_fn, "r", encoding="utf-8", errors='ignore') as f:
+        return sum(bl.count("\n") for bl in blocks(f))
+
diff --git a/wikidata/helpers.py b/wikidata/helpers.py
index 05040f1..623a062 100644
--- a/wikidata/helpers.py
+++ b/wikidata/helpers.py
@@ -62,6 +62,7 @@ def read_Tsv(filename,encoding='ascii'):
     column = text[0]
     del text[0]
     return pd.DataFrame(text,columns = column)
+
 #finding the missing link for wikipedia pages for which wikidata_IDs are not available
 def finding_links(files):
     missing_uri=[]
diff --git a/wikidata/property_wd.py b/wikidata/property_wd.py
index d75243e..6181cf8 100644
--- a/wikidata/property_wd.py
+++ b/wikidata/property_wd.py
@@ -2,9 +2,9 @@
 
 class Property(object):
     """docstring for property."""
-    def __init__(self, id,isMultiple,type_):
+    def __init__(self, id, isMultiple, type_):
         self.id=id
-        self.isMultiple=isMultiple
+        self.isMultiple = isMultiple
         self.type=type_
 
     def exists(self,data):
@@ -14,4 +14,4 @@ class Property(object):
         return False
 
     def extractData(self,data):
-        return self.type.extractData(self.id,self.isMultiple,data)
+        return self.type.extractData(self.id, self.isMultiple, data)
diff --git a/wikidata/reader.py b/wikidata/reader.py
index cc6516d..56ea181 100644
--- a/wikidata/reader.py
+++ b/wikidata/reader.py
@@ -1,6 +1,10 @@
 # coding=utf-8
 from gzip import GzipFile
 import json
+
+from utils import wc_l
+
+
 class Reader(object):
     """docstring for Reader."""
     def __init__(self, name, decoding):
@@ -9,6 +13,7 @@ class Reader(object):
         self.decoding = decoding
         self.dump = GzipFile(name,'r')
         self.line = self.dump.readline()
+        self.size_file = wc_l(name)
 
     def has_next(self):
         self.line = self.dump.readline().decode(self.decoding)
diff --git a/wikidata/types_wd.py b/wikidata/types_wd.py
index 42332e4..4564266 100644
--- a/wikidata/types_wd.py
+++ b/wikidata/types_wd.py
@@ -125,15 +125,15 @@ class Time(Type):
     def extractMultiple(self, propID, data):
         result = []
         for i in range(len(data['claims'][propID])):
-            result.append(parsedate(data['claims'][propID][i]['mainsnak']['datavalue']['value']['time']))
+            result.append(parseDate(data['claims'][propID][i]['mainsnak']['datavalue']['value']['time']))
         return result
 
     def extractSingle(self, propID, data):
-        return parsedate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
+        return parseDate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
 
     def check_conformity(self, propID, data):
         try:
-            parsedate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
+            parseDate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
             return True
         except Exception as e:
             return False
-- 
GitLab