Commit e8b6b8d1 authored by Fize Jacques's avatar Fize Jacques
Browse files

add geoname + hierarchy inclusion process + normalization of Wikidata ID

parent 4d2ee963
No related merge requests found
Showing with 349 additions and 79 deletions
+349 -79
......@@ -7,3 +7,9 @@ latest-all.json.gz
temp/*
custom_process/__pycache__/*
out_final.json
__pycache__
temp
geodict*
out*
.idea
.DS_Store
\ No newline at end of file
......@@ -25,4 +25,9 @@ Simply run the command line
$ python3 gazeteer2es.py [ES host if not localhost]
## TODO
* Add geonames missing entry (look into save.py)
* Add a step that create unique ID for Geodict
**Gaurav Shrivastava, Jacques Fize @ 2017**
\ No newline at end of file
{
"osm_boundaries_dir":"osm-boundaries",
"pre_dl_osm": "/Users/jacquesfize/install/osm-boundaries",
"temp_dir":"temp",
"wikidata_dump":"latest-all.json.gz",
"wikidata_dump":"/Volumes/Sauvegarde/latest-all.json.gz",
"lang_list":["en","fr","de","es"],
"properties_to_extract":[
{"id":"P47","isMultiple":true,"type":"EntityID","mappings":"keyword","mappings_details":{}},
......
......@@ -58,6 +58,9 @@
"osmID": {
"type": "keyword"
},
"wikidataID": {
"type": "keyword"
},
"path": {
"type": "keyword"
},
......@@ -70,6 +73,15 @@
"P706": {
"type": "keyword"
},
"inc_P131": {
"type": "keyword"
},
"inc_P706": {
"type": "keyword"
},
"inc_geoname": {
"type": "keyword"
},
"geometry": {
"type": "geo_shape",
"tree": "quadtree",
......
{
"mappings": {
"_default_": {
"properties": {
"de": {
"type": "keyword"
},
"en": {
"type": "keyword"
},
"es": {
"type": "keyword"
},
"fr": {
"type": "keyword"
},
"aliases": {
"type": "nested",
"properties": {
"de": {
"type": "keyword"
},
"en": {
"type": "keyword"
},
"es": {
"type": "keyword"
},
"fr": {
"type": "keyword"
}
}
},
"instance_of": {
"type": "keyword"
},
"coord": {
"type": "geo_point"
},
"geonameID": {
"type": "keyword"
},
"class": {
"type": "keyword"
},
"id": {
"type": "keyword"
},
"country": {
"type": "keyword"
},
"continent": {
"type": "keyword"
},
"score": {
"type": "float"
},
"osmID": {
"type": "keyword"
},
"wikidataID": {
"type": "keyword"
},
"path": {
"type": "keyword"
},
"P47": {
"type": "keyword"
},
"share_border_with": {
"type": "keyword"
},
"P131": {
"type": "keyword"
},
"P706": {
"type": "keyword"
},
"located_in_adm_terr_ent": {
"type": "keyword"
},
"located_in_terr_feature": {
"type": "keyword"
},
"inc_geoname": {
"type": "keyword"
},
"geometry": {
"type": "geo_shape",
"tree": "quadtree",
"precision": "100m"
}
}
}
}
}
\ No newline at end of file
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
......@@ -13,7 +13,9 @@ from wikidata.reader import Reader
from wikidata.process_wd import *
config=Configuration("config/configuration.json")
class BasicExtraction(Process):
def __init__(self, id, labels_fn,page_rank):
super(BasicExtraction, Process.__init__(self, id))
self.dataframe = {}
......@@ -32,6 +34,7 @@ class BasicExtraction(Process):
self.labels_list = json.load(f)
f.close()
print("Loading the PAGERANK DATA ...")
f = open(page_rank,encoding = 'utf-8')
self.scores = json.load(f)
f.close()
......
......@@ -8,6 +8,8 @@ from custom_process.wiki_links import *
from custom_process.class_extraction import *
from custom_process.property_extract import *
from gis.convex_hull import get_convex_hull
from tqdm import tqdm
from utils import wc_l
__config=Configuration("config/configuration.json")
......@@ -15,7 +17,7 @@ __config=Configuration("config/configuration.json")
def temp(filename):
return os.path.join(__config.temp_dir,filename)
return os.path.join(__config.temp_dir, filename)
def import_data():
......@@ -30,25 +32,36 @@ def import_data():
print("Downloading Geonames ...")
filename=temp("allCountries.zip")
urllib.request.urlretrieve(
"http://download.geonames.org/export/dump/allCountries.zip",filename)
print("Geonames data retrieved !!")
print("Extracting the geonames data!")
zip_ref = zipfile.ZipFile(filename, 'r')
zip_ref.extractall("./{0}".format(__config.temp_dir))
print("Extracted !")
if not os.path.exists(temp("allCountries.txt")):
urllib.request.urlretrieve(
"http://download.geonames.org/export/dump/allCountries.zip",filename)
print("Geonames data retrieved !!")
print("Extracting the geonames data!")
zip_ref = zipfile.ZipFile(filename, 'r')
zip_ref.extractall("./{0}".format(__config.temp_dir))
print("Extracted !")
print("Extracting labels")
os.system('cut -f 1,2 {0} > {1}'.format(temp("allCountries.txt"),temp("labels.txt")))
print("Extracting the class")
os.system('cut -f 1,7,8 {0} > {1}'.format(temp("allCountries.txt"),temp("class_codes.txt")))
size_label_txt=wc_l(temp("labels.txt"))
f = open(temp("labels.txt"), encoding = 'utf-8')
labels = {}
for line in f:
for line in tqdm(f,total=size_label_txt,desc="Create JSON containing labels for every GeonameID "):
line = line.strip().split("\t")
labels[line[0]] = line[1]
f.close()
open(temp("labels.json"), "w").write(json.dumps(labels))#, ensure_ascii=False))
os.system('git clone https://github.com/missinglink/osm-boundaries.git')
if not "pre_dl_osm" in config:
os.system('git clone https://github.com/missinglink/osm-boundaries.git')
else:
config["osm_boundaries_dir"]=config["pre_dl_osm"]
def basic_gazetteer(outfile):
......@@ -62,17 +75,16 @@ def basic_gazetteer(outfile):
"""
if not os.path.isfile(os.path.join(__config.temp_dir,"labels.json")):
print("Give correct labels file name!!")
return False
raise FileNotFoundError("Give correct labels file name!!")
if not os.path.isfile(__config.wikidata_dump):
print('Give correct path to wikidata json dump ')
return False
raise FileNotFoundError('Give correct path to wikidata json dump ')
proc1 = BasicExtraction(1,os.path.join(__config.temp_dir,"labels.json"),"resources/wd_page_rank.json")
dump = Reader(__config.wikidata_dump,'utf-8')
proc1 = BasicExtraction(1, os.path.join(__config.temp_dir, "labels.json"), "resources/wd_page_rank.json")
dump = Reader(__config.wikidata_dump, 'utf-8')
controller = WDController(dump,proc1)
controller.process_all()
open(outfile, 'w').write(json.dumps(proc1.dataframe))#,ensure_ascii=False))
open(outfile, 'w').write(json.dumps(proc1.dataframe))
return True
......@@ -92,7 +104,7 @@ def add_properties(input_gazetteer,output_gazetteer,configuration_file):
dump = Reader(__config.wikidata_dump,'utf-8')
controller = WDController(dump,proc1)
controller.process_all()
open(output_gazetteer, 'w').write(json.dumps(proc1.dataframe))#,ensure_ascii=False))
open(output_gazetteer, 'w').write(json.dumps(proc1.dataframe))
return True
......@@ -106,8 +118,8 @@ def extract_classes(gazeteer):
:return:
"""
if not os.path.isfile(__config.wikidata_dump):
print('Give correct path to wikidata json dump')
return None
raise FileNotFoundError('Give correct path to wikidata json dump')
proc3 = ClassExtraction(1, os.path.join(__config.temp_dir,"class_codes.txt"), gazeteer)
dump = Reader(__config.wikidata_dump, 'utf-8')
controller = WDController(dump, proc3)
......@@ -130,7 +142,7 @@ def add_classes(gazeteer,outfile):
iterations = 0
places = 0
keys = set(data.keys())
for key in keys:
for key in tqdm(keys,desc="Add Classes"):
iterations = iterations + 1
temp_ = []
if 'instance_of' in data[key].keys():
......@@ -163,7 +175,7 @@ def extract_missing_WikiIDS(interm_outfile,outfile):
iterations = 0
output=open(interm_outfile,"w")
total=len(paths)
output.write(json.dumps(finding_links(paths)))#,ensure_ascii=False))
output.write(json.dumps(finding_links(paths)))
proc2 = WikipediaURI(2, outfile, interm_outfile)
dump = Reader(__config.wikidata_dump, 'utf-8')
controller = WDController(dump, proc2)
......@@ -189,8 +201,8 @@ def missing_wikidata_IDS(missing_ids):
df = read_tsv(os.path.join(__config.osm_boundaries_dir,'meta.tsv'),encoding = 'utf-8',columns = True)#'./osm-boundaries/meta.tsv'
wikidata_IDs = []
paths = [os.path.join(__config.osm_boundaries_dir,'data',path) for path in df['path']]
iterations = 0
for path in paths:
# iterations = 0
for path in tqdm(paths,desc="Browsing OSM data"):
f = open(path,encoding = 'utf-8')
dataframe = json.load(f)
f.close()
......@@ -207,9 +219,9 @@ def missing_wikidata_IDS(missing_ids):
wikidata_IDs.append(None)
else:
wikidata_IDs.append(None)
if iterations%1000 == 0:
sys.stdout.write("\r iterations: "+'{:,}'.format(iterations))
iterations = iterations + 1
# if iterations%1000 == 0:
# sys.stdout.write("\r iterations: "+'{:,}'.format(iterations))
# iterations = iterations + 1
df['Wiki_IDs'] = wikidata_IDs
df.to_csv(temp('meta_all.csv'),index = False)#'temp/meta_all.csv'
......@@ -230,11 +242,11 @@ def adding_geometry(infile,out_file,output_final_fn):
Wiki_IDs = set(list(path_association.keys()))
data = json.loads(open(out_file).read())
outfile = open(output_final_fn, 'w')
iterations = 0
# iterations = 0
places = 0
keys = set(data.keys())
for key in keys:
iterations = iterations + 1
for key in tqdm(keys,desc="Browsing Geodict"):
# iterations = iterations + 1
temp= data[key]
temp["id"]=key
if key in Wiki_IDs:
......@@ -247,43 +259,85 @@ def adding_geometry(infile,out_file,output_final_fn):
outfile.write(json.dumps(temp)+"\n")#,ensure_ascii=False
del data[key]
if iterations % 100 == 0:
sys.stdout.write("\rEntity Parsed: " + '{:,}'.format(iterations) + " Places with boundaries parsed: " + '{:,}'.format(places))
# if iterations % 100 == 0:
# sys.stdout.write("\rEntity Parsed: " + '{:,}'.format(iterations) + " Places with boundaries parsed: " + '{:,}'.format(places))
def add_final_spatial_entities(input,output):
"""
Add Missing Geonames entries and building Geodict IDs (En cours)
:param input:
:param output:
:return:
"""
d_geo = {}
geonames_i = open(temp("allCountries.txt"))
for entry in geonames_i:
row = entry.split("\t")
d_geo[row[0]] = {lang: row[1] for lang in ["en", "fr", "es", "de"]}
d_geo[row[0]]["aliases"] = {lang: row[3].split(",") for lang in ["en", "fr", "es", "de"]}
d_geo[row[0]]["coord"] = {"lat": float(row[4]), "lon": float(row[5])}
d_geo[row[0]]["class"] = ["{0}-{1}".format(row[6], row[7])]
geoname_id_index = set(d_geo.keys())
already_in_geodict = set([])
for line in open(input):
data = json.loads(line.strip())
if "geonameID" in data:
already_in_geodict.add(data["geonameID"])
diff = geoname_id_index.difference(already_in_geodict)
prefix = "GD"
i = 1
output = open(output, 'w')
size_input = wc_l(input)
for line in tqdm(open(input),total=size_input,desc="Browsing Geodict"):
data = json.loads(line.strip())
data["wikidataID"] = data["id"]
data["id"] = prefix + str(i)
output.write(json.dumps(data) + "\n")
i += 1
for geo_id in tqdm(diff):
data = d_geo[geo_id]
data["id"] = prefix + str(i)
data["geonameID"] = geo_id
output.write(json.dumps(data) + "\n")
i += 1
def main():
start=time.time()
if not os.path.exists(__config.temp_dir):
os.makedirs(__config.temp_dir)
# Import the data sources required to be harvested for creation of gazetteer
print("[1/6] Download required datasets...")
print("[1/7] Download required datasets...")
import_data()
# Create a first basic gazeteer
print("[2/6] Building the core gazetteer...")
print("[2/7] Building the core gazetteer...")
basic_gazetteer(temp("1stoutput.json"))
# Associate geonames classe to the instance_of(P31) values
print("[3/6] Associate a class to each entry...")
print("[3/7] Associate a class to each entry...")
extract_classes(temp("1stoutput.json"))
# Add class to each entity
add_classes(temp("1stoutput.json"),temp("2ndoutput.json"))
# Extract missing wikidata IDs in the boundary data
print("[4/6] Find missing WD ids within boundary data...")
print("[4/7] Find missing WD ids within boundary data...")
extract_missing_WikiIDS(temp('found_missing_links.json'),temp('missing_Wikidata_IDS.txt'))
missing_wikidata_IDS(temp('missing_Wikidata_IDS.txt'))
# Adding properties from configuration_file
print("[5/6] Add user properties...")
print("[5/7] Add user properties...")
add_properties(temp("2ndoutput.json"),temp("3rdoutput.json"),'config/configuration.json')
# Add boundaries in the final data
print("[6/6] Adding adminstrative boundary/ies...")
print("[6/7] Adding adminstrative boundary/ies...")
adding_geometry(temp("meta_all.csv"),temp("3rdoutput.json"),'out_final.json')
print("7/7")
add_final_spatial_entities("out_final.json","out_final_extended.json")
print("The gazeteer was created in {0} hours".format(((time.time()-start)/60)/60))
......
......@@ -2,74 +2,75 @@ import argparse, json, sys
from elasticsearch import Elasticsearch,helpers
from elasticsearch import helpers
import copy
from tqdm import tqdm
from mytoolbox.text.size import wc_l
def polygon_transformation4ES(temp,simple=True):
final = []
if simple:
final=copy.copy(temp)
final.append(temp[0])
final=final
else:
for i in temp:
t=copy.copy(i)
t.append(i[0])
final.append(t)
return final
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input", help="give input json file")
parser.add_argument("-e", "--es_host", help="Elasticsearch Host address", default="127.0.0.1")
parser.add_argument("-p", "--es_port", help="Elasticsearch Host port", default="9200")
args = parser.parse_args()
if not os.path.exists(args.input):
raise FileNotFoundError("Input File '{0}' not found !".format(args.input))
file_name = args.input
es_client = Elasticsearch(args.es_host)
if not es_client.ping():
print("Can't connect to ES ! ")
sys.exit(1)
raise ConnectionError("Could not connect to Elasticserver at {0}".format(args.es_host))
# If exists in the dataase, delete !
if es_client.indices.exists(index="gazetteer"):
es_client.indices.delete(index="gazetteer")
# Open input file
gazetteer = open(file_name, encoding='utf-8')
i = 1
mappings = json.load(open("config/mappings.json"))
mappings = json.load(open("config/mappings.json"))
# complete Mapping depending on custom properties extracted
property_to_be_mapped = json.load(open('config/configuration.json'))
for prop in property_to_be_mapped["properties_to_extract"]:
mappings['mappings']['_default_']['properties'][prop['id']] = {'type':prop["mappings"]}
if prop["mappings_details"]:
for k,v in prop["mappings_details"].items():
mappings['mappings']['_default_']['properties'][prop['id']][k]=v
print(mappings)
print("Mapping of Geodict index: ", mappings)
# Creation of the index in Elasticsearch databased
es_client.indices.create(index="gazetteer", body=mappings)
action_list=[]
for line in gazetteer:
number_of_entries = wc_l(file_name)
for line in tqdm(gazetteer,desc="Importing ...",total=number_of_entries):
data = json.loads(line.strip())
if '_score' in data.keys():
data['score'] = data['_score']
del data['_score']
if "geometry" in data:
del data["geometry"]
del data["geometry"] # Difficult with ES ... so we delete it
if "coord" in data:
if data["coord"]["lat"] >90 or data["coord"]["lon"] >180:
i+=1
continue
if not data["fr"]:
i+=1
continue
#print("AFTER",data["geometry"])
#return
#es_client.index("gazetteer", "place", data)
data["coord"]["lat"]=float(data["coord"]["lat"])
data["coord"]["lon"]= float(data["coord"]["lon"])
if data["coord"]["lat"] >90 or data["coord"]["lon"] >180:continue
if not data["fr"]:continue
actions = {
"_index": "gazetteer",
"_type": "place",
"_source": data
}
#print(data["fr"])
action_list.append(actions)
if i % 1000 == 0:
#print(action_list)
if len(action_list) % 1000 == 0:
helpers.bulk(es_client,action_list,request_timeout=30)
sys.stdout.write("\rEntity transferred: " + '{:,}'.format(i))
action_list = []
i += 1
if __name__ == '__main__':
......
hierarchy.py 0 → 100644
import pandas as pd
from tqdm import tqdm
df = pd.read_csv("hierarchy.txt",sep="\t",header=None,names="parentId childId type".split())
ids = df.parentId.values.tolist()
ids.extend(df.childId.values.tolist())
ids = list(set(ids))
inclusion_relations_ = dict(df["childId parentId".split()].values)
inc_dict_geonames = {}
for childId,parentId in tqdm(inclusion_relations_.items()):
if not childId in inc_dict_geonames:
inc_dict_geonames[childId] = [parentId]
if parentId in inc_dict_geonames:
inc_dict_geonames[childId].extend(inc_dict_geonames[parentId])
else:
B = parentId
while 1:
if B in inclusion_relations_:
inc_dict_geonames[childId].append(inclusion_relations_[B])
B = inclusion_relations_[B]
else:
break
inc_dict_geonames[parentId] = inc_dict_geonames[childId][1:]
import json
path="out_final_extended.json"
geonames2GD,wikidata2GD = {}, {}
from mytoolbox.text.size import wc_l
size_data = wc_l(path)
for line in tqdm(open(path),total=size_data):
data = json.loads(line.strip("\n,"))
if "geonameID" in data:
geonames2GD[data["geonameID"]]=data["id"]
if "wikidataID" in data:
wikidata2GD[data["wikidataID"]]=data["id"]
output_path = "geodict_final_29_04_19.json"
output = open(output_path,'w')
name_col = {"P131":"located_in_adm_terr_ent",
"P706":"located_in_terr_feature",
"P47":"share_border_with"}
for line in tqdm(open(path),total=size_data):
data = json.loads(line.strip("\n,"))
for property_ in ["P131","P706","P47"]:
if not property_ in data:
continue
data[name_col[property_]] = [wikidata2GD[id_] for id_ in data[property_] if id_ in wikidata2GD]
if "geonameID" in data and data["geonameID"] in inc_dict_geonames:
data["geoname_hierarchy"] = inc_dict_geonames[data["geonameID"]]
output.write("{0}\n,".format(json.dumps(data)))
\ No newline at end of file
utils.py 0 → 100644
# coding = utf-8
import os
def blocks(files, size=65536):
while True:
b = files.read(size)
if not b: break
yield b
def wc_l(text_input_fn):
"""
Count the number of line in a file
Parameters
----------
text_input_fn : str
filepath
"""
if not os.path.exists(text_input_fn):
raise FileNotFoundError("{0} does not exists !".format(text_input_fn))
with open(text_input_fn, "r", encoding="utf-8", errors='ignore') as f:
return sum(bl.count("\n") for bl in blocks(f))
......@@ -62,6 +62,7 @@ def read_Tsv(filename,encoding='ascii'):
column = text[0]
del text[0]
return pd.DataFrame(text,columns = column)
#finding the missing link for wikipedia pages for which wikidata_IDs are not available
def finding_links(files):
missing_uri=[]
......
......@@ -2,9 +2,9 @@
class Property(object):
"""docstring for property."""
def __init__(self, id,isMultiple,type_):
def __init__(self, id, isMultiple, type_):
self.id=id
self.isMultiple=isMultiple
self.isMultiple = isMultiple
self.type=type_
def exists(self,data):
......@@ -14,4 +14,4 @@ class Property(object):
return False
def extractData(self,data):
return self.type.extractData(self.id,self.isMultiple,data)
return self.type.extractData(self.id, self.isMultiple, data)
# coding=utf-8
from gzip import GzipFile
import json
from utils import wc_l
class Reader(object):
"""docstring for Reader."""
def __init__(self, name, decoding):
......@@ -9,6 +13,7 @@ class Reader(object):
self.decoding = decoding
self.dump = GzipFile(name,'r')
self.line = self.dump.readline()
self.size_file = wc_l(name)
def has_next(self):
self.line = self.dump.readline().decode(self.decoding)
......
......@@ -125,15 +125,15 @@ class Time(Type):
def extractMultiple(self, propID, data):
result = []
for i in range(len(data['claims'][propID])):
result.append(parsedate(data['claims'][propID][i]['mainsnak']['datavalue']['value']['time']))
result.append(parseDate(data['claims'][propID][i]['mainsnak']['datavalue']['value']['time']))
return result
def extractSingle(self, propID, data):
return parsedate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
return parseDate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
def check_conformity(self, propID, data):
try:
parsedate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
parseDate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
return True
except Exception as e:
return False
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment