diff --git a/.gitignore b/.gitignore index 308a8d4ece879df5bb3c154d77db471faba7ea9e..9166bcb9475f6e2fa67a0cab977517784f3d686a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,9 @@ latest-all.json.gz temp/* custom_process/__pycache__/* out_final.json +__pycache__ +temp +geodict* +out* +.idea +.DS_Store \ No newline at end of file diff --git a/Readme.md b/Readme.md index 2132e088aa5e33e346ae6468a2a490ed3e6b0542..a48c1df75e444a0ae73b732fe7938ae0680bce33 100644 --- a/Readme.md +++ b/Readme.md @@ -25,4 +25,9 @@ Simply run the command line $ python3 gazeteer2es.py [ES host if not localhost] +## TODO + +* Add geonames missing entry (look into save.py) +* Add a step that create unique ID for Geodict + **Gaurav Shrivastava, Jacques Fize @ 2017** \ No newline at end of file diff --git a/config/configuration.json b/config/configuration.json index e14e1e9a3a461639887b9c8e6fd82684c9fdd965..076015f3e65056a9ea1eb72c3ed5524d97481c31 100644 --- a/config/configuration.json +++ b/config/configuration.json @@ -1,7 +1,8 @@ { "osm_boundaries_dir":"osm-boundaries", + "pre_dl_osm": "/Users/jacquesfize/install/osm-boundaries", "temp_dir":"temp", - "wikidata_dump":"latest-all.json.gz", + "wikidata_dump":"/Volumes/Sauvegarde/latest-all.json.gz", "lang_list":["en","fr","de","es"], "properties_to_extract":[ {"id":"P47","isMultiple":true,"type":"EntityID","mappings":"keyword","mappings_details":{}}, diff --git a/config/mappings.json b/config/mappings.json index 027778a38f5fa53f27caf3a6fcc20903a94df5a7..9d493847220e620efe127d139c78255cc839050b 100644 --- a/config/mappings.json +++ b/config/mappings.json @@ -58,6 +58,9 @@ "osmID": { "type": "keyword" }, + "wikidataID": { + "type": "keyword" + }, "path": { "type": "keyword" }, @@ -70,6 +73,15 @@ "P706": { "type": "keyword" }, + "inc_P131": { + "type": "keyword" + }, + "inc_P706": { + "type": "keyword" + }, + "inc_geoname": { + "type": "keyword" + }, "geometry": { "type": "geo_shape", "tree": "quadtree", diff --git a/config/mappingsv2.json b/config/mappingsv2.json new file mode 100644 index 0000000000000000000000000000000000000000..36e6f952a7a52c37e97dc438421d273ce92574cf --- /dev/null +++ b/config/mappingsv2.json @@ -0,0 +1,97 @@ +{ + "mappings": { + "_default_": { + "properties": { + "de": { + "type": "keyword" + }, + "en": { + "type": "keyword" + }, + "es": { + "type": "keyword" + }, + "fr": { + "type": "keyword" + }, + "aliases": { + "type": "nested", + "properties": { + "de": { + "type": "keyword" + }, + "en": { + "type": "keyword" + }, + "es": { + "type": "keyword" + }, + "fr": { + "type": "keyword" + } + } + }, + "instance_of": { + "type": "keyword" + }, + "coord": { + "type": "geo_point" + }, + "geonameID": { + "type": "keyword" + }, + "class": { + "type": "keyword" + }, + "id": { + "type": "keyword" + }, + "country": { + "type": "keyword" + }, + "continent": { + "type": "keyword" + }, + "score": { + "type": "float" + }, + "osmID": { + "type": "keyword" + }, + "wikidataID": { + "type": "keyword" + }, + "path": { + "type": "keyword" + }, + "P47": { + "type": "keyword" + }, + "share_border_with": { + "type": "keyword" + }, + "P131": { + "type": "keyword" + }, + "P706": { + "type": "keyword" + }, + "located_in_adm_terr_ent": { + "type": "keyword" + }, + "located_in_terr_feature": { + "type": "keyword" + }, + "inc_geoname": { + "type": "keyword" + }, + "geometry": { + "type": "geo_shape", + "tree": "quadtree", + "precision": "100m" + } + + } + } + } +} \ No newline at end of file diff --git a/custom_process/__pycache__/__init__.cpython-36.pyc b/custom_process/__pycache__/__init__.cpython-36.pyc index 32b2ceac0dd176fee582e3e5d3deace746143615..c31ddfe621f9a7f7589d980786e78ce493b880b0 100644 Binary files a/custom_process/__pycache__/__init__.cpython-36.pyc and b/custom_process/__pycache__/__init__.cpython-36.pyc differ diff --git a/custom_process/__pycache__/basic_extraction.cpython-36.pyc b/custom_process/__pycache__/basic_extraction.cpython-36.pyc index c1b722257898fa6ddc3fd047271d81092b6ac3d8..696cff8ebe4c37a5a81ddf6d79d1281e710accb3 100644 Binary files a/custom_process/__pycache__/basic_extraction.cpython-36.pyc and b/custom_process/__pycache__/basic_extraction.cpython-36.pyc differ diff --git a/custom_process/__pycache__/class_extraction.cpython-36.pyc b/custom_process/__pycache__/class_extraction.cpython-36.pyc index 80b54e87a17d6d781a3714190e7c09dddb421502..7b0e8984cc218fb24a251281dfabdbffed609a78 100644 Binary files a/custom_process/__pycache__/class_extraction.cpython-36.pyc and b/custom_process/__pycache__/class_extraction.cpython-36.pyc differ diff --git a/custom_process/__pycache__/property_extract.cpython-36.pyc b/custom_process/__pycache__/property_extract.cpython-36.pyc index de7db971e619abc91937ff3e89f474013442c533..3a1cbbcf4f4c92d01d8e3736d6f759b25ac0e81b 100644 Binary files a/custom_process/__pycache__/property_extract.cpython-36.pyc and b/custom_process/__pycache__/property_extract.cpython-36.pyc differ diff --git a/custom_process/__pycache__/wiki_links.cpython-36.pyc b/custom_process/__pycache__/wiki_links.cpython-36.pyc index 6763cf3c0d9ad9053ea0cc91d166545483c417f6..4c1647e55e5c3f7e32394fa4ed56cbf842addf52 100644 Binary files a/custom_process/__pycache__/wiki_links.cpython-36.pyc and b/custom_process/__pycache__/wiki_links.cpython-36.pyc differ diff --git a/custom_process/basic_extraction.py b/custom_process/basic_extraction.py index 68f00cea6e8809ae7ef174bfe4ad0e19dbf36076..7969e3b36840acdc9acd07b9f8f66e9ee0c5b109 100644 --- a/custom_process/basic_extraction.py +++ b/custom_process/basic_extraction.py @@ -13,7 +13,9 @@ from wikidata.reader import Reader from wikidata.process_wd import * config=Configuration("config/configuration.json") + class BasicExtraction(Process): + def __init__(self, id, labels_fn,page_rank): super(BasicExtraction, Process.__init__(self, id)) self.dataframe = {} @@ -32,6 +34,7 @@ class BasicExtraction(Process): self.labels_list = json.load(f) f.close() + print("Loading the PAGERANK DATA ...") f = open(page_rank,encoding = 'utf-8') self.scores = json.load(f) f.close() diff --git a/gazetteer.py b/gazetteer.py index e16a7bec3b45501c0258a2565e588381687732c8..bf3bcf08b1741ea68cfa7f78c949e31ef2f26cef 100644 --- a/gazetteer.py +++ b/gazetteer.py @@ -8,6 +8,8 @@ from custom_process.wiki_links import * from custom_process.class_extraction import * from custom_process.property_extract import * from gis.convex_hull import get_convex_hull +from tqdm import tqdm +from utils import wc_l __config=Configuration("config/configuration.json") @@ -15,7 +17,7 @@ __config=Configuration("config/configuration.json") def temp(filename): - return os.path.join(__config.temp_dir,filename) + return os.path.join(__config.temp_dir, filename) def import_data(): @@ -30,25 +32,36 @@ def import_data(): print("Downloading Geonames ...") filename=temp("allCountries.zip") - urllib.request.urlretrieve( - "http://download.geonames.org/export/dump/allCountries.zip",filename) - print("Geonames data retrieved !!") - print("Extracting the geonames data!") - zip_ref = zipfile.ZipFile(filename, 'r') - zip_ref.extractall("./{0}".format(__config.temp_dir)) - print("Extracted !") + if not os.path.exists(temp("allCountries.txt")): + urllib.request.urlretrieve( + "http://download.geonames.org/export/dump/allCountries.zip",filename) + print("Geonames data retrieved !!") + + print("Extracting the geonames data!") + zip_ref = zipfile.ZipFile(filename, 'r') + zip_ref.extractall("./{0}".format(__config.temp_dir)) + print("Extracted !") + print("Extracting labels") os.system('cut -f 1,2 {0} > {1}'.format(temp("allCountries.txt"),temp("labels.txt"))) + print("Extracting the class") os.system('cut -f 1,7,8 {0} > {1}'.format(temp("allCountries.txt"),temp("class_codes.txt"))) + + size_label_txt=wc_l(temp("labels.txt")) f = open(temp("labels.txt"), encoding = 'utf-8') labels = {} - for line in f: + for line in tqdm(f,total=size_label_txt,desc="Create JSON containing labels for every GeonameID "): line = line.strip().split("\t") labels[line[0]] = line[1] f.close() + open(temp("labels.json"), "w").write(json.dumps(labels))#, ensure_ascii=False)) - os.system('git clone https://github.com/missinglink/osm-boundaries.git') + + if not "pre_dl_osm" in config: + os.system('git clone https://github.com/missinglink/osm-boundaries.git') + else: + config["osm_boundaries_dir"]=config["pre_dl_osm"] def basic_gazetteer(outfile): @@ -62,17 +75,16 @@ def basic_gazetteer(outfile): """ if not os.path.isfile(os.path.join(__config.temp_dir,"labels.json")): - print("Give correct labels file name!!") - return False + raise FileNotFoundError("Give correct labels file name!!") + if not os.path.isfile(__config.wikidata_dump): - print('Give correct path to wikidata json dump ') - return False + raise FileNotFoundError('Give correct path to wikidata json dump ') - proc1 = BasicExtraction(1,os.path.join(__config.temp_dir,"labels.json"),"resources/wd_page_rank.json") - dump = Reader(__config.wikidata_dump,'utf-8') + proc1 = BasicExtraction(1, os.path.join(__config.temp_dir, "labels.json"), "resources/wd_page_rank.json") + dump = Reader(__config.wikidata_dump, 'utf-8') controller = WDController(dump,proc1) controller.process_all() - open(outfile, 'w').write(json.dumps(proc1.dataframe))#,ensure_ascii=False)) + open(outfile, 'w').write(json.dumps(proc1.dataframe)) return True @@ -92,7 +104,7 @@ def add_properties(input_gazetteer,output_gazetteer,configuration_file): dump = Reader(__config.wikidata_dump,'utf-8') controller = WDController(dump,proc1) controller.process_all() - open(output_gazetteer, 'w').write(json.dumps(proc1.dataframe))#,ensure_ascii=False)) + open(output_gazetteer, 'w').write(json.dumps(proc1.dataframe)) return True @@ -106,8 +118,8 @@ def extract_classes(gazeteer): :return: """ if not os.path.isfile(__config.wikidata_dump): - print('Give correct path to wikidata json dump') - return None + raise FileNotFoundError('Give correct path to wikidata json dump') + proc3 = ClassExtraction(1, os.path.join(__config.temp_dir,"class_codes.txt"), gazeteer) dump = Reader(__config.wikidata_dump, 'utf-8') controller = WDController(dump, proc3) @@ -130,7 +142,7 @@ def add_classes(gazeteer,outfile): iterations = 0 places = 0 keys = set(data.keys()) - for key in keys: + for key in tqdm(keys,desc="Add Classes"): iterations = iterations + 1 temp_ = [] if 'instance_of' in data[key].keys(): @@ -163,7 +175,7 @@ def extract_missing_WikiIDS(interm_outfile,outfile): iterations = 0 output=open(interm_outfile,"w") total=len(paths) - output.write(json.dumps(finding_links(paths)))#,ensure_ascii=False)) + output.write(json.dumps(finding_links(paths))) proc2 = WikipediaURI(2, outfile, interm_outfile) dump = Reader(__config.wikidata_dump, 'utf-8') controller = WDController(dump, proc2) @@ -189,8 +201,8 @@ def missing_wikidata_IDS(missing_ids): df = read_tsv(os.path.join(__config.osm_boundaries_dir,'meta.tsv'),encoding = 'utf-8',columns = True)#'./osm-boundaries/meta.tsv' wikidata_IDs = [] paths = [os.path.join(__config.osm_boundaries_dir,'data',path) for path in df['path']] - iterations = 0 - for path in paths: + # iterations = 0 + for path in tqdm(paths,desc="Browsing OSM data"): f = open(path,encoding = 'utf-8') dataframe = json.load(f) f.close() @@ -207,9 +219,9 @@ def missing_wikidata_IDS(missing_ids): wikidata_IDs.append(None) else: wikidata_IDs.append(None) - if iterations%1000 == 0: - sys.stdout.write("\r iterations: "+'{:,}'.format(iterations)) - iterations = iterations + 1 + # if iterations%1000 == 0: + # sys.stdout.write("\r iterations: "+'{:,}'.format(iterations)) + # iterations = iterations + 1 df['Wiki_IDs'] = wikidata_IDs df.to_csv(temp('meta_all.csv'),index = False)#'temp/meta_all.csv' @@ -230,11 +242,11 @@ def adding_geometry(infile,out_file,output_final_fn): Wiki_IDs = set(list(path_association.keys())) data = json.loads(open(out_file).read()) outfile = open(output_final_fn, 'w') - iterations = 0 + # iterations = 0 places = 0 keys = set(data.keys()) - for key in keys: - iterations = iterations + 1 + for key in tqdm(keys,desc="Browsing Geodict"): + # iterations = iterations + 1 temp= data[key] temp["id"]=key if key in Wiki_IDs: @@ -247,43 +259,85 @@ def adding_geometry(infile,out_file,output_final_fn): outfile.write(json.dumps(temp)+"\n")#,ensure_ascii=False del data[key] - if iterations % 100 == 0: - sys.stdout.write("\rEntity Parsed: " + '{:,}'.format(iterations) + " Places with boundaries parsed: " + '{:,}'.format(places)) - + # if iterations % 100 == 0: + # sys.stdout.write("\rEntity Parsed: " + '{:,}'.format(iterations) + " Places with boundaries parsed: " + '{:,}'.format(places)) +def add_final_spatial_entities(input,output): + """ + Add Missing Geonames entries and building Geodict IDs (En cours) + :param input: + :param output: + :return: + """ + d_geo = {} + geonames_i = open(temp("allCountries.txt")) + for entry in geonames_i: + row = entry.split("\t") + + d_geo[row[0]] = {lang: row[1] for lang in ["en", "fr", "es", "de"]} + d_geo[row[0]]["aliases"] = {lang: row[3].split(",") for lang in ["en", "fr", "es", "de"]} + d_geo[row[0]]["coord"] = {"lat": float(row[4]), "lon": float(row[5])} + d_geo[row[0]]["class"] = ["{0}-{1}".format(row[6], row[7])] + + geoname_id_index = set(d_geo.keys()) + + already_in_geodict = set([]) + for line in open(input): + data = json.loads(line.strip()) + if "geonameID" in data: + already_in_geodict.add(data["geonameID"]) + + diff = geoname_id_index.difference(already_in_geodict) + prefix = "GD" + i = 1 + output = open(output, 'w') + size_input = wc_l(input) + for line in tqdm(open(input),total=size_input,desc="Browsing Geodict"): + data = json.loads(line.strip()) + data["wikidataID"] = data["id"] + data["id"] = prefix + str(i) + output.write(json.dumps(data) + "\n") + i += 1 + for geo_id in tqdm(diff): + data = d_geo[geo_id] + data["id"] = prefix + str(i) + data["geonameID"] = geo_id + output.write(json.dumps(data) + "\n") + i += 1 def main(): start=time.time() if not os.path.exists(__config.temp_dir): os.makedirs(__config.temp_dir) # Import the data sources required to be harvested for creation of gazetteer - print("[1/6] Download required datasets...") + print("[1/7] Download required datasets...") import_data() # Create a first basic gazeteer - print("[2/6] Building the core gazetteer...") + print("[2/7] Building the core gazetteer...") basic_gazetteer(temp("1stoutput.json")) # Associate geonames classe to the instance_of(P31) values - print("[3/6] Associate a class to each entry...") + print("[3/7] Associate a class to each entry...") extract_classes(temp("1stoutput.json")) - # Add class to each entity add_classes(temp("1stoutput.json"),temp("2ndoutput.json")) # Extract missing wikidata IDs in the boundary data - print("[4/6] Find missing WD ids within boundary data...") + print("[4/7] Find missing WD ids within boundary data...") extract_missing_WikiIDS(temp('found_missing_links.json'),temp('missing_Wikidata_IDS.txt')) missing_wikidata_IDS(temp('missing_Wikidata_IDS.txt')) # Adding properties from configuration_file - print("[5/6] Add user properties...") + print("[5/7] Add user properties...") add_properties(temp("2ndoutput.json"),temp("3rdoutput.json"),'config/configuration.json') # Add boundaries in the final data - print("[6/6] Adding adminstrative boundary/ies...") + print("[6/7] Adding adminstrative boundary/ies...") adding_geometry(temp("meta_all.csv"),temp("3rdoutput.json"),'out_final.json') + print("7/7") + add_final_spatial_entities("out_final.json","out_final_extended.json") print("The gazeteer was created in {0} hours".format(((time.time()-start)/60)/60)) diff --git a/gazetteer2es.py b/gazetteer2es.py index f393a2546759a4885d1ea2f68704fcef7bcdafae..fd4fc1127bea7d914ac31d50a6c8f2f30e8a28e6 100644 --- a/gazetteer2es.py +++ b/gazetteer2es.py @@ -2,74 +2,75 @@ import argparse, json, sys from elasticsearch import Elasticsearch,helpers from elasticsearch import helpers import copy +from tqdm import tqdm +from mytoolbox.text.size import wc_l -def polygon_transformation4ES(temp,simple=True): - final = [] - if simple: - final=copy.copy(temp) - final.append(temp[0]) - final=final - else: - for i in temp: - t=copy.copy(i) - t.append(i[0]) - final.append(t) - return final def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="give input json file") parser.add_argument("-e", "--es_host", help="Elasticsearch Host address", default="127.0.0.1") + parser.add_argument("-p", "--es_port", help="Elasticsearch Host port", default="9200") args = parser.parse_args() + + if not os.path.exists(args.input): + raise FileNotFoundError("Input File '{0}' not found !".format(args.input)) + file_name = args.input es_client = Elasticsearch(args.es_host) + if not es_client.ping(): - print("Can't connect to ES ! ") - sys.exit(1) + raise ConnectionError("Could not connect to Elasticserver at {0}".format(args.es_host)) + + # If exists in the dataase, delete ! if es_client.indices.exists(index="gazetteer"): es_client.indices.delete(index="gazetteer") + + # Open input file gazetteer = open(file_name, encoding='utf-8') - i = 1 - mappings = json.load(open("config/mappings.json")) + + + mappings = json.load(open("config/mappings.json")) + # complete Mapping depending on custom properties extracted property_to_be_mapped = json.load(open('config/configuration.json')) for prop in property_to_be_mapped["properties_to_extract"]: mappings['mappings']['_default_']['properties'][prop['id']] = {'type':prop["mappings"]} if prop["mappings_details"]: for k,v in prop["mappings_details"].items(): mappings['mappings']['_default_']['properties'][prop['id']][k]=v - print(mappings) + print("Mapping of Geodict index: ", mappings) + + # Creation of the index in Elasticsearch databased es_client.indices.create(index="gazetteer", body=mappings) action_list=[] - for line in gazetteer: + + number_of_entries = wc_l(file_name) + + for line in tqdm(gazetteer,desc="Importing ...",total=number_of_entries): data = json.loads(line.strip()) if '_score' in data.keys(): data['score'] = data['_score'] del data['_score'] if "geometry" in data: - del data["geometry"] + del data["geometry"] # Difficult with ES ... so we delete it if "coord" in data: - if data["coord"]["lat"] >90 or data["coord"]["lon"] >180: - i+=1 - continue - if not data["fr"]: - i+=1 - continue - #print("AFTER",data["geometry"]) - #return - #es_client.index("gazetteer", "place", data) + data["coord"]["lat"]=float(data["coord"]["lat"]) + data["coord"]["lon"]= float(data["coord"]["lon"]) + + if data["coord"]["lat"] >90 or data["coord"]["lon"] >180:continue + + if not data["fr"]:continue + actions = { "_index": "gazetteer", "_type": "place", "_source": data } - #print(data["fr"]) action_list.append(actions) - if i % 1000 == 0: - #print(action_list) + if len(action_list) % 1000 == 0: helpers.bulk(es_client,action_list,request_timeout=30) sys.stdout.write("\rEntity transferred: " + '{:,}'.format(i)) action_list = [] - i += 1 if __name__ == '__main__': diff --git a/hierarchy.py b/hierarchy.py new file mode 100644 index 0000000000000000000000000000000000000000..1e33428a1823850d37612c9efb615c4db77da478 --- /dev/null +++ b/hierarchy.py @@ -0,0 +1,60 @@ +import pandas as pd +from tqdm import tqdm + +df = pd.read_csv("hierarchy.txt",sep="\t",header=None,names="parentId childId type".split()) + +ids = df.parentId.values.tolist() +ids.extend(df.childId.values.tolist()) +ids = list(set(ids)) + +inclusion_relations_ = dict(df["childId parentId".split()].values) + + +inc_dict_geonames = {} +for childId,parentId in tqdm(inclusion_relations_.items()): + if not childId in inc_dict_geonames: + inc_dict_geonames[childId] = [parentId] + if parentId in inc_dict_geonames: + inc_dict_geonames[childId].extend(inc_dict_geonames[parentId]) + else: + B = parentId + while 1: + if B in inclusion_relations_: + inc_dict_geonames[childId].append(inclusion_relations_[B]) + B = inclusion_relations_[B] + else: + break + inc_dict_geonames[parentId] = inc_dict_geonames[childId][1:] + +import json +path="out_final_extended.json" +geonames2GD,wikidata2GD = {}, {} + +from mytoolbox.text.size import wc_l + +size_data = wc_l(path) + +for line in tqdm(open(path),total=size_data): + data = json.loads(line.strip("\n,")) + if "geonameID" in data: + geonames2GD[data["geonameID"]]=data["id"] + if "wikidataID" in data: + wikidata2GD[data["wikidataID"]]=data["id"] + +output_path = "geodict_final_29_04_19.json" + +output = open(output_path,'w') + +name_col = {"P131":"located_in_adm_terr_ent", + "P706":"located_in_terr_feature", + "P47":"share_border_with"} + +for line in tqdm(open(path),total=size_data): + data = json.loads(line.strip("\n,")) + for property_ in ["P131","P706","P47"]: + if not property_ in data: + continue + data[name_col[property_]] = [wikidata2GD[id_] for id_ in data[property_] if id_ in wikidata2GD] + if "geonameID" in data and data["geonameID"] in inc_dict_geonames: + data["geoname_hierarchy"] = inc_dict_geonames[data["geonameID"]] + output.write("{0}\n,".format(json.dumps(data))) \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9ef956215d06dfb9c1c7b04d06df1ce170ab3454 --- /dev/null +++ b/utils.py @@ -0,0 +1,25 @@ +# coding = utf-8 +import os + +def blocks(files, size=65536): + while True: + b = files.read(size) + if not b: break + yield b + +def wc_l(text_input_fn): + """ + Count the number of line in a file + + Parameters + ---------- + text_input_fn : str + filepath + + """ + if not os.path.exists(text_input_fn): + raise FileNotFoundError("{0} does not exists !".format(text_input_fn)) + + with open(text_input_fn, "r", encoding="utf-8", errors='ignore') as f: + return sum(bl.count("\n") for bl in blocks(f)) + diff --git a/wikidata/helpers.py b/wikidata/helpers.py index 05040f1512d31f8dbb321e150f0e437476a55c03..623a06292a5cbafa3e5cfb548f4d26acf56c37b7 100644 --- a/wikidata/helpers.py +++ b/wikidata/helpers.py @@ -62,6 +62,7 @@ def read_Tsv(filename,encoding='ascii'): column = text[0] del text[0] return pd.DataFrame(text,columns = column) + #finding the missing link for wikipedia pages for which wikidata_IDs are not available def finding_links(files): missing_uri=[] diff --git a/wikidata/property_wd.py b/wikidata/property_wd.py index d75243e9afa670e10ff1f08cd0c274d3e055b18e..6181cf85007388a25c6adcc234d881d83e66658d 100644 --- a/wikidata/property_wd.py +++ b/wikidata/property_wd.py @@ -2,9 +2,9 @@ class Property(object): """docstring for property.""" - def __init__(self, id,isMultiple,type_): + def __init__(self, id, isMultiple, type_): self.id=id - self.isMultiple=isMultiple + self.isMultiple = isMultiple self.type=type_ def exists(self,data): @@ -14,4 +14,4 @@ class Property(object): return False def extractData(self,data): - return self.type.extractData(self.id,self.isMultiple,data) + return self.type.extractData(self.id, self.isMultiple, data) diff --git a/wikidata/reader.py b/wikidata/reader.py index cc6516d2303c88f179ebaaf91758cbfe1ad4b39a..56ea1817e461df5b66a073e86036a05091708703 100644 --- a/wikidata/reader.py +++ b/wikidata/reader.py @@ -1,6 +1,10 @@ # coding=utf-8 from gzip import GzipFile import json + +from utils import wc_l + + class Reader(object): """docstring for Reader.""" def __init__(self, name, decoding): @@ -9,6 +13,7 @@ class Reader(object): self.decoding = decoding self.dump = GzipFile(name,'r') self.line = self.dump.readline() + self.size_file = wc_l(name) def has_next(self): self.line = self.dump.readline().decode(self.decoding) diff --git a/wikidata/types_wd.py b/wikidata/types_wd.py index 42332e44bba00f8fc49903cf0b7dd59189c3f788..4564266ab77f09e5a2c46464454d8d9e5afdb0ed 100644 --- a/wikidata/types_wd.py +++ b/wikidata/types_wd.py @@ -125,15 +125,15 @@ class Time(Type): def extractMultiple(self, propID, data): result = [] for i in range(len(data['claims'][propID])): - result.append(parsedate(data['claims'][propID][i]['mainsnak']['datavalue']['value']['time'])) + result.append(parseDate(data['claims'][propID][i]['mainsnak']['datavalue']['value']['time'])) return result def extractSingle(self, propID, data): - return parsedate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"]) + return parseDate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"]) def check_conformity(self, propID, data): try: - parsedate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"]) + parseDate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"]) return True except Exception as e: return False