add geoname + hierarchy inclusion process + normalization of Wikidata ID

e8b6b8d1 · Fize Jacques · 4d2ee963 · e8b6b8d1 · e8b6b8d1 · e8b6b8d1
Commit e8b6b8d1 authored 6 years ago by Fize Jacques
Hide whitespace changes
Inline Side-by-side

Showing

with 349 additions and 79 deletions
+349 -79
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,9 @@ latest-all.json.gz
 temp/*
 custom_process/__pycache__/*
 out_final.json
+__pycache__
+temp
+geodict*
+out*
+.idea
+.DS_Store
\ No newline at end of file
--- a/Readme.md
+++ b/Readme.md
@@ -25,4 +25,9 @@ Simply run the command line
    $ python3 gazeteer2es.py [ES host if not localhost]
    

+## TODO
+
+* Add geonames missing entry (look into save.py)
+* Add a step that create unique ID for Geodict
+
 **Gaurav Shrivastava, Jacques Fize @ 2017**
\ No newline at end of file
--- a/config/configuration.json
+++ b/config/configuration.json
 {
  "osm_boundaries_dir":"osm-boundaries",
+  "pre_dl_osm": "/Users/jacquesfize/install/osm-boundaries",
  "temp_dir":"temp",
-  "wikidata_dump":"latest-all.json.gz",
+  "wikidata_dump":"/Volumes/Sauvegarde/latest-all.json.gz",
  "lang_list":["en","fr","de","es"],
  "properties_to_extract":[
    {"id":"P47","isMultiple":true,"type":"EntityID","mappings":"keyword","mappings_details":{}},

--- a/config/mappings.json
+++ b/config/mappings.json
@@ -58,6 +58,9 @@
                "osmID": {
                    "type": "keyword"
                },
+                "wikidataID": {
+                    "type": "keyword"
+                },
                "path": {
                    "type": "keyword"
                },
@@ -70,6 +73,15 @@
                "P706": {
                    "type": "keyword"
                },
+              "inc_P131": {
+                    "type": "keyword"
+                },
+              "inc_P706": {
+                    "type": "keyword"
+                },
+              "inc_geoname": {
+                    "type": "keyword"
+                },
                 "geometry": {
                    "type": "geo_shape",
                    "tree": "quadtree",

--- a/config/mappingsv2.json
+++ b/config/mappingsv2.json
+{
+    "mappings": {
+        "_default_": {
+            "properties": {
+                "de": {
+                    "type": "keyword"
+                },
+                "en": {
+                    "type": "keyword"
+                },
+                "es": {
+                    "type": "keyword"
+                },
+                "fr": {
+                    "type": "keyword"
+                },
+                "aliases": {
+                    "type": "nested",
+                    "properties": {
+                        "de": {
+                            "type": "keyword"
+                        },
+                        "en": {
+                            "type": "keyword"
+                        },
+                        "es": {
+                            "type": "keyword"
+                        },
+                        "fr": {
+                            "type": "keyword"
+                        }
+                    }
+                },
+                "instance_of": {
+                    "type": "keyword"
+                },
+                "coord": {
+                    "type": "geo_point"
+                },
+                "geonameID": {
+                    "type": "keyword"
+                },
+                "class": {
+                    "type": "keyword"
+                },
+                "id": {
+                    "type": "keyword"
+                },
+                "country": {
+                    "type": "keyword"
+                },
+                "continent": {
+                    "type": "keyword"
+                },
+                "score": {
+                    "type": "float"
+                },
+                "osmID": {
+                    "type": "keyword"
+                },
+                "wikidataID": {
+                    "type": "keyword"
+                },
+                "path": {
+                    "type": "keyword"
+                },
+                "P47": {
+                    "type": "keyword"
+                },
+                "share_border_with": {
+                    "type": "keyword"
+                },
+                "P131": {
+                    "type": "keyword"
+                },
+                "P706": {
+                    "type": "keyword"
+                },
+                "located_in_adm_terr_ent": {
+                    "type": "keyword"
+                },
+                "located_in_terr_feature": {
+                    "type": "keyword"
+                },
+                "inc_geoname": {
+                    "type": "keyword"
+                },
+                "geometry": {
+                    "type": "geo_shape",
+                    "tree": "quadtree",
+                    "precision": "100m"
+                }
+
+            }
+        }
+    }
+}
\ No newline at end of file
--- a/custom_process/__pycache__/__init__.cpython-36.pyc
+++ b/custom_process/__pycache__/__init__.cpython-36.pyc
--- a/custom_process/__pycache__/basic_extraction.cpython-36.pyc
+++ b/custom_process/__pycache__/basic_extraction.cpython-36.pyc
--- a/custom_process/__pycache__/class_extraction.cpython-36.pyc
+++ b/custom_process/__pycache__/class_extraction.cpython-36.pyc
--- a/custom_process/__pycache__/property_extract.cpython-36.pyc
+++ b/custom_process/__pycache__/property_extract.cpython-36.pyc
--- a/custom_process/__pycache__/wiki_links.cpython-36.pyc
+++ b/custom_process/__pycache__/wiki_links.cpython-36.pyc
--- a/custom_process/basic_extraction.py
+++ b/custom_process/basic_extraction.py
@@ -13,7 +13,9 @@ from wikidata.reader import Reader
 from wikidata.process_wd import *

 config=Configuration("config/configuration.json")
+
 class BasicExtraction(Process):
+
    def __init__(self, id, labels_fn,page_rank):
        super(BasicExtraction, Process.__init__(self, id))
        self.dataframe = {}
@@ -32,6 +34,7 @@ class BasicExtraction(Process):
        self.labels_list = json.load(f)
        f.close()

+        print("Loading the PAGERANK DATA ...")
        f = open(page_rank,encoding = 'utf-8')
        self.scores = json.load(f)
        f.close()

--- a/gazetteer.py
+++ b/gazetteer.py
@@ -8,6 +8,8 @@ from custom_process.wiki_links import *
 from custom_process.class_extraction import *
 from custom_process.property_extract import *
 from gis.convex_hull import get_convex_hull
+from tqdm import tqdm
+from utils import wc_l

 __config=Configuration("config/configuration.json")

@@ -15,7 +17,7 @@ __config=Configuration("config/configuration.json")


 def temp(filename):
-    return os.path.join(__config.temp_dir,filename)
+    return os.path.join(__config.temp_dir, filename)

 def import_data():

@@ -30,25 +32,36 @@ def import_data():

    print("Downloading Geonames ...")
    filename=temp("allCountries.zip")
-    urllib.request.urlretrieve(
-        "http://download.geonames.org/export/dump/allCountries.zip",filename)
-    print("Geonames data retrieved !!")
-    print("Extracting the geonames data!")
-    zip_ref = zipfile.ZipFile(filename, 'r')
-    zip_ref.extractall("./{0}".format(__config.temp_dir))
-    print("Extracted !")
+    if not os.path.exists(temp("allCountries.txt")):
+        urllib.request.urlretrieve(
+            "http://download.geonames.org/export/dump/allCountries.zip",filename)
+        print("Geonames data retrieved !!")
+
+        print("Extracting the geonames data!")
+        zip_ref = zipfile.ZipFile(filename, 'r')
+        zip_ref.extractall("./{0}".format(__config.temp_dir))
+        print("Extracted !")
+
    print("Extracting labels")
    os.system('cut -f 1,2 {0} > {1}'.format(temp("allCountries.txt"),temp("labels.txt")))
+
    print("Extracting the class")
    os.system('cut -f 1,7,8 {0} > {1}'.format(temp("allCountries.txt"),temp("class_codes.txt")))
+
+    size_label_txt=wc_l(temp("labels.txt"))
    f = open(temp("labels.txt"), encoding = 'utf-8')
    labels = {}
-    for line in f:
+    for line in tqdm(f,total=size_label_txt,desc="Create JSON containing labels for every GeonameID "):
        line = line.strip().split("\t")
        labels[line[0]] = line[1]
    f.close()
+
    open(temp("labels.json"), "w").write(json.dumps(labels))#, ensure_ascii=False))
-    os.system('git clone https://github.com/missinglink/osm-boundaries.git')
+
+    if not "pre_dl_osm" in config:
+        os.system('git clone https://github.com/missinglink/osm-boundaries.git')
+    else:
+        config["osm_boundaries_dir"]=config["pre_dl_osm"]


 def basic_gazetteer(outfile):
@@ -62,17 +75,16 @@ def basic_gazetteer(outfile):
    """

    if not os.path.isfile(os.path.join(__config.temp_dir,"labels.json")):
-        print("Give correct labels file name!!")
-        return False
+        raise FileNotFoundError("Give correct labels file name!!")
+
    if not os.path.isfile(__config.wikidata_dump):
-        print('Give correct path to wikidata json dump ')
-        return False
+        raise FileNotFoundError('Give correct path to wikidata json dump ')

-    proc1 = BasicExtraction(1,os.path.join(__config.temp_dir,"labels.json"),"resources/wd_page_rank.json")
-    dump = Reader(__config.wikidata_dump,'utf-8')
+    proc1 = BasicExtraction(1, os.path.join(__config.temp_dir, "labels.json"), "resources/wd_page_rank.json")
+    dump = Reader(__config.wikidata_dump, 'utf-8')
    controller = WDController(dump,proc1)
    controller.process_all()
-    open(outfile, 'w').write(json.dumps(proc1.dataframe))#,ensure_ascii=False))
+    open(outfile, 'w').write(json.dumps(proc1.dataframe))
    return True


@@ -92,7 +104,7 @@ def add_properties(input_gazetteer,output_gazetteer,configuration_file):
    dump = Reader(__config.wikidata_dump,'utf-8')
    controller = WDController(dump,proc1)
    controller.process_all()
-    open(output_gazetteer, 'w').write(json.dumps(proc1.dataframe))#,ensure_ascii=False))
+    open(output_gazetteer, 'w').write(json.dumps(proc1.dataframe))
    return True


@@ -106,8 +118,8 @@ def extract_classes(gazeteer):
    :return:
    """
    if not os.path.isfile(__config.wikidata_dump):
-        print('Give correct path to wikidata json dump')
-        return None
+        raise FileNotFoundError('Give correct path to wikidata json dump')
+
    proc3 = ClassExtraction(1, os.path.join(__config.temp_dir,"class_codes.txt"), gazeteer)
    dump = Reader(__config.wikidata_dump, 'utf-8')
    controller = WDController(dump, proc3)
@@ -130,7 +142,7 @@ def add_classes(gazeteer,outfile):
    iterations = 0
    places = 0
    keys = set(data.keys())
-    for key in keys:
+    for key in tqdm(keys,desc="Add Classes"):
        iterations = iterations + 1
        temp_ = []
        if 'instance_of' in data[key].keys():
@@ -163,7 +175,7 @@ def extract_missing_WikiIDS(interm_outfile,outfile):
    iterations = 0
    output=open(interm_outfile,"w")
    total=len(paths)
-    output.write(json.dumps(finding_links(paths)))#,ensure_ascii=False))
+    output.write(json.dumps(finding_links(paths)))
    proc2 = WikipediaURI(2, outfile, interm_outfile)
    dump = Reader(__config.wikidata_dump, 'utf-8')
    controller = WDController(dump, proc2)
@@ -189,8 +201,8 @@ def missing_wikidata_IDS(missing_ids):
    df = read_tsv(os.path.join(__config.osm_boundaries_dir,'meta.tsv'),encoding = 'utf-8',columns = True)#'./osm-boundaries/meta.tsv'
    wikidata_IDs = []
    paths = [os.path.join(__config.osm_boundaries_dir,'data',path) for path in df['path']]
-    iterations = 0
-    for path in paths:
+    # iterations = 0
+    for path in tqdm(paths,desc="Browsing OSM data"):
        f = open(path,encoding = 'utf-8')
        dataframe = json.load(f)
        f.close()
@@ -207,9 +219,9 @@ def missing_wikidata_IDS(missing_ids):
                wikidata_IDs.append(None)
        else:
            wikidata_IDs.append(None)
-        if iterations%1000 == 0:
-            sys.stdout.write("\r iterations: "+'{:,}'.format(iterations))
-        iterations = iterations + 1
+        # if iterations%1000 == 0:
+        #     sys.stdout.write("\r iterations: "+'{:,}'.format(iterations))
+        # iterations = iterations + 1
    df['Wiki_IDs'] = wikidata_IDs
    df.to_csv(temp('meta_all.csv'),index = False)#'temp/meta_all.csv'

@@ -230,11 +242,11 @@ def adding_geometry(infile,out_file,output_final_fn):
    Wiki_IDs = set(list(path_association.keys()))
    data = json.loads(open(out_file).read())
    outfile = open(output_final_fn, 'w')
-    iterations = 0
+    # iterations = 0
    places = 0
    keys = set(data.keys())
-    for key in keys:
-        iterations = iterations + 1
+    for key in tqdm(keys,desc="Browsing Geodict"):
+        # iterations = iterations + 1
        temp= data[key]
        temp["id"]=key
        if key in Wiki_IDs:
@@ -247,43 +259,85 @@ def adding_geometry(infile,out_file,output_final_fn):
        outfile.write(json.dumps(temp)+"\n")#,ensure_ascii=False
        del data[key]

-        if iterations % 100 == 0:
-            sys.stdout.write("\rEntity Parsed: " + '{:,}'.format(iterations) + " Places with boundaries parsed: " + '{:,}'.format(places))
-
+        # if iterations % 100 == 0:
+        #     sys.stdout.write("\rEntity Parsed: " + '{:,}'.format(iterations) + " Places with boundaries parsed: " + '{:,}'.format(places))

+def add_final_spatial_entities(input,output):
+    """
+    Add Missing Geonames entries and building Geodict IDs (En cours)
+    :param input:
+    :param output:
+    :return:
+    """
+    d_geo = {}
+    geonames_i = open(temp("allCountries.txt"))
+    for entry in geonames_i:
+        row = entry.split("\t")
+
+        d_geo[row[0]] = {lang: row[1] for lang in ["en", "fr", "es", "de"]}
+        d_geo[row[0]]["aliases"] = {lang: row[3].split(",") for lang in ["en", "fr", "es", "de"]}
+        d_geo[row[0]]["coord"] = {"lat": float(row[4]), "lon": float(row[5])}
+        d_geo[row[0]]["class"] = ["{0}-{1}".format(row[6], row[7])]
+
+    geoname_id_index = set(d_geo.keys())
+
+    already_in_geodict = set([])
+    for line in open(input):
+        data = json.loads(line.strip())
+        if "geonameID" in data:
+            already_in_geodict.add(data["geonameID"])
+
+    diff = geoname_id_index.difference(already_in_geodict)
+    prefix = "GD"
+    i = 1
+    output = open(output, 'w')
+    size_input = wc_l(input)
+    for line in tqdm(open(input),total=size_input,desc="Browsing Geodict"):
+        data = json.loads(line.strip())
+        data["wikidataID"] = data["id"]
+        data["id"] = prefix + str(i)
+        output.write(json.dumps(data) + "\n")
+        i += 1
+    for geo_id in tqdm(diff):
+        data = d_geo[geo_id]
+        data["id"] = prefix + str(i)
+        data["geonameID"] = geo_id
+        output.write(json.dumps(data) + "\n")
+        i += 1

 def main():
    start=time.time()
    if not os.path.exists(__config.temp_dir):
        os.makedirs(__config.temp_dir)
    # Import the data sources required to be harvested for creation of gazetteer
-    print("[1/6] Download required datasets...")
+    print("[1/7] Download required datasets...")
    import_data()

    # Create a first basic gazeteer
-    print("[2/6] Building the core gazetteer...")
+    print("[2/7] Building the core gazetteer...")
    basic_gazetteer(temp("1stoutput.json"))

    # Associate geonames classe to the instance_of(P31) values
-    print("[3/6] Associate a class to each entry...")
+    print("[3/7] Associate a class to each entry...")
    extract_classes(temp("1stoutput.json"))
-
    # Add class to each entity
    add_classes(temp("1stoutput.json"),temp("2ndoutput.json"))

    # Extract missing wikidata IDs in the boundary data
-    print("[4/6] Find missing WD ids within boundary data...")
+    print("[4/7] Find missing WD ids within boundary data...")
    extract_missing_WikiIDS(temp('found_missing_links.json'),temp('missing_Wikidata_IDS.txt'))
    missing_wikidata_IDS(temp('missing_Wikidata_IDS.txt'))

    # Adding properties from configuration_file
-    print("[5/6] Add user properties...")
+    print("[5/7] Add user properties...")
    add_properties(temp("2ndoutput.json"),temp("3rdoutput.json"),'config/configuration.json')

    # Add boundaries in the final data
-    print("[6/6] Adding adminstrative boundary/ies...")
+    print("[6/7] Adding adminstrative boundary/ies...")
    adding_geometry(temp("meta_all.csv"),temp("3rdoutput.json"),'out_final.json')

+    print("7/7")
+    add_final_spatial_entities("out_final.json","out_final_extended.json")
    print("The gazeteer was created in {0} hours".format(((time.time()-start)/60)/60))



--- a/gazetteer2es.py
+++ b/gazetteer2es.py
@@ -2,74 +2,75 @@ import argparse, json, sys
 from elasticsearch import Elasticsearch,helpers
 from elasticsearch import helpers
 import copy
+from tqdm import tqdm
+from mytoolbox.text.size import wc_l

-def polygon_transformation4ES(temp,simple=True):
-    final = []
-    if simple:
-        final=copy.copy(temp)
-        final.append(temp[0])
-        final=final
-    else:
-        for i in temp:
-            t=copy.copy(i)
-            t.append(i[0])
-            final.append(t)
-    return final

 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="give input json file")
    parser.add_argument("-e", "--es_host", help="Elasticsearch Host address", default="127.0.0.1")
+    parser.add_argument("-p", "--es_port", help="Elasticsearch Host port", default="9200")
    args = parser.parse_args()
+
+    if not os.path.exists(args.input):
+        raise FileNotFoundError("Input File '{0}' not found !".format(args.input))
+
    file_name = args.input
    es_client = Elasticsearch(args.es_host)
+
    if not es_client.ping():
-        print("Can't connect to ES ! ")
-        sys.exit(1)
+        raise ConnectionError("Could not connect to Elasticserver at {0}".format(args.es_host))
+
+    # If exists in the dataase, delete !
    if es_client.indices.exists(index="gazetteer"):
        es_client.indices.delete(index="gazetteer")
+
+    # Open input file
    gazetteer = open(file_name, encoding='utf-8')
-    i = 1
-    mappings = json.load(open("config/mappings.json"))
+
+ 
+    mappings = json.load(open("config/mappings.json")) 
+    # complete Mapping depending on custom properties extracted
    property_to_be_mapped = json.load(open('config/configuration.json'))
    for prop in property_to_be_mapped["properties_to_extract"]:
        mappings['mappings']['_default_']['properties'][prop['id']] = {'type':prop["mappings"]}
        if prop["mappings_details"]:
            for k,v in prop["mappings_details"].items():
                mappings['mappings']['_default_']['properties'][prop['id']][k]=v
-    print(mappings)
+    print("Mapping of Geodict index: ", mappings)
+
+    # Creation of the index in Elasticsearch databased
    es_client.indices.create(index="gazetteer", body=mappings)
    action_list=[]
-    for line in gazetteer:
+
+    number_of_entries = wc_l(file_name)
+
+    for line in tqdm(gazetteer,desc="Importing ...",total=number_of_entries):
        data = json.loads(line.strip())
        if '_score' in data.keys():
            data['score'] = data['_score']
            del data['_score']
        if "geometry" in data:
-            del data["geometry"]
+            del data["geometry"] # Difficult with ES ... so we delete it
        if "coord" in data:
-            if data["coord"]["lat"] >90 or data["coord"]["lon"] >180:
-                i+=1
-                continue
-        if not data["fr"]:
-            i+=1
-            continue
-                #print("AFTER",data["geometry"])
-                #return
-        #es_client.index("gazetteer", "place", data)
+            data["coord"]["lat"]=float(data["coord"]["lat"])
+            data["coord"]["lon"]= float(data["coord"]["lon"])
+
+            if data["coord"]["lat"] >90 or data["coord"]["lon"] >180:continue
+
+        if not data["fr"]:continue
+
        actions = {
        "_index": "gazetteer",
        "_type": "place",
        "_source": data
        }
-        #print(data["fr"])
        action_list.append(actions)
-        if i % 1000 == 0:
-            #print(action_list)
+        if len(action_list) % 1000 == 0:
            helpers.bulk(es_client,action_list,request_timeout=30)
            sys.stdout.write("\rEntity transferred: " + '{:,}'.format(i))
            action_list = []
-        i += 1


 if __name__ == '__main__':

--- a/hierarchy.py
+++ b/hierarchy.py
+import pandas as pd
+from tqdm import tqdm
+
+df = pd.read_csv("hierarchy.txt",sep="\t",header=None,names="parentId childId type".split())
+
+ids = df.parentId.values.tolist()
+ids.extend(df.childId.values.tolist())
+ids = list(set(ids))
+
+inclusion_relations_ = dict(df["childId parentId".split()].values)
+
+
+inc_dict_geonames = {} 
+for childId,parentId in tqdm(inclusion_relations_.items()):
+    if not childId in inc_dict_geonames:
+        inc_dict_geonames[childId] = [parentId]
+        if parentId in inc_dict_geonames:
+            inc_dict_geonames[childId].extend(inc_dict_geonames[parentId])
+        else:
+            B = parentId
+            while 1:
+                if B in inclusion_relations_:
+                    inc_dict_geonames[childId].append(inclusion_relations_[B])
+                    B = inclusion_relations_[B]
+                else:
+                    break
+            inc_dict_geonames[parentId] = inc_dict_geonames[childId][1:]
+            
+import json
+path="out_final_extended.json"
+geonames2GD,wikidata2GD = {}, {}
+
+from mytoolbox.text.size import wc_l
+
+size_data = wc_l(path)
+
+for line in tqdm(open(path),total=size_data):
+    data = json.loads(line.strip("\n,"))
+    if "geonameID" in data:
+        geonames2GD[data["geonameID"]]=data["id"]
+    if "wikidataID" in data:
+        wikidata2GD[data["wikidataID"]]=data["id"]
+
+output_path = "geodict_final_29_04_19.json"
+
+output = open(output_path,'w')
+
+name_col = {"P131":"located_in_adm_terr_ent",
+            "P706":"located_in_terr_feature",
+            "P47":"share_border_with"}
+
+for line in tqdm(open(path),total=size_data):
+    data = json.loads(line.strip("\n,"))
+    for property_ in ["P131","P706","P47"]:
+        if not property_ in data:
+            continue
+        data[name_col[property_]] = [wikidata2GD[id_] for id_ in data[property_] if id_ in wikidata2GD]
+    if "geonameID" in data and data["geonameID"] in inc_dict_geonames:
+        data["geoname_hierarchy"] = inc_dict_geonames[data["geonameID"]]
+    output.write("{0}\n,".format(json.dumps(data)))
\ No newline at end of file
--- a/utils.py
+++ b/utils.py
+# coding = utf-8
+import os
+
+def blocks(files, size=65536):
+    while True:
+        b = files.read(size)
+        if not b: break
+        yield b
+
+def wc_l(text_input_fn):
+    """
+    Count the number of line in a file
+
+    Parameters
+    ----------
+    text_input_fn : str
+        filepath
+
+    """
+    if not os.path.exists(text_input_fn):
+        raise FileNotFoundError("{0} does not exists !".format(text_input_fn))
+
+    with open(text_input_fn, "r", encoding="utf-8", errors='ignore') as f:
+        return sum(bl.count("\n") for bl in blocks(f))
+
--- a/wikidata/helpers.py
+++ b/wikidata/helpers.py
@@ -62,6 +62,7 @@ def read_Tsv(filename,encoding='ascii'):
    column = text[0]
    del text[0]
    return pd.DataFrame(text,columns = column)
+
 #finding the missing link for wikipedia pages for which wikidata_IDs are not available
 def finding_links(files):
    missing_uri=[]

--- a/wikidata/property_wd.py
+++ b/wikidata/property_wd.py
@@ -2,9 +2,9 @@

 class Property(object):
    """docstring for property."""
-    def __init__(self, id,isMultiple,type_):
+    def __init__(self, id, isMultiple, type_):
        self.id=id
-        self.isMultiple=isMultiple
+        self.isMultiple = isMultiple
        self.type=type_

    def exists(self,data):
@@ -14,4 +14,4 @@ class Property(object):
        return False

    def extractData(self,data):
-        return self.type.extractData(self.id,self.isMultiple,data)
+        return self.type.extractData(self.id, self.isMultiple, data)
--- a/wikidata/reader.py
+++ b/wikidata/reader.py
 # coding=utf-8
 from gzip import GzipFile
 import json
+
+from utils import wc_l
+
+
 class Reader(object):
    """docstring for Reader."""
    def __init__(self, name, decoding):
@@ -9,6 +13,7 @@ class Reader(object):
        self.decoding = decoding
        self.dump = GzipFile(name,'r')
        self.line = self.dump.readline()
+        self.size_file = wc_l(name)

    def has_next(self):
        self.line = self.dump.readline().decode(self.decoding)

--- a/wikidata/types_wd.py
+++ b/wikidata/types_wd.py
@@ -125,15 +125,15 @@ class Time(Type):
    def extractMultiple(self, propID, data):
        result = []
        for i in range(len(data['claims'][propID])):
-            result.append(parsedate(data['claims'][propID][i]['mainsnak']['datavalue']['value']['time']))
+            result.append(parseDate(data['claims'][propID][i]['mainsnak']['datavalue']['value']['time']))
        return result

    def extractSingle(self, propID, data):
-        return parsedate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
+        return parseDate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])

    def check_conformity(self, propID, data):
        try:
-            parsedate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
+            parseDate(data['claims'][propID][0]['mainsnak']['datavalue']['value']["time"])
            return True
        except Exception as e:
            return False