Debug + First Beta Release

6140bbb5 · Pokiros · 4d4910b4 · 6140bbb5 · 6140bbb5 · 6140bbb5
Commit 6140bbb5 authored 8 years ago by Pokiros
Hide whitespace changes
Inline Side-by-side

Showing

with 22 additions and 19 deletions
+22 -19
--- a/.gitignore
+++ b/.gitignore
+wikidata/__pycache__/*
--- a/Gazetteer.py
+++ b/Gazetteer.py
 # coding=utf-8
 import os,json,sys,time,argparse,zipfile,urllib.request
-import pandas as import pd
+import pandas as pd
 from gzip import GzipFile
 from collections import defaultdict
 from wikidata.helpers import *
@@ -9,7 +9,7 @@ from wikidata.entity_wd import *
 from wikidata.property_wd import *
 from wikidata.reader import Reader
 from wikidata.process_wd import *
-from custom_process.wiki_extract import *
+from custom_process.basic_extraction import *
 from custom_process.wiki_links import *
 from custom_process.class_extraction import *
 from custom_process.property_extract import *
@@ -161,7 +161,7 @@ def extract_missing_WikiIDS(wikidata_dump,interm_outfile,outfile):
    if not os.path.isfile(wikidata_dump):
        print('Give correct path to wikidata json dump ""\(-_-)/""')
        return None
-    df = read_tsv('./osm-boundaries/meta.tsv', encoding = 'utf-8', columns = True)#C:\\Users\shrivastava\Desktop
+    df = read_Tsv('./osm-boundaries/meta.tsv', encoding = 'utf-8')#C:\\Users\shrivastava\Desktop
    paths = ['./osm-boundaries/data/'+ path for path in df['path']]
    del df
    iterations = 0
@@ -257,26 +257,26 @@ def main():
    start=time.time()
    # Import the data sources required to be harvested for creation of gazetteer
-    import_data('./temp/labels.txt','./temp/class_codes.txt','./temp/labels.json')
+#    import_data('./temp/labels.txt','./temp/class_codes.txt','./temp/labels.json')
    # Create a first basic gazeteer
-    basic_gazetteer("./temp/labels.json", wikidata_dump,'./temp/output.json','resources/wd_page_rank.json')
+#    basic_gazetteer("./temp/labels.json", wikidata_dump,'./temp/output.json','resources/wd_page_rank.json')
    # Associate geonames classe to the instance_of(P31) values
-    extract_classes(wikidata_dump,'./temp/class_codes.txt','./temp/output.json','./temp/class_mapped.json')
+#    extract_classes(wikidata_dump,'./temp/class_codes.txt','./temp/output.json','./temp/class_mapped.json')
    # Add class to each entity
-    add_classes('./temp/class_mapped.json','./temp/output.json','temp/output_temp.json')
+#    add_classes('./temp/class_mapped.json','./temp/output.json','temp/output_temp.json')
    # Extract missing wikidata IDs in the boundary data
-    extract_missing_WikiIDS(wikidata_dump,'./temp/found_missing_links.json','./temp/missing_Wikidata_IDS.txt')
+#    extract_missing_WikiIDS(wikidata_dump,'./temp/found_missing_links.json','./temp/missing_Wikidata_IDS.txt')
-    missing_wikidata_IDS('./temp/missing_Wikidata_IDS.txt','./osm-boundaries/meta.tsv','temp/meta_all.csv')
+#    missing_wikidata_IDS('./temp/missing_Wikidata_IDS.txt','./osm-boundaries/meta.tsv','temp/meta_all.csv')
    # Adding properties from configuration_file
-    add_properties(wikidata_dump,'final_output.json','finale_output.json','configuration.json')
+    add_properties(wikidata_dump,'temp/output_temp.json','temp/finale_output.json','Configuration/configuration.json')
    # Add boundaries in the final data
-    adding_geometry('temp/meta_all.csv','temp/output_temp.json','out_final.json')
+    adding_geometry('temp/meta_all.csv','temp/finale_output.json','out_final.json')
    print("The gazeteer was created in {0} hours".format(((time.time()-start)/60)/60))

--- a/custom_process/__pycache__/__init__.cpython-36.pyc
+++ b/custom_process/__pycache__/__init__.cpython-36.pyc
--- a/custom_process/__pycache__/basic_extraction.cpython-36.pyc
+++ b/custom_process/__pycache__/basic_extraction.cpython-36.pyc
--- a/custom_process/__pycache__/class_extraction.cpython-36.pyc
+++ b/custom_process/__pycache__/class_extraction.cpython-36.pyc
--- a/custom_process/__pycache__/property_extract.cpython-36.pyc
+++ b/custom_process/__pycache__/property_extract.cpython-36.pyc
--- a/custom_process/__pycache__/wiki_links.cpython-36.pyc
+++ b/custom_process/__pycache__/wiki_links.cpython-36.pyc
--- a/custom_process/basic_extraction.py
+++ b/custom_process/basic_extraction.py
@@ -13,7 +13,7 @@ from wikidata.process_wd import *
 class BasicExtraction(Process):
    def __init__(self, id, labels_fn,page_rank):
-        super(P1, Process.__init__(self, id))
+        super(BasicExtraction, Process.__init__(self, id))
        self.dataframe = {}
        self.instance_of_prop = Property("P31", True, EntityID())
        self.coord_prop = Property("P625", False, Coordinates())
@@ -69,7 +69,7 @@ class BasicExtraction(Process):
                if self.continent_prop.exists(entry):
                    entity["continent"] = self.continent_prop.extractData(entry)
                if entry['id'] in self.scores.keys():
-                    entity['score'] = self.scores[keys]
+                    entity['score'] = self.scores[entry["id"]]
                # setting GeoName ID
                # self.dataframe.append(entity)
                self.dataframe[entry['id']] = entity
--- a/custom_process/property_extract.py
+++ b/custom_process/property_extract.py
@@ -15,7 +15,7 @@ from wikidata.process_wd import *
 class PropertyExtract(Process):
    def __init__(self, id, properties, data):
-        super(Property_Extract, Process.__init__(self, id))
+        super(PropertyExtract, Process.__init__(self, id))
        self.dataframe = {}
        #self.extract_prop = Property(prop, istype, String())
        self.properties_to_extract = properties['properties_to_extract']

--- a/wikidata/__pycache__/__init__.cpython-36.pyc
+++ b/wikidata/__pycache__/__init__.cpython-36.pyc
--- a/wikidata/__pycache__/entity_wd.cpython-36.pyc
+++ b/wikidata/__pycache__/entity_wd.cpython-36.pyc
--- a/wikidata/__pycache__/helpers.cpython-36.pyc
+++ b/wikidata/__pycache__/helpers.cpython-36.pyc
--- a/wikidata/__pycache__/process_wd.cpython-36.pyc
+++ b/wikidata/__pycache__/process_wd.cpython-36.pyc
--- a/wikidata/__pycache__/property_wd.cpython-36.pyc
+++ b/wikidata/__pycache__/property_wd.cpython-36.pyc
--- a/wikidata/__pycache__/reader.cpython-36.pyc
+++ b/wikidata/__pycache__/reader.cpython-36.pyc
--- a/wikidata/__pycache__/types_wd.cpython-36.pyc
+++ b/wikidata/__pycache__/types_wd.cpython-36.pyc
--- a/wikidata/helpers.py
+++ b/wikidata/helpers.py
@@ -46,19 +46,21 @@ def columns_extract(text):
    return pd.DataFrame(text, columns=columns)
-def read_tsv(filename,encoding='ascii'):
+def read_Tsv(filename,encoding='ascii'):
-    f = open(filename,encoding)
+    f = open(filename,encoding=encoding)
    text = f.read()
    f.close()
    text = text.split('\n')
-    for line in text:
+    for line in range(len(text)):
        text[line] = text[line].split('\t')
    column = text[0]
-    text.remove(col)
+    del text[0]
    return pd.DataFrame(text,columns = column)
 #finding the missing link for wikipedia pages for which wikidata_IDs are not available
 def finding_links(files):
    missing_uri=[]
+    iterations=0
+    total=len(files)
    for file_name in files:
        f = open(file_name,encoding = 'utf-8')
        dataframe = json.load(f)
@@ -83,7 +85,7 @@ def columns_extract(text):
    return pd.DataFrame(text,columns = columns)
-def read_Tsv(filename,encoding='ascii',columns = False):
+def read_tsv(filename,encoding='ascii',columns = False):
    f = open(filename,encoding = encoding)
    text = f.read()
    f.close()