Commit 6140bbb5 authored by Pokiros's avatar Pokiros
Browse files

Debug + First Beta Release

No related merge requests found
Showing with 22 additions and 19 deletions
+22 -19
.gitignore 0 → 100644
wikidata/__pycache__/*
# coding=utf-8 # coding=utf-8
import os,json,sys,time,argparse,zipfile,urllib.request import os,json,sys,time,argparse,zipfile,urllib.request
import pandas as import pd import pandas as pd
from gzip import GzipFile from gzip import GzipFile
from collections import defaultdict from collections import defaultdict
from wikidata.helpers import * from wikidata.helpers import *
...@@ -9,7 +9,7 @@ from wikidata.entity_wd import * ...@@ -9,7 +9,7 @@ from wikidata.entity_wd import *
from wikidata.property_wd import * from wikidata.property_wd import *
from wikidata.reader import Reader from wikidata.reader import Reader
from wikidata.process_wd import * from wikidata.process_wd import *
from custom_process.wiki_extract import * from custom_process.basic_extraction import *
from custom_process.wiki_links import * from custom_process.wiki_links import *
from custom_process.class_extraction import * from custom_process.class_extraction import *
from custom_process.property_extract import * from custom_process.property_extract import *
...@@ -161,7 +161,7 @@ def extract_missing_WikiIDS(wikidata_dump,interm_outfile,outfile): ...@@ -161,7 +161,7 @@ def extract_missing_WikiIDS(wikidata_dump,interm_outfile,outfile):
if not os.path.isfile(wikidata_dump): if not os.path.isfile(wikidata_dump):
print('Give correct path to wikidata json dump ""\(-_-)/""') print('Give correct path to wikidata json dump ""\(-_-)/""')
return None return None
df = read_tsv('./osm-boundaries/meta.tsv', encoding = 'utf-8', columns = True)#C:\\Users\shrivastava\Desktop df = read_Tsv('./osm-boundaries/meta.tsv', encoding = 'utf-8')#C:\\Users\shrivastava\Desktop
paths = ['./osm-boundaries/data/'+ path for path in df['path']] paths = ['./osm-boundaries/data/'+ path for path in df['path']]
del df del df
iterations = 0 iterations = 0
...@@ -257,26 +257,26 @@ def main(): ...@@ -257,26 +257,26 @@ def main():
start=time.time() start=time.time()
# Import the data sources required to be harvested for creation of gazetteer # Import the data sources required to be harvested for creation of gazetteer
import_data('./temp/labels.txt','./temp/class_codes.txt','./temp/labels.json') # import_data('./temp/labels.txt','./temp/class_codes.txt','./temp/labels.json')
# Create a first basic gazeteer # Create a first basic gazeteer
basic_gazetteer("./temp/labels.json", wikidata_dump,'./temp/output.json','resources/wd_page_rank.json') # basic_gazetteer("./temp/labels.json", wikidata_dump,'./temp/output.json','resources/wd_page_rank.json')
# Associate geonames classe to the instance_of(P31) values # Associate geonames classe to the instance_of(P31) values
extract_classes(wikidata_dump,'./temp/class_codes.txt','./temp/output.json','./temp/class_mapped.json') # extract_classes(wikidata_dump,'./temp/class_codes.txt','./temp/output.json','./temp/class_mapped.json')
# Add class to each entity # Add class to each entity
add_classes('./temp/class_mapped.json','./temp/output.json','temp/output_temp.json') # add_classes('./temp/class_mapped.json','./temp/output.json','temp/output_temp.json')
# Extract missing wikidata IDs in the boundary data # Extract missing wikidata IDs in the boundary data
extract_missing_WikiIDS(wikidata_dump,'./temp/found_missing_links.json','./temp/missing_Wikidata_IDS.txt') # extract_missing_WikiIDS(wikidata_dump,'./temp/found_missing_links.json','./temp/missing_Wikidata_IDS.txt')
missing_wikidata_IDS('./temp/missing_Wikidata_IDS.txt','./osm-boundaries/meta.tsv','temp/meta_all.csv') # missing_wikidata_IDS('./temp/missing_Wikidata_IDS.txt','./osm-boundaries/meta.tsv','temp/meta_all.csv')
# Adding properties from configuration_file # Adding properties from configuration_file
add_properties(wikidata_dump,'final_output.json','finale_output.json','configuration.json') add_properties(wikidata_dump,'temp/output_temp.json','temp/finale_output.json','Configuration/configuration.json')
# Add boundaries in the final data # Add boundaries in the final data
adding_geometry('temp/meta_all.csv','temp/output_temp.json','out_final.json') adding_geometry('temp/meta_all.csv','temp/finale_output.json','out_final.json')
print("The gazeteer was created in {0} hours".format(((time.time()-start)/60)/60)) print("The gazeteer was created in {0} hours".format(((time.time()-start)/60)/60))
......
File added
File added
File added
File added
File added
...@@ -13,7 +13,7 @@ from wikidata.process_wd import * ...@@ -13,7 +13,7 @@ from wikidata.process_wd import *
class BasicExtraction(Process): class BasicExtraction(Process):
def __init__(self, id, labels_fn,page_rank): def __init__(self, id, labels_fn,page_rank):
super(P1, Process.__init__(self, id)) super(BasicExtraction, Process.__init__(self, id))
self.dataframe = {} self.dataframe = {}
self.instance_of_prop = Property("P31", True, EntityID()) self.instance_of_prop = Property("P31", True, EntityID())
self.coord_prop = Property("P625", False, Coordinates()) self.coord_prop = Property("P625", False, Coordinates())
...@@ -69,7 +69,7 @@ class BasicExtraction(Process): ...@@ -69,7 +69,7 @@ class BasicExtraction(Process):
if self.continent_prop.exists(entry): if self.continent_prop.exists(entry):
entity["continent"] = self.continent_prop.extractData(entry) entity["continent"] = self.continent_prop.extractData(entry)
if entry['id'] in self.scores.keys(): if entry['id'] in self.scores.keys():
entity['score'] = self.scores[keys] entity['score'] = self.scores[entry["id"]]
# setting GeoName ID # setting GeoName ID
# self.dataframe.append(entity) # self.dataframe.append(entity)
self.dataframe[entry['id']] = entity self.dataframe[entry['id']] = entity
...@@ -15,7 +15,7 @@ from wikidata.process_wd import * ...@@ -15,7 +15,7 @@ from wikidata.process_wd import *
class PropertyExtract(Process): class PropertyExtract(Process):
def __init__(self, id, properties, data): def __init__(self, id, properties, data):
super(Property_Extract, Process.__init__(self, id)) super(PropertyExtract, Process.__init__(self, id))
self.dataframe = {} self.dataframe = {}
#self.extract_prop = Property(prop, istype, String()) #self.extract_prop = Property(prop, istype, String())
self.properties_to_extract = properties['properties_to_extract'] self.properties_to_extract = properties['properties_to_extract']
......
File deleted
File deleted
File deleted
File deleted
File deleted
File deleted
File deleted
...@@ -46,19 +46,21 @@ def columns_extract(text): ...@@ -46,19 +46,21 @@ def columns_extract(text):
return pd.DataFrame(text, columns=columns) return pd.DataFrame(text, columns=columns)
def read_tsv(filename,encoding='ascii'): def read_Tsv(filename,encoding='ascii'):
f = open(filename,encoding) f = open(filename,encoding=encoding)
text = f.read() text = f.read()
f.close() f.close()
text = text.split('\n') text = text.split('\n')
for line in text: for line in range(len(text)):
text[line] = text[line].split('\t') text[line] = text[line].split('\t')
column = text[0] column = text[0]
text.remove(col) del text[0]
return pd.DataFrame(text,columns = column) return pd.DataFrame(text,columns = column)
#finding the missing link for wikipedia pages for which wikidata_IDs are not available #finding the missing link for wikipedia pages for which wikidata_IDs are not available
def finding_links(files): def finding_links(files):
missing_uri=[] missing_uri=[]
iterations=0
total=len(files)
for file_name in files: for file_name in files:
f = open(file_name,encoding = 'utf-8') f = open(file_name,encoding = 'utf-8')
dataframe = json.load(f) dataframe = json.load(f)
...@@ -83,7 +85,7 @@ def columns_extract(text): ...@@ -83,7 +85,7 @@ def columns_extract(text):
return pd.DataFrame(text,columns = columns) return pd.DataFrame(text,columns = columns)
def read_Tsv(filename,encoding='ascii',columns = False): def read_tsv(filename,encoding='ascii',columns = False):
f = open(filename,encoding = encoding) f = open(filename,encoding = encoding)
text = f.read() text = f.read()
f.close() f.close()
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment