An error occurred while loading the file. Please try again.
-
Commandre Benjamin authored33ba8fa7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# coding = utf-8
import math
from ...helpers.collision import *
#from ...helpers.geodict_helpers_old import *
from ...helpers.geodict_helpers import *
from .disambiguator import Disambiguator
from ...models.str import get_inclusion_chain
class GauravGeodict(Disambiguator):
def __init__(self):
Disambiguator.__init__(self)
def fib_formula(self, n):
if n in [0, 1]: return 0 # Modifying fibonacci behaviour
golden_ratio = (1 + math.sqrt(5)) / 2
val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5)
return int(round(val))
def inclusion_log(self, x, alpha=0.2):
if x==0:
return 1
return math.log(x)
def get_inclusion_tree(self, id_, prop):
"""
For an entity return it geographical inclusion tree using a property.
"""
arr = []
current_entity = gazetteer.get_by_id(id_)[0]
while True:
if prop in current_entity:
arr.append(current_entity[prop][0])
current_entity = gazetteer.get_by_other_id(current_entity[prop][0],"wikidata")
else:
arr.append(gazetteer.get_by_label("Earth","en")[0].id) # Earth ID
break
return arr
def get_inclusion_score(self, id1, id2): # is it really inclusion ? :)
list1 = get_inclusion_chain(id1, 'P131')
list2 = get_inclusion_chain(id2, 'P131')
interP131 = len(list(set(list1).intersection(list2)))
list1 = get_inclusion_chain(id1, 'P706')
list2 = get_inclusion_chain(id2, 'P706')
interP706 = len(list(set(list1).intersection(list2)))
# return fib_no[interP131]+fib_no[interP706]
return self.inclusion_log(interP131) + self.inclusion_log(interP706)
def Adjacency_P47(self, id1, id2):
data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
if "P47" in data_1 and "P47" in data_2:
if id1 in data_2.other.P47 or id2 in data_1.other.P47:
return True
return False
def Adjacency_Hull(self, id1, id2):
return collisionTwoSEBoundaries(id1, id2)
def disambiguateOne(self, spat_candidates, fixed_entities):
score_dc = {}
for cand in spat_candidates:
id_cand = cand.id
score_dc[id_cand] = 0
for fixed in fixed_entities:
id_fixed = fixed_entities[fixed].id
if self.Adjacency_P47(id_cand, id_fixed):
score_dc[id_cand] += 3
elif self.Adjacency_Hull(id_cand, id_fixed):
score_dc[id_cand] += 2
score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
m = max(score_dc, key=score_dc.get)
if score_dc[m] < 4:
return None
for cand in spat_candidates:
if cand.id == m:
return cand.id
def eval(self,se_,lang):
selected_en = {}
fixed_entities = {}
ambiguous_entities = {}
for en in se_:
request = gazetteer.get_by_label(en, lang)
if len(request) == 0:
request = gazetteer.get_by_alias(en, lang)
if len(request) > 1:
ambiguous_entities[en] = request
elif len(request) == 1:
fixed_entities[en] = request[0]
d_amb_results = {}
for amb_ent in ambiguous_entities:
d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
if not d:
d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id
else:
d_amb_results[amb_ent] = d
#print(fixed_entities)
for k, v in fixed_entities.items():
fixed_entities[k] = v.id
for k, v in d_amb_results.items():
fixed_entities[k] = v
return fixed_entities