geodict_gaurav.py 3.83 KiB
# coding = utf-8
import math

from ...helpers.collision import *
#from ...helpers.geodict_helpers_old import *
from ...helpers.geodict_helpers import *
from .disambiguator import Disambiguator

from ...models.str import get_inclusion_chain


class GauravGeodict(Disambiguator):

    def __init__(self):
        Disambiguator.__init__(self)

    def fib_formula(self, n):
        if n in [0, 1]: return 0  # Modifying fibonacci behaviour
        golden_ratio = (1 + math.sqrt(5)) / 2
        val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5)
        return int(round(val))

    def inclusion_log(self, x, alpha=0.2):
        if x==0:
            return 1
        return math.log(x)

    def get_inclusion_tree(self, id_, prop):
        """
        For an entity return it geographical inclusion tree using a property.
        """
        arr = []
        current_entity = gazetteer.get_by_id(id_)[0]
        while True:
            if prop in current_entity:
                arr.append(current_entity[prop][0])
                current_entity = gazetteer.get_by_other_id(current_entity[prop][0],"wikidata")
            else:
                arr.append(gazetteer.get_by_label("Earth","en")[0].id)  # Earth ID
                break
        return arr

    def get_inclusion_score(self, id1, id2):  # is it really inclusion ? :)
        list1 = get_inclusion_chain(id1, 'P131')
        list2 = get_inclusion_chain(id2, 'P131')
        interP131 = len(list(set(list1).intersection(list2)))
        list1 = get_inclusion_chain(id1, 'P706')
        list2 = get_inclusion_chain(id2, 'P706')
        interP706 = len(list(set(list1).intersection(list2)))
        # return fib_no[interP131]+fib_no[interP706]
        return self.inclusion_log(interP131) + self.inclusion_log(interP706)

    def Adjacency_P47(self, id1, id2):
        data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
        if "P47" in data_1 and "P47" in data_2:
            if id1 in data_2.other.P47 or id2 in data_1.other.P47:
                return True
        return False

    def Adjacency_Hull(self, id1, id2):
        return collisionTwoSEBoundaries(id1, id2)

    def disambiguateOne(self, spat_candidates, fixed_entities):
        score_dc = {}

        for cand in spat_candidates:
            id_cand = cand.id
            score_dc[id_cand] = 0
            for fixed in fixed_entities:
                id_fixed = fixed_entities[fixed].id
                if self.Adjacency_P47(id_cand, id_fixed):
                    score_dc[id_cand] += 3
                elif self.Adjacency_Hull(id_cand, id_fixed):
                    score_dc[id_cand] += 2
                score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
        m = max(score_dc, key=score_dc.get)
        if score_dc[m] < 4:
            return None
        for cand in spat_candidates:
            if cand.id == m:
                return cand.id


    def eval(self,se_,lang):
        selected_en = {}
        fixed_entities = {}
        ambiguous_entities = {}
        for en in se_:
            request = gazetteer.get_by_label(en, lang)
            if len(request) == 0:
                request = gazetteer.get_by_alias(en, lang)

            if len(request) > 1:
                ambiguous_entities[en] = request
            elif len(request) == 1:
                fixed_entities[en] = request[0]

        d_amb_results = {}
        for amb_ent in ambiguous_entities:
            d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
            if not d:
                d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id
            else:
                d_amb_results[amb_ent] = d
        #print(fixed_entities)
        for k, v in fixed_entities.items():
            fixed_entities[k] = v.id
        for k, v in d_amb_results.items():
            fixed_entities[k] = v

        return fixed_entities