disambiguator.py 2.02 KiB
# coding = utf-8

import copy
import string

import numpy as np

from ..ner.ner import NER


class Disambiguator(object):

    def __init__(self,one_by_one=False,context_based=False):
        """Constructor for Disambiguator"""
        self.one_by_one= one_by_one
        self.context_based=context_based

    def disambiguate(self,lang,ner_output=None,toponyms=None):
        """
        Run the disambiguation on the NER output
        Parameters
        ----------
        ner_output : 2D numpy array
            NER output
        lang : str
            language

        Returns
        -------
        dict
            {toponym : geodictID}
        """
        if isinstance(ner_output, np.ndarray) and len(ner_output.shape) == 2 and ner_output.shape[1] == 2:
            toponyms = self.parse_ner_output(ner_output)
        elif len(ner_output.shape) != 2:
            return  {}
        elif not toponyms:
            raise ValueError("Either enter a list of toponyms or give ner_output")
        if self.context_based:
            return self.disambiguate_context_based(toponyms,lang)
        else:
            return self.disambiguate_one_by_one(toponyms,lang)

    def disambiguate_one_by_one(self, toponyms, lang):
        """
        Disambiguation process when toponyms are geocoded one by one.
        Parameters
        ----------
        toponyms :list
            toponyms
        Returns
        -------
        dict
            {toponym : geodictID}
        """
        raise NotImplementedError

    def disambiguate_context_based(self,toponyms,lang):
        """
        Disambiguation process when toponyms are geocoded using each one of them
        Parameters
        ----------
        toponyms :list
            toponyms
        Returns
        -------
        dict
            {toponym : geodictID}
        """
        raise NotImplementedError

    def parse_ner_output(self,ner_output):
        return [toponym[0] if isinstance(toponym[0],str) else " ".join(toponym[0]) for toponym in ner_output[ner_output[:,1] == NER._unified_tag["place"]]]