# coding =utf-8 from strpython.models.str import STR from .models.transformation.transform import Generalisation, Expansion from .nlp.disambiguator.disambiguator import Disambiguator from .nlp.disambiguator.most_common import MostCommonDisambiguator from .nlp.exception.disambiguator import NotADisambiguatorInstance from .nlp.exception.ner import NotANERInstance from .nlp.exception.tagger import NotATaggerInstance from .nlp.ner.ner import NER from .nlp.ner.stanford_ner import StanfordNER from .nlp.pos_tagger.tagger import Tagger from .nlp.pos_tagger.treetagger import TreeTagger import json,re class Pipeline(object): """ Class defining a Pipeline instance Run the whole treatement on a given text """ def __init__(self,lang="english",**kwargs): """ Constructor :param kwargs: """ self.lang=lang[:2] self.tagger=kwargs["tagger"] if "tagger" in kwargs else TreeTagger(language=lang) self.ner = kwargs["ner"] if "ner" in kwargs else StanfordNER(lang=lang[:2]) self.disambiguator=kwargs["disambiguator"] if "disambiguator" in kwargs else MostCommonDisambiguator() def parse(self,text,debug=False): """ :param text: :rtype: list,dict """ output = text # If specificate POS if self.tagger.active: output = self.tagger.tag(output) # NER output = self.ner.identify(output) # Disambiguation count,se_identified = self.disambiguator.disambiguate(output, self.lang) if debug: print(se_identified) return count,output,se_identified def set_tagger(self,tagger): """ Set POS tagger used in the Pipeline :param tagger: :return: """ if isinstance(tagger,Tagger): self.tagger=tagger else: raise NotATaggerInstance() def set_ner(self,ner): """ Set NER used in the pipeline :param ner: :return: """ if isinstance(ner,NER): self.ner=ner else: raise NotANERInstance() def set_disambiguator(self,disambiguator): """ :param disambiguator: :return: """ if isinstance(disambiguator,Disambiguator): self.disambiguator=disambiguator else: raise NotADisambiguatorInstance() def build(self,text,se_identified=None, **kwargs): """ Return the corresponding STR for a text. :param text: :return: STR """ cooc= kwargs.get("cooc",False) adj = kwargs.get("adj", True) inc = kwargs.get("inc", True) toponyms= kwargs.get("toponyms", None) stop_words=kwargs.get("stop_words",[]) if isinstance(toponyms,list): se_identified = self.disambiguator.disambiguate_list([top for top in toponyms if not top.lower() in stop_words and not len(re.findall("\d+",top)) != 0 and len(top)>3],self.lang) count,output ={},text #print(se_identified) elif not se_identified: count,output, se_identified = self.parse(text) else: count, output, _ = self.parse(text) str_=STR(output,se_identified) str_.build(adj=adj,inc=inc) str_=self.transform(str_,**kwargs) #TODO : Add count return str_,count,str_.spatial_entities def transform(self,str_,**kwargs): if not "type_trans" in kwargs: return str_ type_trans=kwargs.pop("type_trans") if type_trans == "gen": str_=Generalisation().transform(str_,**kwargs) else: str_=Expansion().transform(str_,**kwargs) return str_ if __name__ == '__main__': pass