Commit 755998a6 authored by Fize Jacques's avatar Fize Jacques
Browse files

Debug, STR modif for faster generation, debug disambiguators, update...

Debug, STR modif for faster generation, debug disambiguators, update pipeline,debug document selection
parent 5da5fbd1
No related merge requests found
Showing with 1112 additions and 301 deletions
+1112 -301
......@@ -5,7 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
from langdetect import detect
from progressbar import ProgressBar, Timer, Bar, ETA, Counter
from strpython.nlp.disambiguator.geodict_gaurav import *
from strpython.nlp.disambiguator.share_prop import *
from strpython.pipeline import *
import networkx as nx
......
......@@ -6,7 +6,7 @@ import argparse,glob, string,time,re
from progressbar import ProgressBar, Timer, Bar, ETA, Counter
from strpython.models.str import STR
from strpython.nlp.disambiguator.geodict_gaurav import *
from strpython.nlp.disambiguator.share_prop import *
from strpython.pipeline import *
import pandas as pd
import networkx as nx
......
......@@ -33,7 +33,7 @@ selected = json.load(open(args.selectedFile))
for fn in matrix_fns:
measure = os.path.basename(fn).split("_")[0]
type_= "_".join(fn.split("_")[1:]).replace(".npy.bz2","")
type_= "_".join(os.path.basename(fn).split("_")[1:]).replace(".npy.bz2","")
print("Proceeding...",measure, type_)
df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)),
selected,
......
......@@ -21,7 +21,7 @@ from strpython.nlp.ner.polyglot import Polyglot as poly_ner
from strpython.nlp.ner.stanford_ner import StanfordNER as stanford_ner
from strpython.nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator as wiki_d
from strpython.nlp.disambiguator.geodict_gaurav import GauravGeodict as shared_geo_d
from strpython.nlp.disambiguator.share_prop import ShareProp as shared_geo_d
from strpython.nlp.disambiguator.most_common import MostCommonDisambiguator as most_common_d
from mytoolbox.text.clean import *
......
# coding = utf-8
from shapely.geometry import Point
from ..nlp.disambiguator.geodict_gaurav import GauravGeodict
from ..nlp.disambiguator.share_prop import GauravGeodict
from ..nlp.disambiguator.most_common import MostCommonDisambiguator
from ..nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator
......
This diff is collapsed.
# coding = utf-8
\ No newline at end of file
# coding = utf-8
from .most_common import MostCommonDisambiguator
from .share_prop import ShareProp
from .wikipedia_cooc import WikipediaDisambiguator
from .disambiguator import Disambiguator
\ No newline at end of file
......@@ -10,53 +10,62 @@ from ..ner.ner import NER
class Disambiguator(object):
def __init__(self):
def __init__(self,one_by_one=False,context_based=False):
"""Constructor for Disambiguator"""
pass
def extract_se_entities(self, input):
out = Disambiguator.parse_corpus(input)
en_ = out[out[:, 1] == NER._unified_tag["place"]]
return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0])
def toponymes_frequencies(self, ens_):
count = {}
for en in ens_:
if not en in count: count[en] = 0
count[en] += 1
return count
@staticmethod
def parse_corpus(corpus):
final_corpus = []
t = 0
placeTag = NER._unified_tag["place"]
while t < len(corpus):
tag = copy.copy(corpus[t])
if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
lenw = 1
if tag[1] == "BEG-" + placeTag:
compound_tag = tag[0]
t += 1
while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
tag = copy.copy(corpus[t])
if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation:
compound_tag += tag[0]
else:
compound_tag += " " + tag[0]
t += 1
lenw += 1
tag[0] = compound_tag
tag[1] = placeTag
t += 1
else:
t += 1
final_corpus.append(tag)
return np.array(final_corpus)
def disambiguate(self, ner_result):
pass
def disambiguate_list(self,toponyms,lang):
pass
\ No newline at end of file
self.one_by_one= one_by_one
self.context_based=context_based
def disambiguate(self,lang,ner_output=None,toponyms=None):
"""
Run the disambiguation on the NER output
Parameters
----------
ner_output : 2D numpy array
NER output
lang : str
language
Returns
-------
dict
{toponym : geodictID}
"""
if isinstance(ner_output, np.ndarray) and ner_output.shape[1] == 2:
toponyms = self.parse_ner_output(ner_output)
elif not toponyms:
raise ValueError("Either enter a list of toponyms or give ner_output")
if self.context_based:
return self.disambiguate_context_based(toponyms,lang)
else:
return self.disambiguate_one_by_one(toponyms,lang)
def disambiguate_one_by_one(self, toponyms, lang):
"""
Disambiguation process when toponyms are geocoded one by one.
Parameters
----------
toponyms :list
toponyms
Returns
-------
dict
{toponym : geodictID}
"""
raise NotImplementedError
def disambiguate_context_based(self,toponyms,lang):
"""
Disambiguation process when toponyms are geocoded using each one of them
Parameters
----------
toponyms :list
toponyms
Returns
-------
dict
{toponym : geodictID}
"""
raise NotImplementedError
def parse_ner_output(self,ner_output):
return [toponym[0] if isinstance(toponym[0],str) else " ".join(toponym[0]) for toponym in ner_output[ner_output[:,1] == NER._unified_tag["place"]]]
\ No newline at end of file
......@@ -28,40 +28,28 @@ common_words = {
class MostCommonDisambiguator(Disambiguator):
def __init__(self):
Disambiguator.__init__(self)
Disambiguator.__init__(self,one_by_one=True)
def disambiguate(self, ner_result, lang="en"):
count, se_ = self.extract_se_entities(ner_result)
new_count = {}
selected_en = {}
for en in se_:
id_,score=self.disambiguate_(en,lang)
if not id_ == "O" and id_:
selected_en[id_] = en
new_count[id_] = count[en]
return new_count, selected_en
def disambiguate_list(self,toponyms,lang):
def disambiguate_one_by_one(self, toponyms,lang):
result={}
for toponym in toponyms:
id_,_=self.disambiguate_(toponym,lang)
if id_:
result[id_]=toponym
result[toponym]=id_
return result
def disambiguate_(self, label, lang='fr'):
if re.match("^\d+$", label):
return 'O', -1
if lang in stop_words: #and lang in common_words:
if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]:
if label.lower().rstrip("s") in stop_words[lang]:
return 'O', -1
if lang in inflectors:
plural=inflectors[lang].singularize(label)
else:
plural = label.rstrip("s") + "s"
if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]:
if plural.lower() in stop_words[lang]:
return 'O', -1
data=get_most_common_id_v3(label, lang)
......
# coding = utf-8
import math
from ...helpers.collision import *
#from ...helpers.geodict_helpers_old import *
from ...helpers.geodict_helpers import *
from .disambiguator import Disambiguator
from ...models.str import get_inclusion_chain
class ShareProp(Disambiguator):
def __init__(self):
Disambiguator.__init__(self,context_based=True)
def fib_formula(self, n):
"""
Return the fibonacci value.
Parameters
----------
n : int
parameter
Returns
-------
int
fibonnaci value
"""
if n in [0, 1]: return 0 # Modifying fibonacci behaviour
golden_ratio = (1 + math.sqrt(5)) / 2
val = (golden_ratio ** n - (1 - golden_ratio) ** n) / math.sqrt(5)
return int(round(val))
def inclusion_log(self, x):
"""
Return the inclusion log
Parameters
----------
x : int
parameter
Returns
-------
int
inclusion log
"""
if x==0:
return 1
return math.log(x)
def get_inclusion_score(self, id1, id2):
"""
Return the inclusion score. Compute the distance between two entities in the hierarchy.
Parameters
----------
id1 : str
id of the first spatial entity
id2 : str
id of the second spatial entity
Returns
-------
int
inclusion score
"""
list1 = get_inclusion_chain(id1, 'P131')
list2 = get_inclusion_chain(id2, 'P131')
interP131 = len(list(set(list1).intersection(list2)))
list1 = get_inclusion_chain(id1, 'P706')
list2 = get_inclusion_chain(id2, 'P706')
interP706 = len(list(set(list1).intersection(list2)))
# return fib_no[interP131]+fib_no[interP706]
return self.inclusion_log(interP131) + self.inclusion_log(interP706)
def Adjacency_P47(self, id1, id2):
"""
Return true, if two spatial entities are found adjacent using the P47 property (share borders) from Wikidata.
Parameters
----------
id1 : str
id of the first spatial entity
id2 : str
id of the second spatial entity
Returns
-------
bool
true if adjacent using P47
"""
data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0]
if "P47" in data_1 and "P47" in data_2:
if id1 in data_2.other.P47 or id2 in data_1.other.P47:
return True
return False
def Adjacency_Hull(self, id1, id2):
"""
To find if two spatial entities hull "collide"
Parameters
----------
id1 : str
id of the first spatial entity
id2 : str
id of the second spatial entity
Returns
-------
bool
if collide
"""
return collisionTwoSEBoundaries(id1, id2)
def disambiguateOne(self, spat_candidates, fixed_entities):
"""
Disambiguate one toponym
Parameters
----------
spat_candidates
list of candidates found in the georeferential
fixed_entities
entities with no ambiguities
Returns
-------
"""
score_dc = {}
for cand in spat_candidates:
id_cand = cand.id
score_dc[id_cand] = 0
for fixed in fixed_entities:
id_fixed = fixed_entities[fixed].id
if self.Adjacency_P47(id_cand, id_fixed):
score_dc[id_cand] += 3
elif self.Adjacency_Hull(id_cand, id_fixed):
score_dc[id_cand] += 2
score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed)
m = max(score_dc, key=score_dc.get)
if score_dc[m] < 4:
return None
for cand in spat_candidates:
if cand.id == m:
return cand.id
def disambiguate_context_based(self,toponyms,lang):
selected_en = {}
fixed_entities = {}
ambiguous_entities = {}
for topo in toponyms:
request = gazetteer.get_by_label(topo, lang)
if len(request) == 0:
request = gazetteer.get_by_alias(topo, lang)
if len(request) > 1:
ambiguous_entities[topo] = request
elif len(request) == 1:
fixed_entities[topo] = request[0]
d_amb_results = {}
for amb_ent in ambiguous_entities:
d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities)
if not d:
d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id
else:
d_amb_results[amb_ent] = d
for k, v in fixed_entities.items():
selected_en[k] = v.id
for k, v in d_amb_results.items():
selected_en[k] = v
return selected_en
\ No newline at end of file
......@@ -16,80 +16,71 @@ def read_pickle(fn):
class WikipediaDisambiguator(Disambiguator):
def __init__(self,measure="degree"):
Disambiguator.__init__(self)
Disambiguator.__init__(self,context_based=True)
# Load model
self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
self.measure=measure
def disambiguate(self, ner_result, lang="en"):
count, se_ = self.extract_se_entities(ner_result)
new_count = {}
selected_en_rev = {}
selected_en = self.disambiguate_wiki(se_,lang)
for en in selected_en:
selected_en_rev[en]=selected_en[en]
#new_count[selected_en[en]] = count[en]
return new_count, selected_en
def disambiguate_list(self,toponyms,lang):
result=self.disambiguate_wiki(toponyms,lang)
return {k:v for k,v in result.items() if v}
def disambiguate_wiki(self, entities, lang):
spat_en=[]
for e in entities:
if re.match("^\d+$", e):
def disambiguate_context_based(self,toponyms,lang):
toponyms_filtered=[]
for toponym in toponyms:
if re.match("^\d+$", toponym):
continue
if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]:
if lang in stop_words and toponym.lower().rstrip("s") in stop_words[lang]:# or toponym.lower().rstrip("s") in common_words[lang]:
continue
plural = e.rstrip("s") + "s"
plural = toponym.rstrip("s") + "s"
if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]:
continue
spat_en.append(e)
spat_en=list(set(spat_en))
toponyms_filtered.append(toponym)
toponyms_filtered=list(set(toponyms_filtered))
g = nx.Graph()
possible_candidates = []
betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ...
group_candidate = {} #candidates per toponym
for e in spat_en:
cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4)
cand = [c.id for c in cand if c]
if not cand:
cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c]
group_candidate[e] = cand
betw_cand[e]=cand
for n in cand:
betw_cand[n]=set(cand)-set(n)
possible_candidates.extend(cand)
for toponym in toponyms_filtered:
candidates = get_top_candidate(toponym, lang, 5)
candidates = [c.id for c in candidates if c]
if not candidates:
candidates = [c.id for c in gazetteer.get_n_label_similar(toponym,lang,5) if c]
group_candidate[toponym] = candidates
betw_cand[toponym]=candidates
for n in candidates:
betw_cand[n]=set(candidates)-set(n)
possible_candidates.extend(candidates)
for cand in possible_candidates:
g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang])
for candidate in possible_candidates:
g.add_node(candidate, label=gazetteer.get_by_id(candidate)[0].label[lang])
data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
for cand in possible_candidates:
for cand2 in possible_candidates:
for candidate in possible_candidates:
for candidate2 in possible_candidates:
# Get PageRank score
d = data_candidate[cand]
d = data_candidate[candidate]
sc = 1
sc=d.score
# Compute probability
prob = self.model.get_coocurence_probability(sc, cand, cand2)
prob = self.model.get_coocurence_probability(sc, candidate, candidate2)
if cand2 in betw_cand[cand] or cand in betw_cand[cand2]:
if candidate2 in betw_cand[candidate] or candidate in betw_cand[candidate2]:
prob = 0.0
if prob < 0.0000001:
prob = 0.0
if not cand == cand2:
if not candidate == candidate2:
# take the lowest co-occurrency between two candidates
if g.has_edge(cand2, cand) :
if g.edges[cand2,cand]["weight"] < prob:
if g.has_edge(candidate2, candidate) :
if g.edges[candidate2,candidate]["weight"] < prob:
continue
g.add_edge(cand, cand2, weight=prob)
g.add_edge(candidate, candidate2, weight=prob)
selected = {}
......@@ -104,7 +95,8 @@ class WikipediaDisambiguator(Disambiguator):
else:# degree by default
selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
#print(1)
except Exception as e:
selected[gr]=get_most_common_id_v3(gr,lang)
except Exception as toponym:
most_common = get_most_common_id_v3(gr, lang)
if most_common and len(most_common)>0: selected[gr]=most_common[0].id
return selected
# coding = utf-8
\ No newline at end of file
# coding = utf-8
import copy
import string
import numpy as np
from ..ner.ner import NER
class Disambiguator(object):
def __init__(self):
"""Constructor for Disambiguator"""
pass
def extract_se_entities(self, input):
out = Disambiguator.parse_corpus(input)
en_ = out[out[:, 1] == NER._unified_tag["place"]]
return self.toponymes_frequencies(en_[:, 0]), np.unique(en_[:, 0])
def toponymes_frequencies(self, ens_):
count = {}
for en in ens_:
if not en in count: count[en] = 0
count[en] += 1
return count
@staticmethod
def parse_corpus(corpus):
final_corpus = []
t = 0
placeTag = NER._unified_tag["place"]
while t < len(corpus):
tag = copy.copy(corpus[t])
if tag[1] == "BEG-" + NER._unified_tag["place"] or tag[1] == placeTag:
lenw = 1
if tag[1] == "BEG-" + placeTag:
compound_tag = tag[0]
t += 1
while corpus[t][1] == "END-" + placeTag or corpus[t][1] == placeTag:
tag = copy.copy(corpus[t])
if tag[0][-1] in string.punctuation or compound_tag[-1] in string.punctuation:
compound_tag += tag[0]
else:
compound_tag += " " + tag[0]
t += 1
lenw += 1
tag[0] = compound_tag
tag[1] = placeTag
t += 1
else:
t += 1
final_corpus.append(tag)
return np.array(final_corpus)
def disambiguate(self, ner_result):
pass
def disambiguate_list(self,toponyms,lang):
pass
\ No newline at end of file
# coding = utf-8
\ No newline at end of file
# coding = utf-8
class BigramModel:
def __init__(self,freq={},count={}):
self.cooc_freq=freq
self.count_associated=count
def append(self,uri1,uri2):
if not uri1 in self.cooc_freq:
self.cooc_freq[uri1]={}
if not uri2 in self.cooc_freq[uri1]:
self.cooc_freq[uri1][uri2]=0
self.cooc_freq[uri1][uri2]+=1
self.increment_count(uri2)
def increment_count(self,uri):
if not uri in self.count_associated:
self.count_associated[uri]=0
self.count_associated[uri]+=1
def get_coocurence_probability(self, pr1, *args):
if len(args) < 2:
print("Only one URI indicated")
return 0.
res_=1.
for u in range(1,len(args)):
res_*=self.get_bigram_probability(args[0],args[u],pr1)
return res_
def get_bigram_probability(self,uri1,uri2,pr1=1):
nna=0.00000001
if uri1 in self.cooc_freq:
if uri2 in self.cooc_freq[uri1]:
return self.cooc_freq[uri1][uri2]
#return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1
elif uri2 in self.cooc_freq:
if uri1 in self.cooc_freq[uri2]:
return self.cooc_freq[uri2][uri1]
#return (self.cooc_freq[uri2][uri1] / self.count_associated[uri1])+pr1
return nna
# coding = utf-8
from ...helpers.geodict_helpers import *
from .disambiguator import Disambiguator
import re, json, os
from ...config.configuration import config
from inflector import Inflector,English,Spanish,French
inflectors= {
"en":Inflector(English()),
"fr":Inflector(French()),
"es":Inflector(Spanish())
}
stop_words = {
"fr": set(open(os.path.join(config.language_resources_path,"stop_words_fr.txt")).read().split("\n")),
"en": set(open(os.path.join(config.language_resources_path,"stop_words_en.txt")).read().split("\n"))
}
common_words = {
"fr": set(json.load(open(os.path.join(config.language_resources_path,"dic_fr.json")))),
"en": set(open(os.path.join(config.language_resources_path,"english_common_words_filtered.txt")).read().split("\n"))
}
class MostCommonDisambiguator(Disambiguator):
def __init__(self):
Disambiguator.__init__(self)
def disambiguate(self, ner_result, lang="en"):
count, se_ = self.extract_se_entities(ner_result)
new_count = {}
selected_en = {}
for en in se_:
id_,score=self.disambiguate_(en,lang)
if not id_ == "O" and id_:
selected_en[id_] = en
new_count[id_] = count[en]
return new_count, selected_en
def disambiguate_list(self,toponyms,lang):
result={}
for toponym in toponyms:
id_,_=self.disambiguate_(toponym,lang)
if id_:
result[id_]=toponym
return result
def disambiguate_(self, label, lang='fr'):
if re.match("^\d+$", label):
return 'O', -1
if lang in stop_words: #and lang in common_words:
if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]:
return 'O', -1
if lang in inflectors:
plural=inflectors[lang].singularize(label)
else:
plural = label.rstrip("s") + "s"
if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]:
return 'O', -1
data=get_most_common_id_v3(label, lang)
id_, score=None,0
if data:
id_,score=data.id,data.score
return id_, score
# coding = utf-8
import re
from .disambiguator import Disambiguator
from .models.bigram import BigramModel
import pickle
from ...config.configuration import config
#from ...helpers.geodict_helpers_old import *
from ...helpers.geodict_helpers import *
from .most_common import stop_words,common_words
import networkx as nx
def read_pickle(fn):
return pickle.load(open(fn,'rb'))
class WikipediaDisambiguator(Disambiguator):
def __init__(self,measure="degree"):
Disambiguator.__init__(self)
# Load model
self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count))
self.measure=measure
def disambiguate(self, ner_result, lang="en"):
count, se_ = self.extract_se_entities(ner_result)
new_count = {}
selected_en_rev = {}
selected_en = self.disambiguate_wiki(se_,lang)
for en in selected_en:
selected_en_rev[en]=selected_en[en]
#new_count[selected_en[en]] = count[en]
return new_count, selected_en
def disambiguate_list(self,toponyms,lang):
result=self.disambiguate_wiki(toponyms,lang)
return {k:v for k,v in result.items() if v}
def disambiguate_wiki(self, entities, lang):
spat_en=[]
for e in entities:
if re.match("^\d+$", e):
continue
if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]:
continue
plural = e.rstrip("s") + "s"
if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]:
continue
spat_en.append(e)
spat_en=list(set(spat_en))
g = nx.Graph()
possible_candidates = []
betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ...
group_candidate = {} #candidates per toponym
for e in spat_en:
cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4)
cand = [c.id for c in cand if c]
if not cand:
cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c]
group_candidate[e] = cand
betw_cand[e]=cand
for n in cand:
betw_cand[n]=set(cand)-set(n)
possible_candidates.extend(cand)
for cand in possible_candidates:
g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang])
data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates}
for cand in possible_candidates:
for cand2 in possible_candidates:
# Get PageRank score
d = data_candidate[cand]
sc = 1
sc=d.score
# Compute probability
prob = self.model.get_coocurence_probability(sc, cand, cand2)
if cand2 in betw_cand[cand] or cand in betw_cand[cand2]:
prob = 0.0
if prob < 0.0000001:
prob = 0.0
if not cand == cand2:
# take the lowest co-occurrency between two candidates
if g.has_edge(cand2, cand) :
if g.edges[cand2,cand]["weight"] < prob:
continue
g.add_edge(cand, cand2, weight=prob)
selected = {}
#Take the candidates with the highest degree weighted
for gr in group_candidate:
try:
if self.measure == "degree":
selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
elif self.measure == "centrality":
selected[gr] = max(group_candidate[gr], key=lambda x: nx.closeness_centrality(g, x, distance="weight"))
else:# degree by default
selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight'))
#print(1)
except Exception as e:
selected[gr]=get_most_common_id_v3(gr,lang)
return selected
from .spacy import Spacy
from .nltk import NLTK
from .polyglot import Polyglot
from .stanford_ner import StanfordNER
from .ner import NER
\ No newline at end of file
......@@ -12,7 +12,43 @@ class NER:
self._lang = lang
def identify(self, input):
return input
"""
Parameters
----------
input
Returns
-------
"""
raise NotImplementedError
def parse_output(self, output):
pass
"""
Parse the output of the NER
Parameters
----------
output: obj
ner output
Returns
-------
2D-array numpy
First col = Text, Second Col = Tag
"""
raise NotImplementedError
def translate_tag(self, tag):
"""
Translate the NER tag to a unique tag use in this module.
Parameters
----------
tag :str
tag
Returns
-------
str
transformed tag
"""
raise NotImplementedError
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment