Commit 5e8fd7d7 authored by Pokiros's avatar Pokiros
Browse files

Modification on Stanford NER

parent 207ddf1e
No related merge requests found
Showing with 97 additions and 25 deletions
+97 -25
# coding: utf-8
import glob
import glob, os, sys,json,re
import time
from progressbar import ProgressBar, Timer, Bar, ETA
......@@ -159,7 +159,7 @@ parser.add_argument("graphs_dir")
parser.add_argument("metadata_fn")
parser.add_argument("-e","--evalEPI",action="store_true")
parser.add_argument("-a","--all",action="store_true")
parser.add_argument("-o","--output",help="Output Filename",default="GED")
parser.add_argument("-o","--output",help="Output Filename")
args = parser.parse_args()
......@@ -255,6 +255,8 @@ with ProgressBar(max_value=len(selected_documents_),widgets=[' [', Timer(), '] '
pg.update(inc)
if not args.output:
print("Saved in gui_graph_viewer/evalTop10STR_{0}.json".format(args.distance))
open("gui_graph_viewer/evalTop10STR_{0}.json".format(args.distance),'w').write(json.dumps(final_data,indent=4))
else:
print("Saved in {0}".format(args.output))
open(args.output, 'w').write(json.dumps(final_data, indent=4))
......@@ -8,6 +8,9 @@ import time
from nlp.disambiguator.geodict_gaurav import *
from nlp.disambiguator.pagerank import *
from polyglot.detect import Detector
from nlp.ner.by_dict import ByDict
from nlp.ner.polyglot import Polyglot
from nlp.pos_tagger.tagger import Tagger
from progressbar import ProgressBar, Timer, Bar, ETA, Counter
......@@ -28,12 +31,12 @@ parser.add_argument("metadata_output_fn")
args = parser.parse_args()
start = time.time()
class_=StanfordNER
# Initialise Graphs Transformers
pipeline= {
"en":Pipeline(lang="english",tagger=Tagger(),ner=StanfordNER(lang="en")),
"fr":Pipeline(lang="french",tagger=Tagger(),ner=StanfordNER(lang="fr")),
"es":Pipeline(lang="espagnol",tagger=Tagger(),ner=StanfordNER(lang="es"))
"en":Pipeline(lang="english",tagger=Tagger(),ner=class_(lang="en")),
"fr":Pipeline(lang="french",tagger=Tagger(),ner=class_(lang="fr")),
"es":Pipeline(lang="espagnol",tagger=Tagger(),ner=class_(lang="es"))
}
......@@ -100,11 +103,11 @@ with ProgressBar(max_value=len(texts_),widgets=[' [', Timer(), '] ',Bar(),'(', C
else:
t=filter_nonprintable(texts_[id_doc])
try:
a, b, c = pipeline[lang].parse(t)
list_gs.append(pipeline[lang].build(t).graph)
str, count, se_identified = pipeline[lang].build(t)
list_gs.append(str.graph)
# Save Metadata
count_per_doc[id_doc] = a
associated_es[id_doc] = c
count_per_doc[id_doc] = count
associated_es[id_doc] = se_identified
except: # NER Bug
count_per_doc[id_doc] = {}
associated_es[id_doc] = {}
......
......@@ -15,11 +15,14 @@ class ByDict(NER):
def identify(self, text):
try:
pos_t = PolyText(text).pos_tags
except:
pos_t = PolyText(text, hint_language_code=self._lang).pos_tags # Error due to UTF8 invalid character pycld2
pos_t=np.array(pos_t)
if isinstance(text,str):
try:
pos_t = PolyText(text).pos_tags
except:
pos_t = PolyText(text, hint_language_code=self._lang).pos_tags # Error due to UTF8 invalid character pycld2
pos_t = np.array(pos_t)
else:
pos_t=np.array(text)
mask = np.argwhere((pos_t[:, 1] == 'PROPN') | (pos_t[:, 1] == 'ADP') | (pos_t[:, 1] == 'DET') | (pos_t[:, 1] == 'ADJ'))
# get terms
......
......@@ -10,6 +10,6 @@ class NER:
def __init__(self,lang):
self._lang = lang
def identify(self,input):
pass
return input
def parse_output(self,output):
pass
\ No newline at end of file
......@@ -5,9 +5,13 @@ from termcolor import colored
from pycorenlp import StanfordCoreNLP
from config.configuration import config
from threading import Thread
from nlp.exception.language import LanguageNotAvailable
from .ner import NER
from queue import Queue
import numpy as np
import json
_stanfordner_available_language = ["fr", "en","es"]
......@@ -23,7 +27,20 @@ _tag_stanford = {
"pers": "PERSON"
}
class NERWorker(Thread):
def __init__(self,ner,queue,lang):
Thread.__init__(self)
self.ner=ner
self.outputs=[]
self.queue=queue
self._lang=lang
def run(self):
while 1:
id_,text=self.queue.get()
self.outputs.append((id_,self.ner.annotate(text, properties={'annotators': 'tokenize,ssplit,pos,ner',
'outputFormat': 'json',"tokenize.untokenizable": "noneDelete"})))
self.queue.task_done()
class StanfordNER(NER):
"""
Python wrapper for StanfordNER
......@@ -44,23 +61,70 @@ class StanfordNER(NER):
self.identified = None
def split_text(self,text,maxlen=50000):
texts=text.split(".")
phrases_given=[]
c=0
current_phrase=""
for t in texts:
if c + len(t) <maxlen:
current_phrase+="."+t
c+=len(t)
elif c + len(t) > maxlen:
phrases_given.append(current_phrase)
current_phrase, c ="",0
return phrases_given
def identify(self, text=None):
if not text:
raise TypeError("No value found in `text` parameter.")
if len(text) < 50000:
output_=self._ner.annotate(text,properties={'annotators': 'tokenize,ssplit,pos,ner','outputFormat':'json',"tokenize.untokenizable":"noneDelete"})
if isinstance(output_, str):
output_ = json.loads(output_, strict=False)
else:
texts=self.split_text(text)
print("\n",len(texts))
output_ = self._ner.annotate(texts[0], properties={'annotators': 'tokenize,ssplit,pos,ner',
'outputFormat': 'json',
"tokenize.untokenizable": "noneDelete"})
if isinstance(output_, str):
output_ = json.loads(output_, strict=False)
queue=Queue()
queue_texts=[]
for t in range(1,len(texts)):
queue.put((t,texts[t]))
list_worker=[]
for t in range(4):
worker=NERWorker(self._ner,queue,self._lang)
list_worker.append(worker)
list_worker[-1].daemon=True
list_worker[-1].start()
queue.join()
outputs=["" for i in range(len(texts)-1)]
for worker in list_worker:
for id,out in worker.outputs:
outputs[id-1]=out
for o in outputs:
try:
if isinstance(o, str):
o = json.loads(o, strict=False)
output_["sentences"].extend(o["sentences"])
except:
pass
output_=self._ner.annotate(text,properties={'annotators': 'tokenize,ssplit,pos,ner','pipelineLanguage':self._lang,'outputFormat':'json',"tokenize.untokenizable":"noneDelete"})
if isinstance(output_,str):
output_=json.loads(output_,strict=False)
return self.parse_output(output_, [])
def parse_output(self, output, pos_tags):
# Pre-Treatment on the output
#print(1)
tagged_=[]
_tag_entity = list(_tag_stanford.values())
for sentence in output["sentences"]:
#print(sentence.keys())
for w in sentence["tokens"]:
if w["ner"] in _tag_entity:
tagged_.append([w["originalText"],self.translate_tag(w["ner"])])
......
......@@ -97,12 +97,12 @@ class Pipeline(object):
adj = kwargs.get("adj", True)
inc = kwargs.get("inc", True)
if not se_identified:
_,output, se_identified = self.parse(text)
count,output, se_identified = self.parse(text)
else:
_, output, tt = self.parse(text)
count, output, tt = self.parse(text)
str_=STR(output,se_identified)
str_.build(cooc=cooc,adj=adj,inc=inc)
return str_
return str_,count,se_identified
def buildSemSTR(self,text,win_size=5):
"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment