Modify tt4py + add str with semantics (v1) + modify pipeline

957a7e13 · Pokiros · 082aee05 · 957a7e13 · 957a7e13 · 957a7e13
Commit 957a7e13 authored 7 years ago by Pokiros
Hide whitespace changes
Inline Side-by-side

Showing

with 644 additions and 21 deletions
+644 -21
--- a/ged4py/weisfeiler_lehman.py
+++ b/ged4py/weisfeiler_lehman.py
+# coding = utf-8
+
+"""Weisfeiler_Lehman graph kernel.
+
+Python implementation based on: "Weisfeiler-Lehman Graph Kernels", by:
+Nino Shervashidze, Pascal Schweitzer, Erik J. van Leeuwen, Kurt
+Mehlhorn, Karsten M. Borgwardt, JMLR, 2012.
+http://jmlr.csail.mit.edu/papers/v12/shervashidze11a.html
+
+Author : Sandro Vega-Pons, Emanuele Olivetti
+"""
+
+import numpy as np
+import networkx as nx
+import copy
+
+
+class GK_WL():
+    """
+    Weisfeiler_Lehman graph kernel.
+    """
+    def compare_list(self, graph_list, h=1, node_label=True):
+        """Compute the all-pairs kernel values for a list of graphs.
+
+        This function can be used to directly compute the kernel
+        matrix for a list of graphs. The direct computation of the
+        kernel matrix is faster than the computation of all individual
+        pairwise kernel values.
+
+        Parameters
+        ----------
+        graph_list: list
+            A list of graphs (list of networkx graphs)
+        h : interger
+            Number of iterations.
+        node_label : boolean
+            Whether to use original node labels. True for using node labels
+            saved in the attribute 'node_label'. False for using the node
+            degree of each node as node attribute.
+
+        Return
+        ------
+        K: numpy.array, shape = (len(graph_list), len(graph_list))
+        The similarity matrix of all graphs in graph_list.
+
+        """
+        self.graphs = graph_list
+        n = len(graph_list)
+        lists = [0] * n
+        k = [0] * (h + 1)
+        n_nodes = 0
+        n_max = 0
+
+        # Compute adjacency lists and n_nodes, the total number of
+        # nodes in the dataset.
+        for i in range(n):
+            lists[i] = graph_list[i].adjacency_list()
+            n_nodes = n_nodes + len(graph_list[i])
+
+            # Computing the maximum number of nodes in the graphs. It
+            # will be used in the computation of vectorial
+            # representation.
+            if(n_max < len(graph_list[i])):
+                n_max = len(graph_list[i])
+
+        phi = np.zeros((n_max, n), dtype=np.uint64)
+
+        # INITIALIZATION: initialize the nodes labels for each graph
+        # with their labels or with degrees (for unlabeled graphs)
+
+        labels = [0] * n
+        label_lookup = {}
+        label_counter = 0
+
+        # label_lookup is an associative array, which will contain the
+        # mapping from multiset labels (strings) to short labels
+        # (integers)
+
+        if node_label is True:
+            for i in range(n):
+                l_aux = nx.get_node_attributes(graph_list[i],
+                                               'label').values()
+                l_aux = list(l_aux)
+
+                # It is assumed that the graph has an attribute
+                # 'node_label'
+                labels[i] = np.zeros(len(l_aux), dtype=np.int32)
+
+                for j in range(len(l_aux)):
+                    if not (l_aux[j] in label_lookup):
+                        label_lookup[l_aux[j]] = label_counter
+                        labels[i][j] = label_counter
+                        label_counter += 1
+                    else:
+                        labels[i][j] = label_lookup[l_aux[j]]
+                    # labels are associated to a natural number
+                    # starting with 0.
+                    phi[labels[i][j], i] += 1
+        else:
+            for i in range(n):
+
+                labels[i] = np.array(list(graph_list[i].degree().values()))
+                for j in range(len(labels[i])):
+                    phi[labels[i][j], i] += 1
+        print(phi)
+        # Simplified vectorial representation of graphs (just taking
+        # the vectors before the kernel iterations), i.e., it is just
+        # the original nodes degree.
+        self.vectors = np.copy(phi.transpose())
+
+        k = np.dot(phi.transpose(), phi)
+
+        # MAIN LOOP
+        it = 0
+        new_labels = copy.deepcopy(labels)
+
+        while it < h:
+            # create an empty lookup table
+            label_lookup = {}
+            label_counter = 0
+
+            phi = np.zeros((n_nodes, n), dtype=np.uint64)
+            for i in range(n):
+                for v in range(len(lists[i])):
+                    # form a multiset label of the node v of the i'th graph
+                    # and convert it to a string
+
+                    long_label = np.concatenate((np.array([labels[i][v]]),
+                                                 np.sort(labels[i]
+                                                 [lists[i][v]])))
+                    long_label_string = str(long_label)
+                    # if the multiset label has not yet occurred, add it to the
+                    # lookup table and assign a number to it
+                    if not (long_label_string in label_lookup):
+                        label_lookup[long_label_string] = label_counter
+                        new_labels[i][v] = label_counter
+                        label_counter += 1
+                    else:
+                        new_labels[i][v] = label_lookup[long_label_string]
+                # fill the column for i'th graph in phi
+                aux = np.bincount(new_labels[i])
+                phi[new_labels[i], i] += aux[new_labels[i]]
+
+            k += np.dot(phi.transpose(), phi)
+            labels = copy.deepcopy(new_labels)
+            it = it + 1
+
+        # Compute the normalized version of the kernel
+        k_norm = np.zeros(k.shape)
+        for i in range(k.shape[0]):
+            for j in range(k.shape[1]):
+                k_norm[i, j] = k[i, j] / np.sqrt(k[i, i] * k[j, j])
+
+        return k_norm
+
+
+    def compare(self, g_1, g_2, h=1, node_label=True):
+        """Compute the kernel value (similarity) between two graphs.
+        The kernel is normalized to [0,1] by the equation:
+        k_norm(g1, g2) = k(g1, g2) / sqrt(k(g1,g1) * k(g2,g2))
+
+        Parameters
+        ----------
+        g_1 : networkx.Graph
+            First graph.
+        g_2 : networkx.Graph
+            Second graph.
+        h : interger
+            Number of iterations.
+        node_label : boolean
+            Whether to use the values under the graph attribute 'node_label'
+            as node labels. If False, the degree of the nodes are used as
+            labels.
+
+        Returns
+        -------
+        k : The similarity value between g1 and g2.
+        """
+        gl = [g_1, g_2]
+        return self.compare_list(gl, h, node_label)[0, 1]
--- a/generate_eval_support.py
+++ b/generate_eval_support.py
@@ -180,13 +180,13 @@ if not os.path.exists(args.graphs_dir):
    graphs={}
    for t,text in texts.items():
        if text:
-            graphs[t]=pip.build(text).graph
+            graphs[t]=pip.buildSemSTR(text,win_size=7).graph
        else:
            graphs[t]=nx.MultiDiGraph()
-
-    os.mkdir(graphs_dir)
+    os.mkdir(args.graphs_dir)
    for t,g in graphs.items():
-        nx.write_gexf(g,os.path.join(graphs_dir,"{0}.gexf".format(t)))
+        print(t)
+        nx.write_gexf(g,os.path.join(args.graphs_dir,"{0}.gexf".format(t)))

 # LOAD graph data and associated spatial entities of each graph
 assC=json.load(open("associated_and_count.json"))

--- a/generate_eval_support_JPT.py
+++ b/generate_eval_support_JPT.py
+# coding: utf-8
+
+from ner.gate_annie import GateAnnie
+from ner.nltk import NLTK
+
+from pipeline import *
+
+from pos_tagger.tagger import Tagger
+
+# Disa
+from disambiguator.pagerank import *
+from disambiguator.geodict_gaurav import *
+
+# Graph Edit Distance Algorithm Import
+from ged4py.algorithm import graph_edit_dist as ged
+from ged4py.geo_ged import GeoGED
+from ged4py.geo_hed import GeoHED
+from ged4py.hausdorff_edit_distance import HED
+from ged4py.bipartite_graph_matching_2 import BP_2
+from ged4py.greedy_edit_distance import GreedyEditDistance
+from ged4py.geo_bp2 import GeoBP2
+
+
+from ged4py.exception import NotFoundDistance
+
+import numpy as np
+import glob, json, argparse
+from progressbar import ProgressBar,Timer,Bar,ETA
+
+# Similarity Function between graph and a set of graphs
+
+
+def compareGED(id_,graphs):
+    g=graphs[id_]
+    sc=np.zeros(len(graphs))
+    for id_,g2 in graphs.items():
+        score=ged.compare(g,g2)
+        sc[id_]=score
+    return sc
+
+def compareGEOGED(id1,graphs):
+    g=graphs[id1]
+    sc=np.zeros(len(graphs))
+    for id_,g2 in graphs.items():
+        try:
+            if len(g2) >1:
+                gg=GeoGED(g,g2)
+                score=gg.distance()
+                sc[id_] = score
+            else:
+                sc[id_]=np.inf
+        except:
+            sc[id_] = np.inf
+
+    return sc
+
+def compareBP2(id_,graphs):
+    bp2=BP_2()
+    g = graphs[id_]
+    sc = np.zeros(len(graphs))
+    for id_, g2 in graphs.items():
+        if len(g2) >0:
+            score = bp2.bp2(g, g2)
+            sc[id_] = score
+        else:
+            sc[id_] = np.inf
+    return sc
+def compareGEOBP2(id_,graphs):
+    bp2=GeoBP2()
+    g = graphs[id_]
+    sc = np.zeros(len(graphs))
+    for id_, g2 in graphs.items():
+        if len(g2) >0:
+            score = bp2.bp2(g, g2)
+            sc[id_] = score
+        else:
+            sc[id_] = np.inf
+    return sc
+
+def compareHED(id_,graphs):
+    h=HED()
+    g = graphs[id_]
+    sc = np.zeros(len(graphs))
+    for id_, g2 in graphs.items():
+        if len(g2) >0:
+            score = h.hed(g, g2)
+            sc[id_] = score
+        else:
+            sc[id_]=np.inf
+    return sc
+
+def compareGEOHED(id_,graphs):
+    h=GeoHED()
+    g = graphs[id_]
+    sc = np.zeros(len(graphs))
+    for id_, g2 in graphs.items():
+        if len(g2)>1:
+            score = h.hed(g, g2)
+            sc[id_] = score
+        else:
+            sc[id_] = np.inf
+    return sc
+
+def compareGreedy(id_,graphs):
+
+    g = graphs[id_]
+    sc = np.zeros(len(graphs))
+    for id_, g2 in graphs.items():
+        h = GreedyEditDistance(g,g2)
+        score = h.distance()
+        sc[id_] = score
+    return sc
+
+funcDict={
+    "GED":compareGED,
+    "GEOGED":compareGEOGED,
+    "BP2":compareBP2,
+    "GEOBP2":compareGEOBP2,
+    "HED":compareHED,
+    "GEOHED":compareGEOHED,
+    "GREEDY":compareGreedy
+}
+
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("distance")
+parser.add_argument("graphs_dir")
+parser.add_argument("--ignore",help="Ignore Output",action="store_true")
+parser.add_argument("-o","--output",help="Output Filename",default="GED")
+args = parser.parse_args()
+
+
+if not args.distance in funcDict.keys():
+    raise NotFoundDistance(args.distance,funcDict)
+    exit()
+# Initialize Pipeline for Spatial Entities extraction and STR construction
+
+pip=Pipeline(lang="english",tagger=Tagger(),ner=StanfordNER(lang="en"))
+
+# Load all the text from the corpus
+
+def get_text_data(directory):
+    """
+    Load the Epidemiology corpus
+    """
+    files = glob.glob(os.path.join(directory,"*.txt"))
+    texts={}
+    for filepath in files:
+        id_doc=int(re.findall("\d+",filepath)[-1])
+        data=open(filepath).read()
+        texts[id_doc]=data
+    return texts
+
+texts=get_text_data("data/data_agritrop/text/")# Raw text
+
+
+#print("TEST text 0 = ",texts[0])
+
+
+# Extract All spatial entities
+
+if not os.path.exists("associated_and_count_agritrop.json"):
+    pass
+    associated_es={}
+    count_per_doc={}
+    for id_,text in texts.items():
+        if text:
+            a,b,c=pip.parse(text)
+            count_per_doc[id_]=a
+            associated_es[id_]=c
+        else:
+            associated_es[id_]={}
+            count_per_doc[id_]={}
+
+    open("associated_and_count_agritrop.json",'w').write(json.dumps([associated_es,count_per_doc],indent=4))
+
+if not os.path.exists(args.graphs_dir):
+    pass
+    import networkx as nx
+    graphs={}
+    for t,text in texts.items():
+        if text:
+            graphs[t]=pip.build(text).graph
+        else:
+            graphs[t]=nx.MultiDiGraph()
+    os.mkdir(args.graphs_dir)
+    for t,g in graphs.items():
+        print(t)
+        nx.write_gexf(g,os.path.join(args.graphs_dir,"{0}.gexf".format(t)))
+
+# LOAD graph data and associated spatial entities of each graph
+assC=json.load(open("associated_and_count_agritrop.json"))
+associated_es,count_per_doc=assC[0],assC[1]
+
+graphs={}
+for file in glob.glob(args.graphs_dir.rstrip("/")+"/*.gexf"):
+    id=int(re.findall("\d+",file)[0])
+    graphs[id]=nx.read_gexf(file)
+
+
+
+from ged4py.algorithm import graph_edit_dist as ged
+from ged4py.bipartite_graph_matching_2 import BP_2
+
+
+
+
+def getLocInfo(id_):
+    data=get_data(id_)
+    if 'coord' in data:
+        return [data["coord"]["lat"],data["coord"]["lon"]]
+    return [0,0]
+
+def get_associated_es(associated_es_data):
+    new_={}
+    for id_ in associated_es_data:
+        new_[id_]={"label":associated_es_data[id_],"coord":getLocInfo(id_)}
+    return new_
+
+def getEdges4Draw(associated_es,edges):
+    data={}
+    for es in associated_es:
+        data[es]=getLocInfo(es)
+    lines=[]
+    for ed in edges:
+        lines.append([data[ed[0]],data[ed[1]],ed[2]["color"]])
+    return lines
+
+
+# Generating Evaluation Output
+
+
+top_ten_documents=[]
+final_data={}
+
+inv_table,j={},0
+new_graphs={}
+for i in graphs:
+    inv_table[j]=i
+    new_graphs[j]=graphs[i]
+    j+=1
+
+
+with ProgressBar(max_value=len(texts.keys()),widgets=[' [', Timer(), '] ',Bar(),' (', ETA(), ') ',]) as pg:
+    inc=0
+    for doc_s in inv_table.keys():
+        id_json=str(doc_s)
+        real_id=inv_table[doc_s]
+        r_id_str=str(real_id)
+        #print(id_json)
+        _score=funcDict[args.distance](doc_s, new_graphs)
+        top_4_docs_score= np.sort(_score)[1:4].astype(float)
+        top_4_docs= np.argsort(_score)[1:4].astype(int)
+        #print(top_10_docs)
+        final_data[real_id]={
+            "sp_entities":get_associated_es(associated_es[str(real_id)]),
+            "text":texts[inv_table[doc_s]],
+            "edges":getEdges4Draw(associated_es[str(real_id)],graphs[real_id].edges(data=True))
+        }
+        final_data[real_id]["top_10"]=[]
+        for d in range(len(top_4_docs)):
+            doc_data={}
+            doc_data["score"]=top_4_docs_score[d]
+            doc_data["id_txt"]=inv_table[int(top_4_docs[d])]
+            doc_data["text"]=texts[doc_data["id_txt"]]
+            doc_data["sp_entities"]=get_associated_es(associated_es[str(doc_data["id_txt"])])
+            doc_data["edges"]=getEdges4Draw(associated_es[str(doc_data["id_txt"])],graphs[doc_data["id_txt"]].edges(data=True))
+            doc_data["relevant"]=None
+            final_data[real_id]["top_10"].append(doc_data)
+        inc+=1
+        pg.update(inc)
+
+
+
+if not args.ignore:
+    open("graph_viewer/evalTopJP_10STR_{0}.json".format(args.distance),'w').write(json.dumps(final_data,indent=4))
--- a/models/str_with_semantic.py
+++ b/models/str_with_semantic.py
@@ -16,14 +16,46 @@ import json
 class STR_SEM(STR):
    """"""

-    def __init__(self, tagged_text, spatial_entities,diseases,species):
-        super().__init__(tagged_text, spatial_entities,diseases,species)
-        labels = json.load(open("/Users/jacquesfize/Downloads/labelsEN.json"))
+    def __init__(self, tagged_text, spatial_entities):
+        super().__init__(tagged_text, spatial_entities)
+        self.labels = json.load(open("/Users/jacquesfize/Downloads/labelsEN.json"))
+        self.tags=None

-    def build(self, verbose=False):
-        super().build(cooc=True, inc=True, adj=True,verbose=verbose)
+    def build(self, win_size = 5,verbose=False):
+        super().build(cooc=False, inc=True, adj=True,verbose=verbose)
        search_engine = tt4py.Text(self.tagged_text)
-        search_engine.transform_tagged()
+        search_engine.tag_item_in_thesaurus(self.labels,prefix_="agrovoc")
+
+        sps_inv = {}
+        for k, v in self.spatial_entities.items():
+            sps_inv[v.lower()] = k
+
+        cleaned_=search_engine.tagged_text
+        self.tags = cleaned_
+        linked_to = {}
+        w = 0
+        while w < len(cleaned_):
+            curr = cleaned_[w]
+            if "agrovoc" in curr[1] and not "LOC" in curr[1]:
+                window = np.array(cleaned_[w - win_size:w + win_size])
+                if not window.size:
+                    w += 1
+                    continue
+                for wo in window:
+                    if "LOC" == wo[1] and wo[0] in sps_inv:
+
+                        if not curr[0] in linked_to: linked_to[curr[0]] = set([])
+                        linked_to[curr[0]].add(wo[0])
+            w += 1
+        edges = []
+        register = set([])
+        for l, v in linked_to.items():
+            for vi in v:
+                for vj in v:
+                    if vj != vi and vj + "-" + vi not in register:
+                        edges.append([sps_inv[vi], sps_inv[vj], {"label": str(l), "color": "cyan"}])
+                        register.add(vi + "-" + vj)
+        self.graph.add_edges_from(edges)
        # load spatial entities
        # find the positions for each spatial entities
        # for each spatial entities postitions, find the neighbouring words

--- a/pipeline.py
+++ b/pipeline.py
@@ -9,6 +9,7 @@ from pos_tagger.treetagger import TreeTagger
 from ner.stanford_ner import *
 from disambiguator.pagerank import PageRankDisambiguator
 from models.str import STR
+from models.str_with_semantic import STR_SEM


 class Pipeline(object):
@@ -100,6 +101,18 @@ class Pipeline(object):
        str_.build()
        return str_

+    def buildSemSTR(self,text,win_size=5):
+        """
+        Return the corresponding STR for a text.
+        :param text:
+        :return: STR
+        """
+        _,output, se_identified = self.parse(text)
+
+        str_=STR_SEM(output,se_identified)
+        str_.build(win_size=win_size)
+        return str_
+
    def build_class_variation_str(self,text):
        """
        Return the corresponding STR for a text.

--- a/tt4py/tt4py.py
+++ b/tt4py/tt4py.py
@@ -4,6 +4,12 @@ from tt4py.helpers import *
 import numpy as np
 from enum import Enum
 from termcolor import colored
+from ner.ner import NER
+from nltk.stem import WordNetLemmatizer, SnowballStemmer
+
+
+_wn_lem =WordNetLemmatizer()
+_snowball_stemmer = SnowballStemmer("english")

 class TaggedType(Enum):
    POS=2
@@ -16,12 +22,19 @@ class SearchFlag(Enum):
    SP_WS = lambda x : x.split(" ") # split using whitespaces
    SP_P = lambda x : x.split(".") # split using point
    SP_D = lambda x : x.split("-") # split using dash
+    WN_LEM = lambda x : _wn_lem.lemmatize(x)
+    SNW_STEM = lambda x : _snowball_stemmer.stem(x)

 class TaggedInputError(Exception):
    def __init__(self):
        super(Exception, self).__init__(
            colored("Wrong input : check your input data type or the size for each token data ", "red"))

+class WrongThesaurusFormatError(Exception):
+    def __init__(self,var):
+        super(Exception, self).__init__(
+            colored("Wrong thesaurus format: use dict format instead of {0}. Ex. {'id_1':'label'}".format(str(type(var))), "red"))
+
 class Text(object):
    def __init__(self,tagged_text,type=TaggedType.MIX_POS_TAG):
        #check if 'tagged_text' is an iterable object
@@ -32,6 +45,7 @@ class Text(object):

        # Convert input into numpy array
        self.tagged_text=tagged_text
+
        if isinstance(tagged_text,dict):
            self.tagged_text = dict_to_array(tagged_text)
        elif isinstance(tagged_text,list):
@@ -45,19 +59,13 @@ class Text(object):
        if not type.value == self.tagged_text.shape[1]:
            raise TaggedInputError

-        self.raw_text=" ".join(self.tagged_text[:,0])
+        self._original=self.tagged_text.copy()
+        self.flag_applied = []
+

-    def is_in_text(self,string,flags=[SearchFlag.NO_CASE]):
-        t_1,t_2=[string],self.raw_text
-        # Apply necessary for string search
-        for flag in flags:
-            t_1,t_2=np.array([flag(i) for i in t_1]).flatten(),np.array([flag(i) for i in t_2]).flatten()
-        if not " {0} ".format(t_1) in t_2:
-            return False
-        return True

    def transform_tagged(self,flags = [SearchFlag.NO_CASE, SearchFlag.SP_WS, SearchFlag.SP_D]):
-        tagged = self.tagged_text.copy().tolist()
+        tagged = self._original.copy().tolist()
        # Apply necessary for string search
        for flag in flags:
            tagged_t=[]
@@ -71,9 +79,19 @@ class Text(object):
                else:
                    tagged_t.extend([[res_, token[1]]])
            tagged=tagged_t
+
        self.tagged_text = np.array(tagged)
+        self.flag_applied=flags
+
+    def hasSameFlags(self,flags):
+        for f in flags:
+            if not f in self.flag_applied:
+                return False
+        return True

    def get_occurrences(self,string,flags = [SearchFlag.NO_CASE, SearchFlag.SP_WS, SearchFlag.SP_D]):
+        if not self.hasSameFlags(flags):
+            self.transform_tagged(flags)

        positions_list=[]
        t_1 = [string]
@@ -111,15 +129,118 @@ class Text(object):
            pos2=pos1
        return self.tagged_text[pos1-window_size:window_size+pos2]

-    def extract_token_by_tag(self,tags):
+    def extract_token_by_tag(self,*tags):
        res,posis_=[],[]
        for tag in tags:
            posis_.extend(np.argwhere(self.tagged_text[:, -1] == tag).flatten())
        posis_ = sorted(posis_)
+
        for pos in posis_:
            pp=self.tagged_text[pos].tolist()
            pp.append(pos)
            res.append(pp)
        return res

+    def tag_item_in_thesaurus(self,thesaurus,flags = [SearchFlag.NO_CASE, SearchFlag.SP_WS, SearchFlag.SP_D],prefix_="th_",stop_tag = ["LOC"]):
+        if not self.hasSameFlags(flags):
+            self.transform_tagged(flags)
+
+        if not isinstance(thesaurus,dict):
+            raise WrongThesaurusFormatError(thesaurus)
+
+        t=" ".join(self.tagged_text[:,0].tolist())
+        for id_,element in thesaurus.items():
+            if element in t:
+                positions_ = self.get_occurrences(element)
+                for d_ in positions_:
+                    f=True
+                    x,y=d_[0],d_[1]
+                    c=0
+                    if not self.isWorthIt(x,y,prefix_):
+                        break
+                    for st in stop_tag:
+                        if x != y and st in self.tagged_text[x:y][:,1]:
+                            f=False
+                        elif x == y and st in self.tagged_text[x][1]:
+                            f=False
+                    if f:
+                        # rec_str_= self.tagged_text[x][0]
+                        # if x != y:
+                        #     rec_str_ = self.reconstruct_str(self.tagged_text[x:y][:,0])
+                        #     if x - y > 1:
+                        #         self.tagged_text = np.delete(self.tagged_text,np.arange(x+1,y),0)
+                        #     else:
+                        #         self.tagged_text = np.delete(self.tagged_text,y,0)
+                        if abs(x-y)> 0:
+
+                            self.tagged_text[x:y][:,1] = prefix_ + id_
+                            #print("AFTER",self.tagged_text[x:y],x,y)
+                        else:
+                            self.tagged_text[x][1] = prefix_ + id_
+                            #print("AFTER", self.tagged_text[x], x)
+        new_tagged_= []
+        j=0
+        while j < len(self.tagged_text):
+            tag = self.tagged_text[j]
+            if prefix_ in tag[-1]:
+                curr=tag[-1]
+                t=1
+                while j+t < len(self.tagged_text):
+                    if self.tagged_text[j+t][-1] != curr:
+                        break
+                    t+=1
+                #print(self.reconstruct_str(self.tagged_text[j:j+t][:,0]),self.tagged_text[j:j+t],j,t)
+                new_tagged_.append([self.reconstruct_str(self.tagged_text[j:j+t][:,0]),curr])
+                j+=t
+            else:
+                new_tagged_.append(tag.tolist())
+                j+=1
+        self.tagged_text=np.array(new_tagged_)
+
+    def reconstruct_str(self,list_):
+        res = ""
+        no_sp_char = ["-"]
+        no_sp_bf = [","]
+        for ch in list_:
+            if not ch in no_sp_char and res:
+                if res[-1] in no_sp_char or ch in no_sp_bf:
+                    res+=ch
+            if not res:
+                res+=ch
+            else:
+                res+=" "+ch
+
+        return res

+    def isWorthIt(self, x, y,prefix):
+        taille = abs(x-y)
+        count=0
+        if x == y:
+            if prefix in self.tagged_text[x]:
+                count+=1
+            taille=1
+        else:
+            # c=None
+            for item in self.tagged_text[x:y]:
+                if prefix in item[-1]:
+                    count+=1
+                    # if not c: c=item[-1]
+                    # elif c and item[-1] != c:  ---> A discuter
+                    #     return False
+        decx,decy=0,0
+        fx,fy=True,True
+        while fx or fy:
+            fx,fy=False,False
+            if x-(decx+1) >0:
+                if prefix in self.tagged_text[x-(decx+1)][-1] :
+                    fx=True
+                    decx += 1
+            if y + decy+1 < len(self.tagged_text):
+                if prefix in self.tagged_text[y + decy+1][-1] :
+                    fy=True
+                    decy += 1
+
+        #print(self.tagged_text[x:y],count,taille+decy+decx)
+        if taille < count+decx+decy:
+            return False
+        return True
\ No newline at end of file