From d17a7d0c21ce2cdcdb83c41b92c5e86acf819fe0 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Mon, 13 May 2019 13:49:57 +0200
Subject: [PATCH] Debug Add Test compared to Baseline Change select candidate
 for disambiguation Add new criteria

---
 .gitignore                                    |   3 +-
 auto_fill_annotation.py                       |  22 +--
 generate_str.py                               |  39 ++--
 run_automatic_annotation.py                   |   6 +-
 run_test.py                                   |  30 ++--
 run_test_comparedto.py                        | 166 ++++++++++++++++++
 run_test_disambiguisation.sh                  |  12 --
 setup.py                                      |  23 ---
 strpython/__init__.py                         |   5 +-
 strpython/eval/automatic_annotation.py        |  22 ++-
 strpython/helpers/geo_relation_database.py    |  22 ++-
 strpython/helpers/match_cache.py              |   4 +-
 strpython/helpers/sim_matrix.py               |   4 +-
 strpython/models/spatial_relation.py          |   2 +-
 strpython/models/str.py                       |   8 +-
 strpython/nlp/disambiguator/__init__.py       |   2 +-
 strpython/nlp/disambiguator/disambiguator.py  |  19 ++
 strpython/nlp/disambiguator/most_common.py    |   7 +-
 strpython/nlp/disambiguator/share_prop.py     |   6 +-
 strpython/nlp/disambiguator/wikipedia_cooc.py |   4 +-
 strpython/nlp/stop_words.py                   |   2 +
 strpython/pipeline.py                         |  35 ++--
 22 files changed, 314 insertions(+), 129 deletions(-)
 create mode 100644 run_test_comparedto.py
 delete mode 100755 run_test_disambiguisation.sh
 delete mode 100644 setup.py
 create mode 100644 strpython/nlp/stop_words.py

diff --git a/.gitignore b/.gitignore
index dd029ad..c83cd79 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,4 +29,5 @@ __pycache__/
 *.gexf
 temp_cluster_2/
 agromada*
-output*
\ No newline at end of file
+output*
+.vscode
\ No newline at end of file
diff --git a/auto_fill_annotation.py b/auto_fill_annotation.py
index 8ccf6f9..a06b2f2 100644
--- a/auto_fill_annotation.py
+++ b/auto_fill_annotation.py
@@ -23,13 +23,13 @@ def main(dataset, matrix_sim_dir, raw_graph_dir, selected_graphs, threshold, inc
     first_step_output = "output_first_step_{0}_{1}".format(dataset, threshold)
     last_step_output = "output_final_{0}_{1}".format(dataset, threshold)
     generate_annotation_dataframe(matrix_sim_dir, selected_graphs, first_step_output)
-    # size_str = extract_criteria_4_all(annotater, first_step_output, raw_graph_dir, dataset, threshold)
+    size_str = extract_criteria_4_all(annotater, first_step_output, raw_graph_dir, dataset, threshold)
 
     if not os.path.exists(last_step_output):
         os.makedirs(last_step_output)
 
-    # for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"):
-    #      annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str)
+    for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"):
+         annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str)
 
     min_carac_dict=None
     if min_carac_fn != "" and os.path.exists(min_carac_fn):
@@ -120,14 +120,14 @@ def extract_criteria_4_all(annotater, csv_input_dir, raw_graph_dir, dataset, thr
         try:
             return annotater.all(strs[int(x.G1)], strs[int(x.G2)], int(x.G1), int(x.G2))
         except KeyError as e:
-            annotater.matching_cache.add(int(x.G1), int(x.G2), *(0, 0, 0, 0,300000))
-            return [0, 0, 0, 0,300000,0]
+            annotater.matching_cache.add(int(x.G1), int(x.G2), *(0, 0, 0, 0,300000,0))
+            return [0, 0, 0, 0,300000,0,0]
 
     # Annotation Time
     print("Computing Criteria for each match")
     matching_dataframe["res"] = matching_dataframe.progress_apply(lambda x: annotate(x), axis=1)
-    matching_dataframe.res = matching_dataframe.res.apply(lambda x: [int(x[0]),int(x[1]),int(x[2]),int(x[3]),float(x[4])] if x else [])
-    for ix, col in enumerate("c1 c2 c3 c4 c5".split()):
+    matching_dataframe.res = matching_dataframe.res.apply(lambda x: [int(x[0]),int(x[1]),int(x[2]),int(x[3]),float(x[4]),float(x[5])] if x else [])
+    for ix, col in enumerate("c1 c2 c3 c4 c5 c6".split()):
         matching_dataframe[col] = matching_dataframe.res.apply(lambda x: x[ix] if len(x) > 0 else 0)
 
     del matching_dataframe["res"]
@@ -157,7 +157,7 @@ def annotate_eval_sample(annotater, csv_file, output_file, size_str):
         try:
             return annotater.all(None, None, x.G1, x.G2)
         except Exception as e:
-            return [0, 0, 0, 0,300000]
+            return [0, 0, 0, 0,300000,0]
 
     df["res"] = df.apply(lambda x: foo(x), axis=1)
     df.res = df.res.apply(lambda x: list(map(float, x)) if x else [])  # if bool
@@ -166,6 +166,7 @@ def annotate_eval_sample(annotater, csv_file, output_file, size_str):
     df[["c3"]] = df.res.apply(lambda x: x[2] if len(x) > 0 else 0)
     df[["c4"]] = df.res.apply(lambda x: x[3] if len(x) > 0 else 0)
     df[["c5"]] = df.res.apply(lambda x: x[4] if len(x) > 0 else 300000)
+    df[["c6"]] = df.res.apply(lambda x: x[5] if len(x) > 0 else 0)
     df["size_G1"] =df.apply(lambda x: size_str[x.G1] if x.G1 in size_str else 0, axis=1)
     df["size_G2"] = df.apply(lambda x: size_str[x.G2] if x.G2 in size_str else 0, axis=1)
     del df["res"]
@@ -213,14 +214,15 @@ def synthesize(last_step_output,output_filename,min_size_G1=None,min_size_G2=Non
 
         df = df.replace([np.inf, -np.inf], 300000)
         df["c5"] = 1 - (df.c5 - df.c5.min()) / (df.c5.max() - df.c5.min())
+        df["c6"] = (df.c6 - df.c6.min()) / (df.c6.max() - df.c6.min())
         if len(df) <1:
             continue
         mes = np.unique(df.sim_measure)[0]
         type_ = np.unique(df.type_str)[0]
-        val = df.groupby("G1").mean().mean()["c1 c2 c3 c4 c5".split()].values.tolist()
+        val = df.groupby("G1").mean().mean()["c1 c2 c3 c4 c5 c6".split()].values.tolist()
         val.insert(0, type_)
         val.insert(0, mes)
         data.append(val)
 
-    res = pd.DataFrame(data, columns="mesure type c1 c2 c3 c4 c5".split())
+    res = pd.DataFrame(data, columns="mesure type c1 c2 c3 c4 c5 c6".split())
     res.to_csv(output_filename)
\ No newline at end of file
diff --git a/generate_str.py b/generate_str.py
index 19d9bfd..c4689eb 100644
--- a/generate_str.py
+++ b/generate_str.py
@@ -27,6 +27,7 @@ from strpython.nlp.disambiguator.most_common import MostCommonDisambiguator as m
 from mytoolbox.text.clean import *
 from mytoolbox.exception.inline import safe_execute
 
+from thematic_str.helpers.terminology.matcher import matcher_agrovoc
 from stop_words import get_stop_words
 
 import logging
@@ -86,6 +87,7 @@ if not os.path.exists(args.input_pkl):
 
 df = pd.read_pickle(args.input_pkl)
 
+
 cols=set(df.columns)
 if not "filename" in cols or not "id_doc" in cols or not "content" in cols or not "lang" in cols:
     raise ValueError("Missing data column in input given")
@@ -97,37 +99,16 @@ pipelines={
     lang : Pipeline(lang=lang,ner=ner_dict[args.ner](lang=lang),tagger=Tagger(),disambiguator= disambiguator_dict[args.disambiguator]())
     for lang in tqdm(languages,desc="Load Pipelines model")
 }
-def matcher_agrovoc( lang):
-    """
-    Return a terminolgy matcher using the Agrovoc vocabulary.
-    
-    Parameters
-    ----------
-    nlp : spacy.lang.Language
-        model
-    lang : str
-        language of the terms
-    
-    Returns
-    -------
-    TerminologyMatcher
-        matcher
-    """
-    agrovoc_vocab = pd.read_csv("../thematic_str/data/terminology/agrovoc/agrovoc_cleaned.csv")
-    agrovoc_vocab["preferred_label_new"] = agrovoc_vocab["preferred_label_new"].apply(
-        lambda x: safe_execute({}, Exception, json.loads, x.replace("\'", "\"")))
-    agrovoc_vocab["label_lang"] = agrovoc_vocab["preferred_label_new"].apply(
-        lambda x: str(resolv_a(x[lang]) if lang in x else np.nan).strip().lower())
-    agrovoc_vocab=agrovoc_vocab[~pd.isna(agrovoc_vocab["label_lang"])]
-    return agrovoc_vocab["label_lang"].values.tolist()
+
 
 stopwords = {
-    lang:matcher_agrovoc(lang)
+    lang:matcher_agrovoc(lang).terminology_data
     for lang in tqdm(languages,desc="Load stopwords")
 }
 for lang in stopwords:
     stopwords[lang].extend(get_stop_words(lang))
 
+
 print("Clean input content ...")
 if not "entities" in df:
     df["content"]= df.content.progress_apply(lambda x :clean_text(x))
@@ -138,7 +119,7 @@ def build(pipelines,x):
     global count_error
     try:
         if "entities" in x:
-            return pipelines[x.lang].build(x.content,toponyms=x.entities,stop_words=stopwords[x.lang])
+            return pipelines[x.lang].build(x.content,stop_words=stopwords[x.lang])
     except Exception as e:
         print(e)
     
@@ -154,9 +135,11 @@ def build(pipelines,x):
             return STR.from_networkx_graph(nx.Graph())
 
 print("Transforming text to STR ...")
-
-df["str_object"]=df.progress_apply(lambda x: build(pipelines,x) if len(x.content) >0 else STR.from_networkx_graph(nx.Graph()) , axis = 1)
-df["str_object"]=df["str_object"].apply(lambda x: x[0] if isinstance(x,tuple) else x)
+df["str_object"] = STR.from_networkx_graph(nx.Graph())
+for lang in tqdm(languages,desc="Computing STR"):
+    corpus_ = df[df.lang == lang].content
+    df[df.lang == lang]["str_object"]=pipelines[lang].pipe_build(corpus_)#df.progress_apply(lambda x: build(pipelines,x) if len(x.content) >0 else STR.from_networkx_graph(nx.Graph()) , axis = 1)
+    df[df.lang == lang]["str_object"]=df["str_object"].apply(lambda x: x[0] if isinstance(x,tuple) else x)
 
 if "ext" in args.transform:
     print("Extending STR ...")
diff --git a/run_automatic_annotation.py b/run_automatic_annotation.py
index bf04d41..82bf704 100644
--- a/run_automatic_annotation.py
+++ b/run_automatic_annotation.py
@@ -24,9 +24,9 @@ parser.add_argument("-m", "--nb_car_doc1",type=int, default=0, help="Return eval
 parser.add_argument("-n", "--nb_car_doc2",type=int, default=0, help="Return evaluation results based on min size of associated text for G2")
 
 args = parser.parse_args()
-if os.path.exists("temp_cluster") and yes_or_no("Do you want to compute STR's clusters all over again ?"):
-    shutil.rmtree('temp_cluster', ignore_errors=True)
-    os.makedirs("temp_cluster")
+# if os.path.exists("temp_cluster") and yes_or_no("Do you want to compute STR's clusters all over again ?"):
+#     shutil.rmtree('temp_cluster', ignore_errors=True)
+#     os.makedirs("temp_cluster")
 
 
 
diff --git a/run_test.py b/run_test.py
index 11246ec..d200d18 100644
--- a/run_test.py
+++ b/run_test.py
@@ -39,25 +39,30 @@ if not os.path.exists(args.input):
     raise FileNotFoundError("{0} does not exists !".format(args.input))
 
 data = pd.read_csv(args.input, index_col=0)
+if len(data) == 0:
+    write_excel(writer, result, args.input.split("/")[-1])
+    exit()
 data["mesure"] = data.mesure.apply(lambda x: "BOW" if x == "BagOfNodes" else x)
 data["sum"] = data["c1 c2 c3 c4 c5".split()].sum(axis=1)
 
 combination_pareto_criteria = [
-    ("c1_c2_c3_c4_c5", "c1 c2 c3 c4 c5".split()),
-    ("c1_c2_c5", "c1 c2 c5".split()),
+    ("c1_c2_c3_c4_c5_c6", "c1 c2 c3 c4 c5 c6".split()),
     ("c1_c2_c3", "c1 c2 c3".split()),
-    ("c3_c4", "c3 c4".split()),
-    ("c5", "c5".split()),
-    ("c2", "c2".split()),
+    ("c3_c4_c5", "c3 c4 c5".split()),
+    ("c2_c5_c6", "c2 c5 c6".split()),
+    ("c5_c6", "c5 c6".split()),
+    ("c2_c5", "c2 c5".split()),
+    ("c6", "c6".split())
 ]
 
 weight_criteria = [
-    ("all_0.2", [0.2, 0.2, 0.2, 0.2, 0.2]),
-    ("c1_0.5_c5_0.5", [0.5, 0., 0., 0., 0.5]),
-    ("c2_0.5_c5_0.5", [0., 0.5, 0., 0., 0.5]),
-    ("c1_0.33_c2_0.33_c3_0,33", [0.33, 0.33, 0.33, 0., 0.]),
-    ("c1_0.5_c2_0.5", [0.5, 0.5, 0., 0., 0.]),
-    ("c3_0.5_c4_0.5", [0., 0., 0.5, 0.5, 0.])
+    ("c1_c2_c3_c4_c5_c6", [0.16, 0.16, 0.16, 0.16, 0.16, 0.16]),
+    ("c1_c2_c3", [0.33, 0.33, 0.33, 0., 0., 0.]),
+    ("c3_c4_c5", [0., 0., 0.33, 0.33, 0.33, 0]),
+    ("c2_c5_c6", [0., 0., 0.33, 0., 0.33,0.33]),
+    ("c5_c6", [0., 0., 0., 0., 0.5, 0.5]),
+    ("c2_c5", [0., 0.5, 0., 0., 0.5, 0.]),
+    ("c6", [0., 0., 0., 0., 0.,1.])
 ]
 
 
@@ -84,6 +89,7 @@ def write_excel(writer, dataframe, title):
     dataframe.to_excel(writer, "result", index=False)
     number_of_rows=len(dataframe)
     worksheet = writer.sheets["result"]
+    worksheet.set_header(title)
     workbook = writer.book
     C_letter = 67
     I_letter= 73
@@ -125,4 +131,4 @@ for weight in tqdm(weight_criteria, desc="WSM computation"):
         result = pd.concat((result,dd),axis=0)
 
 
-write_excel(writer,result,args.output_fn.split("/")[-1])
\ No newline at end of file
+write_excel(writer,result,args.input.split("/")[-1])
\ No newline at end of file
diff --git a/run_test_comparedto.py b/run_test_comparedto.py
new file mode 100644
index 0000000..2b15ad6
--- /dev/null
+++ b/run_test_comparedto.py
@@ -0,0 +1,166 @@
+# coding = utf-8
+import argparse
+import os
+
+import pandas as pd
+import numpy as np
+
+from tqdm import tqdm
+from skcriteria.madm import closeness, simple
+from skcriteria import Data, MIN, MAX
+
+def identify_pareto(scores):
+    # Count number of items
+    population_size = scores.shape[0]
+    # Create a NumPy index for scores on the pareto front (zero indexed)
+    population_ids = np.arange(population_size)
+    # Create a starting list of items on the Pareto front
+    # All items start off as being labelled as on the Parteo front
+    pareto_front = np.ones(population_size, dtype=bool)
+    # Loop through each item. This will then be compared with all other items
+    for i in range(population_size):
+        # Loop through all other items
+        for j in range(population_size):
+            # Check if our 'i' pint is dominated by out 'j' point
+            if all(scores[j] >= scores[i]) and any(scores[j] > scores[i]):
+                # j dominates i. Label 'i' point as not on Pareto front
+                pareto_front[i] = 0
+                # Stop further comparisons with 'i' (no more comparisons needed)
+                break
+    # Return ids of scenarios on pareto front
+    return pareto_front,population_ids[pareto_front]
+
+def pareto_frontier_multi(myArray):
+    # Sort on first dimension
+    myArray = myArray[myArray[:, 0].argsort()]
+    # Add first row to pareto_frontier
+    pareto_frontier = myArray[0:1, :]
+    indices, i = [], 1
+    # Test next row against the last row in pareto_frontier
+    for row in myArray[1:, :]:
+        if sum([row[x] >= pareto_frontier[-1][x]
+                for x in range(len(row))]) == len(row):
+            # If it is better on all features add the row to pareto_frontier
+            pareto_frontier = np.concatenate((pareto_frontier, [row]))
+            indices.append(i)
+        i += 1
+    return indices, pareto_frontier
+
+def evolution(dataframe,mesure,type_,col="c1 c2 c3 c4 c5 c6 sum mean".split()):
+    dataframe2=dataframe.copy()
+    dataframe2.iloc[:,2:2+len(col)] = dataframe2[col].values - dataframe2[(dataframe2.mesure == mesure) & (dataframe2.type == type_)][col].values
+    return dataframe2
+
+parser = argparse.ArgumentParser()
+parser.add_argument("input")
+parser.add_argument("output_fn")
+parser.add_argument("-n","--topn",type=int,default=5)
+parser.add_argument("-m","--mesure",type=str,default="BOW")
+parser.add_argument("-t","--type",type=str,default="str_object")
+args = parser.parse_args()
+
+writer = pd.ExcelWriter(args.output_fn, engine='xlsxwriter')
+
+if not os.path.exists(args.input):
+    raise FileNotFoundError("{0} does not exists !".format(args.input))
+
+data = pd.read_csv(args.input, index_col=0)
+data = data.fillna(0)
+
+if len(data) == 0:
+    write_excel(writer, result, args.input.split("/")[-1])
+    exit()
+data["mesure"] = data.mesure.apply(lambda x: "BOW" if x == "BagOfNodes" else x)
+data["sum"] = data["c1 c2 c3 c4 c5 c6".split()].sum(axis=1)
+data["mean"] = data["c1 c2 c3 c4 c5 c6".split()].mean(axis=1)
+data = evolution(data, args.mesure, args.type)
+
+combination_pareto_criteria = [
+    ("c1_c2_c3_c4_c5_c6", "c1 c2 c3 c4 c5 c6".split()),
+    # ("c1_c2_c3", "c1 c2 c3".split()),
+    # ("c3_c4_c5", "c3 c4 c5".split()),
+    ("c2_c5_c6", "c2 c5 c6".split()),
+    ("c5_c6", "c5 c6".split()),
+    ("c2_c5", "c2 c5".split()),
+    ("c6", "c6".split()),
+    ("sum", "sum".split())
+]
+
+weight_criteria = [
+    ("c1_c2_c3_c4_c5_c6", [.16,.16, .16, .16, .16, .16]),
+    # ("c1_c2_c3", [0.33, 0.33, 0.33, 0., 0., 0.]),
+    # ("c3_c4_c5", [0., 0., 0.33, 0.33, 0.33, 0]),
+    ("c2_c5_c6", [0., .33, 0., 0.,.33,.33]),
+    ("c5_c6", [0., 0., 0., 0., .5,.5]),
+    ("c2_c5", [0., .5, 0., 0., .5, 0.]),
+    # ("c6", [0., 0., 0., 0., 0.,1.])
+]
+
+
+
+def get_top_combination_wsm(dataframe, weights,topn):
+    datas = (dataframe["c1 c2 c3 c4 c5 c6".split()].values)
+    #dd = Data(datas, criteria=[MAX, MAX, MAX, MAX, MAX, MAX], weights=weights[1])
+    index_max = np.argsort(np.dot(datas, weights[1]))[::-1][:topn]
+    df = dataframe.iloc[index_max]
+
+    df["name"]=weights[0]
+    df["type_score"] = "wsm"
+    return df
+
+
+def get_top_combination_pareto(dataframe, columns,topn):
+    index, data_pa = identify_pareto(dataframe[columns[1]].values)
+    df = dataframe.iloc[index]
+    df = df.sort_values(by = "sum",ascending=False).head(topn)
+    df["name"]=columns[0]
+    df["type_score"] = "pareto"
+    return df
+
+
+def write_excel(writer, dataframe, title):
+    dataframe.to_excel(writer, "result", index=False)
+    number_of_rows=len(dataframe)
+    worksheet = writer.sheets["result"]
+    worksheet.set_header(title)
+    workbook = writer.book
+    C_letter = 67
+    J_letter= 74
+
+    format1 = workbook.add_format({'bg_color': '#FFC7CE',
+                                   'font_color': '#9C0006'})
+
+    # Add a format. Green fill with dark green text.
+    format2 = workbook.add_format({'bg_color': '#C6EFCE',
+                                   'font_color': '#006100'})
+    for i in range(C_letter,J_letter):
+        begin=2
+
+        ch_=chr(i)
+        color_range = "{0}{1}:{0}{2}".format(ch_,begin,number_of_rows)
+        worksheet.conditional_format(color_range, {'type': 'bottom',
+                                                   'value': '1',
+                                                   'format': format1})
+
+        worksheet.conditional_format(color_range, {'type': 'top',
+                                                   'value': '1',
+                                                   'format': format2})
+    writer.save()
+
+result = None
+for comb_ in tqdm(combination_pareto_criteria, desc="Pareto computation"):
+    dd = get_top_combination_pareto(data, comb_,args.topn)
+    if not isinstance(result,pd.DataFrame):
+        result = dd
+    else:
+        result = pd.concat((result,dd),axis=0)
+
+for weight in tqdm(weight_criteria, desc="WSM computation"):
+    dd= get_top_combination_wsm(data, weight,args.topn)
+    if not isinstance(result,pd.DataFrame):
+        result = dd
+    else:
+        result = pd.concat((result,dd),axis=0)
+
+
+write_excel(writer,result,args.input.split("/")[-1])
\ No newline at end of file
diff --git a/run_test_disambiguisation.sh b/run_test_disambiguisation.sh
deleted file mode 100755
index 66ec009..0000000
--- a/run_test_disambiguisation.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-python3 eval_disambiguation.py padiweb accuracy > accuracy_res_padi.txt
-python3 eval_disambiguation.py agromada accuracy > accuracy_res_mada.txt
-
-python3 eval_disambiguation.py padiweb mean_distance_error > mean_distance_res_padi.txt
-python3 eval_disambiguation.py agromada mean_distance_error > mean_distance_res_mada.txt
-
-python3 eval_disambiguation.py padiweb accuracy_k -k=1 >>accuracyk1_res_padi.txt
-python3 eval_disambiguation.py padiweb accuracy_k -k=0.5 > accuracyk0-5_res_padi.txt
-
-python3 eval_disambiguation.py agromada accuracy_k -k=1 >> accuracyk1_res_mada.txt
-python3 eval_disambiguation.py agromada accuracy_k -k=0.5 > accuracyk0-5_res_mada.txt
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 4a646c0..0000000
--- a/setup.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from setuptools import setup
-import os,shutil
-from pathlib import Path
-
-setup(
-    name='strpython',
-    version='0.1',
-    packages=['strpython', 'strpython.nlp', 'strpython.nlp.ner', 'strpython.nlp.exception', 'strpython.nlp.pos_tagger',
-              'strpython.nlp.disambiguator', 'strpython.nlp.disambiguator.models',
-              'strpython.nlp.disambiguator.delozier', 'strpython.eval', 'strpython.config',
-              'strpython.models', 'strpython.models.transformation', 'strpython.helpers'],
-    url='',
-    license='MIT',
-    author='Jacques Fize',
-    author_email='jacques.fize@cirad.fr',
-    description="Module developed in the context of a thesis. This module comprise all implementation of algorithms, "
-                "model for text matching based on spatial features ", install_requires=['tqdm']
-)
-# Put default config file if not exists
-home = str(Path.home())
-if not os.path.exists(os.path.join(home,".strpython")): #or not os.path.exists(os.path.join(home,".strpython/config.json")):
-    os.makedirs(os.path.dirname(os.path.join(home,".strpython/config.json")), exist_ok=True)
-    shutil.copy2("strpython/config/config.json",os.path.join(home,".strpython/config.json"))
\ No newline at end of file
diff --git a/strpython/__init__.py b/strpython/__init__.py
index 0c14602..e5e68f6 100644
--- a/strpython/__init__.py
+++ b/strpython/__init__.py
@@ -1,3 +1,6 @@
 # coding = utf-8
 
-from .models.str import STR
\ No newline at end of file
+from .models.str import STR
+from .helpers.match_cache import MatchingCache
+from .helpers.collision import getGEO
+from .pipeline import Pipeline
diff --git a/strpython/eval/automatic_annotation.py b/strpython/eval/automatic_annotation.py
index 1ff9150..f07b2cc 100644
--- a/strpython/eval/automatic_annotation.py
+++ b/strpython/eval/automatic_annotation.py
@@ -58,8 +58,13 @@ class AnnotationAutomatic(object):
             if found:
                 return list(value)
 
-        crit_ = [self.criterion1(str1, str2), self.criterion2(str1, str2), self.criterion3(str1, str2, id1, id2),
-                 self.criterion4(str1, str2, id1, id2),self.criteria5(str1, str2, id1, id2)]
+        crit_ = [self.criterion1(str1, str2),
+                 self.criterion2(str1, str2),
+                 self.criterion3(str1, str2, id1, id2),
+                 self.criterion4(str1, str2, id1, id2),
+                 self.criteria5(str1, str2, id1, id2),
+                 self.criterion6(str1, str2)]
+
         self.matching_cache.add(id1, id2, *crit_)
         return crit_
 
@@ -212,3 +217,16 @@ class AnnotationAutomatic(object):
 
         return np.mean(cdist(get_centroid_array(c1),get_centroid_array(c2), "euclidean").flatten())
 
+    def criterion6(self, str1, str2):
+        """
+        Return True if both STR contains similar spatial entities.
+        Parameters
+        ----------
+        str1
+        str2
+
+        Returns
+        -------
+
+        """
+        return len(set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys()))
\ No newline at end of file
diff --git a/strpython/helpers/geo_relation_database.py b/strpython/helpers/geo_relation_database.py
index a51cbff..aa43d34 100644
--- a/strpython/helpers/geo_relation_database.py
+++ b/strpython/helpers/geo_relation_database.py
@@ -27,7 +27,7 @@ class GeoRelationMatchingDatabase():
         (idse1 text, idse2 text, value integer)
         """
         matching_schema = """CREATE TABLE matching
-        (dataset text, g1 integer, g2 integer, c1 integer, c2 integer, c3 integer,c4 integer, c5 REAL )
+        (dataset text, g1 integer, g2 integer, c1 integer, c2 integer, c3 integer,c4 integer, c5 REAL, c6 REAL )
         """
         cursor.execute(inclusion_schema)
         cursor.execute(adjacency_schema)
@@ -74,7 +74,7 @@ class GeoRelationMatchingDatabase():
         self._db_connection.commit()
         cursor.close()
 
-    def add_matching(self, dataset: str, G1: int, G2: int, c1: bool, c2: bool, c3: bool, c4: bool,c5: float):
+    def add_matching(self, dataset: str, G1: int, G2: int, c1: bool, c2: bool, c3: bool, c4: bool,c5: float, c6: float):
         """
         Add a matching criteria result within the database
         Parameters
@@ -93,11 +93,15 @@ class GeoRelationMatchingDatabase():
             value of criterion 3
         c4 : bool
             value of criterion 4
+        c5 : float
+            value of criterion 5
+        c6 : float
+            value of criterion 6
 
         """
         cursor = self._db_connection.cursor()
-        cursor.execute('INSERT INTO matching VALUES(?,?,?,?,?,?,?,?)',
-                       (dataset, G1, G2, int(c1), int(c2), int(c3), int(c4),float(c5)))
+        cursor.execute('INSERT INTO matching VALUES(?,?,?,?,?,?,?,?,?)',
+                       (dataset, G1, G2, int(c1), int(c2), int(c3), int(c4),float(c5),float(c6)))
         self._db_connection.commit()
         cursor.close()
 
@@ -169,7 +173,7 @@ class GeoRelationMatchingDatabase():
         result_ = cursor.fetchone()
         cursor.close()
         if result_:
-            return True, tuple(map(float, result_[-5:]))
+            return True, tuple(map(float, result_[-6:]))
         return False, False
 
 
@@ -185,9 +189,9 @@ if __name__ == "__main__":
     assert g.get_inclusion("GD1", "GD2") == (True, True)
     assert g.get_inclusion("GD2", "GD1") == (False, False)
 
-    g.add_matching("test", 1, 2, True, True, False, True,0.)
-    g.add_matching("test2", 1, 2, True, False, False, True,0.)
-    assert g.get_matching(1, 2, "test") == (True, (True, True, False, True,0.))
-    assert g.get_matching(1, 2, "test2") != (True, (True, True, False, True,0.))
+    g.add_matching("test", 1, 2, True, True, False, True,0.,0.)
+    g.add_matching("test2", 1, 2, True, False, False, True,0.,0.)
+    assert g.get_matching(1, 2, "test") == (True, (True, True, False, True,0.,0.))
+    assert g.get_matching(1, 2, "test2") != (True, (True, True, False, True,0.,0.))
     print("Passed the tests !")
 
diff --git a/strpython/helpers/match_cache.py b/strpython/helpers/match_cache.py
index 7e79c28..8f5b483 100644
--- a/strpython/helpers/match_cache.py
+++ b/strpython/helpers/match_cache.py
@@ -12,6 +12,6 @@ class MatchingCache:
     def is_match(self, id_str1: int, id_str2: int):
         return self.db_rel_match.get_matching(id_str1, id_str2, self.dataset)
 
-    def add(self, id_str1: int, id_str2: int, c1: int, c2: int, c3: int, c4: int, c5: float):
+    def add(self, id_str1: int, id_str2: int, c1: int, c2: int, c3: int, c4: int, c5: float, c6: float):
         if not self.is_match(id_str1, id_str2)[0]:
-            self.db_rel_match.add_matching(self.dataset, id_str1, id_str2, c1, c2, c3, c4,c5)
+            self.db_rel_match.add_matching(self.dataset, id_str1, id_str2, c1, c2, c3, c4, c5, c6)
diff --git a/strpython/helpers/sim_matrix.py b/strpython/helpers/sim_matrix.py
index 1d57994..3750896 100644
--- a/strpython/helpers/sim_matrix.py
+++ b/strpython/helpers/sim_matrix.py
@@ -36,6 +36,6 @@ def matrix_to_pandas_dataframe(matrix, selected, sim_measure, type_str, n=5):
         top_n = np.argsort(matrix[line])[::-1][1:n + 1]
         rank = 1
         for val in top_n:
-            tab_array.append([line, val, sim, type_, rank, 0, 0, 0, 0,300000])
+            tab_array.append([line, val, sim, type_, rank, 0, 0, 0, 0,300000,0])
             rank += 1
-    return pd.DataFrame(tab_array, columns="G1 G2 sim_measure type_str rank c1 c2 c3 c4 c5".split())
+    return pd.DataFrame(tab_array, columns="G1 G2 sim_measure type_str rank c1 c2 c3 c4 c5 c6".split())
diff --git a/strpython/models/spatial_relation.py b/strpython/models/spatial_relation.py
index 8f3f26d..89e72d0 100644
--- a/strpython/models/spatial_relation.py
+++ b/strpython/models/spatial_relation.py
@@ -99,7 +99,7 @@ class RelationExtractor(MetaCollector):
          * Inclusion(Sa, Sb) = Within(Sa,Sb)
 
         """
-        if not self.all_geometry:
+        if len(self.all_geometry)<0:
             raise ValueError("No geometry extracted. Check the `spatial_entities` arg during the initialization.")
 
         gdf_intersect = gpd.GeoDataFrame(self.all_geometry, columns="id geometry".split())
diff --git a/strpython/models/str.py b/strpython/models/str.py
index dfe994b..1666854 100644
--- a/strpython/models/str.py
+++ b/strpython/models/str.py
@@ -23,7 +23,7 @@ from ..helpers.geodict_helpers import gazetteer
 from ..helpers.relation_extraction import AdjacencyRelation, InclusionRelation
 
 from joblib import Parallel,delayed
-
+from strpython.helpers.collision import getGEO
 max_int = 1e6
 
 def get_inclusion_chain(id_, prop):
@@ -466,6 +466,12 @@ class STR(object):
         df["y"] = df.geometry.apply(lambda p: p.y)
         return df
 
+    def get_geo_dissolved(self):
+        es = [getGEO(en) for en in self.spatial_entities]
+        es = [[1, e.values[0][0]] if isinstance(e, gpd.GeoDataFrame) else [1, e.values[0]] for e in es if
+              isinstance(e, gpd.GeoDataFrame) or isinstance(e, gpd.GeoSeries)]
+        return gpd.GeoDataFrame(es, columns="dd geometry".split()).dissolve(by="dd")
+
     def get_cluster(self, id_=None):
         """
         Return the cluster detected using spatial entities position.
diff --git a/strpython/nlp/disambiguator/__init__.py b/strpython/nlp/disambiguator/__init__.py
index bceef44..d56a212 100644
--- a/strpython/nlp/disambiguator/__init__.py
+++ b/strpython/nlp/disambiguator/__init__.py
@@ -3,4 +3,4 @@
 from .most_common import MostCommonDisambiguator
 from .share_prop import ShareProp
 from .wikipedia_cooc import WikipediaDisambiguator
-from .disambiguator import Disambiguator
\ No newline at end of file
+from .disambiguator import Disambiguator
diff --git a/strpython/nlp/disambiguator/disambiguator.py b/strpython/nlp/disambiguator/disambiguator.py
index 54defd1..d1d640d 100644
--- a/strpython/nlp/disambiguator/disambiguator.py
+++ b/strpython/nlp/disambiguator/disambiguator.py
@@ -4,9 +4,11 @@ import copy
 import string
 
 import numpy as np
+import pandas as pd
 
 from ..ner.ner import NER
 
+from ...helpers.geodict_helpers import gazetteer
 
 class Disambiguator(object):
 
@@ -69,5 +71,22 @@ class Disambiguator(object):
         """
         raise NotImplementedError
 
+    def get_candidates(self,label,lang):
+
+        candidates=[]
+        candidates.extend(gazetteer.get_by_label(label,lang))
+        candidates.extend(gazetteer.get_by_alias(label, lang))
+        candidates.extend(gazetteer.get_n_label_similar(label,lang, n=1))
+        candidates.extend(gazetteer.get_n_alias_similar(label, lang, n=1))
+        return pd.DataFrame([[
+            r.id,
+            "Paris",
+            r.label[lang],
+            r.score if "score" in r else -1,
+            r.coord if "coord" in r else {},
+            r] for r in candidates],
+                     columns="id toponym label score coord raw".split())
+
+
     def parse_ner_output(self,ner_output):
         return [toponym[0] if isinstance(toponym[0],str) else " ".join(toponym[0]) for toponym in ner_output[ner_output[:,1] == NER._unified_tag["place"]]]
\ No newline at end of file
diff --git a/strpython/nlp/disambiguator/most_common.py b/strpython/nlp/disambiguator/most_common.py
index be12646..ad3f46c 100644
--- a/strpython/nlp/disambiguator/most_common.py
+++ b/strpython/nlp/disambiguator/most_common.py
@@ -52,8 +52,9 @@ class MostCommonDisambiguator(Disambiguator):
             if plural.lower() in stop_words[lang]:
                 return 'O', -1
 
-        data=get_most_common_id_v3(label, lang)
+        data=self.get_candidates(label, lang).sort_values(by="score",ascending=False)
         id_, score=None,0
-        if data:
-            id_,score=data.id,data.score
+        if len(data)>0:
+            entry_selected= data.iloc[0]
+            id_,score=entry_selected.id,entry_selected.score
         return id_, score
diff --git a/strpython/nlp/disambiguator/share_prop.py b/strpython/nlp/disambiguator/share_prop.py
index 001bc78..7637faf 100644
--- a/strpython/nlp/disambiguator/share_prop.py
+++ b/strpython/nlp/disambiguator/share_prop.py
@@ -150,13 +150,11 @@ class ShareProp(Disambiguator):
         fixed_entities = {}
         ambiguous_entities = {}
         for topo in toponyms:
-            request = gazetteer.get_by_label(topo, lang)
-            if len(request) == 0:
-                request = gazetteer.get_by_alias(topo, lang)
+            request = self.get_candidates(topo,lang)
             if len(request) > 1:
                 ambiguous_entities[topo] = request
             elif len(request) == 1:
-                fixed_entities[topo] = request[0]
+                fixed_entities[topo] = request.iloc[0].raw
 
         d_amb_results = {}
         for amb_ent in ambiguous_entities:
diff --git a/strpython/nlp/disambiguator/wikipedia_cooc.py b/strpython/nlp/disambiguator/wikipedia_cooc.py
index a8dacfd..723423f 100644
--- a/strpython/nlp/disambiguator/wikipedia_cooc.py
+++ b/strpython/nlp/disambiguator/wikipedia_cooc.py
@@ -47,10 +47,8 @@ class WikipediaDisambiguator(Disambiguator):
         group_candidate = {} #candidates per toponym
 
         for toponym in toponyms_filtered:
-            candidates = get_top_candidate(toponym, lang, 5)
+            candidates = self.get_candidates(toponym, lang)
             candidates = [c.id for c in candidates if c]
-            if not candidates:
-                candidates = [c.id for c in gazetteer.get_n_label_similar(toponym,lang,5) if c]
             group_candidate[toponym] = candidates
             betw_cand[toponym]=candidates
             for n in candidates:
diff --git a/strpython/nlp/stop_words.py b/strpython/nlp/stop_words.py
new file mode 100644
index 0000000..e5c6c3c
--- /dev/null
+++ b/strpython/nlp/stop_words.py
@@ -0,0 +1,2 @@
+# coding = utf-8
+
diff --git a/strpython/pipeline.py b/strpython/pipeline.py
index 6a905af..66b5d21 100644
--- a/strpython/pipeline.py
+++ b/strpython/pipeline.py
@@ -1,6 +1,8 @@
 # coding =utf-8
 import re
 
+import os, json, re
+
 from nltk import word_tokenize
 from joblib import Parallel, delayed
 
@@ -21,6 +23,7 @@ if in_notebook():
 else:
     from tqdm import tqdm
 
+from mytoolbox.env import yes_or_no
 
 class Pipeline(object):
     """
@@ -39,6 +42,9 @@ class Pipeline(object):
         self.ner = kwargs["ner"] if "ner" in kwargs else Spacy(lang=lang[:2])
         self.disambiguator=kwargs["disambiguator"] if "disambiguator" in kwargs else MostCommonDisambiguator()
 
+        self.dict_adj = kwargs.get("dict_adj",None)
+        self.dict_inc = kwargs.get("dict_inc",None)
+
     def parse(self,text,debug=False):
         """
 
@@ -90,25 +96,32 @@ class Pipeline(object):
         -------
 
         """
-        r = RelationExtractor(spatial_entities)
-        r.get_relation_geometry_based()
-        r.get_relation_meta_based()
-        df_adj, df_inc = r.fuse_meta_and_geom()
-        dict_adj = df_adj.to_dict()
-        dict_inc = df_inc.to_dict()
-        return dict_adj, dict_inc
+        if not self.dict_adj and not self.dict_inc:
+            if os.path.exists("adj_dict.json") and yes_or_no(question="Do you want to use previous adj file"):
+                self.dict_adj=json.load(open("adj_dict.json"))
+            if os.path.exists("inc_dict.json") and yes_or_no(question="Do you want to use previous inc file"):
+                self.dict_adj=json.load(open("inc_dict.json"))
+
+            if not self.dict_adj and not self.dict_inc:
+                r = RelationExtractor(spatial_entities)
+                r.get_relation_geometry_based()
+                r.get_relation_meta_based()
+                df_adj, df_inc = r.fuse_meta_and_geom()
+                self.dict_adj = df_adj.to_dict()
+                self.dict_inc = df_inc.to_dict()
+
 
     def pipe_build(self,texts, cpu_count=cpu_count(),**kwargs):
 
-        text_and_spatial_entities = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.parse)(text) for text in tqdm(texts,desc="Extract spatial entities from the texts"))
+        text_and_spatial_entities = [self.parse(text) for text in tqdm(texts,desc="Extract spatial entities from the texts")]
         sp_es= []
         for res in text_and_spatial_entities:
             sp_es.extend(list(res[1].values()))
         sp_es= [es for es in sp_es if es.startswith("GD")]
         print("Extract Spatial Relation for all identified spatial entities")
-        adj_rel_dict, inc_rel_dict = self.extract_all_relation(sp_es)
+        self.extract_all_relation(sp_es)
 
-        str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[0], ext[1], adj_rel_dict, inc_rel_dict, **kwargs) for ext in tqdm(text_and_spatial_entities, desc="Build STR"))
+        str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[0], ext[1], self.dict_adj, self.dict_inc, **kwargs) for ext in tqdm(text_and_spatial_entities, desc="Build STR"))
         return str_s
 
     def pipe_transform(self,strs_, cpu_count=cpu_count(),**kwargs):
@@ -116,7 +129,7 @@ class Pipeline(object):
         return str_s
 
     def build(self, text_input, spatial_entities_identified, prec_adj, prec_inc):
-        str_ = STR(word_tokenize(text_input), spatial_entities_identified, toponym_first=True,precomputed_adj=prec_adj,precomputed_inc=prec_inc)
+        str_ = STR(word_tokenize(text_input), spatial_entities_identified, toponym_first=True,precomputed_adj=self.dict_adj,precomputed_inc=self.dict_inc)
         str_.build(adj=True, inc=True)
         return str_
 
-- 
GitLab