diff --git a/.gitignore b/.gitignore index dd029ad06e516449caf4624c57e728de671ba3f1..c83cd7923636bb9c62648bbfb1e4be2be47c9d2d 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,5 @@ __pycache__/ *.gexf temp_cluster_2/ agromada* -output* \ No newline at end of file +output* +.vscode \ No newline at end of file diff --git a/auto_fill_annotation.py b/auto_fill_annotation.py index 8ccf6f917a0965b1bfa0e3e86a4911859fe31d4c..a06b2f2179e1c14e87ff2a0c64655a970464fc17 100644 --- a/auto_fill_annotation.py +++ b/auto_fill_annotation.py @@ -23,13 +23,13 @@ def main(dataset, matrix_sim_dir, raw_graph_dir, selected_graphs, threshold, inc first_step_output = "output_first_step_{0}_{1}".format(dataset, threshold) last_step_output = "output_final_{0}_{1}".format(dataset, threshold) generate_annotation_dataframe(matrix_sim_dir, selected_graphs, first_step_output) - # size_str = extract_criteria_4_all(annotater, first_step_output, raw_graph_dir, dataset, threshold) + size_str = extract_criteria_4_all(annotater, first_step_output, raw_graph_dir, dataset, threshold) if not os.path.exists(last_step_output): os.makedirs(last_step_output) - # for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"): - # annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) + for fn in tqdm(glob.glob(os.path.join(first_step_output,"*.csv")),desc="Annotate sample"): + annotate_eval_sample(annotater, fn, os.path.join(last_step_output, os.path.basename(fn)),size_str) min_carac_dict=None if min_carac_fn != "" and os.path.exists(min_carac_fn): @@ -120,14 +120,14 @@ def extract_criteria_4_all(annotater, csv_input_dir, raw_graph_dir, dataset, thr try: return annotater.all(strs[int(x.G1)], strs[int(x.G2)], int(x.G1), int(x.G2)) except KeyError as e: - annotater.matching_cache.add(int(x.G1), int(x.G2), *(0, 0, 0, 0,300000)) - return [0, 0, 0, 0,300000,0] + annotater.matching_cache.add(int(x.G1), int(x.G2), *(0, 0, 0, 0,300000,0)) + return [0, 0, 0, 0,300000,0,0] # Annotation Time print("Computing Criteria for each match") matching_dataframe["res"] = matching_dataframe.progress_apply(lambda x: annotate(x), axis=1) - matching_dataframe.res = matching_dataframe.res.apply(lambda x: [int(x[0]),int(x[1]),int(x[2]),int(x[3]),float(x[4])] if x else []) - for ix, col in enumerate("c1 c2 c3 c4 c5".split()): + matching_dataframe.res = matching_dataframe.res.apply(lambda x: [int(x[0]),int(x[1]),int(x[2]),int(x[3]),float(x[4]),float(x[5])] if x else []) + for ix, col in enumerate("c1 c2 c3 c4 c5 c6".split()): matching_dataframe[col] = matching_dataframe.res.apply(lambda x: x[ix] if len(x) > 0 else 0) del matching_dataframe["res"] @@ -157,7 +157,7 @@ def annotate_eval_sample(annotater, csv_file, output_file, size_str): try: return annotater.all(None, None, x.G1, x.G2) except Exception as e: - return [0, 0, 0, 0,300000] + return [0, 0, 0, 0,300000,0] df["res"] = df.apply(lambda x: foo(x), axis=1) df.res = df.res.apply(lambda x: list(map(float, x)) if x else []) # if bool @@ -166,6 +166,7 @@ def annotate_eval_sample(annotater, csv_file, output_file, size_str): df[["c3"]] = df.res.apply(lambda x: x[2] if len(x) > 0 else 0) df[["c4"]] = df.res.apply(lambda x: x[3] if len(x) > 0 else 0) df[["c5"]] = df.res.apply(lambda x: x[4] if len(x) > 0 else 300000) + df[["c6"]] = df.res.apply(lambda x: x[5] if len(x) > 0 else 0) df["size_G1"] =df.apply(lambda x: size_str[x.G1] if x.G1 in size_str else 0, axis=1) df["size_G2"] = df.apply(lambda x: size_str[x.G2] if x.G2 in size_str else 0, axis=1) del df["res"] @@ -213,14 +214,15 @@ def synthesize(last_step_output,output_filename,min_size_G1=None,min_size_G2=Non df = df.replace([np.inf, -np.inf], 300000) df["c5"] = 1 - (df.c5 - df.c5.min()) / (df.c5.max() - df.c5.min()) + df["c6"] = (df.c6 - df.c6.min()) / (df.c6.max() - df.c6.min()) if len(df) <1: continue mes = np.unique(df.sim_measure)[0] type_ = np.unique(df.type_str)[0] - val = df.groupby("G1").mean().mean()["c1 c2 c3 c4 c5".split()].values.tolist() + val = df.groupby("G1").mean().mean()["c1 c2 c3 c4 c5 c6".split()].values.tolist() val.insert(0, type_) val.insert(0, mes) data.append(val) - res = pd.DataFrame(data, columns="mesure type c1 c2 c3 c4 c5".split()) + res = pd.DataFrame(data, columns="mesure type c1 c2 c3 c4 c5 c6".split()) res.to_csv(output_filename) \ No newline at end of file diff --git a/generate_str.py b/generate_str.py index 19d9bfdf4c985f482e933a417d0e3706995d0edc..c4689ebe92d019460ea9855f7694e6f9b26bd0c7 100644 --- a/generate_str.py +++ b/generate_str.py @@ -27,6 +27,7 @@ from strpython.nlp.disambiguator.most_common import MostCommonDisambiguator as m from mytoolbox.text.clean import * from mytoolbox.exception.inline import safe_execute +from thematic_str.helpers.terminology.matcher import matcher_agrovoc from stop_words import get_stop_words import logging @@ -86,6 +87,7 @@ if not os.path.exists(args.input_pkl): df = pd.read_pickle(args.input_pkl) + cols=set(df.columns) if not "filename" in cols or not "id_doc" in cols or not "content" in cols or not "lang" in cols: raise ValueError("Missing data column in input given") @@ -97,37 +99,16 @@ pipelines={ lang : Pipeline(lang=lang,ner=ner_dict[args.ner](lang=lang),tagger=Tagger(),disambiguator= disambiguator_dict[args.disambiguator]()) for lang in tqdm(languages,desc="Load Pipelines model") } -def matcher_agrovoc( lang): - """ - Return a terminolgy matcher using the Agrovoc vocabulary. - - Parameters - ---------- - nlp : spacy.lang.Language - model - lang : str - language of the terms - - Returns - ------- - TerminologyMatcher - matcher - """ - agrovoc_vocab = pd.read_csv("../thematic_str/data/terminology/agrovoc/agrovoc_cleaned.csv") - agrovoc_vocab["preferred_label_new"] = agrovoc_vocab["preferred_label_new"].apply( - lambda x: safe_execute({}, Exception, json.loads, x.replace("\'", "\""))) - agrovoc_vocab["label_lang"] = agrovoc_vocab["preferred_label_new"].apply( - lambda x: str(resolv_a(x[lang]) if lang in x else np.nan).strip().lower()) - agrovoc_vocab=agrovoc_vocab[~pd.isna(agrovoc_vocab["label_lang"])] - return agrovoc_vocab["label_lang"].values.tolist() + stopwords = { - lang:matcher_agrovoc(lang) + lang:matcher_agrovoc(lang).terminology_data for lang in tqdm(languages,desc="Load stopwords") } for lang in stopwords: stopwords[lang].extend(get_stop_words(lang)) + print("Clean input content ...") if not "entities" in df: df["content"]= df.content.progress_apply(lambda x :clean_text(x)) @@ -138,7 +119,7 @@ def build(pipelines,x): global count_error try: if "entities" in x: - return pipelines[x.lang].build(x.content,toponyms=x.entities,stop_words=stopwords[x.lang]) + return pipelines[x.lang].build(x.content,stop_words=stopwords[x.lang]) except Exception as e: print(e) @@ -154,9 +135,11 @@ def build(pipelines,x): return STR.from_networkx_graph(nx.Graph()) print("Transforming text to STR ...") - -df["str_object"]=df.progress_apply(lambda x: build(pipelines,x) if len(x.content) >0 else STR.from_networkx_graph(nx.Graph()) , axis = 1) -df["str_object"]=df["str_object"].apply(lambda x: x[0] if isinstance(x,tuple) else x) +df["str_object"] = STR.from_networkx_graph(nx.Graph()) +for lang in tqdm(languages,desc="Computing STR"): + corpus_ = df[df.lang == lang].content + df[df.lang == lang]["str_object"]=pipelines[lang].pipe_build(corpus_)#df.progress_apply(lambda x: build(pipelines,x) if len(x.content) >0 else STR.from_networkx_graph(nx.Graph()) , axis = 1) + df[df.lang == lang]["str_object"]=df["str_object"].apply(lambda x: x[0] if isinstance(x,tuple) else x) if "ext" in args.transform: print("Extending STR ...") diff --git a/run_automatic_annotation.py b/run_automatic_annotation.py index bf04d415750a94c6b84688df6d857f685997f7ea..82bf70475c6d504324b1534536bdf38f7cd0f2f2 100644 --- a/run_automatic_annotation.py +++ b/run_automatic_annotation.py @@ -24,9 +24,9 @@ parser.add_argument("-m", "--nb_car_doc1",type=int, default=0, help="Return eval parser.add_argument("-n", "--nb_car_doc2",type=int, default=0, help="Return evaluation results based on min size of associated text for G2") args = parser.parse_args() -if os.path.exists("temp_cluster") and yes_or_no("Do you want to compute STR's clusters all over again ?"): - shutil.rmtree('temp_cluster', ignore_errors=True) - os.makedirs("temp_cluster") +# if os.path.exists("temp_cluster") and yes_or_no("Do you want to compute STR's clusters all over again ?"): +# shutil.rmtree('temp_cluster', ignore_errors=True) +# os.makedirs("temp_cluster") diff --git a/run_test.py b/run_test.py index 11246ec8168050af89811ecd4272e1713051f4c8..d200d185183e0fd1dd451448d306528e11e4f050 100644 --- a/run_test.py +++ b/run_test.py @@ -39,25 +39,30 @@ if not os.path.exists(args.input): raise FileNotFoundError("{0} does not exists !".format(args.input)) data = pd.read_csv(args.input, index_col=0) +if len(data) == 0: + write_excel(writer, result, args.input.split("/")[-1]) + exit() data["mesure"] = data.mesure.apply(lambda x: "BOW" if x == "BagOfNodes" else x) data["sum"] = data["c1 c2 c3 c4 c5".split()].sum(axis=1) combination_pareto_criteria = [ - ("c1_c2_c3_c4_c5", "c1 c2 c3 c4 c5".split()), - ("c1_c2_c5", "c1 c2 c5".split()), + ("c1_c2_c3_c4_c5_c6", "c1 c2 c3 c4 c5 c6".split()), ("c1_c2_c3", "c1 c2 c3".split()), - ("c3_c4", "c3 c4".split()), - ("c5", "c5".split()), - ("c2", "c2".split()), + ("c3_c4_c5", "c3 c4 c5".split()), + ("c2_c5_c6", "c2 c5 c6".split()), + ("c5_c6", "c5 c6".split()), + ("c2_c5", "c2 c5".split()), + ("c6", "c6".split()) ] weight_criteria = [ - ("all_0.2", [0.2, 0.2, 0.2, 0.2, 0.2]), - ("c1_0.5_c5_0.5", [0.5, 0., 0., 0., 0.5]), - ("c2_0.5_c5_0.5", [0., 0.5, 0., 0., 0.5]), - ("c1_0.33_c2_0.33_c3_0,33", [0.33, 0.33, 0.33, 0., 0.]), - ("c1_0.5_c2_0.5", [0.5, 0.5, 0., 0., 0.]), - ("c3_0.5_c4_0.5", [0., 0., 0.5, 0.5, 0.]) + ("c1_c2_c3_c4_c5_c6", [0.16, 0.16, 0.16, 0.16, 0.16, 0.16]), + ("c1_c2_c3", [0.33, 0.33, 0.33, 0., 0., 0.]), + ("c3_c4_c5", [0., 0., 0.33, 0.33, 0.33, 0]), + ("c2_c5_c6", [0., 0., 0.33, 0., 0.33,0.33]), + ("c5_c6", [0., 0., 0., 0., 0.5, 0.5]), + ("c2_c5", [0., 0.5, 0., 0., 0.5, 0.]), + ("c6", [0., 0., 0., 0., 0.,1.]) ] @@ -84,6 +89,7 @@ def write_excel(writer, dataframe, title): dataframe.to_excel(writer, "result", index=False) number_of_rows=len(dataframe) worksheet = writer.sheets["result"] + worksheet.set_header(title) workbook = writer.book C_letter = 67 I_letter= 73 @@ -125,4 +131,4 @@ for weight in tqdm(weight_criteria, desc="WSM computation"): result = pd.concat((result,dd),axis=0) -write_excel(writer,result,args.output_fn.split("/")[-1]) \ No newline at end of file +write_excel(writer,result,args.input.split("/")[-1]) \ No newline at end of file diff --git a/run_test_comparedto.py b/run_test_comparedto.py new file mode 100644 index 0000000000000000000000000000000000000000..2b15ad649d5db41544fc7a9a85056b1261728650 --- /dev/null +++ b/run_test_comparedto.py @@ -0,0 +1,166 @@ +# coding = utf-8 +import argparse +import os + +import pandas as pd +import numpy as np + +from tqdm import tqdm +from skcriteria.madm import closeness, simple +from skcriteria import Data, MIN, MAX + +def identify_pareto(scores): + # Count number of items + population_size = scores.shape[0] + # Create a NumPy index for scores on the pareto front (zero indexed) + population_ids = np.arange(population_size) + # Create a starting list of items on the Pareto front + # All items start off as being labelled as on the Parteo front + pareto_front = np.ones(population_size, dtype=bool) + # Loop through each item. This will then be compared with all other items + for i in range(population_size): + # Loop through all other items + for j in range(population_size): + # Check if our 'i' pint is dominated by out 'j' point + if all(scores[j] >= scores[i]) and any(scores[j] > scores[i]): + # j dominates i. Label 'i' point as not on Pareto front + pareto_front[i] = 0 + # Stop further comparisons with 'i' (no more comparisons needed) + break + # Return ids of scenarios on pareto front + return pareto_front,population_ids[pareto_front] + +def pareto_frontier_multi(myArray): + # Sort on first dimension + myArray = myArray[myArray[:, 0].argsort()] + # Add first row to pareto_frontier + pareto_frontier = myArray[0:1, :] + indices, i = [], 1 + # Test next row against the last row in pareto_frontier + for row in myArray[1:, :]: + if sum([row[x] >= pareto_frontier[-1][x] + for x in range(len(row))]) == len(row): + # If it is better on all features add the row to pareto_frontier + pareto_frontier = np.concatenate((pareto_frontier, [row])) + indices.append(i) + i += 1 + return indices, pareto_frontier + +def evolution(dataframe,mesure,type_,col="c1 c2 c3 c4 c5 c6 sum mean".split()): + dataframe2=dataframe.copy() + dataframe2.iloc[:,2:2+len(col)] = dataframe2[col].values - dataframe2[(dataframe2.mesure == mesure) & (dataframe2.type == type_)][col].values + return dataframe2 + +parser = argparse.ArgumentParser() +parser.add_argument("input") +parser.add_argument("output_fn") +parser.add_argument("-n","--topn",type=int,default=5) +parser.add_argument("-m","--mesure",type=str,default="BOW") +parser.add_argument("-t","--type",type=str,default="str_object") +args = parser.parse_args() + +writer = pd.ExcelWriter(args.output_fn, engine='xlsxwriter') + +if not os.path.exists(args.input): + raise FileNotFoundError("{0} does not exists !".format(args.input)) + +data = pd.read_csv(args.input, index_col=0) +data = data.fillna(0) + +if len(data) == 0: + write_excel(writer, result, args.input.split("/")[-1]) + exit() +data["mesure"] = data.mesure.apply(lambda x: "BOW" if x == "BagOfNodes" else x) +data["sum"] = data["c1 c2 c3 c4 c5 c6".split()].sum(axis=1) +data["mean"] = data["c1 c2 c3 c4 c5 c6".split()].mean(axis=1) +data = evolution(data, args.mesure, args.type) + +combination_pareto_criteria = [ + ("c1_c2_c3_c4_c5_c6", "c1 c2 c3 c4 c5 c6".split()), + # ("c1_c2_c3", "c1 c2 c3".split()), + # ("c3_c4_c5", "c3 c4 c5".split()), + ("c2_c5_c6", "c2 c5 c6".split()), + ("c5_c6", "c5 c6".split()), + ("c2_c5", "c2 c5".split()), + ("c6", "c6".split()), + ("sum", "sum".split()) +] + +weight_criteria = [ + ("c1_c2_c3_c4_c5_c6", [.16,.16, .16, .16, .16, .16]), + # ("c1_c2_c3", [0.33, 0.33, 0.33, 0., 0., 0.]), + # ("c3_c4_c5", [0., 0., 0.33, 0.33, 0.33, 0]), + ("c2_c5_c6", [0., .33, 0., 0.,.33,.33]), + ("c5_c6", [0., 0., 0., 0., .5,.5]), + ("c2_c5", [0., .5, 0., 0., .5, 0.]), + # ("c6", [0., 0., 0., 0., 0.,1.]) +] + + + +def get_top_combination_wsm(dataframe, weights,topn): + datas = (dataframe["c1 c2 c3 c4 c5 c6".split()].values) + #dd = Data(datas, criteria=[MAX, MAX, MAX, MAX, MAX, MAX], weights=weights[1]) + index_max = np.argsort(np.dot(datas, weights[1]))[::-1][:topn] + df = dataframe.iloc[index_max] + + df["name"]=weights[0] + df["type_score"] = "wsm" + return df + + +def get_top_combination_pareto(dataframe, columns,topn): + index, data_pa = identify_pareto(dataframe[columns[1]].values) + df = dataframe.iloc[index] + df = df.sort_values(by = "sum",ascending=False).head(topn) + df["name"]=columns[0] + df["type_score"] = "pareto" + return df + + +def write_excel(writer, dataframe, title): + dataframe.to_excel(writer, "result", index=False) + number_of_rows=len(dataframe) + worksheet = writer.sheets["result"] + worksheet.set_header(title) + workbook = writer.book + C_letter = 67 + J_letter= 74 + + format1 = workbook.add_format({'bg_color': '#FFC7CE', + 'font_color': '#9C0006'}) + + # Add a format. Green fill with dark green text. + format2 = workbook.add_format({'bg_color': '#C6EFCE', + 'font_color': '#006100'}) + for i in range(C_letter,J_letter): + begin=2 + + ch_=chr(i) + color_range = "{0}{1}:{0}{2}".format(ch_,begin,number_of_rows) + worksheet.conditional_format(color_range, {'type': 'bottom', + 'value': '1', + 'format': format1}) + + worksheet.conditional_format(color_range, {'type': 'top', + 'value': '1', + 'format': format2}) + writer.save() + +result = None +for comb_ in tqdm(combination_pareto_criteria, desc="Pareto computation"): + dd = get_top_combination_pareto(data, comb_,args.topn) + if not isinstance(result,pd.DataFrame): + result = dd + else: + result = pd.concat((result,dd),axis=0) + +for weight in tqdm(weight_criteria, desc="WSM computation"): + dd= get_top_combination_wsm(data, weight,args.topn) + if not isinstance(result,pd.DataFrame): + result = dd + else: + result = pd.concat((result,dd),axis=0) + + +write_excel(writer,result,args.input.split("/")[-1]) \ No newline at end of file diff --git a/run_test_disambiguisation.sh b/run_test_disambiguisation.sh deleted file mode 100755 index 66ec00915e5c5acaa647c61e5a74819d08ab0c34..0000000000000000000000000000000000000000 --- a/run_test_disambiguisation.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash -python3 eval_disambiguation.py padiweb accuracy > accuracy_res_padi.txt -python3 eval_disambiguation.py agromada accuracy > accuracy_res_mada.txt - -python3 eval_disambiguation.py padiweb mean_distance_error > mean_distance_res_padi.txt -python3 eval_disambiguation.py agromada mean_distance_error > mean_distance_res_mada.txt - -python3 eval_disambiguation.py padiweb accuracy_k -k=1 >>accuracyk1_res_padi.txt -python3 eval_disambiguation.py padiweb accuracy_k -k=0.5 > accuracyk0-5_res_padi.txt - -python3 eval_disambiguation.py agromada accuracy_k -k=1 >> accuracyk1_res_mada.txt -python3 eval_disambiguation.py agromada accuracy_k -k=0.5 > accuracyk0-5_res_mada.txt \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 4a646c0a6e6ebe062c25ef2dae72caf34aea8b4d..0000000000000000000000000000000000000000 --- a/setup.py +++ /dev/null @@ -1,23 +0,0 @@ -from setuptools import setup -import os,shutil -from pathlib import Path - -setup( - name='strpython', - version='0.1', - packages=['strpython', 'strpython.nlp', 'strpython.nlp.ner', 'strpython.nlp.exception', 'strpython.nlp.pos_tagger', - 'strpython.nlp.disambiguator', 'strpython.nlp.disambiguator.models', - 'strpython.nlp.disambiguator.delozier', 'strpython.eval', 'strpython.config', - 'strpython.models', 'strpython.models.transformation', 'strpython.helpers'], - url='', - license='MIT', - author='Jacques Fize', - author_email='jacques.fize@cirad.fr', - description="Module developed in the context of a thesis. This module comprise all implementation of algorithms, " - "model for text matching based on spatial features ", install_requires=['tqdm'] -) -# Put default config file if not exists -home = str(Path.home()) -if not os.path.exists(os.path.join(home,".strpython")): #or not os.path.exists(os.path.join(home,".strpython/config.json")): - os.makedirs(os.path.dirname(os.path.join(home,".strpython/config.json")), exist_ok=True) - shutil.copy2("strpython/config/config.json",os.path.join(home,".strpython/config.json")) \ No newline at end of file diff --git a/strpython/__init__.py b/strpython/__init__.py index 0c14602ded0f9ca881bca5ccb2023e96e5eeb8c4..e5e68f6f34cd0b0de35d3161e4eb51be0e498a78 100644 --- a/strpython/__init__.py +++ b/strpython/__init__.py @@ -1,3 +1,6 @@ # coding = utf-8 -from .models.str import STR \ No newline at end of file +from .models.str import STR +from .helpers.match_cache import MatchingCache +from .helpers.collision import getGEO +from .pipeline import Pipeline diff --git a/strpython/eval/automatic_annotation.py b/strpython/eval/automatic_annotation.py index 1ff91506b9f18b13bb97196163214a382c22c7b3..f07b2cc79b914ea3ff73e9549137b3e74fb08ff9 100644 --- a/strpython/eval/automatic_annotation.py +++ b/strpython/eval/automatic_annotation.py @@ -58,8 +58,13 @@ class AnnotationAutomatic(object): if found: return list(value) - crit_ = [self.criterion1(str1, str2), self.criterion2(str1, str2), self.criterion3(str1, str2, id1, id2), - self.criterion4(str1, str2, id1, id2),self.criteria5(str1, str2, id1, id2)] + crit_ = [self.criterion1(str1, str2), + self.criterion2(str1, str2), + self.criterion3(str1, str2, id1, id2), + self.criterion4(str1, str2, id1, id2), + self.criteria5(str1, str2, id1, id2), + self.criterion6(str1, str2)] + self.matching_cache.add(id1, id2, *crit_) return crit_ @@ -212,3 +217,16 @@ class AnnotationAutomatic(object): return np.mean(cdist(get_centroid_array(c1),get_centroid_array(c2), "euclidean").flatten()) + def criterion6(self, str1, str2): + """ + Return True if both STR contains similar spatial entities. + Parameters + ---------- + str1 + str2 + + Returns + ------- + + """ + return len(set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys())) \ No newline at end of file diff --git a/strpython/helpers/geo_relation_database.py b/strpython/helpers/geo_relation_database.py index a51cbffa10619f97bd49abf1955374349592eec0..aa43d345d0e9fb020ce9ba180f6da482f67916bf 100644 --- a/strpython/helpers/geo_relation_database.py +++ b/strpython/helpers/geo_relation_database.py @@ -27,7 +27,7 @@ class GeoRelationMatchingDatabase(): (idse1 text, idse2 text, value integer) """ matching_schema = """CREATE TABLE matching - (dataset text, g1 integer, g2 integer, c1 integer, c2 integer, c3 integer,c4 integer, c5 REAL ) + (dataset text, g1 integer, g2 integer, c1 integer, c2 integer, c3 integer,c4 integer, c5 REAL, c6 REAL ) """ cursor.execute(inclusion_schema) cursor.execute(adjacency_schema) @@ -74,7 +74,7 @@ class GeoRelationMatchingDatabase(): self._db_connection.commit() cursor.close() - def add_matching(self, dataset: str, G1: int, G2: int, c1: bool, c2: bool, c3: bool, c4: bool,c5: float): + def add_matching(self, dataset: str, G1: int, G2: int, c1: bool, c2: bool, c3: bool, c4: bool,c5: float, c6: float): """ Add a matching criteria result within the database Parameters @@ -93,11 +93,15 @@ class GeoRelationMatchingDatabase(): value of criterion 3 c4 : bool value of criterion 4 + c5 : float + value of criterion 5 + c6 : float + value of criterion 6 """ cursor = self._db_connection.cursor() - cursor.execute('INSERT INTO matching VALUES(?,?,?,?,?,?,?,?)', - (dataset, G1, G2, int(c1), int(c2), int(c3), int(c4),float(c5))) + cursor.execute('INSERT INTO matching VALUES(?,?,?,?,?,?,?,?,?)', + (dataset, G1, G2, int(c1), int(c2), int(c3), int(c4),float(c5),float(c6))) self._db_connection.commit() cursor.close() @@ -169,7 +173,7 @@ class GeoRelationMatchingDatabase(): result_ = cursor.fetchone() cursor.close() if result_: - return True, tuple(map(float, result_[-5:])) + return True, tuple(map(float, result_[-6:])) return False, False @@ -185,9 +189,9 @@ if __name__ == "__main__": assert g.get_inclusion("GD1", "GD2") == (True, True) assert g.get_inclusion("GD2", "GD1") == (False, False) - g.add_matching("test", 1, 2, True, True, False, True,0.) - g.add_matching("test2", 1, 2, True, False, False, True,0.) - assert g.get_matching(1, 2, "test") == (True, (True, True, False, True,0.)) - assert g.get_matching(1, 2, "test2") != (True, (True, True, False, True,0.)) + g.add_matching("test", 1, 2, True, True, False, True,0.,0.) + g.add_matching("test2", 1, 2, True, False, False, True,0.,0.) + assert g.get_matching(1, 2, "test") == (True, (True, True, False, True,0.,0.)) + assert g.get_matching(1, 2, "test2") != (True, (True, True, False, True,0.,0.)) print("Passed the tests !") diff --git a/strpython/helpers/match_cache.py b/strpython/helpers/match_cache.py index 7e79c28ecf0bee7b37eb51c40cdabf096a491911..8f5b48390ac8f787af3c797dbc344b762aa0a4d6 100644 --- a/strpython/helpers/match_cache.py +++ b/strpython/helpers/match_cache.py @@ -12,6 +12,6 @@ class MatchingCache: def is_match(self, id_str1: int, id_str2: int): return self.db_rel_match.get_matching(id_str1, id_str2, self.dataset) - def add(self, id_str1: int, id_str2: int, c1: int, c2: int, c3: int, c4: int, c5: float): + def add(self, id_str1: int, id_str2: int, c1: int, c2: int, c3: int, c4: int, c5: float, c6: float): if not self.is_match(id_str1, id_str2)[0]: - self.db_rel_match.add_matching(self.dataset, id_str1, id_str2, c1, c2, c3, c4,c5) + self.db_rel_match.add_matching(self.dataset, id_str1, id_str2, c1, c2, c3, c4, c5, c6) diff --git a/strpython/helpers/sim_matrix.py b/strpython/helpers/sim_matrix.py index 1d57994345950e68021711f00d0b981ab05a2f23..3750896edc5f7c45be5e2a300e36dd4f878f2b3a 100644 --- a/strpython/helpers/sim_matrix.py +++ b/strpython/helpers/sim_matrix.py @@ -36,6 +36,6 @@ def matrix_to_pandas_dataframe(matrix, selected, sim_measure, type_str, n=5): top_n = np.argsort(matrix[line])[::-1][1:n + 1] rank = 1 for val in top_n: - tab_array.append([line, val, sim, type_, rank, 0, 0, 0, 0,300000]) + tab_array.append([line, val, sim, type_, rank, 0, 0, 0, 0,300000,0]) rank += 1 - return pd.DataFrame(tab_array, columns="G1 G2 sim_measure type_str rank c1 c2 c3 c4 c5".split()) + return pd.DataFrame(tab_array, columns="G1 G2 sim_measure type_str rank c1 c2 c3 c4 c5 c6".split()) diff --git a/strpython/models/spatial_relation.py b/strpython/models/spatial_relation.py index 8f3f26d9edad345b266cb6b18ab022eeecc7365e..89e72d0f569b4c4f1790fc77671eea19143db009 100644 --- a/strpython/models/spatial_relation.py +++ b/strpython/models/spatial_relation.py @@ -99,7 +99,7 @@ class RelationExtractor(MetaCollector): * Inclusion(Sa, Sb) = Within(Sa,Sb) """ - if not self.all_geometry: + if len(self.all_geometry)<0: raise ValueError("No geometry extracted. Check the `spatial_entities` arg during the initialization.") gdf_intersect = gpd.GeoDataFrame(self.all_geometry, columns="id geometry".split()) diff --git a/strpython/models/str.py b/strpython/models/str.py index dfe994b1129ae48831f05dd4ef11b0a7968275e3..1666854ee34df9350f715f4f961e77a6598653a1 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -23,7 +23,7 @@ from ..helpers.geodict_helpers import gazetteer from ..helpers.relation_extraction import AdjacencyRelation, InclusionRelation from joblib import Parallel,delayed - +from strpython.helpers.collision import getGEO max_int = 1e6 def get_inclusion_chain(id_, prop): @@ -466,6 +466,12 @@ class STR(object): df["y"] = df.geometry.apply(lambda p: p.y) return df + def get_geo_dissolved(self): + es = [getGEO(en) for en in self.spatial_entities] + es = [[1, e.values[0][0]] if isinstance(e, gpd.GeoDataFrame) else [1, e.values[0]] for e in es if + isinstance(e, gpd.GeoDataFrame) or isinstance(e, gpd.GeoSeries)] + return gpd.GeoDataFrame(es, columns="dd geometry".split()).dissolve(by="dd") + def get_cluster(self, id_=None): """ Return the cluster detected using spatial entities position. diff --git a/strpython/nlp/disambiguator/__init__.py b/strpython/nlp/disambiguator/__init__.py index bceef44db7ff1990b2a2da22a1b4a2b73f6dfe7d..d56a212f111e39c5e7f76505b99a77503b4a8094 100644 --- a/strpython/nlp/disambiguator/__init__.py +++ b/strpython/nlp/disambiguator/__init__.py @@ -3,4 +3,4 @@ from .most_common import MostCommonDisambiguator from .share_prop import ShareProp from .wikipedia_cooc import WikipediaDisambiguator -from .disambiguator import Disambiguator \ No newline at end of file +from .disambiguator import Disambiguator diff --git a/strpython/nlp/disambiguator/disambiguator.py b/strpython/nlp/disambiguator/disambiguator.py index 54defd1eb08c163a558d69dda93c81af28a9b396..d1d640dcf700d003f2d503973d66f01ec151da79 100644 --- a/strpython/nlp/disambiguator/disambiguator.py +++ b/strpython/nlp/disambiguator/disambiguator.py @@ -4,9 +4,11 @@ import copy import string import numpy as np +import pandas as pd from ..ner.ner import NER +from ...helpers.geodict_helpers import gazetteer class Disambiguator(object): @@ -69,5 +71,22 @@ class Disambiguator(object): """ raise NotImplementedError + def get_candidates(self,label,lang): + + candidates=[] + candidates.extend(gazetteer.get_by_label(label,lang)) + candidates.extend(gazetteer.get_by_alias(label, lang)) + candidates.extend(gazetteer.get_n_label_similar(label,lang, n=1)) + candidates.extend(gazetteer.get_n_alias_similar(label, lang, n=1)) + return pd.DataFrame([[ + r.id, + "Paris", + r.label[lang], + r.score if "score" in r else -1, + r.coord if "coord" in r else {}, + r] for r in candidates], + columns="id toponym label score coord raw".split()) + + def parse_ner_output(self,ner_output): return [toponym[0] if isinstance(toponym[0],str) else " ".join(toponym[0]) for toponym in ner_output[ner_output[:,1] == NER._unified_tag["place"]]] \ No newline at end of file diff --git a/strpython/nlp/disambiguator/most_common.py b/strpython/nlp/disambiguator/most_common.py index be12646104817bbd04f5d4f1c93f9af722847ae4..ad3f46c555ca5b2ad70e8450e684c1139bff8961 100644 --- a/strpython/nlp/disambiguator/most_common.py +++ b/strpython/nlp/disambiguator/most_common.py @@ -52,8 +52,9 @@ class MostCommonDisambiguator(Disambiguator): if plural.lower() in stop_words[lang]: return 'O', -1 - data=get_most_common_id_v3(label, lang) + data=self.get_candidates(label, lang).sort_values(by="score",ascending=False) id_, score=None,0 - if data: - id_,score=data.id,data.score + if len(data)>0: + entry_selected= data.iloc[0] + id_,score=entry_selected.id,entry_selected.score return id_, score diff --git a/strpython/nlp/disambiguator/share_prop.py b/strpython/nlp/disambiguator/share_prop.py index 001bc78182f237108b0511b9d15536458bf4911b..7637fafdf87cf9de84dfb36ef7747c970e019731 100644 --- a/strpython/nlp/disambiguator/share_prop.py +++ b/strpython/nlp/disambiguator/share_prop.py @@ -150,13 +150,11 @@ class ShareProp(Disambiguator): fixed_entities = {} ambiguous_entities = {} for topo in toponyms: - request = gazetteer.get_by_label(topo, lang) - if len(request) == 0: - request = gazetteer.get_by_alias(topo, lang) + request = self.get_candidates(topo,lang) if len(request) > 1: ambiguous_entities[topo] = request elif len(request) == 1: - fixed_entities[topo] = request[0] + fixed_entities[topo] = request.iloc[0].raw d_amb_results = {} for amb_ent in ambiguous_entities: diff --git a/strpython/nlp/disambiguator/wikipedia_cooc.py b/strpython/nlp/disambiguator/wikipedia_cooc.py index a8dacfde5df8a3eac5d25ecf4160b0fbc048205e..723423fd0c1ae1ea6e45da4f82181d7556b28875 100644 --- a/strpython/nlp/disambiguator/wikipedia_cooc.py +++ b/strpython/nlp/disambiguator/wikipedia_cooc.py @@ -47,10 +47,8 @@ class WikipediaDisambiguator(Disambiguator): group_candidate = {} #candidates per toponym for toponym in toponyms_filtered: - candidates = get_top_candidate(toponym, lang, 5) + candidates = self.get_candidates(toponym, lang) candidates = [c.id for c in candidates if c] - if not candidates: - candidates = [c.id for c in gazetteer.get_n_label_similar(toponym,lang,5) if c] group_candidate[toponym] = candidates betw_cand[toponym]=candidates for n in candidates: diff --git a/strpython/nlp/stop_words.py b/strpython/nlp/stop_words.py new file mode 100644 index 0000000000000000000000000000000000000000..e5c6c3cf41f050a3a264593b6ecb526318bf65a2 --- /dev/null +++ b/strpython/nlp/stop_words.py @@ -0,0 +1,2 @@ +# coding = utf-8 + diff --git a/strpython/pipeline.py b/strpython/pipeline.py index 6a905af458c960dbea22c12770b1fedf24af0c55..66b5d21b51c98fac1d586910dd31c4b1a5b86792 100644 --- a/strpython/pipeline.py +++ b/strpython/pipeline.py @@ -1,6 +1,8 @@ # coding =utf-8 import re +import os, json, re + from nltk import word_tokenize from joblib import Parallel, delayed @@ -21,6 +23,7 @@ if in_notebook(): else: from tqdm import tqdm +from mytoolbox.env import yes_or_no class Pipeline(object): """ @@ -39,6 +42,9 @@ class Pipeline(object): self.ner = kwargs["ner"] if "ner" in kwargs else Spacy(lang=lang[:2]) self.disambiguator=kwargs["disambiguator"] if "disambiguator" in kwargs else MostCommonDisambiguator() + self.dict_adj = kwargs.get("dict_adj",None) + self.dict_inc = kwargs.get("dict_inc",None) + def parse(self,text,debug=False): """ @@ -90,25 +96,32 @@ class Pipeline(object): ------- """ - r = RelationExtractor(spatial_entities) - r.get_relation_geometry_based() - r.get_relation_meta_based() - df_adj, df_inc = r.fuse_meta_and_geom() - dict_adj = df_adj.to_dict() - dict_inc = df_inc.to_dict() - return dict_adj, dict_inc + if not self.dict_adj and not self.dict_inc: + if os.path.exists("adj_dict.json") and yes_or_no(question="Do you want to use previous adj file"): + self.dict_adj=json.load(open("adj_dict.json")) + if os.path.exists("inc_dict.json") and yes_or_no(question="Do you want to use previous inc file"): + self.dict_adj=json.load(open("inc_dict.json")) + + if not self.dict_adj and not self.dict_inc: + r = RelationExtractor(spatial_entities) + r.get_relation_geometry_based() + r.get_relation_meta_based() + df_adj, df_inc = r.fuse_meta_and_geom() + self.dict_adj = df_adj.to_dict() + self.dict_inc = df_inc.to_dict() + def pipe_build(self,texts, cpu_count=cpu_count(),**kwargs): - text_and_spatial_entities = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.parse)(text) for text in tqdm(texts,desc="Extract spatial entities from the texts")) + text_and_spatial_entities = [self.parse(text) for text in tqdm(texts,desc="Extract spatial entities from the texts")] sp_es= [] for res in text_and_spatial_entities: sp_es.extend(list(res[1].values())) sp_es= [es for es in sp_es if es.startswith("GD")] print("Extract Spatial Relation for all identified spatial entities") - adj_rel_dict, inc_rel_dict = self.extract_all_relation(sp_es) + self.extract_all_relation(sp_es) - str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[0], ext[1], adj_rel_dict, inc_rel_dict, **kwargs) for ext in tqdm(text_and_spatial_entities, desc="Build STR")) + str_s = Parallel(n_jobs=cpu_count,backend="threading")(delayed(self.build)(ext[0], ext[1], self.dict_adj, self.dict_inc, **kwargs) for ext in tqdm(text_and_spatial_entities, desc="Build STR")) return str_s def pipe_transform(self,strs_, cpu_count=cpu_count(),**kwargs): @@ -116,7 +129,7 @@ class Pipeline(object): return str_s def build(self, text_input, spatial_entities_identified, prec_adj, prec_inc): - str_ = STR(word_tokenize(text_input), spatial_entities_identified, toponym_first=True,precomputed_adj=prec_adj,precomputed_inc=prec_inc) + str_ = STR(word_tokenize(text_input), spatial_entities_identified, toponym_first=True,precomputed_adj=self.dict_adj,precomputed_inc=self.dict_inc) str_.build(adj=True, inc=True) return str_