diff --git a/.gitignore b/.gitignore index 0e7aa1d08cda06ed1bba177262ea792e02b5673f..4a97748395d8d5b4ee3b6fd348a997ed58c6a753 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,7 @@ __pycache__/ !/tests/ .DS_Store .Rproj.user -.vscode/* +.vscode/ data/ csv_results depreciated diff --git a/README.md b/README.md index 64fd0dea1431c013b12fde4a8887ba0a178a65de..207a27090dcdf0dd72b93f19531217b8cadcd7de 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -#STR +# STR This repository contains all the work on STR or Spatial Textual Representation. The file hierarchy is divided in multiple modules such as : diff --git a/auto_fill_annotation.py b/auto_fill_annotation.py index 52b49c85898725f0cb52056ad57a44dddc71c055..abcddfc4360113ace74adbd1c394de06ef2ca487 100644 --- a/auto_fill_annotation.py +++ b/auto_fill_annotation.py @@ -38,17 +38,17 @@ for file in glob.glob(os.path.join(str_graph_path, "*.gexf")): def foo(x): try: return annotater.all(strs[x.G1], strs[x.G2],x.G1, x.G2) - except Exception as e: + except KeyError as e: print(e) return [0, 0, 0, 0] df["res"] = df.progress_apply(lambda x: foo(x), axis=1) -df.res=df.res.apply(lambda x :list(map(int,x))) -df[["c1"]] = df.res.apply(lambda x: x[0]) -df[["c2"]] = df.res.apply(lambda x: x[1]) -df[["c3"]] = df.res.apply(lambda x: x[2]) -df[["c4"]] = df.res.apply(lambda x: x[3]) +df.res=df.res.apply(lambda x :list(map(int,x)) if x else []) +df[["c1"]] = df.res.apply(lambda x: x[0] if len(x)>0 else 0) +df[["c2"]] = df.res.apply(lambda x: x[1] if len(x)>0 else 0) +df[["c3"]] = df.res.apply(lambda x: x[2] if len(x)>0 else 0) +df[["c4"]] = df.res.apply(lambda x: x[3] if len(x)>0 else 0) del df["res"] save_cache() diff --git a/generate_annotation_file.py b/generate_annotation_file.py index ec00d2ae907ca6ec8aa4e47553b1a34e37540c93..d18abc9fb23dcda85f217a1160cb181397950562 100644 --- a/generate_annotation_file.py +++ b/generate_annotation_file.py @@ -33,10 +33,7 @@ selected = json.load(open(args.selectedFile)) for fn in matrix_fns: measure = os.path.basename(fn).split("_")[0] - if os.path.basename(fn).split("_")[-2] in ["extension","gen"] or os.path.basename(fn).split("_")[-1].replace(".npy.bz2", "") in ["window"]: - type_ = "_".join(os.path.basename(fn).split("_")[-2:]).replace(".npy.bz2", "") - else: - type_ = "_".join(os.path.basename(fn).split("_")[-1:]).replace(".npy.bz2", "") + type_= "_".join(fn.split("_")[1:]).replace(".npy.bz2","") print("Proceeding...",measure, type_) df = matrix_to_pandas_dataframe(np.nan_to_num(read_bz2_matrix(fn)), selected, diff --git a/generate_selected_document.py b/generate_selected_document.py index 2a5762352eb5adfbb1ec4df4f022754083a4edd2..cfbc56b61313878e85a39ff9f9799b32379e85c0 100644 --- a/generate_selected_document.py +++ b/generate_selected_document.py @@ -8,7 +8,7 @@ parser.add_argument("graph_input_dir") args=parser.parse_args() graphs={} -for file in glob.glob(args.graph_input_dir+"/normal/*.gexf"): +for file in glob.glob(args.graph_input_dir+"/*.gexf"): id=int(re.findall("\d+",file)[-1]) graphs[id]=nx.read_gexf(file) diff --git a/notebooks/MatchingAnalysis/Result_AnaysisV2_MADA.ipynb b/notebooks/MatchingAnalysis/Result_AnaysisV2_MADA.ipynb index 81d283c3a269c0bc4b3624d79bc0bf30a7589935..836ce6f015255c22c9e87f85a4eea3d0aa3b0939 100644 --- a/notebooks/MatchingAnalysis/Result_AnaysisV2_MADA.ipynb +++ b/notebooks/MatchingAnalysis/Result_AnaysisV2_MADA.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 13, + "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2018-09-28T05:03:07.327486Z", @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2018-09-28T05:03:09.093753Z", @@ -36,7 +36,7 @@ }, "outputs": [], "source": [ - "data=pd.read_csv(\"../../result_debug.csv\",index_col=0)\n", + "data=pd.read_csv(\"../../final_test.csv\",index_col=0)\n", "data=data[data.mesure != \"BP\"]\n", "data[\"mean\"]=np.mean(data[\"c1 c2 c3 c4\".split()].values,axis=1)\n", "data[\"sum\"]=np.sum(data[\"c1 c2 c3 c4\".split()].values,axis=1)\n", @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2018-09-26T12:55:10.491478Z", @@ -122,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2018-09-26T12:55:10.899176Z", @@ -134,148 +134,121 @@ "data": { "text/html": [ "<style type=\"text/css\" >\n", - " #T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row0_col0 {\n", + " #T_fa7389a2_43d2_11e9_991b_6a0002e84820row0_col3 {\n", " background-color: yellow;\n", " : ;\n", - " } #T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row0_col2 {\n", - " background-color: yellow;\n", + " } #T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col0 {\n", " : ;\n", - " } #T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row0_col3 {\n", - " background-color: yellow;\n", - " : ;\n", - " } #T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row4_col0 {\n", + " background-color: #d64541;\n", + " color: white;\n", + " } #T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col1 {\n", " : ;\n", " background-color: #d64541;\n", " color: white;\n", - " } #T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row4_col1 {\n", + " } #T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col2 {\n", " : ;\n", " background-color: #d64541;\n", " color: white;\n", - " } #T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row4_col2 {\n", + " } #T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col3 {\n", " : ;\n", " background-color: #d64541;\n", " color: white;\n", - " } #T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row4_col5 {\n", + " } #T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col5 {\n", " : ;\n", " background-color: #d64541;\n", " color: white;\n", - " } #T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row7_col5 {\n", + " } #T_fa7389a2_43d2_11e9_991b_6a0002e84820row5_col5 {\n", " background-color: yellow;\n", " : ;\n", - " } #T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row10_col1 {\n", + " } #T_fa7389a2_43d2_11e9_991b_6a0002e84820row6_col0 {\n", " background-color: yellow;\n", " : ;\n", - " } #T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row10_col3 {\n", + " } #T_fa7389a2_43d2_11e9_991b_6a0002e84820row7_col1 {\n", + " background-color: yellow;\n", " : ;\n", - " background-color: #d64541;\n", - " color: white;\n", - " }</style><table id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820\" ><thead> <tr> <th class=\"blank level0\" ></th> <th class=\"col_heading level0 col0\" >c1</th> <th class=\"col_heading level0 col1\" >c2</th> <th class=\"col_heading level0 col2\" >c3</th> <th class=\"col_heading level0 col3\" >c4</th> <th class=\"col_heading level0 col4\" >mean</th> <th class=\"col_heading level0 col5\" >sum</th> </tr> <tr> <th class=\"index_name level0\" >mesure</th> <th class=\"blank\" ></th> <th class=\"blank\" ></th> <th class=\"blank\" ></th> <th class=\"blank\" ></th> <th class=\"blank\" ></th> <th class=\"blank\" ></th> </tr></thead><tbody>\n", + " } #T_fa7389a2_43d2_11e9_991b_6a0002e84820row7_col2 {\n", + " background-color: yellow;\n", + " : ;\n", + " }</style><table id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820\" ><thead> <tr> <th class=\"blank level0\" ></th> <th class=\"col_heading level0 col0\" >c1</th> <th class=\"col_heading level0 col1\" >c2</th> <th class=\"col_heading level0 col2\" >c3</th> <th class=\"col_heading level0 col3\" >c4</th> <th class=\"col_heading level0 col4\" >mean</th> <th class=\"col_heading level0 col5\" >sum</th> </tr> <tr> <th class=\"index_name level0\" >mesure</th> <th class=\"blank\" ></th> <th class=\"blank\" ></th> <th class=\"blank\" ></th> <th class=\"blank\" ></th> <th class=\"blank\" ></th> <th class=\"blank\" ></th> </tr></thead><tbody>\n", " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row0\" class=\"row_heading level0 row0\" >BOW</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row0_col0\" class=\"data row0 col0\" >0.953636</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row0_col1\" class=\"data row0 col1\" >0.26</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row0_col2\" class=\"data row0 col2\" >0.926</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row0_col3\" class=\"data row0 col3\" >0.473091</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row0_col4\" class=\"data row0 col4\" >0.653182</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row0_col5\" class=\"data row0 col5\" >2.61273</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row1\" class=\"row_heading level0 row1\" >BagOfCliques</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row1_col0\" class=\"data row1 col0\" >0.8932</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row1_col1\" class=\"data row1 col1\" >0.3072</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row1_col2\" class=\"data row1 col2\" >0.8188</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row1_col3\" class=\"data row1 col3\" >0.3532</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row1_col4\" class=\"data row1 col4\" >0.5931</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row1_col5\" class=\"data row1 col5\" >2.3724</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row2\" class=\"row_heading level0 row2\" >GraphEditDistance</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row2_col0\" class=\"data row2 col0\" >0.918909</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row2_col1\" class=\"data row2 col1\" >0.227091</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row2_col2\" class=\"data row2 col2\" >0.891818</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row2_col3\" class=\"data row2 col3\" >0.458182</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row2_col4\" class=\"data row2 col4\" >0.624</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row2_col5\" class=\"data row2 col5\" >2.496</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row3\" class=\"row_heading level0 row3\" >GraphEditDistanceW</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row3_col0\" class=\"data row3 col0\" >0.926333</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row3_col1\" class=\"data row3 col1\" >0.2</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row3_col2\" class=\"data row3 col2\" >0.901</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row3_col3\" class=\"data row3 col3\" >0.469333</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row3_col4\" class=\"data row3 col4\" >0.624167</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row3_col5\" class=\"data row3 col5\" >2.49667</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row4\" class=\"row_heading level0 row4\" >GreedyEditDistance</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row4_col0\" class=\"data row4 col0\" >0.0315385</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row4_col1\" class=\"data row4 col1\" >0.0472308</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row4_col2\" class=\"data row4 col2\" >0.0723077</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row4_col3\" class=\"data row4 col3\" >0.0124615</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row4_col4\" class=\"data row4 col4\" >0.0408846</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row4_col5\" class=\"data row4 col5\" >0.163538</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row5\" class=\"row_heading level0 row5\" >HED</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row5_col0\" class=\"data row5 col0\" >0.887636</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row5_col1\" class=\"data row5 col1\" >0.237091</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row5_col2\" class=\"data row5 col2\" >0.826</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row5_col3\" class=\"data row5 col3\" >0.363636</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row5_col4\" class=\"data row5 col4\" >0.578591</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row5_col5\" class=\"data row5 col5\" >2.31436</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row6\" class=\"row_heading level0 row6\" >Jaccard</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row6_col0\" class=\"data row6 col0\" >0.938</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row6_col1\" class=\"data row6 col1\" >0.4326</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row6_col2\" class=\"data row6 col2\" >0.9052</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row6_col3\" class=\"data row6 col3\" >0.2934</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row6_col4\" class=\"data row6 col4\" >0.6423</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row6_col5\" class=\"data row6 col5\" >2.5692</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row7\" class=\"row_heading level0 row7\" >MCS</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row7_col0\" class=\"data row7 col0\" >0.9432</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row7_col1\" class=\"data row7 col1\" >0.4278</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row7_col2\" class=\"data row7 col2\" >0.9068</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row7_col3\" class=\"data row7 col3\" >0.3686</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row7_col4\" class=\"data row7 col4\" >0.6616</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row7_col5\" class=\"data row7 col5\" >2.6464</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row8\" class=\"row_heading level0 row8\" >PolyIntersect</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row8_col0\" class=\"data row8 col0\" >0.584</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row8_col1\" class=\"data row8 col1\" >0.4744</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row8_col2\" class=\"data row8 col2\" >0.6972</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row8_col3\" class=\"data row8 col3\" >0.1276</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row8_col4\" class=\"data row8 col4\" >0.4708</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row8_col5\" class=\"data row8 col5\" >1.8832</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row9\" class=\"row_heading level0 row9\" >VertexEdgeOverlap</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row9_col0\" class=\"data row9 col0\" >0.9458</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row9_col1\" class=\"data row9 col1\" >0.3588</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row9_col2\" class=\"data row9 col2\" >0.8928</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row9_col3\" class=\"data row9 col3\" >0.3574</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row9_col4\" class=\"data row9 col4\" >0.6387</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row9_col5\" class=\"data row9 col5\" >2.5548</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820level0_row10\" class=\"row_heading level0 row10\" >WeisfeleirLehmanKernel</th>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row10_col0\" class=\"data row10 col0\" >0.594167</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row10_col1\" class=\"data row10 col1\" >0.762667</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row10_col2\" class=\"data row10 col2\" >0.831333</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row10_col3\" class=\"data row10 col3\" >0.004</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row10_col4\" class=\"data row10 col4\" >0.548042</td>\n", - " <td id=\"T_d79cca2c_2f61_11e9_a1d4_6a0002e84820row10_col5\" class=\"data row10 col5\" >2.19217</td>\n", + " <th id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820level0_row0\" class=\"row_heading level0 row0\" >BOW</th>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row0_col0\" class=\"data row0 col0\" >0.158</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row0_col1\" class=\"data row0 col1\" >0.088</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row0_col2\" class=\"data row0 col2\" >0.15925</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row0_col3\" class=\"data row0 col3\" >0.064</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row0_col4\" class=\"data row0 col4\" >0.117313</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row0_col5\" class=\"data row0 col5\" >0.46925</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820level0_row1\" class=\"row_heading level0 row1\" >DeepWalk</th>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row1_col0\" class=\"data row1 col0\" >0.1425</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row1_col1\" class=\"data row1 col1\" >0.12375</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row1_col2\" class=\"data row1 col2\" >0.15875</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row1_col3\" class=\"data row1 col3\" >0.02</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row1_col4\" class=\"data row1 col4\" >0.11125</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row1_col5\" class=\"data row1 col5\" >0.445</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820level0_row2\" class=\"row_heading level0 row2\" >Graph2Vec</th>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col0\" class=\"data row2 col0\" >0.00775</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col1\" class=\"data row2 col1\" >0.00775</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col2\" class=\"data row2 col2\" >0.00875</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col3\" class=\"data row2 col3\" >0.0005</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col4\" class=\"data row2 col4\" >0.0061875</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row2_col5\" class=\"data row2 col5\" >0.02475</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820level0_row3\" class=\"row_heading level0 row3\" >GraphEditDistance</th>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row3_col0\" class=\"data row3 col0\" >0.141333</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row3_col1\" class=\"data row3 col1\" >0.0846667</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row3_col2\" class=\"data row3 col2\" >0.158667</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row3_col3\" class=\"data row3 col3\" >0.06</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row3_col4\" class=\"data row3 col4\" >0.111167</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row3_col5\" class=\"data row3 col5\" >0.444667</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820level0_row4\" class=\"row_heading level0 row4\" >Jaccard</th>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row4_col0\" class=\"data row4 col0\" >0.156</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row4_col1\" class=\"data row4 col1\" >0.091</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row4_col2\" class=\"data row4 col2\" >0.156</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row4_col3\" class=\"data row4 col3\" >0.0575</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row4_col4\" class=\"data row4 col4\" >0.115125</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row4_col5\" class=\"data row4 col5\" >0.4605</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820level0_row5\" class=\"row_heading level0 row5\" >MCS</th>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row5_col0\" class=\"data row5 col0\" >0.156889</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row5_col1\" class=\"data row5 col1\" >0.0911111</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row5_col2\" class=\"data row5 col2\" >0.158889</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row5_col3\" class=\"data row5 col3\" >0.0626667</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row5_col4\" class=\"data row5 col4\" >0.117389</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row5_col5\" class=\"data row5 col5\" >0.469556</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820level0_row6\" class=\"row_heading level0 row6\" >VertexEdgeOverlap</th>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row6_col0\" class=\"data row6 col0\" >0.15925</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row6_col1\" class=\"data row6 col1\" >0.089</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row6_col2\" class=\"data row6 col2\" >0.15925</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row6_col3\" class=\"data row6 col3\" >0.0615</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row6_col4\" class=\"data row6 col4\" >0.11725</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row6_col5\" class=\"data row6 col5\" >0.469</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820level0_row7\" class=\"row_heading level0 row7\" >WeisfeleirLehmanKernel</th>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row7_col0\" class=\"data row7 col0\" >0.142</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row7_col1\" class=\"data row7 col1\" >0.1595</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row7_col2\" class=\"data row7 col2\" >0.16</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row7_col3\" class=\"data row7 col3\" >0.004</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row7_col4\" class=\"data row7 col4\" >0.116375</td>\n", + " <td id=\"T_fa7389a2_43d2_11e9_991b_6a0002e84820row7_col5\" class=\"data row7 col5\" >0.4655</td>\n", " </tr>\n", " </tbody></table>" ], "text/plain": [ - "<pandas.io.formats.style.Styler at 0x12a4146d8>" + "<pandas.io.formats.style.Styler at 0x12c4d2518>" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -327,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2018-09-26T12:55:10.937714Z", @@ -339,93 +312,176 @@ "data": { "text/html": [ "<style type=\"text/css\" >\n", - " #T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col2 {\n", + " #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row1_col2 {\n", " background-color: yellow;\n", " : ;\n", - " } #T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col3 {\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row1_col4 {\n", " background-color: yellow;\n", " : ;\n", - " } #T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col4 {\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col3 {\n", " background-color: yellow;\n", " : ;\n", - " } #T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col7 {\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col4 {\n", " background-color: yellow;\n", " : ;\n", - " } #T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col2 {\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col5 {\n", " : ;\n", " background-color: #d64541;\n", " color: white;\n", - " } #T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col3 {\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col7 {\n", + " background-color: yellow;\n", + " : ;\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row3_col7 {\n", " : ;\n", " background-color: #d64541;\n", " color: white;\n", - " } #T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col4 {\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col3 {\n", + " background-color: yellow;\n", + " : ;\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col4 {\n", + " background-color: yellow;\n", + " : ;\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col5 {\n", + " : ;\n", + " background-color: #d64541;\n", + " color: white;\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col7 {\n", + " background-color: yellow;\n", + " : ;\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row5_col5 {\n", + " background-color: yellow;\n", + " : ;\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col2 {\n", + " background-color: yellow;\n", + " : ;\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col4 {\n", + " background-color: yellow;\n", + " : ;\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col5 {\n", + " background-color: yellow;\n", + " : ;\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row7_col4 {\n", " : ;\n", " background-color: #d64541;\n", " color: white;\n", - " } #T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col5 {\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col2 {\n", " : ;\n", " background-color: #d64541;\n", " color: white;\n", - " } #T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col7 {\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col3 {\n", " : ;\n", " background-color: #d64541;\n", " color: white;\n", - " } #T_d92e5b58_2f61_11e9_ba72_6a0002e84820row3_col5 {\n", + " } #T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col4 {\n", " background-color: yellow;\n", " : ;\n", - " }</style><table id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820\" ><thead> <tr> <th class=\"blank level0\" ></th> <th class=\"col_heading level0 col0\" >mesure</th> <th class=\"col_heading level0 col1\" >type</th> <th class=\"col_heading level0 col2\" >c1</th> <th class=\"col_heading level0 col3\" >c2</th> <th class=\"col_heading level0 col4\" >c3</th> <th class=\"col_heading level0 col5\" >c4</th> <th class=\"col_heading level0 col6\" >mean</th> <th class=\"col_heading level0 col7\" >sum</th> </tr></thead><tbody>\n", + " }</style><table id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820\" ><thead> <tr> <th class=\"blank level0\" ></th> <th class=\"col_heading level0 col0\" >mesure</th> <th class=\"col_heading level0 col1\" >type</th> <th class=\"col_heading level0 col2\" >c1</th> <th class=\"col_heading level0 col3\" >c2</th> <th class=\"col_heading level0 col4\" >c3</th> <th class=\"col_heading level0 col5\" >c4</th> <th class=\"col_heading level0 col6\" >mean</th> <th class=\"col_heading level0 col7\" >sum</th> </tr></thead><tbody>\n", " <tr>\n", - " <th id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820level0_row0\" class=\"row_heading level0 row0\" >6</th>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col0\" class=\"data row0 col0\" >MCS</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col1\" class=\"data row0 col1\" >extension_2</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col2\" class=\"data row0 col2\" >0.962</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col3\" class=\"data row0 col3\" >0.426</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col4\" class=\"data row0 col4\" >0.912</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col5\" class=\"data row0 col5\" >0.38</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col6\" class=\"data row0 col6\" >0.67</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row0_col7\" class=\"data row0 col7\" >2.68</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820level0_row1\" class=\"row_heading level0 row1\" >21</th>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col0\" class=\"data row1 col0\" >GreedyEditDistance</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col1\" class=\"data row1 col1\" >biotexlda_window</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col2\" class=\"data row1 col2\" >0.078</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col3\" class=\"data row1 col3\" >0.1</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col4\" class=\"data row1 col4\" >0.142</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col5\" class=\"data row1 col5\" >0.018</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col6\" class=\"data row1 col6\" >0.0845</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row1_col7\" class=\"data row1 col7\" >0.338</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820level0_row2\" class=\"row_heading level0 row2\" >22</th>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row2_col0\" class=\"data row2 col0\" >HED</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row2_col1\" class=\"data row2 col1\" >devdu_window</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row2_col2\" class=\"data row2 col2\" >0.914</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row2_col3\" class=\"data row2 col3\" >0.192</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row2_col4\" class=\"data row2 col4\" >0.826</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row2_col5\" class=\"data row2 col5\" >0.366</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row2_col6\" class=\"data row2 col6\" >0.5745</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row2_col7\" class=\"data row2 col7\" >2.298</td>\n", - " </tr>\n", - " <tr>\n", - " <th id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820level0_row3\" class=\"row_heading level0 row3\" >49</th>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row3_col0\" class=\"data row3 col0\" >HED</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row3_col1\" class=\"data row3 col1\" >all</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row3_col2\" class=\"data row3 col2\" >0.924</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row3_col3\" class=\"data row3 col3\" >0.2</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row3_col4\" class=\"data row3 col4\" >0.828</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row3_col5\" class=\"data row3 col5\" >0.384</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row3_col6\" class=\"data row3 col6\" >0.584</td>\n", - " <td id=\"T_d92e5b58_2f61_11e9_ba72_6a0002e84820row3_col7\" class=\"data row3 col7\" >2.336</td>\n", + " <th id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820level0_row0\" class=\"row_heading level0 row0\" >1</th>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row0_col0\" class=\"data row0 col0\" >VertexEdgeOverlap</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row0_col1\" class=\"data row0 col1\" >gen_country</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row0_col2\" class=\"data row0 col2\" >0.158</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row0_col3\" class=\"data row0 col3\" >0.092</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row0_col4\" class=\"data row0 col4\" >0.158</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row0_col5\" class=\"data row0 col5\" >0.06</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row0_col6\" class=\"data row0 col6\" >0.117</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row0_col7\" class=\"data row0 col7\" >0.468</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820level0_row1\" class=\"row_heading level0 row1\" >2</th>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row1_col0\" class=\"data row1 col0\" >MCS</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row1_col1\" class=\"data row1 col1\" >object</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row1_col2\" class=\"data row1 col2\" >0.16</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row1_col3\" class=\"data row1 col3\" >0.088</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row1_col4\" class=\"data row1 col4\" >0.16</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row1_col5\" class=\"data row1 col5\" >0.056</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row1_col6\" class=\"data row1 col6\" >0.116</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row1_col7\" class=\"data row1 col7\" >0.464</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820level0_row2\" class=\"row_heading level0 row2\" >3</th>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col0\" class=\"data row2 col0\" >WeisfeleirLehmanKernel</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col1\" class=\"data row2 col1\" >gen_country</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col2\" class=\"data row2 col2\" >0.154</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col3\" class=\"data row2 col3\" >0.16</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col4\" class=\"data row2 col4\" >0.16</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col5\" class=\"data row2 col5\" >0.006</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col6\" class=\"data row2 col6\" >0.12</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row2_col7\" class=\"data row2 col7\" >0.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820level0_row3\" class=\"row_heading level0 row3\" >7</th>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row3_col0\" class=\"data row3 col0\" >DeepWalk</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row3_col1\" class=\"data row3 col1\" >ext_1</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row3_col2\" class=\"data row3 col2\" >0.136</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row3_col3\" class=\"data row3 col3\" >0.11</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row3_col4\" class=\"data row3 col4\" >0.158</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row3_col5\" class=\"data row3 col5\" >0.018</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row3_col6\" class=\"data row3 col6\" >0.1055</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row3_col7\" class=\"data row3 col7\" >0.422</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820level0_row4\" class=\"row_heading level0 row4\" >8</th>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col0\" class=\"data row4 col0\" >WeisfeleirLehmanKernel</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col1\" class=\"data row4 col1\" >gen_region</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col2\" class=\"data row4 col2\" >0.154</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col3\" class=\"data row4 col3\" >0.16</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col4\" class=\"data row4 col4\" >0.16</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col5\" class=\"data row4 col5\" >0.006</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col6\" class=\"data row4 col6\" >0.12</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row4_col7\" class=\"data row4 col7\" >0.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820level0_row5\" class=\"row_heading level0 row5\" >10</th>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row5_col0\" class=\"data row5 col0\" >MCS</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row5_col1\" class=\"data row5 col1\" >bvlac</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row5_col2\" class=\"data row5 col2\" >0.154</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row5_col3\" class=\"data row5 col3\" >0.092</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row5_col4\" class=\"data row5 col4\" >0.158</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row5_col5\" class=\"data row5 col5\" >0.066</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row5_col6\" class=\"data row5 col6\" >0.1175</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row5_col7\" class=\"data row5 col7\" >0.47</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820level0_row6\" class=\"row_heading level0 row6\" >12</th>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col0\" class=\"data row6 col0\" >BOW</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col1\" class=\"data row6 col1\" >ext_1</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col2\" class=\"data row6 col2\" >0.16</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col3\" class=\"data row6 col3\" >0.084</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col4\" class=\"data row6 col4\" >0.16</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col5\" class=\"data row6 col5\" >0.066</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col6\" class=\"data row6 col6\" >0.1175</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row6_col7\" class=\"data row6 col7\" >0.47</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820level0_row7\" class=\"row_heading level0 row7\" >13</th>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row7_col0\" class=\"data row7 col0\" >Jaccard</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row7_col1\" class=\"data row7 col1\" >object</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row7_col2\" class=\"data row7 col2\" >0.152</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row7_col3\" class=\"data row7 col3\" >0.09</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row7_col4\" class=\"data row7 col4\" >0.152</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row7_col5\" class=\"data row7 col5\" >0.054</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row7_col6\" class=\"data row7 col6\" >0.112</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row7_col7\" class=\"data row7 col7\" >0.448</td>\n", + " </tr>\n", + " <tr>\n", + " <th id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820level0_row8\" class=\"row_heading level0 row8\" >26</th>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col0\" class=\"data row8 col0\" >GraphEditDistance</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col1\" class=\"data row8 col1\" >inra</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col2\" class=\"data row8 col2\" >0.132</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col3\" class=\"data row8 col3\" >0.078</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col4\" class=\"data row8 col4\" >0.16</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col5\" class=\"data row8 col5\" >0.06</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col6\" class=\"data row8 col6\" >0.1075</td>\n", + " <td id=\"T_06bfa4d4_43d3_11e9_a30b_6a0002e84820row8_col7\" class=\"data row8 col7\" >0.43</td>\n", " </tr>\n", " </tbody></table>" ], "text/plain": [ - "<pandas.io.formats.style.Styler at 0x12a4142e8>" + "<pandas.io.formats.style.Styler at 0x102c12cf8>" ] }, - "execution_count": 12, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } diff --git a/strpython/models/node2vec.py b/strpython/models/node2vec.py deleted file mode 100644 index b36e7e5ffc62f489bd434c82828bcdf76f7c5c5b..0000000000000000000000000000000000000000 --- a/strpython/models/node2vec.py +++ /dev/null @@ -1,191 +0,0 @@ -import random - -import numpy as np -from gensim.models import Word2Vec - - -class Graph(): - def __init__(self, nx_G, is_directed, p, q): - self.G = nx_G - self.is_directed = is_directed - self.p = p - self.q = q - - def node2vec_walk(self, walk_length, start_node): - ''' - Simulate a random walk starting from start node. - ''' - G = self.G - alias_nodes = self.alias_nodes - alias_edges = self.alias_edges - - walk = [start_node] - - while len(walk) < walk_length: - cur = walk[-1] - cur_nbrs = sorted(G.neighbors(cur)) - if len(cur_nbrs) > 0: - if len(walk) == 1: - walk.append( - cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) - else: - prev = walk[-2] - next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], - alias_edges[(prev, cur)][1])] - walk.append(next) - else: - break - - return walk - - def simulate_walks(self, num_walks, walk_length): - ''' - Repeatedly simulate random walks from each node. - ''' - # sys.stdout.write("\r") - G = self.G - walks = [] - nodes = list(G.nodes) - for walk_iter in range(num_walks): - # sys.stdout.write( - # '\rWalk iteration: {0}/{1}'.format(walk_iter + 1, num_walks)) - random.shuffle(nodes) - for node in nodes: - walks.append(self.node2vec_walk( - walk_length=walk_length, start_node=node)) - - return walks - - def get_alias_edge(self, src, dst): - ''' - Get the alias edge setup lists for a given edge. - ''' - G = self.G - p = self.p - q = self.q - - unnormalized_probs = [] - for dst_nbr in sorted(G.neighbors(dst)): - if dst_nbr == src: - unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p) - elif G.has_edge(dst_nbr, src): - unnormalized_probs.append(G[dst][dst_nbr]['weight']) - else: - unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q) - norm_const = sum(unnormalized_probs) - normalized_probs = [ - float(u_prob) / norm_const for u_prob in unnormalized_probs] - - return alias_setup(normalized_probs) - - def preprocess_transition_probs(self): - ''' - Preprocessing of transition probabilities for guiding the random walks. - ''' - G = self.G - is_directed = self.is_directed - - alias_nodes = {} - for node in list(G.nodes): - unnormalized_probs = [G[node][nbr]['weight'] - for nbr in sorted(G.neighbors(node))] - norm_const = sum(unnormalized_probs) - normalized_probs = [ - float(u_prob) / norm_const for u_prob in unnormalized_probs] - alias_nodes[node] = alias_setup(normalized_probs) - - alias_edges = {} - triads = {} - - if is_directed: - for edge in list(G.edges()): - alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) - else: - for edge in list(G.edges()): - alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) - alias_edges[(edge[1], edge[0])] = self.get_alias_edge( - edge[1], edge[0]) - - self.alias_nodes = alias_nodes - self.alias_edges = alias_edges - - return - - -def alias_setup(probs): - ''' - Compute utility lists for non-uniform sampling from discrete distributions. - Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ - for details - ''' - K = len(probs) - q = np.zeros(K) - J = np.zeros(K, dtype=np.int) - - smaller = [] - larger = [] - for kk, prob in enumerate(probs): - q[kk] = K * prob - if q[kk] < 1.0: - smaller.append(kk) - else: - larger.append(kk) - - while len(smaller) > 0 and len(larger) > 0: - small = smaller.pop() - large = larger.pop() - - J[small] = large - q[large] = q[large] + q[small] - 1.0 - if q[large] < 1.0: - smaller.append(large) - else: - larger.append(large) - - return J, q - - -def alias_draw(J, q): - ''' - Draw sample from a non-uniform discrete distribution using alias sampling. - ''' - K = len(J) - - kk = int(np.floor(np.random.rand() * K)) - if np.random.rand() < q[kk]: - return kk - else: - return J[kk] - - -def learn_embeddings(walks, dimensions, window_size, nb_workers, nb_iter): - ''' - Learn embeddings by optimizing the Skipgram objective using SGD. - ''' - walks_ = [list(map(str, walk)) for walk in walks] - model = Word2Vec(walks_, size=dimensions, window=window_size, - min_count=0, sg=1, workers=nb_workers, iter=nb_iter) - return model - - -def compute_graph_model(nx_graph, **kwargs): - ''' - Pipeline for representational learning for all nodes in a graph. - @param nx_graph - @kwarg p: int - @kwarg q: int - ''' - p = kwargs.get("p", 1) - q = kwargs.get("q", 1) - dimensions = kwargs.get("dimensions", 128) - window_size = kwargs.get("window_size", 10) - nb_workers = kwargs.get("nb_workers", 8) - nb_iter = kwargs.get("nb_iter", 1) - num_walks = kwargs.get("num_walks", 10) - walk_length = kwargs.get("walk_length", 80) - directed = kwargs.get("directed", False) - - G = Graph(nx_graph, directed, p, q) - G.preprocess_transition_probs() - walks = G.simulate_walks(num_walks, walk_length) - return learn_embeddings(walks, dimensions, window_size, nb_workers, nb_iter) diff --git a/strpython/models/str.py b/strpython/models/str.py index 9e14fba7f2811d7444ab282c8f228cdb978f381d..6bfbed91c620be11f8d1e0ab7c3a256bdbca449b 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -5,7 +5,6 @@ import os import time import warnings -from tqdm import tqdm import folium import geopandas as gpd import networkx as nx @@ -22,7 +21,6 @@ import numpy as np # logging.basicConfig(filename=config.log_file,level=logging.INFO) - def get_inclusion_chain(id_, prop): """ For an entity return it geographical inclusion tree using a property. @@ -42,28 +40,10 @@ class STR(object): """ Str basic structure """ - __cache_inclusion = {} # Store inclusion relations found between spaital entities - __cache_adjacency = {} # Store adjacency relations found between spaital entities - __cache_entity_data = {} # Store data about entity requested - + __cache_inclusion = {} def __init__(self, tagged_text, spatial_entities): - """ - Constructir - - Parameters - ---------- - tagged_text : list - Text in forms of token associated with tag (2D array 2*t where t == |tokens| ) - spatial_entities : dict - spatial entities associated with a text. Follow this structure {"<id>: <label>"} - - """ - self.tagged_text = tagged_text self.spatial_entities = spatial_entities - for k in list(spatial_entities.keys()): - if not k[:2] == "GD": - del spatial_entities[k] self.adjacency_relationships = {} self.inclusion_relationships = {} @@ -71,21 +51,11 @@ class STR(object): @staticmethod def from_networkx_graph(g: nx.Graph, tagged_: list = []): """ - Build a STR based on networkx graph - - Parameters - ---------- - g : nx.Graph - input graph - tagged_ : list, optional - tagged text (the default is []). A 2D array 2*t where t == |tokens|. - - Returns - ------- - STR - resulting STR + Return a STR built from a Networkx imported graph + :param g: + :param tagged_: + :return: """ - sp_en = {} for nod in g: try: @@ -93,7 +63,7 @@ class STR(object): except KeyError: # If no label found, grab one from the geo-database data = gazetteer.get_by_id(nod) if data: - sp_en[nod] = data[0].name + sp_en[nod] = data[0].label str_ = STR(tagged_, sp_en) str_.set_graph(g) @@ -102,19 +72,10 @@ class STR(object): @staticmethod def from_dict(spat_ent: dict, tagged_: list = []): """ - Build a STR based on networkx graph - - Parameters - ---------- - spat_ent : dict - Dict of patial entities associated with a text. Follow this structure {"<id>: <label>"} - tagged_ : list, optional - tagged text (the default is []). A 2D array 2*t where t == |tokens|. - - Returns - ------- - STR - resulting STR + Return a STR built from a Networkx imported graph + :param g: + :param tagged_: + :return: """ sp_en = {} for id_, label in spat_ent.items(): @@ -126,59 +87,16 @@ class STR(object): @staticmethod def from_pandas(dataf: pd.DataFrame, tagged: list = []): - """ - Build a STR from a Pandas Dataframe with two column : id and label. - - Parameters - ---------- - dataf : pd.DataFrame - dataframe containing the spatial entities - tagged : list, optional - tagged text (the default is []). A 2D array 2*t where t == |tokens|. - - Returns - ------- - STR - resulting STR - """ - return STR.from_dict(pd.Series(dataf.label.values, index=dataf.id).to_dict(), tagged) - def set_graph(self, g): - """ - Apply changes to the current STR based on Networkx Graph. - - Parameters - ---------- - g : networkx.Graph - input graph - - """ - - self.graph = g - rel_ = self.graph.edges(data=True) - for edge in rel_: - id1, id2 = edge[0], edge[1] - if edge[2]["color"] == "green": - self.add_adjacency_rel(edge[0],edge[1]) - self.add_cache__adjacency(id1, id2,True) - elif edge[2]["color"] == "red": - self.add_inclusion_rel(edge[0], edge[1]) - self.add_cache_inclusion(id1,id2,True) - def add_spatial_entity(self, id, label=None, v=True): """ - Add a spatial entity to the current STR - - Parameters - ---------- - id : str - identifier of the spatial entity in Geodict - label : str, optional - if not available in Geodict (the default is None) - + Adding a spatial entity to the current STR + :param id: + :param label: + :return: """ - data_ = self.get_data(id) + data_ = gazetteer.get_by_id(id) if not data_: warnings.warn("{0} wasn't found in Geo-Database".format(id)) return False @@ -192,14 +110,9 @@ class STR(object): def add_spatial_entities(self, ids: list, labels: list = []): """ Add spatial entities to the current STR - - Parameters - ---------- - ids : list - list of identifiers of each spatial entity - labels : list, optional - list of labels of each spatial entity - + :param ids: + :param label: + :return: """ if not labels: warnings.warn("Labels list is empty. @en labels from Geo-Database will be used by default") @@ -212,121 +125,27 @@ class STR(object): self.add_spatial_entity(id, label, False) # print(self.graph.nodes(data=True)) - def add_adjacency_rel(self, se1, se2): - """ - Add a adjacency relationship to the current STR. - - Parameters - ---------- - se1 : str - Identifier of the first spatial entity - se2 : str - Identifier of the second spatial entity - - """ - - if not se1 in self.adjacency_relationships: self.adjacency_relationships[se1] = {} - if not se2 in self.adjacency_relationships: self.adjacency_relationships[se2] = {} - self.adjacency_relationships[se1][se2],self.adjacency_relationships[se2][se1] = True, True - self.add_cache__adjacency(se1,se2,True) + def add_adjacency_rel(self, se1, se2,v=True): + if not se1 in self.adjacency_relationships: + self.adjacency_relationships[se1] = {} + self.adjacency_relationships[se1][se2]=v - def add_inclusion_rel(self, se1, se2): - """ - Add a inclusion relationship to the current STR. - - Parameters - ---------- - se1 : str - Identifier of the first spatial entity - se2 : str - Identifier of the second spatial entity - - """ + def add_inclusion_rel(self, se1, se2,v=True): if not se1 in self.inclusion_relationships: self.inclusion_relationships[se1] = {} - self.inclusion_relationships[se1][se2]=True - self.add_cache_inclusion(se1,se2,True) + self.inclusion_relationships[se1][se2]=v - def add_cache_inclusion(self,id1, id2, v=True): + def transform_spatial_entities(self, transform_map): """ - Add a relation of inclusion in a cache variable - - Parameters - ---------- - id1 : str - id of the first spatial entity - id2 : str - id of the second spatial entity - v : bool, optional - if the relation exists between the two spatial entities. Default is True - + Apply transformation to a STR + :param transform_map: + :return: """ - - if not id1 in STR.__cache_inclusion: - STR.__cache_inclusion[id1] = {} - STR.__cache_inclusion[id1][id2] = v - - def add_cache__adjacency(self,se1,se2,v=True): - """ - Add a relation of adjacency in a cache variable - - Parameters - ---------- - id1 : str - id of the first spatial entity - id2 : str - id of the second spatial entity - v : bool, optional - if the relation exists between the two spatial entities. Default is True - - """ - if not se1 in STR.__cache_adjacency: - STR.__cache_adjacency[se1] = {} - if not se2 in STR.__cache_adjacency: - STR.__cache_adjacency[se2] = {} - STR.__cache_adjacency[se1][se2]=v - STR.__cache_adjacency[se2][se1]=v - - def get_data(self,id_se): - """ - Return an gazpy.Element object containing information about a spatial entity. - - Parameters - ---------- - id_se : str - Identifier of the spatial entity - - Returns - ------- - gazpy.Element - data - """ - - if id_se in STR.__cache_entity_data: - return STR.__cache_entity_data[id_se] - data=gazetteer.get_by_id(id_se) - if len(data) > 0: - STR.__cache_entity_data[id_se]= data[0] - - - def transform_spatial_entities(self, transform_map : dict): - """ - Replace or delete certain spatial entities based on a transformation map - - Parameters - ---------- - transform_map : dict - New mapping for the spatial entities in the current STR. Format required : {"<id of the old spatial entity>":"<id of the new spatial entity>"} - - """ - final_transform_map = {} # Erase old spatial entities new_label = {} - to_del=set([]) for old_se, new_se in transform_map.items(): - data = self.get_data(new_se) - to_del.add(old_se) + data = gazetteer.get_by_id(new_se) if data: data = data[0] final_transform_map[old_se] = new_se @@ -334,70 +153,59 @@ class STR(object): self.add_spatial_entity(new_se, data.label.en) del self.spatial_entities[old_se] - new_label[new_se] = data.label.en else: warnings.warn("{0} doesn't exists in the geo database!".format(new_se)) - self.graph = nx.relabel_nodes(self.graph, final_transform_map) - - for es in to_del: - if es in self.graph._node: - self.graph.remove_node(es) - for se_ in new_label: self.graph.nodes[se_]["label"] = new_label[se_] def update(self): """ - Update the relationship between spatial entities in the STR. Used when transforming the STR. + Method for updating links between spatial entities + :return: """ - nodes = copy.deepcopy(self.graph.nodes(data=True)) self.graph.clear() self.graph.add_nodes_from(nodes) + print("inclusion") self.get_inclusion_relationships() for se1 in self.inclusion_relationships: for se2 in self.inclusion_relationships[se1]: - if not se1 in self.graph.nodes or not se2 in self.graph.nodes: - continue if self.inclusion_relationships[se1][se2]: self.graph.add_edge(se1, se2, key=0, color="red") - + print("adjacency") self.get_adjacency_relationships() for se1 in self.adjacency_relationships: for se2 in self.adjacency_relationships[se1]: - if not se1 in self.graph.nodes or not se2 in self.graph.nodes: - continue if self.adjacency_relationships[se1][se2]: self.graph.add_edge(se1, se2, key=0, color="green") + print("fin adj") - + + def add_cache_inclusion(self,id1, id2): + if not id1 in STR.__cache_inclusion: + STR.__cache_inclusion[id1] = set([]) + STR.__cache_inclusion[id1].add(id2) def is_included_in(self, se1_id, se2_id): + global __cache_inclusion """ - Return True if a spatial entity is included within another one. - - Parameters - ---------- - se1_id : str - id of the contained entity - se2_id : str - id of the entity container - - Returns - ------- - bool - if se1 included in se2 + Return true if the two spatial entities identified by @se1_id and @se2_id share an inclusion relationship + :param se1_id: + :param se2_id: + :return: """ - if se1_id in self.inclusion_relationships: if se2_id in self.inclusion_relationships[se1_id]: return self.inclusion_relationships[se1_id][se2_id] + if se1_id in STR.__cache_inclusion: + if se2_id in STR.__cache_inclusion[se1_id]: + return True inc_chain_P131 = get_inclusion_chain(se1_id, "P131") inc_chain_P706 = get_inclusion_chain(se1_id, "P706") @@ -405,120 +213,18 @@ class STR(object): inc_chain.extend(inc_chain_P706) inc_chain = set(inc_chain) if se2_id in inc_chain: - self.add_cache_inclusion(se1_id,se2_id,True) - return True - - return False - - def is_adjacent_cache(self,se1,se2): - """ - Return true if two spatial entities were found adjacent previously. - - Parameters - ---------- - se1 : str - id of the first spatial entity - se2 : str - id of the second spatial entity - - Returns - ------- - bool - if se1 adjacent to se2 - """ - - if se1 in STR.__cache_adjacency: - if se2 in STR.__cache_adjacency[se1]: - return STR.__cache_adjacency[se1][se2] - if se2 in STR.__cache_adjacency: - if se1 in STR.__cache_adjacency[se2]: - return STR.__cache_adjacency[se2][se1] - return False - - def is_included_cache(self,se1,se2): - """ - Return true if a spatial entity were found included previously in an other one. - - Parameters - ---------- - se1 : str - id of the first spatial entity - se2 : str - id of the second spatial entity - - Returns - ------- - bool - if se1 included to se2 - """ - if se1 in STR.__cache_inclusion: - if se2 in STR.__cache_inclusion[se1]: - return STR.__cache_inclusion[se1][se2] - return False - - def is_adjacent(self,se1,se2,datase1=None,datase2=None): - """ - Return true if se1 is adjacent to se2. - - Parameters - ---------- - se1 : str - id of the first spatial entity - se2 : str - id of the second spatial entity - datase1 : gazpy.Element, optional - if given cached data concerning the spatial entity with id = se1 (the default is None) - datase2 : gazpy.Element, optional - if given cached data concerning the spatial entity with id = se2 (the default is None) - - Returns - ------- - bool - true if adjacent - """ - - stop_class = set(["A-PCLI", "A-ADM1"]) - - def get_p47_adjacency_data(self, data): - p47se1 = [] - for el in data.other.P47: - d = gazetteer.get_by_other_id(el,"wikidata") - if not d:continue - p47se1.append(d[0].id) - return p47se1 - - if self.is_adjacent_cache(se1,se2): - return False - - if self.is_included_in(se1, se2) or self.is_included_in(se2, se1): - return False - - data_se1, data_se2 = self.get_data(se1), self.get_data(se2) - - if "P47" in data_se2 and se1 in self.get_p47_adjacency_data(data_se2): - return True - # print("P47") - elif "P47" in data_se1 and se2 in self.get_p47_adjacency_data(data_se1): - return True - # print("P47") - - if collisionTwoSEBoundaries(se1, se2): + self.add_cache_inclusion(se1_id,se2_id) return True - if "coord" in data_se1 and "coord" in data_se2: - if Point(data_se1.coord.lon, data_se1.coord.lat).distance( - Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len( - set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1: - return True return False def get_inclusion_relationships(self): """ - Find all the inclusion relationships between the spatial entities declared in the current STR. - + Return all the inclusion relationships between all the spatial entities in the STR. + :return: """ - - for se_ in tqdm(self.spatial_entities,desc="Extract Inclusion"): + inclusions_ = [] + for se_ in self.spatial_entities: inc_chain_P131 = get_inclusion_chain(se_, "P131") inc_chain_P706 = get_inclusion_chain(se_, "P706") @@ -529,19 +235,61 @@ class STR(object): for se2_ in self.spatial_entities: if se2_ in inc_chain: self.add_inclusion_rel(se_,se2_) + return inclusions_ + + def getP47AdjacencyData(self, data): + p47se1 = [] + for el in data.other.P47: + d = gazetteer.get_by_other_id(el,"wikidata") + if not d:continue + p47se1.append(d[0].id) + return p47se1 + + def is_adjacent(self,se1,se2,datase1=None,datase2=None): + f = False + stop_class = set(["A-PCLI", "A-ADM1"]) + if self.is_included_in(se1, se2): + return f + + elif self.is_included_in(se2, se1): + return f + + data_se1 = gazetteer.get_by_id(se1)[0] if not datase1 else datase1 # Évite de recharger à chaque fois -_- + data_se2 = gazetteer.get_by_id(se2)[0] if not datase2 else datase2 + + # print("testP47") + if "P47" in data_se2.other: + if se1 in self.getP47AdjacencyData(data_se2): + return True + # print("P47") + if not f: + if "P47" in data_se1.other: + if se2 in self.getP47AdjacencyData(data_se1): + return True + # print("P47") + if not f: + # print("test collision") + if collisionTwoSEBoundaries(se1, se2): + return True + if not f: + if "coord" in data_se1.other and "coord" in data_se2.other: + if Point(data_se1.coord.lon, data_se1.coord.lat).distance( + Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len( + set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1: + return True + return f - def get_adjacency_relationships(self): """ - Find all the adjacency relationships between the spatial entities declared in the current STR. + Return all the adjacency relationships between all the spatial entities in the STR. + :return: """ - - data={se:self.get_data(se) for se in self.spatial_entities} - - for se1 in tqdm(self.spatial_entities,desc="Extract Adjacency Relationship"): + data={se:gazetteer.get_by_id(se)[0] for se in self.spatial_entities} + for se1 in self.spatial_entities: data_se1 = data[se1] for se2 in self.spatial_entities: if se1 == se2: continue + # print("test adjacency") if se1 in self.adjacency_relationships: if se2 in self.adjacency_relationships[se1]: continue @@ -556,22 +304,11 @@ class STR(object): def build(self, inc=True, adj=True, verbose=False): """ Build the STR - - Parameters - ---------- - inc : bool, optional - if inclusion relationship have to be included in the STR (the default is True) - adj : bool, optional - if adjacency relationship have to be included in the STR (the default is True) - verbose : bool, optional - Verbose mode activated (the default is False) - - Returns - ------- - networkx.Graph - graph representing the STR + :param inc: + :param adj: + :param verbose: + :return: """ - nodes = [] for k, v in self.spatial_entities.items(): nodes.append((k, {"label": v})) @@ -588,7 +325,7 @@ class STR(object): graph.add_edge(se1,se2, key=0, color="green") graph.add_edge(se2, se1, key=0, color="green") - + logging.info("Extract Adjacency Rel\t{0}".format(time.time()-debut)) if inc: debut=time.time() self.get_inclusion_relationships() @@ -596,20 +333,18 @@ class STR(object): for se2 in self.inclusion_relationships[se1]: if self.inclusion_relationships[se1][se2]: graph.add_edge(se1,se2, key=0, color="red") - + logging.info("Extract Inclusion Rel\t{0}".format(time.time() - debut)) self.graph = graph return graph def save_graph_fig(self, output_fn, format="svg"): """ - Save the graphiz reprensentation of the STR graph. + Save the graph graphiz reprensentation Parameters ---------- output_fn : string Output filename - format : str - Output format (svg or pdf) """ try: @@ -622,33 +357,28 @@ class STR(object): print("Error while saving STR to {0}".format(format)) def getUndirected(self): - """ - Return the Undirected form of a STR graph. - - Returns - ------- - networkx.Graph - unidirected graph - """ - return nx.Graph(self.graph) - def get_geo_data_of_se(self): - """ - Return Geographical information for each spatial entities in the STR - - Returns - ------- - geopandas.GeoDataFrame - dataframe containing geographical information of each entity in the STR - """ + def set_graph(self, g): + self.graph = g + rel_ = self.graph.edges(data=True) + for edge in rel_: + id1, id2 = edge[0], edge[1] + if edge[2]["color"] == "green": + self.add_adjacency_rel(edge[0],edge[1]) + add_cache_adjacency(id1, id2) + elif edge[2]["color"] == "red": + self.add_inclusion_rel(edge[0], edge[1]) + self.add_cache_inclusion(id1,id2) + + def get_geo_data_of_se(self): points,label,class_ = [], [], [] for se in self.spatial_entities: data = gazetteer.get_by_id(se)[0] try: points.append(Point(data.coord.lon, data.coord.lat)) - label.append(data.name) + label.append(data.label) # class_.append(most_common(data["class"])) except KeyError: pass @@ -659,21 +389,7 @@ class STR(object): return df def get_cluster(self,id_=None): - """ - Return the cluster detected using spatial entities position. - - Parameters - ---------- - id_ : temp_file_id, optional - if cached version of geoinfo (the default is None) - - Returns - ------- - gpd.GeoDataFrame - cluster geometry - """ - - if os.path.exists("./temp_cluster/{0}.geojson".format(id_)): + if id_ and os.path.exists("./temp_cluster/{0}.geojson".format(id_)): return gpd.read_file("./temp_cluster/{0}.geojson".format(id_)) data=self.get_geo_data_of_se() @@ -689,6 +405,22 @@ class STR(object): samples,labels=dbscan(X) data["cluster"] = labels + """ + + # deuxième découpe en cluster + c=data['cluster'].value_counts().idxmax() + X=data[data["cluster"] == c] + X=X[["x","y"]] + bandwidth = estimate_bandwidth(X.values) + ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) + ms.fit(X.values) + X["cluster"]=ms.labels_+(data['cluster'].max()+1) + lab=ms.labels_ + lab+=data['cluster'].max()+1 + + data["cluster"][data["cluster"] == c]=X["cluster"] + """ + geo = data.groupby("cluster").apply(to_Polygon) cluster_polybuff = gpd.GeoDataFrame(geometry=geo) if id_: @@ -697,15 +429,6 @@ class STR(object): def to_folium(self): - """ - Use the folium package to project the STR on a map - - Returns - ------- - folium.Map - folium map instance - """ - points = [] for se in self.spatial_entities: data = gazetteer.get_by_id(se)[0] @@ -755,20 +478,6 @@ class STR(object): def map_projection(self,plt=False): - """ - Return a matplotlib figure of the STR - - Parameters - ---------- - plt : bool, optional - if the user wish to use the plt.show() (the default is False) - - Returns - ------- - plt.Figure - Matplotlib figure instance - """ - import matplotlib.pyplot as plt world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) base = world.plot(color='white', edgecolor='black', figsize=(16, 9)) @@ -811,39 +520,11 @@ class STR(object): plt.show() -# def to_Multipoints(x): -# """ -# Return a polygon buffered representation for a set of point - -# Parameters -# ---------- -# x : pandas.Series -# coordinates columns - -# Returns -# ------- -# shapely.geometry.Polygon -# polygon -# """ - -# #print(x[["x","y"]].values) -# return Polygon([Point(z) for z in x[["x","y"]].values]).buffer(1) +def to_Multipoints(x): + #print(x[["x","y"]].values) + return Polygon([Point(z) for z in x[["x","y"]].values]).buffer(1) def to_Polygon(x): - """ - Return a polygon buffered representation for a set of points. - - Parameters - ---------- - x : pandas.Series - coordinates columns - - Returns - ------- - shapely.geometry.Polygon - polygon - """ - points = [Point(z) for z in x[["x","y"]].values] if len(points) > 2: coords = [p.coords[:][0] for p in points] diff --git a/strpython/models/word2vec.py b/strpython/models/word2vec.py deleted file mode 100644 index c0904f7514f64eec46f32f1371c64b5bf878ead2..0000000000000000000000000000000000000000 --- a/strpython/models/word2vec.py +++ /dev/null @@ -1,178 +0,0 @@ -# coding = utf-8 - -# -*- coding: utf-8 -*- - -from glob import glob -from tqdm import tqdm -import numpy as np -from gensim.models.word2vec import Word2Vec -from polyglot.text import Text -from pycorenlp import StanfordCoreNLP - - -def getDependant(output_core_nlp, deps_list=["compound", "amod", "advmod"]): - - """ - Filter dependencies from Stanford NLP output - :param output_core_nlp: output of stanford nlp request - :param deps_list: list of tags that correspond to wanted dependencies - :return: - """ - dependencies = [] - i = 0 - for s in output_core_nlp["sentences"]: - for dep in s["basicDependencies"]: - if dep["dep"] in deps_list: - dependencies.append([dep["governor"], dep["dependent"], i]) - i += 1 - return dependencies - - -def filter_dependenciesV1(dependencies): - """ - Filter Dependencies to be sure to get compound words ! - :param dependencies: getDependant() output - :return: - """ - d_temp = {} - for d in dependencies: - if not d[-1] in d_temp: d_temp[d[-1]] = {} - if not d[0] in d_temp[d[-1]]: d_temp[d[-1]][d[0]] = set([]) - d_temp[d[-1]][d[0]].add(d[1]) - to_del = [] - for d_1 in d_temp: - for d_2 in d_temp[d_1]: - d_temp[d_1][d_2] = sorted(d_temp[d_1][d_2]) - if len(d_temp[d_1][d_2]) < 2: - continue - sorted_ = d_temp[d_1][d_2] - s_ = 0 - for i in range(len(sorted_)): - if not i + 1 == len(sorted_): - s_ += abs(sorted_[i] - sorted_[i + 1]) - if not s_ == len(sorted_) - 1: - to_del.append([d_1, d_2]) - - for d in to_del: del d_temp[d[0]][d[1]] - return d_temp - - -def filter_dependencies(dependencies): - """ - Same as filter_dependenciesV1(), except we fuse dependencies of compound word(two dependencies relation close to each other) - :param dependencies: - :return: - """ - new_d = [] - d_temp = {} - for d in dependencies: - if not d[-1] in d_temp: d_temp[d[-1]] = {} - if not d[0] in d_temp[d[-1]]: d_temp[d[-1]][d[0]] = set([]) - d_temp[d[-1]][d[0]].add(d[1]) - to_del = [] - for d_1 in d_temp: - for d_2 in d_temp[d_1]: - d_temp[d_1][d_2] = sorted(d_temp[d_1][d_2]) - if len(d_temp[d_1][d_2]) < 2: - continue - sorted_ = d_temp[d_1][d_2] - s_ = 0 - for i in range(len(sorted_)): - if not i + 1 == len(sorted_): - s_ += abs(sorted_[i] - sorted_[i + 1]) - if not s_ == len(sorted_) - 1: - to_del.append([d_1, d_2]) - - for d in to_del: del d_temp[d[0]][d[1]] - to_del = [] - for d_1 in d_temp: - for d_2 in d_temp[d_1]: - _depend = d_temp[d_1][d_2] - for k in d_temp[d_1]: - _depend2 = d_temp[d_1][k] - if k == _depend[0]: - for d in _depend2: _depend.insert(0, d) - d_temp[d_1][d_2] = _depend - to_del.append([d_1, k]) - elif k == _depend[-1]: - _depend.extend(_depend2) - d_temp[d_1][d_2] = _depend - to_del.append([d_1, k]) - for d in to_del: del d_temp[d[0]][d[1]] - - return d_temp - - -def transformed_sentences(output_core_nlp, dependencies): - """ - Transform tokenized version to adapt word2vec input model - :param output_core_nlp: - :param dependencies: - :return: - """ - sentences = [] - j = 0 - for s in output_core_nlp["sentences"]: - tokens = [t["originalText"].lower() for t in s["tokens"]] - # print(tokens) - if j in dependencies: - # print(dependencies[j]) - to_tuple = [] - to_del = [] - for k, v in dependencies[j].items(): - tuple = list(v) - tuple.append(k) - to_tuple.append(tuple) - if tuple[0] - 1 in to_del: - set_ = set(to_del) - set_.remove(tuple[0] - 1) - to_del = list(set_) - to_del.extend((np.array(tuple[1:]) - 1).tolist()) - for tup in to_tuple: - tokens[tup[0] - 1] = "_".join([tokens[t - 1] for t in tup]) - k = 0 - for d in to_del: - del tokens[d - k] - k += 1 - sentences.append(tokens) - j += 1 - return np.array(sentences) - -if __name__ == "__main__": - - - - files = glob("data/EPI_ELENA/raw_text/*.txt") - nlp = StanfordCoreNLP("http://localhost:9000") - texts = [open(f).read() for f in files] - sentences = [] - # Classic tokenization of sentences - for f in tqdm(texts): - text = f - if not text: continue - try: - text = Text(text) - - for s in text.sentences: - tokens = [] - for t in s.tokens: tokens.append(t.lower()) - sentences.append(tokens) - except: - pass - - # Add compound word version of sentences - for t in tqdm(texts): - if not t: continue - try: - nlp_o = nlp.annotate(t, properties={'annotators': 'tokenize,ssplit,depparse', 'outputFormat': 'json'}) - dependenc = filter_dependencies(getDependant(nlp_o)) - dependenc2 = filter_dependenciesV1(getDependant(nlp_o)) - sentences.extend(transformed_sentences(nlp_o, dependenc)) # extend compound word - sentences.extend(transformed_sentences(nlp_o, dependenc2)) # classic compound word - - except expression as identifier: - pass - - model = Word2Vec(sentences, min_count=10) - model.save("w2v_model_epi.w2v") - diff --git a/strpython/nlp/disambiguator/delozier/__init__.py b/strpython/nlp/disambiguator/delozier/__init__.py deleted file mode 100644 index ec03d569f9032a54c2eb5a05fba3e6d671e7adca..0000000000000000000000000000000000000000 --- a/strpython/nlp/disambiguator/delozier/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# coding = utf-8 diff --git a/strpython/nlp/disambiguator/delozier/grid.py b/strpython/nlp/disambiguator/delozier/grid.py deleted file mode 100644 index 1d17059860e6ec9f918afb3eabed0c8fab978117..0000000000000000000000000000000000000000 --- a/strpython/nlp/disambiguator/delozier/grid.py +++ /dev/null @@ -1,78 +0,0 @@ -# coding = utf-8 - -import geopandas as gpd -import numpy as np -from progressbar import ProgressBar -from shapely.geometry import Point, Polygon - - -class GeoGrid(): - """""" - - def __init__(self, upper_lat=70, bottom_lat=-70, step=2, square_size=0.25): - """Constructor for GoeGrid""" - self.h, self.w = (upper_lat * step) - (bottom_lat * step), 2 * 180 * step - self.grid_points = np.indices((self.h, self.w))[1] - # matrice avec indice de ligne - - self.upper_lat = upper_lat - self.step = step - - self.world_borders_data = gpd.read_file("./world_borders.shp") - - def inside(self, point): - poly_from_point = Polygon(self.create_square(point)) - for id_, row in self.world_borders_data.iterrows(): - if poly_from_point.within(row["geometry"]): - return True - return False - - def create_square(self, p1, size=0.25): - x, y = p1.x, p1.y - r = size / 2 - return [[x - r, y + r], [x + r, y + r], [x + r, y - r], [x - r, y - r]] - - def point_within(self, j, i): - p = Point(-180 + (j * (1 / self.step)), self.upper_lat - i * (1 / self.step)) - if self.inside(p): - return np.int(0) - else: - return np.int(-1) - - def createGeoGrid(self): - with ProgressBar(max_value=len(self.grid_points)) as bar: - for i in range(len(self.grid_points)): - self.grid_points[i] = np.apply_along_axis(self.point_within, 0, self.grid_points[i].reshape(1, -1), i) - bar.update(i) - bar.finish() - print("Geogrid Created") - - def loadGeoGrid(self): - self.grid_points = np.load("./resources/grid_GEO.npy") - # avoid frontier problem - mask = np.arange(-1, 2) - for i in range(len(self.grid_points)): - for j in range(len(self.grid_points[i])): - if self.grid_points[i][j] == -1: - if i - 1 > 0 and i + 1 < len(self.grid_points) and j - 1 > 0 and j + 1 < len(self.grid_points[i]): - sub = np.abs(self.grid_points[np.ix_(mask + i, mask + j)]) - if np.sum(sub) < 5: - self.grid_points[i][j] = 0 - - def get_points_coordinates(self, step=None): - if not step: - step = self.step - coordinates = [] - for i in range(len(self.grid_points)): - for j in range(len(self.grid_points[i])): - if self.grid_points[i][j] == 0: - p = [-180 + (j * (1 / step)), self.upper_lat - i * (1 / step)] - coordinates.append(p) - coordinates = np.array(coordinates) - return coordinates - - def draw_grid(self, step=None): - import matplotlib.pyplot as plt - coordinates = self.get_points_coordinates() - plt.scatter(coordinates[:, 0], coordinates[:, 1], s=0.5) - plt.show() diff --git a/strpython/nlp/disambiguator/delozier/utils.py b/strpython/nlp/disambiguator/delozier/utils.py deleted file mode 100644 index 15c2f4f79da30a1e512c673c959941f605b5ab6c..0000000000000000000000000000000000000000 --- a/strpython/nlp/disambiguator/delozier/utils.py +++ /dev/null @@ -1,11 +0,0 @@ -# coding = utf-8 - -def weight_i_j(i, j): - dist = i.distance(j) - if dist <= 1: - return .75 * (1 - (dist)) - return False - - -def Gi_star(x): - pass diff --git a/strpython/tt4py/__init__.py b/strpython/tt4py/__init__.py deleted file mode 100644 index fec16bd317775c509c627a98d1a397f17029446c..0000000000000000000000000000000000000000 --- a/strpython/tt4py/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# coding = utf-8 - -""" -tt4t is a Tagged Texy Manipulation module for Python. - -tt4t was conceived in order to search through tokenized/pos-tagged version of texts. -""" \ No newline at end of file diff --git a/strpython/tt4py/helpers.py b/strpython/tt4py/helpers.py deleted file mode 100644 index 0e739d4a86327f6643ed94afb89e35f113332ede..0000000000000000000000000000000000000000 --- a/strpython/tt4py/helpers.py +++ /dev/null @@ -1,30 +0,0 @@ -# coding = utf-8 - -import numpy as np - - -def dict_to_array(var): - res = [] - for i, j in var.items(): - res.append([i, j]) - return np.array(res) - - -def list_to_array(var): - return np.array(var) - - -def flattern(A): - rt = [] - for i in A: - if isinstance(i, list): - rt.extend(flattern(i)) - elif isinstance(i, np.ndarray): - rt.extend(flattern(i.tolist())) - else: - rt.append(i) - return rt - - -def flatten(var): - return flattern(var) diff --git a/strpython/tt4py/tt4py.py b/strpython/tt4py/tt4py.py deleted file mode 100644 index 889a2bf62beec6b445ca10bf51134aaab442a49c..0000000000000000000000000000000000000000 --- a/strpython/tt4py/tt4py.py +++ /dev/null @@ -1,239 +0,0 @@ -# coding = utf-8 - -from enum import Enum - -from nltk.stem import WordNetLemmatizer, SnowballStemmer -from termcolor import colored - -from .helpers import * - -_wn_lem = WordNetLemmatizer() -_snowball_stemmer = SnowballStemmer("english") - - -class TaggedType(Enum): - TOK = 2 - POS = 2 - POS_LEM = 3 - POS_TAG = 3 # length of each token data - MIX_POS_TAG = 2 - - -class SearchFlag(Enum): - NO_CASE = lambda x: x.lower() - SP_WS = lambda x: x.split(" ") # split using whitespaces - SP_P = lambda x: x.split(".") # split using point - SP_D = lambda x: x.split("-") # split using dash - WN_LEM = lambda x: _wn_lem.lemmatize(x) - SNW_STEM = lambda x: _snowball_stemmer.stem(x) - - -class TaggedInputError(Exception): - def __init__(self): - super(Exception, self).__init__( - colored("Wrong input : check your input data type or the size for each token data ", "red")) - - -class WrongThesaurusFormatError(Exception): - def __init__(self, var): - super(Exception, self).__init__( - colored( - "Wrong thesaurus format: use dict format instead of {0}. Ex. {'id_1':'label'}".format(str(type(var))), - "red")) - - -class Text(object): - def __init__(self, tagged_text, type=TaggedType.MIX_POS_TAG): - # check if 'tagged_text' is an iterable object - try: - some_object_iterator = iter(tagged_text) - except TypeError: - raise TaggedInputError - - # Convert input into numpy array - self.tagged_text = tagged_text - - if isinstance(tagged_text, dict): - self.tagged_text = dict_to_array(tagged_text) - elif isinstance(tagged_text, list): - self.tagged_text = list_to_array(tagged_text) - else: - try: - self.tagged_text = np.array(list(tagged_text)) - except: - print("Can't convert iterable given into a np array") - - if not type.value == self.tagged_text.shape[1]: - raise TaggedInputError - - self._original = self.tagged_text.copy() - self.flag_applied = [] - - def transform_tagged(self, flags=[SearchFlag.NO_CASE, SearchFlag.SP_WS, SearchFlag.SP_D]): - tagged = self._original.copy().tolist() - # Apply necessary for string search - for flag in flags: - tagged_t = [] - for token in tagged: - res_ = flag(token[0]) - if len(res_) > 1 and not isinstance(res_, str): - res_ = [] - tagged_t.extend([[j, token[1]] for j in res_]) - elif isinstance(res_, list): - tagged_t.extend([[res_[-1], token[1]]]) - else: - tagged_t.extend([[res_, token[1]]]) - tagged = tagged_t - - self.tagged_text = np.array(tagged) - self.flag_applied = flags - - def hasSameFlags(self, flags): - for f in flags: - if not f in self.flag_applied: - return False - return True - - def get_occurrences(self, string, flags=[SearchFlag.NO_CASE, SearchFlag.SP_WS, SearchFlag.SP_D]): - if not self.hasSameFlags(flags): - self.transform_tagged(flags) - - positions_list = [] - t_1 = [string] - tagged = self.tagged_text[:, 0].copy() - # Apply necessary for string search - for flag in flags: - t_1 = flatten([flag(i) for i in t_1]) - gram_1 = (True if len(t_1) == 1 else False) - t = 0 - while t < (len(tagged)): - token = tagged[t] - if token == t_1[0] or token == t_1[0].rstrip("s") + "s": - if gram_1: - positions_list.append([t, t]) - t += 1 - else: - j, f = 0, True - while t + j < len(tagged) and j < len(t_1): - if not tagged[t + j] == t_1[j]: - f = False - break - j += 1 - if f: - positions_list.append([t, t + j]) - t += j - else: - t += 1 - else: - t += 1 - - return positions_list - - def get_neighbor_words(self, window_size, pos1, pos2=None): - if not pos2: - pos2 = pos1 - return self.tagged_text[pos1 - window_size:window_size + pos2] - - def extract_token_by_tag(self, *tags): - res, posis_ = [], [] - for tag in tags: - posis_.extend(np.argwhere(self.tagged_text[:, -1] == tag).flatten()) - posis_ = sorted(posis_) - - for pos in posis_: - pp = self.tagged_text[pos].tolist() - pp.append(pos) - res.append(pp) - return res - - def tag_item_in_thesaurus(self, thesaurus, flags=[SearchFlag.NO_CASE, SearchFlag.SP_WS, SearchFlag.SP_D], - prefix_="th_", stop_tag=["BEG-LOC", "LOC", "END-LOC"]): - if not self.hasSameFlags(flags): - self.transform_tagged(flags) - - if not isinstance(thesaurus, dict): - raise WrongThesaurusFormatError(thesaurus) - - t = " ".join(self.tagged_text[:, 0].tolist()) - for id_, element in thesaurus.items(): - if element.lower() in t: - positions_ = self.get_occurrences(element) - for d_ in positions_: - f = True - x, y = d_[0], d_[1] - c = 0 - if not self.isWorthIt(x, y, prefix_): - break - for st in stop_tag: - if x != y and st in self.tagged_text[x:y][:, 1]: - f = False - elif x == y and st in self.tagged_text[x][1]: - f = False - if f: - if abs(x - y) > 0: - self.tagged_text[x:y][:, 1] = prefix_ # prefix_ + id_ - else: - self.tagged_text[x][1] = prefix_ # prefix_ + id_ - new_tagged_ = [] - j = 0 - while j < len(self.tagged_text): - tag = self.tagged_text[j] - if prefix_ in tag[-1]: - curr = tag[-1] - t = 1 - while j + t < len(self.tagged_text): - if self.tagged_text[j + t][-1] != curr: - break - t += 1 - - new_tagged_.append([self.reconstruct_str(self.tagged_text[j:j + t][:, 0]), curr]) - j += t - else: - new_tagged_.append(tag.tolist()) - j += 1 - self.tagged_text = np.array(new_tagged_) - - def reconstruct_str(self, list_): - res = "" - no_sp_char = ["-"] - no_sp_bf = [","] - for ch in list_: - if not ch in no_sp_char and res: - if res[-1] in no_sp_char or ch in no_sp_bf: - res += ch - if not res: - res += ch - else: - res += " " + ch - - return res - - def isWorthIt(self, x, y, prefix): - taille = abs(x - y) - count = 0 - if x == y: - if prefix in self.tagged_text[x]: - count += 1 - taille = 1 - else: - # c=None - for item in self.tagged_text[x:y]: - if prefix in item[-1]: - count += 1 - - decx, decy = 0, 0 - fx, fy = True, True - while fx or fy: - fx, fy = False, False - if x - (decx + 1) > 0: - if prefix in self.tagged_text[x - (decx + 1)][-1]: - fx = True - decx += 1 - if y + decy + 1 < len(self.tagged_text): - if prefix in self.tagged_text[y + decy + 1][-1]: - fy = True - decy += 1 - - if taille < count + decx + decy: - return False - return True