diff --git a/auto_fill_annotation.py b/auto_fill_annotation.py index f5c77f626242b1e6f39ad818ad64e4309065677e..52b49c85898725f0cb52056ad57a44dddc71c055 100644 --- a/auto_fill_annotation.py +++ b/auto_fill_annotation.py @@ -5,8 +5,12 @@ import argparse, os, re, json, glob import pandas as pd import networkx as nx -from strpython.eval.automatic_annotation import AnnotationAutomatic +from strpython.eval.automatic_annotation import AnnotationAutomatic,save_cache from strpython.models.str import STR +from tqdm import tqdm,TqdmSynchronisationWarning +import warnings +warnings.simplefilter("ignore", TqdmSynchronisationWarning) +tqdm.pandas() annotater = AnnotationAutomatic() @@ -33,12 +37,13 @@ for file in glob.glob(os.path.join(str_graph_path, "*.gexf")): #print(strs) def foo(x): try: - return annotater.all(strs[x.G1], strs[x.G2]) + return annotater.all(strs[x.G1], strs[x.G2],x.G1, x.G2) except Exception as e: + print(e) return [0, 0, 0, 0] -df["res"] = df.apply(lambda x: foo(x), axis=1) +df["res"] = df.progress_apply(lambda x: foo(x), axis=1) df.res=df.res.apply(lambda x :list(map(int,x))) df[["c1"]] = df.res.apply(lambda x: x[0]) df[["c2"]] = df.res.apply(lambda x: x[1]) @@ -46,5 +51,5 @@ df[["c3"]] = df.res.apply(lambda x: x[2]) df[["c4"]] = df.res.apply(lambda x: x[3]) del df["res"] - +save_cache() df.to_csv(args.output_file) diff --git a/eval_disambiguation.py b/eval_disambiguation.py index 69968a9518e0af564fae8a282afc8c918c7f0659..4d5ca5c31513b9f6e3343e30f7b183704141e00d 100644 --- a/eval_disambiguation.py +++ b/eval_disambiguation.py @@ -35,14 +35,11 @@ i=0 for fn in corpus_files: i+=1 id_=int(re.findall(r"\d+",fn)[-1]) - try: - df=pd.read_csv(fn) - lang=data_lang[id_] - acc_MC.append(efficiencyMostCommon(df,lang,args.measure,args.k)) - acc_GEO.append(efficiencyGeodict(df,lang,args.measure,args.k)) - acc_wiki.append(efficiencyWiki(df,lang,args.measure,args.k)) - except Exception as e: - print(e) + df=pd.read_csv(fn) + lang=data_lang[id_] + acc_MC.append(efficiencyMostCommon(df,lang,args.measure,args.k)) + acc_GEO.append(efficiencyGeodict(df,lang,args.measure,args.k)) + acc_wiki.append(efficiencyWiki(df,lang,args.measure,args.k)) acc_GEO=np.array(acc_GEO) acc_GEO[acc_GEO == inf] = 0 acc_GEO=acc_GEO.tolist() diff --git a/notebooks/Eval.ipynb b/notebooks/Eval.ipynb index d0a24d70435bb8ce1948dacef73ae12fa18395d9..ac3953f738351fe6c783bdad8afb15c425473d2f 100644 --- a/notebooks/Eval.ipynb +++ b/notebooks/Eval.ipynb @@ -1779,11 +1779,63 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2018-04-03T06:07:00.601741Z", - "start_time": "2018-04-03T06:07:00.533744Z" + "end_time": "2018-09-17T09:16:49.505096Z", + "start_time": "2018-09-17T09:16:49.400580Z" + } + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'gen_country' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-1-5aee3be2a109>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscore_per_mesure\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgen_country\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"id_g1\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"mesureL\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mas_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mscore_gen_country\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscore_per_mesure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"mesureL\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mas_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mscore_gen_country\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstyle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhighlight_max\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msubset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcolorized_subset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhighlight_min\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msubset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcolorized_subset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_html\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'gen_country' is not defined" + ] + } + ], + "source": [ + "score_per_mesure=gen_country.groupby([\"id_g1\",\"mesureL\"], as_index=False).mean()\n", + "score_gen_country=score_per_mesure.groupby([\"mesureL\"],as_index=False).mean()\n", + "score_gen_country.style.apply(highlight_max,subset=colorized_subset).apply(highlight_min,subset=colorized_subset).to_html()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-22T11:59:10.969020Z", + "start_time": "2018-03-22T11:59:10.461280Z" + } + }, + "outputs": [], + "source": [ + "score_gen_country[[\"c1_val\",\"c2_val\",\"c3_val\",\"c4_val\"]].plot.bar(x=score_gen_country[\"mesure\"].unique())\n", + "plt.show()\n", + "score_gen_country[[\"c1*c2\",\"c1+c2\",\"c1*c2*c3\",\"c1+c3\"]].plot.bar(x=score_gen_country[\"mesure\"].unique())\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generalisation region" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-03T06:07:48.680877Z", + "start_time": "2018-04-03T06:07:48.613450Z" } }, "outputs": [ @@ -1794,7 +1846,39 @@ " <style type=\"text/css\" >\n", " \n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow2_col11 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col12 {\n", + " \n", + " background-color: yellow;\n", + " \n", + " : ;\n", + " \n", + " }\n", + " \n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col5 {\n", + " \n", + " background-color: yellow;\n", + " \n", + " : ;\n", + " \n", + " }\n", + " \n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col12 {\n", + " \n", + " background-color: yellow;\n", + " \n", + " : ;\n", + " \n", + " }\n", + " \n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col15 {\n", + " \n", + " background-color: yellow;\n", + " \n", + " : ;\n", + " \n", + " }\n", + " \n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col5 {\n", " \n", " : ;\n", " \n", @@ -1804,7 +1888,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow4_col5 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col15 {\n", " \n", " : ;\n", " \n", @@ -1814,7 +1898,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow4_col6 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col6 {\n", " \n", " : ;\n", " \n", @@ -1824,7 +1908,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow4_col7 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col7 {\n", " \n", " : ;\n", " \n", @@ -1834,7 +1918,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow4_col8 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col8 {\n", " \n", " : ;\n", " \n", @@ -1844,7 +1928,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow4_col12 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col11 {\n", " \n", " : ;\n", " \n", @@ -1854,7 +1938,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow4_col13 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col12 {\n", " \n", " : ;\n", " \n", @@ -1864,7 +1948,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow4_col14 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col13 {\n", " \n", " : ;\n", " \n", @@ -1874,7 +1958,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow4_col15 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col14 {\n", " \n", " : ;\n", " \n", @@ -1884,7 +1968,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow4_col16 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col16 {\n", " \n", " : ;\n", " \n", @@ -1894,15 +1978,17 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow6_col5 {\n", - " \n", - " background-color: yellow;\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col15 {\n", " \n", " : ;\n", " \n", + " background-color: red;\n", + " \n", + " color: white;\n", + " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow6_col6 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col6 {\n", " \n", " background-color: yellow;\n", " \n", @@ -1910,7 +1996,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow6_col7 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col7 {\n", " \n", " background-color: yellow;\n", " \n", @@ -1918,7 +2004,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow6_col8 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col8 {\n", " \n", " background-color: yellow;\n", " \n", @@ -1926,7 +2012,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow6_col11 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col11 {\n", " \n", " background-color: yellow;\n", " \n", @@ -1934,7 +2020,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow6_col12 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col13 {\n", " \n", " background-color: yellow;\n", " \n", @@ -1942,7 +2028,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow6_col13 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col14 {\n", " \n", " background-color: yellow;\n", " \n", @@ -1950,7 +2036,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow6_col14 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col16 {\n", " \n", " background-color: yellow;\n", " \n", @@ -1958,7 +2044,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow6_col15 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col5 {\n", " \n", " background-color: yellow;\n", " \n", @@ -1966,7 +2052,7 @@ " \n", " }\n", " \n", - " #T_390431e8_3705_11e8_8728_4c327598678brow6_col16 {\n", + " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col15 {\n", " \n", " background-color: yellow;\n", " \n", @@ -1976,7 +2062,7 @@ " \n", " </style>\n", "\n", - " <table id=\"T_390431e8_3705_11e8_8728_4c327598678b\" None>\n", + " <table id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\" None>\n", " \n", "\n", " <thead>\n", @@ -2091,121 +2177,121 @@ " <tr>\n", " \n", " \n", - " <th id=\"T_390431e8_3705_11e8_8728_4c327598678b\"\n", + " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", " class=\"row_heading level0 row0\" rowspan=1>\n", " 0\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col0\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col0\"\n", " class=\"data row0 col0\" >\n", " BOC\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col1\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col1\"\n", " class=\"data row0 col1\" >\n", " 241.38\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col2\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col2\"\n", " class=\"data row0 col2\" >\n", - " 268.84\n", + " 255.513\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col3\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col3\"\n", " class=\"data row0 col3\" >\n", " 9\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col4\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col4\"\n", " class=\"data row0 col4\" >\n", " 3\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col5\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col5\"\n", " class=\"data row0 col5\" >\n", - " 0.713333\n", + " 0.92\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col6\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col6\"\n", " class=\"data row0 col6\" >\n", - " 0.866667\n", + " 0.773333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col7\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col7\"\n", " class=\"data row0 col7\" >\n", - " 0.48\n", + " 0.466667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col8\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col8\"\n", " class=\"data row0 col8\" >\n", - " 0.36\n", + " 0.406667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col9\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col9\"\n", " class=\"data row0 col9\" >\n", " 4.92\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col10\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col10\"\n", " class=\"data row0 col10\" >\n", - " 4.44\n", + " 4.14\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col11\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col11\"\n", " class=\"data row0 col11\" >\n", - " 0.666667\n", + " 0.733333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col12\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col12\"\n", " class=\"data row0 col12\" >\n", - " 0.913333\n", + " 0.96\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col13\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col13\"\n", " class=\"data row0 col13\" >\n", - " 0.406667\n", + " 0.44\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col14\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col14\"\n", " class=\"data row0 col14\" >\n", - " 0.42\n", + " 0.46\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col15\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col15\"\n", " class=\"data row0 col15\" >\n", - " 0.773333\n", + " 0.926667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col16\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col16\"\n", " class=\"data row0 col16\" >\n", - " 0.466667\n", + " 0.446667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col17\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col17\"\n", " class=\"data row0 col17\" >\n", " 1\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow0_col18\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col18\"\n", " class=\"data row0 col18\" >\n", " 70.32\n", " \n", @@ -2215,121 +2301,121 @@ " <tr>\n", " \n", " \n", - " <th id=\"T_390431e8_3705_11e8_8728_4c327598678b\"\n", + " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", " class=\"row_heading level0 row1\" rowspan=1>\n", " 1\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col0\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col0\"\n", " class=\"data row1 col0\" >\n", " BOWSE\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col1\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col1\"\n", " class=\"data row1 col1\" >\n", " 241.38\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col2\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col2\"\n", " class=\"data row1 col2\" >\n", - " 264.607\n", + " 259.64\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col3\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col3\"\n", " class=\"data row1 col3\" >\n", " 8\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col4\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col4\"\n", " class=\"data row1 col4\" >\n", " 3\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col5\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col5\"\n", " class=\"data row1 col5\" >\n", - " 0.753333\n", + " 0.926667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col6\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col6\"\n", " class=\"data row1 col6\" >\n", - " 0.893333\n", + " 0.833333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col7\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col7\"\n", " class=\"data row1 col7\" >\n", - " 0.52\n", + " 0.513333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col8\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col8\"\n", " class=\"data row1 col8\" >\n", - " 0.386667\n", + " 0.433333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col9\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col9\"\n", " class=\"data row1 col9\" >\n", " 4.92\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col10\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col10\"\n", " class=\"data row1 col10\" >\n", - " 4.5\n", + " 4.38667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col11\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col11\"\n", " class=\"data row1 col11\" >\n", - " 0.7\n", + " 0.8\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col12\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col12\"\n", " class=\"data row1 col12\" >\n", - " 0.946667\n", + " 0.96\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col13\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col13\"\n", " class=\"data row1 col13\" >\n", - " 0.433333\n", + " 0.473333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col14\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col14\"\n", " class=\"data row1 col14\" >\n", - " 0.46\n", + " 0.506667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col15\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col15\"\n", " class=\"data row1 col15\" >\n", - " 0.813333\n", + " 0.933333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col16\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col16\"\n", " class=\"data row1 col16\" >\n", - " 0.493333\n", + " 0.48\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col17\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col17\"\n", " class=\"data row1 col17\" >\n", " 1\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow1_col18\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col18\"\n", " class=\"data row1 col18\" >\n", " 70.32\n", " \n", @@ -2339,121 +2425,121 @@ " <tr>\n", " \n", " \n", - " <th id=\"T_390431e8_3705_11e8_8728_4c327598678b\"\n", + " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", " class=\"row_heading level0 row2\" rowspan=1>\n", " 2\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col0\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col0\"\n", " class=\"data row2 col0\" >\n", " GREEDY\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col1\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col1\"\n", " class=\"data row2 col1\" >\n", " 241.38\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col2\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col2\"\n", " class=\"data row2 col2\" >\n", - " 235.86\n", + " 184.693\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col3\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col3\"\n", " class=\"data row2 col3\" >\n", " 6\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col4\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col4\"\n", " class=\"data row2 col4\" >\n", " 3\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col5\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col5\"\n", " class=\"data row2 col5\" >\n", - " 0.526667\n", + " 0.52\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col6\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col6\"\n", " class=\"data row2 col6\" >\n", - " 0.626667\n", + " 0.5\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col7\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col7\"\n", " class=\"data row2 col7\" >\n", - " 0.34\n", + " 0.293333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col8\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col8\"\n", " class=\"data row2 col8\" >\n", - " 0.246667\n", + " 0.24\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col9\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col9\"\n", " class=\"data row2 col9\" >\n", " 4.92\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col10\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col10\"\n", " class=\"data row2 col10\" >\n", - " 3.94667\n", + " 3.57333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col11\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col11\"\n", " class=\"data row2 col11\" >\n", - " 0.433333\n", + " 0.446667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col12\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col12\"\n", " class=\"data row2 col12\" >\n", - " 0.72\n", + " 0.573333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col13\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col13\"\n", " class=\"data row2 col13\" >\n", - " 0.293333\n", + " 0.28\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col14\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col14\"\n", " class=\"data row2 col14\" >\n", - " 0.306667\n", + " 0.28\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col15\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col15\"\n", " class=\"data row2 col15\" >\n", - " 0.56\n", + " 0.533333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col16\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col16\"\n", " class=\"data row2 col16\" >\n", - " 0.326667\n", + " 0.293333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col17\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col17\"\n", " class=\"data row2 col17\" >\n", " 1\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow2_col18\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col18\"\n", " class=\"data row2 col18\" >\n", " 70.32\n", " \n", @@ -2463,121 +2549,121 @@ " <tr>\n", " \n", " \n", - " <th id=\"T_390431e8_3705_11e8_8728_4c327598678b\"\n", + " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", " class=\"row_heading level0 row3\" rowspan=1>\n", " 3\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col0\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col0\"\n", " class=\"data row3 col0\" >\n", " HED\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col1\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col1\"\n", " class=\"data row3 col1\" >\n", " 241.38\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col2\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col2\"\n", " class=\"data row3 col2\" >\n", - " 366.993\n", + " 597.18\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col3\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col3\"\n", " class=\"data row3 col3\" >\n", " 5\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col4\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col4\"\n", " class=\"data row3 col4\" >\n", " 3\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col5\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col5\"\n", " class=\"data row3 col5\" >\n", - " 0.713333\n", + " 0.54\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col6\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col6\"\n", " class=\"data row3 col6\" >\n", - " 0.753333\n", + " 0.406667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col7\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col7\"\n", " class=\"data row3 col7\" >\n", - " 0.446667\n", + " 0.2\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col8\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col8\"\n", " class=\"data row3 col8\" >\n", - " 0.346667\n", + " 0.153333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col9\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col9\"\n", " class=\"data row3 col9\" >\n", " 4.92\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col10\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col10\"\n", " class=\"data row3 col10\" >\n", - " 3.36667\n", + " 1.68667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col11\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col11\"\n", " class=\"data row3 col11\" >\n", - " 0.64\n", + " 0.406667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col12\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col12\"\n", " class=\"data row3 col12\" >\n", - " 0.826667\n", + " 0.54\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col13\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col13\"\n", " class=\"data row3 col13\" >\n", - " 0.386667\n", + " 0.2\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col14\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col14\"\n", " class=\"data row3 col14\" >\n", - " 0.4\n", + " 0.2\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col15\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col15\"\n", " class=\"data row3 col15\" >\n", - " 0.76\n", + " 0.54\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col16\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col16\"\n", " class=\"data row3 col16\" >\n", - " 0.433333\n", + " 0.2\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col17\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col17\"\n", " class=\"data row3 col17\" >\n", " 1\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow3_col18\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col18\"\n", " class=\"data row3 col18\" >\n", " 70.32\n", " \n", @@ -2587,121 +2673,121 @@ " <tr>\n", " \n", " \n", - " <th id=\"T_390431e8_3705_11e8_8728_4c327598678b\"\n", + " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", " class=\"row_heading level0 row4\" rowspan=1>\n", " 4\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col0\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col0\"\n", " class=\"data row4 col0\" >\n", " JACCARD\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col1\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col1\"\n", " class=\"data row4 col1\" >\n", " 241.38\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col2\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col2\"\n", " class=\"data row4 col2\" >\n", - " 308.413\n", + " 288.467\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col3\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col3\"\n", " class=\"data row4 col3\" >\n", " 10\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col4\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col4\"\n", " class=\"data row4 col4\" >\n", " 3\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col5\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col5\"\n", " class=\"data row4 col5\" >\n", - " 0.44\n", + " 0.533333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col6\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col6\"\n", " class=\"data row4 col6\" >\n", - " 0.48\n", + " 0.526667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col7\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col7\"\n", " class=\"data row4 col7\" >\n", - " 0.22\n", + " 0.28\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col8\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col8\"\n", " class=\"data row4 col8\" >\n", - " 0.193333\n", + " 0.273333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col9\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col9\"\n", " class=\"data row4 col9\" >\n", " 4.92\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col10\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col10\"\n", " class=\"data row4 col10\" >\n", - " 5.37333\n", + " 5.18\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col11\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col11\"\n", " class=\"data row4 col11\" >\n", - " 0.44\n", + " 0.513333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col12\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col12\"\n", " class=\"data row4 col12\" >\n", - " 0.48\n", + " 0.546667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col13\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col13\"\n", " class=\"data row4 col13\" >\n", - " 0.213333\n", + " 0.28\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col14\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col14\"\n", " class=\"data row4 col14\" >\n", - " 0.213333\n", + " 0.28\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col15\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col15\"\n", " class=\"data row4 col15\" >\n", - " 0.446667\n", + " 0.533333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col16\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col16\"\n", " class=\"data row4 col16\" >\n", - " 0.22\n", + " 0.28\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col17\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col17\"\n", " class=\"data row4 col17\" >\n", " 1\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow4_col18\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col18\"\n", " class=\"data row4 col18\" >\n", " 70.32\n", " \n", @@ -2711,121 +2797,121 @@ " <tr>\n", " \n", " \n", - " <th id=\"T_390431e8_3705_11e8_8728_4c327598678b\"\n", + " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", " class=\"row_heading level0 row5\" rowspan=1>\n", " 5\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col0\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col0\"\n", " class=\"data row5 col0\" >\n", " MCS\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col1\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col1\"\n", " class=\"data row5 col1\" >\n", " 241.38\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col2\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col2\"\n", " class=\"data row5 col2\" >\n", - " 256.973\n", + " 266.933\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col3\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col3\"\n", " class=\"data row5 col3\" >\n", " 1\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col4\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col4\"\n", " class=\"data row5 col4\" >\n", " 3\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col5\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col5\"\n", " class=\"data row5 col5\" >\n", - " 0.793333\n", + " 0.92\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col6\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col6\"\n", " class=\"data row5 col6\" >\n", - " 0.926667\n", + " 0.873333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col7\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col7\"\n", " class=\"data row5 col7\" >\n", - " 0.506667\n", + " 0.52\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col8\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col8\"\n", " class=\"data row5 col8\" >\n", - " 0.42\n", + " 0.446667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col9\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col9\"\n", " class=\"data row5 col9\" >\n", " 4.92\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col10\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col10\"\n", " class=\"data row5 col10\" >\n", - " 5.28\n", + " 4.78\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col11\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col11\"\n", " class=\"data row5 col11\" >\n", - " 0.746667\n", + " 0.846667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col12\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col12\"\n", " class=\"data row5 col12\" >\n", - " 0.973333\n", + " 0.946667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col13\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col13\"\n", " class=\"data row5 col13\" >\n", - " 0.453333\n", + " 0.486667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col14\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col14\"\n", " class=\"data row5 col14\" >\n", - " 0.473333\n", + " 0.513333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col15\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col15\"\n", " class=\"data row5 col15\" >\n", - " 0.826667\n", + " 0.926667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col16\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col16\"\n", " class=\"data row5 col16\" >\n", - " 0.486667\n", + " 0.493333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col17\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col17\"\n", " class=\"data row5 col17\" >\n", " 1\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow5_col18\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col18\"\n", " class=\"data row5 col18\" >\n", " 70.32\n", " \n", @@ -2835,121 +2921,121 @@ " <tr>\n", " \n", " \n", - " <th id=\"T_390431e8_3705_11e8_8728_4c327598678b\"\n", + " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", " class=\"row_heading level0 row6\" rowspan=1>\n", " 6\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col0\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col0\"\n", " class=\"data row6 col0\" >\n", " VEO\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col1\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col1\"\n", " class=\"data row6 col1\" >\n", " 241.38\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col2\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col2\"\n", " class=\"data row6 col2\" >\n", - " 258.473\n", + " 259.227\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col3\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col3\"\n", " class=\"data row6 col3\" >\n", " 2\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col4\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col4\"\n", " class=\"data row6 col4\" >\n", " 3\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col5\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col5\"\n", " class=\"data row6 col5\" >\n", - " 0.82\n", + " 0.926667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col6\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col6\"\n", " class=\"data row6 col6\" >\n", - " 0.926667\n", + " 0.84\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col7\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col7\"\n", " class=\"data row6 col7\" >\n", - " 0.533333\n", + " 0.506667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col8\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col8\"\n", " class=\"data row6 col8\" >\n", " 0.44\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col9\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col9\"\n", " class=\"data row6 col9\" >\n", " 4.92\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col10\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col10\"\n", " class=\"data row6 col10\" >\n", - " 5.1\n", + " 4.66\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col11\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col11\"\n", " class=\"data row6 col11\" >\n", - " 0.773333\n", + " 0.82\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col12\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col12\"\n", " class=\"data row6 col12\" >\n", - " 0.973333\n", + " 0.946667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col13\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col13\"\n", " class=\"data row6 col13\" >\n", " 0.466667\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col14\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col14\"\n", " class=\"data row6 col14\" >\n", - " 0.486667\n", + " 0.5\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col15\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col15\"\n", " class=\"data row6 col15\" >\n", - " 0.866667\n", + " 0.933333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col16\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col16\"\n", " class=\"data row6 col16\" >\n", - " 0.513333\n", + " 0.473333\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col17\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col17\"\n", " class=\"data row6 col17\" >\n", " 1\n", " \n", " \n", " \n", - " <td id=\"T_390431e8_3705_11e8_8728_4c327598678brow6_col18\"\n", + " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col18\"\n", " class=\"data row6 col18\" >\n", " 70.32\n", " \n", @@ -2961,34 +3047,35 @@ " " ], "text/plain": [ - "<pandas.formats.style.Styler at 0x10e48a588>" + "<pandas.formats.style.Styler at 0x10e4b4208>" ] }, - "execution_count": 16, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "score_per_mesure=gen_country.groupby([\"id_g1\",\"mesureL\"], as_index=False).mean()\n", - "score_gen_country=score_per_mesure.groupby([\"mesureL\"],as_index=False).mean()\n", - "score_gen_country.style.apply(highlight_max,subset=colorized_subset).apply(highlight_min,subset=colorized_subset)" + "score_per_mesure=gen_region.groupby([\"id_g1\",\"mesureL\"], as_index=False).mean()\n", + "score_gen_region=score_per_mesure.groupby([\"mesureL\"],as_index=False).mean()\n", + "score_gen_region.style.apply(highlight_max,subset=colorized_subset).apply(highlight_min,subset=colorized_subset)" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 39, "metadata": { "ExecuteTime": { - "end_time": "2018-03-22T11:59:10.969020Z", - "start_time": "2018-03-22T11:59:10.461280Z" - } + "end_time": "2018-03-22T11:59:19.770228Z", + "start_time": "2018-03-22T11:59:19.440447Z" + }, + "format": "row" }, "outputs": [], "source": [ - "score_gen_country[[\"c1_val\",\"c2_val\",\"c3_val\",\"c4_val\"]].plot.bar(x=score_gen_country[\"mesure\"].unique())\n", + "score_gen_region[[\"c1_val\",\"c2_val\",\"c3_val\",\"c4_val\"]].plot.bar(x=score_gen_region[\"mesure\"].unique())\n", "plt.show()\n", - "score_gen_country[[\"c1*c2\",\"c1+c2\",\"c1*c2*c3\",\"c1+c3\"]].plot.bar(x=score_gen_country[\"mesure\"].unique())\n", + "score_gen_region[[\"c1*c2\",\"c1+c2\",\"c1*c2*c3\",\"c1+c3\"]].plot.bar(x=score_gen_region[\"mesure\"].unique())\n", "plt.show()" ] }, @@ -2996,16 +3083,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Generalisation region" + "## Extension 1" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": { "ExecuteTime": { - "end_time": "2018-04-03T06:07:48.680877Z", - "start_time": "2018-04-03T06:07:48.613450Z" + "end_time": "2018-04-03T06:07:37.954797Z", + "start_time": "2018-04-03T06:07:37.893571Z" } }, "outputs": [ @@ -3016,15 +3103,7 @@ " <style type=\"text/css\" >\n", " \n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col12 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col5 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow1_col5 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3032,7 +3111,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col12 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow1_col12 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3040,7 +3119,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col15 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow1_col15 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3048,17 +3127,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col5 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col15 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col5 {\n", " \n", " : ;\n", " \n", @@ -3068,7 +3137,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col6 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col6 {\n", " \n", " : ;\n", " \n", @@ -3078,7 +3147,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col7 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col7 {\n", " \n", " : ;\n", " \n", @@ -3088,7 +3157,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col8 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col8 {\n", " \n", " : ;\n", " \n", @@ -3098,7 +3167,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col11 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col11 {\n", " \n", " : ;\n", " \n", @@ -3108,7 +3177,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col12 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col12 {\n", " \n", " : ;\n", " \n", @@ -3118,7 +3187,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col13 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col13 {\n", " \n", " : ;\n", " \n", @@ -3128,7 +3197,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col14 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col14 {\n", " \n", " : ;\n", " \n", @@ -3138,7 +3207,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col16 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col15 {\n", " \n", " : ;\n", " \n", @@ -3148,7 +3217,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col15 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col16 {\n", " \n", " : ;\n", " \n", @@ -3158,7 +3227,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col6 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col5 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3166,7 +3235,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col7 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col6 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3174,7 +3243,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col8 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col7 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3182,7 +3251,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col11 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col11 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3190,7 +3259,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col13 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col13 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3198,7 +3267,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col14 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col14 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3206,7 +3275,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col16 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col15 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3214,7 +3283,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col5 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col16 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3222,7 +3291,7 @@ " \n", " }\n", " \n", - " #T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col15 {\n", + " #T_4f4867d0_3705_11e8_89af_4c327598678brow4_col8 {\n", " \n", " background-color: yellow;\n", " \n", @@ -3232,1246 +3301,7 @@ " \n", " </style>\n", "\n", - " <table id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\" None>\n", - " \n", - "\n", - " <thead>\n", - " \n", - " <tr>\n", - " \n", - " \n", - " <th class=\"blank level0\" >\n", - " \n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col0\" colspan=1>\n", - " mesureL\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col1\" colspan=1>\n", - " id_g1\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col2\" colspan=1>\n", - " id_g2\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col3\" colspan=1>\n", - " mesure\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col4\" colspan=1>\n", - " id_user\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col5\" colspan=1>\n", - " c1_val\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col6\" colspan=1>\n", - " c2_val\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col7\" colspan=1>\n", - " c3_val\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col8\" colspan=1>\n", - " c4_val\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col9\" colspan=1>\n", - " g1_size\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col10\" colspan=1>\n", - " g2_size\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col11\" colspan=1>\n", - " c1*c2\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col12\" colspan=1>\n", - " c1+c2\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col13\" colspan=1>\n", - " c1*c2*c3\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col14\" colspan=1>\n", - " c1*c3\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col15\" colspan=1>\n", - " c1+c3\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col16\" colspan=1>\n", - " c2*c3\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col17\" colspan=1>\n", - " for_c\n", - " \n", - " \n", - " \n", - " <th class=\"col_heading level0 col18\" colspan=1>\n", - " es_in_common\n", - " \n", - " \n", - " </tr>\n", - " \n", - " </thead>\n", - " <tbody>\n", - " \n", - " <tr>\n", - " \n", - " \n", - " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", - " class=\"row_heading level0 row0\" rowspan=1>\n", - " 0\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col0\"\n", - " class=\"data row0 col0\" >\n", - " BOC\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col1\"\n", - " class=\"data row0 col1\" >\n", - " 241.38\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col2\"\n", - " class=\"data row0 col2\" >\n", - " 255.513\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col3\"\n", - " class=\"data row0 col3\" >\n", - " 9\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col4\"\n", - " class=\"data row0 col4\" >\n", - " 3\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col5\"\n", - " class=\"data row0 col5\" >\n", - " 0.92\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col6\"\n", - " class=\"data row0 col6\" >\n", - " 0.773333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col7\"\n", - " class=\"data row0 col7\" >\n", - " 0.466667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col8\"\n", - " class=\"data row0 col8\" >\n", - " 0.406667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col9\"\n", - " class=\"data row0 col9\" >\n", - " 4.92\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col10\"\n", - " class=\"data row0 col10\" >\n", - " 4.14\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col11\"\n", - " class=\"data row0 col11\" >\n", - " 0.733333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col12\"\n", - " class=\"data row0 col12\" >\n", - " 0.96\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col13\"\n", - " class=\"data row0 col13\" >\n", - " 0.44\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col14\"\n", - " class=\"data row0 col14\" >\n", - " 0.46\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col15\"\n", - " class=\"data row0 col15\" >\n", - " 0.926667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col16\"\n", - " class=\"data row0 col16\" >\n", - " 0.446667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col17\"\n", - " class=\"data row0 col17\" >\n", - " 1\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow0_col18\"\n", - " class=\"data row0 col18\" >\n", - " 70.32\n", - " \n", - " \n", - " </tr>\n", - " \n", - " <tr>\n", - " \n", - " \n", - " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", - " class=\"row_heading level0 row1\" rowspan=1>\n", - " 1\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col0\"\n", - " class=\"data row1 col0\" >\n", - " BOWSE\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col1\"\n", - " class=\"data row1 col1\" >\n", - " 241.38\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col2\"\n", - " class=\"data row1 col2\" >\n", - " 259.64\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col3\"\n", - " class=\"data row1 col3\" >\n", - " 8\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col4\"\n", - " class=\"data row1 col4\" >\n", - " 3\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col5\"\n", - " class=\"data row1 col5\" >\n", - " 0.926667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col6\"\n", - " class=\"data row1 col6\" >\n", - " 0.833333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col7\"\n", - " class=\"data row1 col7\" >\n", - " 0.513333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col8\"\n", - " class=\"data row1 col8\" >\n", - " 0.433333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col9\"\n", - " class=\"data row1 col9\" >\n", - " 4.92\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col10\"\n", - " class=\"data row1 col10\" >\n", - " 4.38667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col11\"\n", - " class=\"data row1 col11\" >\n", - " 0.8\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col12\"\n", - " class=\"data row1 col12\" >\n", - " 0.96\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col13\"\n", - " class=\"data row1 col13\" >\n", - " 0.473333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col14\"\n", - " class=\"data row1 col14\" >\n", - " 0.506667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col15\"\n", - " class=\"data row1 col15\" >\n", - " 0.933333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col16\"\n", - " class=\"data row1 col16\" >\n", - " 0.48\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col17\"\n", - " class=\"data row1 col17\" >\n", - " 1\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow1_col18\"\n", - " class=\"data row1 col18\" >\n", - " 70.32\n", - " \n", - " \n", - " </tr>\n", - " \n", - " <tr>\n", - " \n", - " \n", - " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", - " class=\"row_heading level0 row2\" rowspan=1>\n", - " 2\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col0\"\n", - " class=\"data row2 col0\" >\n", - " GREEDY\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col1\"\n", - " class=\"data row2 col1\" >\n", - " 241.38\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col2\"\n", - " class=\"data row2 col2\" >\n", - " 184.693\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col3\"\n", - " class=\"data row2 col3\" >\n", - " 6\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col4\"\n", - " class=\"data row2 col4\" >\n", - " 3\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col5\"\n", - " class=\"data row2 col5\" >\n", - " 0.52\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col6\"\n", - " class=\"data row2 col6\" >\n", - " 0.5\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col7\"\n", - " class=\"data row2 col7\" >\n", - " 0.293333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col8\"\n", - " class=\"data row2 col8\" >\n", - " 0.24\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col9\"\n", - " class=\"data row2 col9\" >\n", - " 4.92\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col10\"\n", - " class=\"data row2 col10\" >\n", - " 3.57333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col11\"\n", - " class=\"data row2 col11\" >\n", - " 0.446667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col12\"\n", - " class=\"data row2 col12\" >\n", - " 0.573333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col13\"\n", - " class=\"data row2 col13\" >\n", - " 0.28\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col14\"\n", - " class=\"data row2 col14\" >\n", - " 0.28\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col15\"\n", - " class=\"data row2 col15\" >\n", - " 0.533333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col16\"\n", - " class=\"data row2 col16\" >\n", - " 0.293333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col17\"\n", - " class=\"data row2 col17\" >\n", - " 1\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow2_col18\"\n", - " class=\"data row2 col18\" >\n", - " 70.32\n", - " \n", - " \n", - " </tr>\n", - " \n", - " <tr>\n", - " \n", - " \n", - " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", - " class=\"row_heading level0 row3\" rowspan=1>\n", - " 3\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col0\"\n", - " class=\"data row3 col0\" >\n", - " HED\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col1\"\n", - " class=\"data row3 col1\" >\n", - " 241.38\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col2\"\n", - " class=\"data row3 col2\" >\n", - " 597.18\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col3\"\n", - " class=\"data row3 col3\" >\n", - " 5\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col4\"\n", - " class=\"data row3 col4\" >\n", - " 3\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col5\"\n", - " class=\"data row3 col5\" >\n", - " 0.54\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col6\"\n", - " class=\"data row3 col6\" >\n", - " 0.406667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col7\"\n", - " class=\"data row3 col7\" >\n", - " 0.2\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col8\"\n", - " class=\"data row3 col8\" >\n", - " 0.153333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col9\"\n", - " class=\"data row3 col9\" >\n", - " 4.92\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col10\"\n", - " class=\"data row3 col10\" >\n", - " 1.68667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col11\"\n", - " class=\"data row3 col11\" >\n", - " 0.406667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col12\"\n", - " class=\"data row3 col12\" >\n", - " 0.54\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col13\"\n", - " class=\"data row3 col13\" >\n", - " 0.2\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col14\"\n", - " class=\"data row3 col14\" >\n", - " 0.2\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col15\"\n", - " class=\"data row3 col15\" >\n", - " 0.54\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col16\"\n", - " class=\"data row3 col16\" >\n", - " 0.2\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col17\"\n", - " class=\"data row3 col17\" >\n", - " 1\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow3_col18\"\n", - " class=\"data row3 col18\" >\n", - " 70.32\n", - " \n", - " \n", - " </tr>\n", - " \n", - " <tr>\n", - " \n", - " \n", - " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", - " class=\"row_heading level0 row4\" rowspan=1>\n", - " 4\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col0\"\n", - " class=\"data row4 col0\" >\n", - " JACCARD\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col1\"\n", - " class=\"data row4 col1\" >\n", - " 241.38\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col2\"\n", - " class=\"data row4 col2\" >\n", - " 288.467\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col3\"\n", - " class=\"data row4 col3\" >\n", - " 10\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col4\"\n", - " class=\"data row4 col4\" >\n", - " 3\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col5\"\n", - " class=\"data row4 col5\" >\n", - " 0.533333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col6\"\n", - " class=\"data row4 col6\" >\n", - " 0.526667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col7\"\n", - " class=\"data row4 col7\" >\n", - " 0.28\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col8\"\n", - " class=\"data row4 col8\" >\n", - " 0.273333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col9\"\n", - " class=\"data row4 col9\" >\n", - " 4.92\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col10\"\n", - " class=\"data row4 col10\" >\n", - " 5.18\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col11\"\n", - " class=\"data row4 col11\" >\n", - " 0.513333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col12\"\n", - " class=\"data row4 col12\" >\n", - " 0.546667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col13\"\n", - " class=\"data row4 col13\" >\n", - " 0.28\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col14\"\n", - " class=\"data row4 col14\" >\n", - " 0.28\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col15\"\n", - " class=\"data row4 col15\" >\n", - " 0.533333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col16\"\n", - " class=\"data row4 col16\" >\n", - " 0.28\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col17\"\n", - " class=\"data row4 col17\" >\n", - " 1\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow4_col18\"\n", - " class=\"data row4 col18\" >\n", - " 70.32\n", - " \n", - " \n", - " </tr>\n", - " \n", - " <tr>\n", - " \n", - " \n", - " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", - " class=\"row_heading level0 row5\" rowspan=1>\n", - " 5\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col0\"\n", - " class=\"data row5 col0\" >\n", - " MCS\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col1\"\n", - " class=\"data row5 col1\" >\n", - " 241.38\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col2\"\n", - " class=\"data row5 col2\" >\n", - " 266.933\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col3\"\n", - " class=\"data row5 col3\" >\n", - " 1\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col4\"\n", - " class=\"data row5 col4\" >\n", - " 3\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col5\"\n", - " class=\"data row5 col5\" >\n", - " 0.92\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col6\"\n", - " class=\"data row5 col6\" >\n", - " 0.873333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col7\"\n", - " class=\"data row5 col7\" >\n", - " 0.52\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col8\"\n", - " class=\"data row5 col8\" >\n", - " 0.446667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col9\"\n", - " class=\"data row5 col9\" >\n", - " 4.92\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col10\"\n", - " class=\"data row5 col10\" >\n", - " 4.78\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col11\"\n", - " class=\"data row5 col11\" >\n", - " 0.846667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col12\"\n", - " class=\"data row5 col12\" >\n", - " 0.946667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col13\"\n", - " class=\"data row5 col13\" >\n", - " 0.486667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col14\"\n", - " class=\"data row5 col14\" >\n", - " 0.513333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col15\"\n", - " class=\"data row5 col15\" >\n", - " 0.926667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col16\"\n", - " class=\"data row5 col16\" >\n", - " 0.493333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col17\"\n", - " class=\"data row5 col17\" >\n", - " 1\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow5_col18\"\n", - " class=\"data row5 col18\" >\n", - " 70.32\n", - " \n", - " \n", - " </tr>\n", - " \n", - " <tr>\n", - " \n", - " \n", - " <th id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678b\"\n", - " class=\"row_heading level0 row6\" rowspan=1>\n", - " 6\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col0\"\n", - " class=\"data row6 col0\" >\n", - " VEO\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col1\"\n", - " class=\"data row6 col1\" >\n", - " 241.38\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col2\"\n", - " class=\"data row6 col2\" >\n", - " 259.227\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col3\"\n", - " class=\"data row6 col3\" >\n", - " 2\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col4\"\n", - " class=\"data row6 col4\" >\n", - " 3\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col5\"\n", - " class=\"data row6 col5\" >\n", - " 0.926667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col6\"\n", - " class=\"data row6 col6\" >\n", - " 0.84\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col7\"\n", - " class=\"data row6 col7\" >\n", - " 0.506667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col8\"\n", - " class=\"data row6 col8\" >\n", - " 0.44\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col9\"\n", - " class=\"data row6 col9\" >\n", - " 4.92\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col10\"\n", - " class=\"data row6 col10\" >\n", - " 4.66\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col11\"\n", - " class=\"data row6 col11\" >\n", - " 0.82\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col12\"\n", - " class=\"data row6 col12\" >\n", - " 0.946667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col13\"\n", - " class=\"data row6 col13\" >\n", - " 0.466667\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col14\"\n", - " class=\"data row6 col14\" >\n", - " 0.5\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col15\"\n", - " class=\"data row6 col15\" >\n", - " 0.933333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col16\"\n", - " class=\"data row6 col16\" >\n", - " 0.473333\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col17\"\n", - " class=\"data row6 col17\" >\n", - " 1\n", - " \n", - " \n", - " \n", - " <td id=\"T_55ac6cde_3705_11e8_8b8a_4c327598678brow6_col18\"\n", - " class=\"data row6 col18\" >\n", - " 70.32\n", - " \n", - " \n", - " </tr>\n", - " \n", - " </tbody>\n", - " </table>\n", - " " - ], - "text/plain": [ - "<pandas.formats.style.Styler at 0x10e4b4208>" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "score_per_mesure=gen_region.groupby([\"id_g1\",\"mesureL\"], as_index=False).mean()\n", - "score_gen_region=score_per_mesure.groupby([\"mesureL\"],as_index=False).mean()\n", - "score_gen_region.style.apply(highlight_max,subset=colorized_subset).apply(highlight_min,subset=colorized_subset)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "ExecuteTime": { - "end_time": "2018-03-22T11:59:19.770228Z", - "start_time": "2018-03-22T11:59:19.440447Z" - }, - "format": "row" - }, - "outputs": [], - "source": [ - "score_gen_region[[\"c1_val\",\"c2_val\",\"c3_val\",\"c4_val\"]].plot.bar(x=score_gen_region[\"mesure\"].unique())\n", - "plt.show()\n", - "score_gen_region[[\"c1*c2\",\"c1+c2\",\"c1*c2*c3\",\"c1+c3\"]].plot.bar(x=score_gen_region[\"mesure\"].unique())\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Extension 1" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "ExecuteTime": { - "end_time": "2018-04-03T06:07:37.954797Z", - "start_time": "2018-04-03T06:07:37.893571Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " <style type=\"text/css\" >\n", - " \n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow1_col5 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow1_col12 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow1_col15 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col5 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col6 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col7 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col8 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col11 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col12 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col13 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col14 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col15 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow2_col16 {\n", - " \n", - " : ;\n", - " \n", - " background-color: red;\n", - " \n", - " color: white;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col5 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col6 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col7 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col11 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col13 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col14 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col15 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow3_col16 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " #T_4f4867d0_3705_11e8_89af_4c327598678brow4_col8 {\n", - " \n", - " background-color: yellow;\n", - " \n", - " : ;\n", - " \n", - " }\n", - " \n", - " </style>\n", - "\n", - " <table id=\"T_4f4867d0_3705_11e8_89af_4c327598678b\" None>\n", + " <table id=\"T_4f4867d0_3705_11e8_89af_4c327598678b\" None>\n", " \n", "\n", " <thead>\n", @@ -78343,6 +77173,7 @@ { "ename": "KeyError", "evalue": "('n', 'occurred at index 0')", + "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", @@ -78369,8 +77200,7 @@ "\u001b[0;32mpandas/src/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/src/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: ('n', 'occurred at index 0')" - ], - "output_type": "error" + ] } ], "source": [ @@ -79683,9 +78513,9 @@ }, "varInspector": { "cols": { - "lenName": 16.0, - "lenType": 16.0, - "lenVar": 40.0 + "lenName": 16, + "lenType": 16, + "lenVar": 40 }, "kernels_config": { "python": { diff --git a/run_automatic_annotation.py b/run_automatic_annotation.py index 22632827084bc298328a26471a7e33604501a132..7661aabce183aac41273ed289e6ac80ee4d4a8d2 100644 --- a/run_automatic_annotation.py +++ b/run_automatic_annotation.py @@ -1,6 +1,6 @@ # coding = utf-8 -import os, re, argparse, json,sys, subprocess, glob +import os, re, argparse, json,sys, subprocess, glob, shutil import logging for _ in ("boto", "elasticsearch", "urllib3", "sklearn"): logging.getLogger(_).setLevel(logging.CRITICAL) @@ -23,12 +23,22 @@ if process.returncode == 1: fns=glob.glob(os.path.join(args.outputAnnotation_dir,"*.csv")) +if not os.path.exists("./temp_cluster/"): + os.makedirs("./temp_cluster/") +if not os.path.exists("cache.json"): + json.dump({},open("cache.json",'w')) if not os.path.exists(args.outputAnnotation2_dir): os.makedirs(args.outputAnnotation2_dir) +i,n=0,len(fns) for fn in fns: - print("Processing {0}...".format(fn)) - print(["python3","auto_fill_annotation.py",fn,os.path.join(args.graphDataDir),os.path.join(args.outputAnnotation2_dir,os.path.basename(fn))]) + i+=1 + print("Processing({1}/{2} {0}...".format(fn,i,n)) + #print(["python3","auto_fill_annotation.py",fn,os.path.join(args.graphDataDir),os.path.join(args.outputAnnotation2_dir,os.path.basename(fn))]) process=subprocess.run(["python3","auto_fill_annotation.py",fn,os.path.join(args.graphDataDir),os.path.join(args.outputAnnotation2_dir,os.path.basename(fn))]) if process.returncode == 1: - raise subprocess.CalledProcessError(process,"The process did not end well !") \ No newline at end of file + raise subprocess.CalledProcessError(process,"The process did not end well !") + + +shutil.rmtree("./temp_cluster/") +os.remove("cache.json") diff --git a/run_test_disambiguisation.sh b/run_test_disambiguisation.sh index 59cba8bdcb6d3e5875a7794f2e4c32cad21e7523..66ec00915e5c5acaa647c61e5a74819d08ab0c34 100755 --- a/run_test_disambiguisation.sh +++ b/run_test_disambiguisation.sh @@ -5,7 +5,7 @@ python3 eval_disambiguation.py agromada accuracy > accuracy_res_mada.txt python3 eval_disambiguation.py padiweb mean_distance_error > mean_distance_res_padi.txt python3 eval_disambiguation.py agromada mean_distance_error > mean_distance_res_mada.txt -python3 eval_disambiguation.py padiweb accuracy_k -k=1 >>accuracyk1y_res_padi.txt +python3 eval_disambiguation.py padiweb accuracy_k -k=1 >>accuracyk1_res_padi.txt python3 eval_disambiguation.py padiweb accuracy_k -k=0.5 > accuracyk0-5_res_padi.txt python3 eval_disambiguation.py agromada accuracy_k -k=1 >> accuracyk1_res_mada.txt diff --git a/strpython/config/config.json b/strpython/config/config.json index 50c7d66c903142cb984b995e572b6e7986051f09..2a86282993a809e8174fff29d63dc9559b70fc6c 100644 --- a/strpython/config/config.json +++ b/strpython/config/config.json @@ -11,5 +11,6 @@ "cooc_freq":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/coocurrence_wiki.pkl", "count":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/count_wiki.pkl" }, - "language_resources_path":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/language_resources" + "language_resources_path":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/language_resources", + "gazetteer":"geodict" } \ No newline at end of file diff --git a/strpython/eval/automatic_annotation.py b/strpython/eval/automatic_annotation.py index 74099153ad9afda59ceedf76df1d70d1b39565fe..a2a9baa9c3c2712fc2f203dfce420cbddb5f464e 100644 --- a/strpython/eval/automatic_annotation.py +++ b/strpython/eval/automatic_annotation.py @@ -1,4 +1,6 @@ # coding = utf-8 +import json +import os from strpython.models.str import STR import networkx as nx @@ -6,6 +8,41 @@ import numpy as np import geopandas as gpd from shapely.geometry import MultiPoint,Polygon,Point,LineString + +def jsonKeys2int(x): + if isinstance(x, dict): + return {int(k):jsonKeys2int(v) for k,v in x.items() } + return x + +__cache__crit={} +if os.path.exists("cache.json"): + try: + __cache__crit=json.load(open("cache.json")) + __cache__crit=jsonKeys2int(__cache__crit) + except Exception as e: + print(e) + +def save_cache(): + global __cache__crit + open("cache.json", 'w').write(json.dumps(__cache__crit)) + +def get_from_cache(id1,id2): + global __cache__crit + # try: + if id1 in __cache__crit: + if id2 in __cache__crit[id1]: + return __cache__crit[id1][id2] + elif id2 in __cache__crit: + if id1 in __cache__crit[id2]: + return __cache__crit[id2][id1] + return None + +def add_cache(id1,id2,data): + global __cache__crit + if not id1 in __cache__crit: + __cache__crit[id1] = {} + __cache__crit[id1][id2] = data + class AnnotationAutomatic(object): """ @@ -15,8 +52,15 @@ class AnnotationAutomatic(object): def __init__(self): pass - def all(self,str1,str2): - return [self.criterion1(str1,str2),self.criterion2(str1,str2),self.criterion3(str1,str2),self.criterion4(str1,str2)] + def all(self,str1,str2,id1=None,id2=None): + cache_data=get_from_cache(id1,id2) + if not cache_data: + crit_ = [self.criterion1(str1, str2), self.criterion2(str1, str2), + self.criterion3(str1, str2, id1, id2), + self.criterion4(str1, str2, id1, id2)] + add_cache(id1,id2,crit_) + return crit_ + return cache_data def criterion1(self,str1,str2): """ @@ -25,7 +69,7 @@ class AnnotationAutomatic(object): :param str2: STR :return: """ - return len(set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys())) > 0 + return int(len(set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys())) > 0) def criterion2(self,str1 : STR,str2 : STR): """ @@ -39,12 +83,12 @@ class AnnotationAutomatic(object): for es2 in str2.spatial_entities: if not es in stop_en and not es2 in stop_en: if str1.is_included_in(es,es2): - return True + return 1 if str1.is_adjacent(es,es2): - return True - return False + return 1 + return 0 - def criterion3(self, str1 :STR , str2: STR,th=0.2): + def criterion3(self, str1 :STR , str2: STR,id1=None,id2=None,th=0.3): """ Return True if one or multiple cluster of spatial entities have been found in both STR. Cluster are constructed based on low distance between spatial entities. The clustering method used is Mean-Shift as @@ -53,37 +97,46 @@ class AnnotationAutomatic(object): :param str2: :return: """ + + try: + c1=str1.get_cluster(id1) + except: + c1 = str1.get_cluster() ## Feignasse !!!! try: - c1=str1.get_cluster() - c2=str2.get_cluster() - c1["area"]=c1.area - c2["area"] = c2.area - c1=c1.sort_values(by="area",ascending=False) - c2=c2.sort_values(by="area",ascending=False) - for ind,rows in c1.iterrows(): - for ind2,rows2 in c2.iterrows(): - if rows.geometry.intersects(rows2.geometry): - #print(gpd.GeoDataFrame(geometry=[rows.geometry])) - inter = gpd.overlay( - gpd.GeoDataFrame(geometry=[rows.geometry]), - gpd.GeoDataFrame(geometry=[rows2.geometry]), - how="intersection", - use_sindex=False - ) - a1,a2=c1.area.sum(),c2.area.sum() - ia=inter.area.sum() - if a1 < a2 and ia/a1 >= th: - return True - elif a1 < a2 and ia/a2 >= th: - return True - - return False - - except Exception as e: - print(e) - return False - - def criterion4(self, str1, str2): + c2=str2.get_cluster(id2) + except: + c2 = str2.get_cluster() + + if not "geometry" in c1 or (not "geometry" in c2): + return 0 + c1["area"]=c1.area + c2["area"] = c2.area + c1=c1.sort_values(by="area",ascending=False) + c2=c2.sort_values(by="area",ascending=False) + for ind,rows in c1.iterrows(): + for ind2,rows2 in c2.iterrows(): + if rows.geometry.intersects(rows2.geometry): + return 1 + #print(gpd.GeoDataFrame(geometry=[rows.geometry])) + # inter = gpd.overlay( + # gpd.GeoDataFrame(geometry=[rows.geometry]), + # gpd.GeoDataFrame(geometry=[rows2.geometry]), + # how="intersection", + # use_sindex=False + # ) + # a1,a2=c1.area.sum(),c2.area.sum() + # if "geometry" in inter: + # ia=inter.area.sum() + # if a1 < a2 and ia/a1 >= th: + # return 1 + # elif a1 > a2 and ia/a2 >= th: + # return 1 + + return 0 + + + + def criterion4(self, str1, str2,id1=None,id2=None,): """ Return True if both str share the same clusters. Using the same clustering methods as in criterion3(). :param str1: @@ -91,7 +144,14 @@ class AnnotationAutomatic(object): :return: """ try: - return str1.get_cluster().intersects(str2.get_cluster()).all() + c1=str1.get_cluster(id1) + except: + c1 = str1.get_cluster() ## Feignasse !!!! + try: + c2=str2.get_cluster(id2) except: - return False + c2 = str2.get_cluster() + if not "geometry" in c1 or (not "geometry" in c2): + return 0 + return int(c1.intersects(c2).all()) diff --git a/strpython/eval/disambiguation.py b/strpython/eval/disambiguation.py index 1536fe7ea12a3f50c4f27e65a3f2a3b6ad2d72e5..eeed6c5ced7487aa768ba834e27a254180623013 100644 --- a/strpython/eval/disambiguation.py +++ b/strpython/eval/disambiguation.py @@ -1,7 +1,6 @@ # coding = utf-8 from shapely.geometry import Point -from ..helpers.geodict_helpers import * from ..nlp.disambiguator.geodict_gaurav import GauravGeodict from ..nlp.disambiguator.most_common import MostCommonDisambiguator from ..nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator @@ -14,12 +13,12 @@ import glob, re, sys disMost_common = MostCommonDisambiguator() disGaurav = GauravGeodict() disWiki = WikipediaDisambiguator() - +from ..helpers.geodict_helpers import gazetteer def get_coord(id): try: - c = get_data(id).coord - return Point(c["lon"], c["lat"]) + c = gazetteer.get_by_id(id)[0].coord + return Point(c.lon, c.lat) except Exception as e: return None @@ -82,7 +81,7 @@ def parse_file_EPI(fn, path_rawtext): return df["text"] = df["content"].apply(lambda x: re.sub(r"\s+", " ", x.strip())) df["geoname"] = df["info"].apply(lambda x: x["id"]) - df["GID"] = df["geoname"].apply(lambda x: get_data_by_geonames_id(x).id) + df["GID"] = df["geoname"].apply(lambda x: gazetteer.get_by_other_id(x,"geonames").id) df = df[df["geoname"] != -111111] return df, lang diff --git a/strpython/eval/stats.py b/strpython/eval/stats.py index 40cc4bd321e4eb963accce9366b5465de04a391b..aa550e5840b5e4b42153bce7af1a407de2095caa 100644 --- a/strpython/eval/stats.py +++ b/strpython/eval/stats.py @@ -1,6 +1,6 @@ # coding = utf-8 -from ..helpers.geodict_helpers import get_data +from ..helpers.geodict_helpers import gazetteer import numpy as np @@ -17,6 +17,8 @@ def flattern(A): def most_common(lst): + if not lst: + return "P-PPL" if len(list(set(lst))) > 1 and "P-PPL" in set(lst): lst = [x for x in lst if x != "PPL"] return max(set(lst), key=lst.count) @@ -28,7 +30,7 @@ def granularity(graph): :param graph: :return: """ - class_list = flattern([get_data(n)["class"] for n in list(graph.nodes())]) + class_list = flattern([gazetteer.get_by_id(n)[0].class_ for n in list(graph.nodes())]) if not class_list: return "P-PPL" return most_common(class_list) diff --git a/strpython/helpers/bow_polyglot.py b/strpython/helpers/bow_polyglot.py deleted file mode 100644 index af54e2262b9081322b67d6d3dffa67e037542112..0000000000000000000000000000000000000000 --- a/strpython/helpers/bow_polyglot.py +++ /dev/null @@ -1,89 +0,0 @@ -# coding = utf-8 - -import numpy as np -from nltk.stem import WordNetLemmatizer -from polyglot.text import Text -from scipy.sparse import csc_matrix - - -def get_vocabulary(corpus): - """ - Return the vocabulary of a corpus, a list of documents. Each document is represented - using a list of tokens. - :param corpus: list or array-like - :return: - """ - vocabulary=set([]) - for text_tagged in corpus: - for token in text_tagged: - vocabulary.add(token) - return list(vocabulary) - -def lemmatize(corpus,stopwords): - """ - Lemmatize a corpus, a list of documents. Each document is represented - using a list of tokens. - :param corpus: list or array-like - :param stopwords: list or array-like - :return: - """ - pos_tag_corp=[] - lemmatizer = WordNetLemmatizer() - for text in corpus: - pos_t=[] - try: - if not text: - pos_tag_corp.append(pos_t) - continue - for tag in Text(text).pos_tags: - if not tag[0].lower().strip() in stopwords: - if tag[1] == 'VERB': - pos_t.append(lemmatizer.lemmatize(tag[0].lower(),'v')) - else: - pos_t.append(lemmatizer.lemmatize(tag[0].lower())) - - except: - pass - pos_tag_corp.append(pos_t) - - return pos_tag_corp - -def populate_bow(bow,voc_asso,corpus_tagged): - """ - Populate the Bag of words representation for a vocabulary and a corpus. - :param bow: - :param voc_asso: - :param corpus_tagged: - :return: - """ - for t in range(len(corpus_tagged)): - text=corpus_tagged[t] - for token in text: - if token in voc_asso: - try: - bow[t][voc_asso[token]]+=1 - except: - pass - return bow - -def create_bow(corpus,stopwords): - """ - Return a Bag of words representation of a corpus, a lists of document. Each document is a list of tokens. - :param corpus: - :param stopwords: - :return: - """ - stopwords=set(stopwords) - post_tag_corp = lemmatize(corpus,stopwords) - voc = get_vocabulary(post_tag_corp) - voc_association={} - for v in range(len(voc)): - voc_association[voc[v]]=v - bow=np.zeros((len(post_tag_corp)+1,len(voc)+1), dtype=np.int16) - bow=populate_bow(bow,voc_association,post_tag_corp) - return bow - - - - - diff --git a/strpython/helpers/collision.py b/strpython/helpers/collision.py index b46158e6d893e3cbd864dbf6b8e35330387e9067..ac37a1b5ebcb1ca1247df00d90e97bcbda2ee588 100644 --- a/strpython/helpers/collision.py +++ b/strpython/helpers/collision.py @@ -5,7 +5,7 @@ import warnings from shapely.geometry import Point from ..config.configuration import config -from .geodict_helpers import get_data +from .geodict_helpers import gazetteer import geopandas as gpd __cache = {} @@ -74,15 +74,38 @@ def getGEO(id_se): :param id_se: id of the spatial entity :return: geopandas.GeoSeries """ - data = get_data(id_se) + data = gazetteer.get_by_id(id_se) + if not data: + return None + + data=data[0] if "path" in data: - return explode(gpd.read_file(os.path.join(config.osm_boundaries_directory, data["path"]))).convex_hull + return explode(gpd.read_file(os.path.join(config.osm_boundaries_directory, data.other["path"]))).convex_hull elif "coord" in data: - return gpd.GeoDataFrame(gpd.GeoSeries([Point(data["coord"]["lon"], data["coord"]["lat"]).buffer(1.0)])).rename( + return gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(1.0)])).rename( columns={0: 'geometry'}) return None +def getGEO2(id_se): + """ + Get the geofootprint of a spatial entity. If found, this geofootprint is a shape extracted from OSM. If not, + coordinates are used. + :param id_se: id of the spatial entity + :return: geopandas.GeoSeries + """ + data = gazetteer.get_by_id(id_se) + if not data: + return None + + data=data[0] + if "path" in data: + return "P",explode(gpd.read_file(os.path.join(config.osm_boundaries_directory, data.other["path"]))).convex_hull + elif "coord" in data: + return "C",gpd.GeoDataFrame(gpd.GeoSeries([Point(data.coord.lon, data.coord.lat).buffer(1.0)])).rename( + columns={0: 'geometry'}) + return None + def collide(se1, se2): """ Return true, if two entities convex hull intersects. diff --git a/strpython/helpers/deprecated.py b/strpython/helpers/deprecated.py deleted file mode 100644 index 5b42a712d5a3b13fbd6381f56ab1321c21f01477..0000000000000000000000000000000000000000 --- a/strpython/helpers/deprecated.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding = utf-8 -import functools -import inspect -import warnings - -string_types = (type(b''), type(u'')) - - -def deprecated(reason): - """ - This is a decorator which can be used to mark functions - as deprecated. It will result in a warning being emitted - when the function is used. - """ - - if isinstance(reason, string_types): - - # The @deprecated is used with a 'reason'. - # - # .. code-block:: python - # - # @deprecated("please, use another function") - # def old_function(x, y): - # pass - - def decorator(func1): - - if inspect.isclass(func1): - fmt1 = "Call to deprecated class {name} ({reason})." - else: - fmt1 = "Call to deprecated function {name} ({reason})." - - @functools.wraps(func1) - def new_func1(*args, **kwargs): - warnings.simplefilter('always', DeprecationWarning) - warnings.warn( - fmt1.format(name=func1.__name__, reason=reason), - category=DeprecationWarning, - stacklevel=2 - ) - warnings.simplefilter('default', DeprecationWarning) - return func1(*args, **kwargs) - - return new_func1 - - return decorator - - elif inspect.isclass(reason) or inspect.isfunction(reason): - - # The @deprecated is used without any 'reason'. - # - # .. code-block:: python - # - # @deprecated - # def old_function(x, y): - # pass - - func2 = reason - - if inspect.isclass(func2): - fmt2 = "Call to deprecated class {name}." - else: - fmt2 = "Call to deprecated function {name}." - - @functools.wraps(func2) - def new_func2(*args, **kwargs): - warnings.simplefilter('always', DeprecationWarning) - warnings.warn( - fmt2.format(name=func2.__name__), - category=DeprecationWarning, - stacklevel=2 - ) - warnings.simplefilter('default', DeprecationWarning) - return func2(*args, **kwargs) - - return new_func2 - - else: - raise TypeError(repr(type(reason))) diff --git a/strpython/helpers/geodict_helpers.py b/strpython/helpers/geodict_helpers.py index 0a91f973285086104e2d093f981fc8413827e578..9ac66f95646a3937745e3c71b882015939f13c87 100644 --- a/strpython/helpers/geodict_helpers.py +++ b/strpython/helpers/geodict_helpers.py @@ -7,212 +7,10 @@ from ..config.configuration import config import pandas as pd from ..helpers.objectify import objectify -es = Elasticsearch(config.es_server) - -geo_term={ - "fr":open(config.language_resources_path.rstrip("/")+"/geo_term_fr").read().lower().strip().split("\n"), - "en":open(config.language_resources_path.rstrip("/")+"/geo_term_en").read().strip().split("\n") -} - -def convert_es_to_pandas(es_query_results): - """ - Return a `pandas.Dataframe` object built from the elasticsearch query results - - Parameters - ---------- - es_query_results : dict - elasticsearch.search() result - - Returns - ------- - pandas.DataFrame - Dataframe of the elasticsearch query results - """ - if es_query_results["hits"]["total"] == 0: - return None - df = pd.DataFrame([g["_source"] for g in es_query_results["hits"]["hits"]]) - if "score" in df: - df["score"] = df["score"].apply(lambda x: float(x)) - else: - df["score"] = df.apply(lambda x: 0) - df["score"].fillna(-1, inplace=True) - return df - - -def parse_score(score): - if math.isnan(score): - return -1 - else: - return score - -def parse_label2(label : str,lang): - if not lang in geo_term: - return parse_label(label) - - label = re.sub("[ ]+", " ", re.sub('[(?)]+', "", label.strip())) - label = label.strip("'").strip("’") - - parts=label.split(" ") - # f=False - # for part in parts: - # if part.lower() in geo_term[lang]: - # f=True - # if not f: - # return parse_label(label) - new_labels=[] - for part in parts: - if not part.lower() in geo_term[lang]: - new_labels.append(parse_label(part).strip("/?")+"+") - else: - new_labels.append(parse_label(part).strip("/")) - return "/"+"[ ]?".join(new_labels)+"/" - - - - -def parse_label(label: str): - """ - Parse label/toponym to a specific regular expression that allows dissimilarity with the official toponyms/aliases. - - Parameters - ---------- - label : str - toponym - Returns - ------- - str - regular expression built from the toponym - """ - label = re.sub("[ ]+", " ", re.sub('[(?)]+', "", label.strip())) - label = label.strip("'").strip("’") - new_label = "" - for c in label: - if c.isupper(): - close_par = ")" if not (new_label.endswith(")") or new_label.endswith("?")) and new_label != "" else "" - # if new_label.endswith("]"): - # new_label = new_label[:-1] + "({0}{1}]".format(c.lower(), c) - # else: - new_label += close_par + "([{0}{1}]".format(c.lower(), c) - # print("upper", new_label) - elif c == " ": - new_label += ")?[ ]?" - # print("espace", new_label) - elif c == "'" or c == "’": - new_label += c + ")?" - # print("apostrophe", new_label) - else: - - new_label += ("(" if new_label == "" else "") + ("(" if new_label.endswith("?") else "") + c - # print("else", new_label) - new_label = "/" + new_label + ")?/" - return new_label - - -def most_common_label(toponym: str, lang: str): - """ - - - Parameters - ---------- - toponym : str - toponym - lang : str - toponym language - Returns - ------- - - """ - res = es.search("gazetteer", "place", - body={ "query": {"query_string": {"query": "\"{0}\"".format(toponym), "analyze_wildcard": False}}, - "from": 0, - "size": 50, - "sort": [{'score': "desc"}]}) - res = convert_es_to_pandas(res) - if not isinstance(res, pd.DataFrame): - return None, 0 - return res.iloc[0].id, res.iloc[0].score - - -def most_common_alias(toponym: str, lang: str): - """ - Return most common spatial entity by itsje - - Parameters - ---------- - toponym : str - toponym - lang : str - toponym language - Returns - ------- - - """ - res = es.search("gazetteer", "place", - body={"size": 1, "sort": [{"score": {"order": "desc", "unmapped_type": "boolean"}}],"query": {"bool": {"must": [{"term": {lang: toponym}}], "must_not": [], "should": []}}}) - - res = convert_es_to_pandas(res) - if not isinstance(res, pd.DataFrame): - return None, 0 - return res.iloc[0].id, res.iloc[0].score - - -def n_label_similar(toponym, lang, n=5, score=True): - body = { - "query": { - "query_string": { - "default_field": lang, - "query": parse_label2(toponym,lang) - } - }, - "from": 0, - "size": n - } - if score: - body["sort"] = [ - { - 'score': "desc" - } - ] - try: - res = es.search("gazetteer", "place", - body=body) - except: - return None - res = convert_es_to_pandas(res) - if not isinstance(res, pd.DataFrame): - return None - return res - - -def n_alias_similar(toponym, lang, n=5, score=True): - body = {"query": {"nested": {"path": "aliases", - "query": - { - "query_string": { - "default_field": "aliases.{0}".format(lang), - "query": parse_label2(toponym,lang) - } - } - }}, - "from": 0, - "size": n} - if score: - body["sort"] = [ - { - 'score': "desc" - } - ] - try: - res = es.search("gazetteer", "place", - body=body) - except: - return None - - res = convert_es_to_pandas(res) - if not isinstance(res, pd.DataFrame): - return None, 0 - return res.iloc[0].id, res.iloc[0].score +import gazpy as ga +gazetteer = ga.Geodict(es_client=Elasticsearch(config.es_server)) if config.gazetteer == "geodict" else ga.Geonames( + Elasticsearch(config.es_server)) def get_most_common_id_v3(label, lang='fr'): @@ -225,171 +23,38 @@ def get_most_common_id_v3(label, lang='fr'): :param lang: :return: """ - id_, score = most_common_label(label, lang) - if id_: - # China case - id_2, score2 = most_common_alias(label, lang) - if id_2 and score2 > score: - id_, score = id_2, score2 - simi=n_label_similar(label, lang) - if isinstance(simi,pd.DataFrame): - id_3, score3 = simi.iloc[0].id,simi.iloc[0].score - if id_2 and score2 > score: + id_,score=None,-1 + data = gazetteer.get_by_label(label, lang) + if data: + id_, score = data[0].id,data[0].score + data2=gazetteer.get_by_alias(label, lang) + if data2 and data2[0].score > data[0].score: + data2=data2[0] + id_, score = data2.id, data2.score + simi = gazetteer.get_n_label_similar(label, lang, n=5) + if simi: + id_3, score3 = simi[0].id, simi[0].score + if id_3 and score3 > score: id_, score = id_3, score3 - return id_, score - - # if nothing found in english, search in aliases - id_, score = most_common_alias(label, lang) - if id_: - return id_, score - - similar_label=n_label_similar(label,lang) - if isinstance(similar_label,pd.DataFrame): - return similar_label.iloc[0].id, similar_label.iloc[0].score - - similar_alias = n_alias_similar(label, lang) - if isinstance(similar_alias,pd.DataFrame): - return similar_alias.iloc[0].id, similar_alias.iloc[0].score - - return None, -1 - - -def get_most_common_id_alias_v2(alias, lang="fr"): - res = es.search("gazetteer", "place", - body={"query": {"nested": {"path": "aliases", - "query": - { - "query_string": { - "default_field": "aliases.{0}".format(lang), - "query": parse_label(alias) - } - } - }}, - "sort": [{"score": "desc"}]}) - - res = convert_es_to_pandas(res) - if not isinstance(res, pd.DataFrame): - return None, 0 - return res.iloc[0].id, res.iloc[0].score + return gazetteer.get_by_id(id_)[0] -def get_data(id): - """ - Return the data asssociated to an id in Geodict - :param id: - :return: - """ - res = es.search("gazetteer", "place", - body={"query": {"bool": {"must": [{"term": {"id": id}}], "must_not": [], "should": []}}, "from": 0, - "size": 10, "sort": [], "aggs": {}}) - if res["hits"]["total"] > 0: - res = res["hits"]["hits"][0]["_source"] - return objectify(res) - return None - - -def get_data_by_wikidata_id(id): - """ - Return the data asssociated to a wikidata id in Geodict - :param id: - :return: - """ - res = es.search("gazetteer", "place", - body={"query": {"bool": {"must": [{"term": {"wikidataID": id}}], "must_not": [], "should": []}}, - "from": 0, - "size": 10, "sort": [], "aggs": {}}) - if res["hits"]["total"] > 0: - res = res["hits"]["hits"][0]["_source"] - return objectify(res) - return None - - -def get_data_by_geonames_id(id): - """ - Return the data asssociated to a geonames id in Geodict - :param id: - :return: - """ - res = es.search("gazetteer", "place", - body={"query": {"bool": {"must": [{"term": {"geonameID": id}}], "must_not": [], "should": []}}, - "from": 0, - "size": 10, "sort": [], "aggs": {}}) - if res["hits"]["total"] > 0: - res = res["hits"]["hits"][0]["_source"] - return objectify(res) - return None - + # if nothing found in english, search in aliases + data = gazetteer.get_by_alias(label, lang) + if data: + return data[0] #data[0].id, data[0].score -def get_by_label(label, lang): - """ - A Supprimer - :param label: - :param lang: - :return: - """ - query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}, "size": 50} - response = es.search('gazetteer', 'place', body=query) - if 'hits' in response['hits']: - return objectify(response['hits']['hits']) - return None + similar_label = gazetteer.get_n_label_similar(label, lang, n=5) + if similar_label: + return similar_label[0]#similar_label[0].id, similar_label[0].score + similar_alias = gazetteer.get_n_alias_similar(label, lang, n=5) + if similar_alias: + return similar_alias[0]#similar_alias[0].id, similar_alias[0].score -def get_by_alias(alias, lang): - """ - A supprimer - :param alias: - :param lang: - :return: - """ - query = { - "query": {"nested": {"path": "aliases", "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}} - response = es.search('gazetteer', 'place', body=query) - if 'hits' in response['hits']: - return objectify(response['hits']['hits']) return None -def label_exists(label, lang): - """ - Return True if a spatial entity exists with a specific label in a specific language. - :param label: str - :param lang: str - :return: bool - """ - query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} - response = es.count('gazetteer', 'place', body=query) - if response["count"] > 0: - return True - return False - - -def alias_exists(alias, lang): - """ - Return True if a spatial entity exists with a specific alias in a specific language. - :param alias: str - :param lang: str - :return: bool - """ - query = { - "query": {"nested": {"path": "aliases", "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}} - response = es.count('gazetteer', 'place', body=query) - if response["count"] > 0: - return True - return False - - -def count_of_se(label, lang): - """ - Return the number of spatial entities associated with a specific label in a specific language. - :param label: str - :param lang: str - :return: int - """ - query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} - response = es.count('gazetteer', 'place', body=query) - return response["count"] - - def get_top_candidate(label, lang, n=5): """ Return the 5-top candidates for a designated label in a specific language. @@ -397,15 +62,11 @@ def get_top_candidate(label, lang, n=5): :param lang: str :return: list """ - if n<4: - n=4 - query={"size": n-3, "sort": [{"score": {"order": "desc"}}],"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} - query2={"size": 1, "sort": [{"score": {"order": "desc"}}], - "query": {"query_string": {"query": "\"{0}\"".format(label), "analyze_wildcard": False}}} - query3 = {"size": 1, "sort": [{"score": {"order": "desc", "unmapped_type": "boolean"}}],"query": {"bool": {"must": [{"term": {"en": "\"{0}\"".format(label)}}], "must_not": [], "should": []}}} - response = es.search('gazetteer', 'place', body=query) - res=[] - if 'hits' in response['hits']: - res=[x["_source"]["id"] for x in response['hits']['hits']] - res.extend([get_most_common_id_v3(label,lang)[0]]) + if n < 4: + n = 4 + res=gazetteer.get_by_label(label,lang,size=n-3,score=False) + res.extend(gazetteer.get_n_label_similar(label,lang,n=1)) + res.extend(gazetteer.get_n_alias_similar(label, lang, n=1)) + res.append(get_most_common_id_v3(label, lang)) + return res diff --git a/strpython/helpers/geodict_helpers_old.py b/strpython/helpers/geodict_helpers_old.py new file mode 100644 index 0000000000000000000000000000000000000000..7da757d5f8c95c3350d8544e65d8b11428415d88 --- /dev/null +++ b/strpython/helpers/geodict_helpers_old.py @@ -0,0 +1,394 @@ +# coding=utf-8 +import math +import re + +from elasticsearch import Elasticsearch +from ..config.configuration import config +import pandas as pd +from ..helpers.objectify import objectify + +es = Elasticsearch(config.es_server) + +geo_term={ + "fr":open(config.language_resources_path.rstrip("/")+"/geo_term_fr").read().lower().strip().split("\n"), + "en":open(config.language_resources_path.rstrip("/")+"/geo_term_en").read().strip().split("\n") +} + +def convert_es_to_pandas(es_query_results): + """ + Return a `pandas.Dataframe` object built from the elasticsearch query results + + Parameters + ---------- + es_query_results : dict + elasticsearch.search() result + + Returns + ------- + pandas.DataFrame + Dataframe of the elasticsearch query results + """ + if es_query_results["hits"]["total"] == 0: + return None + df = pd.DataFrame([g["_source"] for g in es_query_results["hits"]["hits"]]) + if "score" in df: + df["score"] = df["score"].apply(lambda x: float(x)) + else: + df["score"] = df.apply(lambda x: 0) + df["score"].fillna(-1, inplace=True) + return df + + +def parse_score(score): + if math.isnan(score): + return -1 + else: + return score + +def parse_label2(label : str,lang): + if not lang in geo_term: + return parse_label(label) + + label = re.sub("[ ]+", " ", re.sub('[(?)]+', "", label.strip())) + label = label.strip("'").strip("’") + + parts=label.split(" ") + # f=False + # for part in parts: + # if part.lower() in geo_term[lang]: + # f=True + # if not f: + # return parse_label(label) + new_labels=[] + for part in parts: + if not part.lower() in geo_term[lang]: + new_labels.append(parse_label(part).strip("/?")+"+") + else: + new_labels.append(parse_label(part).strip("/")) + return "/"+"[ ]?".join(new_labels)+"/" + + + + +def parse_label(label: str): + """ + Parse label/toponym to a specific regular expression that allows dissimilarity with the official toponyms/aliases. + + Parameters + ---------- + label : str + toponym + Returns + ------- + str + regular expression built from the toponym + """ + label = re.sub("[ ]+", " ", re.sub('[(?)]+', "", label.strip())) + label = label.strip("'").strip("’") + new_label = "" + for c in label: + if c.isupper(): + close_par = ")" if not (new_label.endswith(")") or new_label.endswith("?")) and new_label != "" else "" + # if new_label.endswith("]"): + # new_label = new_label[:-1] + "({0}{1}]".format(c.lower(), c) + # else: + new_label += close_par + "([{0}{1}]".format(c.lower(), c) + # print("upper", new_label) + elif c == " ": + new_label += ")?[ ]?" + # print("espace", new_label) + elif c == "'" or c == "’": + new_label += c + ")?" + # print("apostrophe", new_label) + else: + + new_label += ("(" if new_label == "" else "") + ("(" if new_label.endswith("?") else "") + c + # print("else", new_label) + new_label = "/" + new_label + ")?/" + return new_label + + +def most_common_label(toponym: str, lang: str): + """ + + + Parameters + ---------- + toponym : str + toponym + lang : str + toponym language + Returns + ------- + + """ + res = es.search("gazetteer", "place", + body={ "query": {"query_string": {"query": "\"{0}\"".format(toponym), "analyze_wildcard": False}}, + "from": 0, + "size": 50, + "sort": [{'score': "desc"}]}) + res = convert_es_to_pandas(res) + if not isinstance(res, pd.DataFrame): + return None, 0 + return res.iloc[0].id, res.iloc[0].score + + +def most_common_alias(toponym: str, lang: str): + """ + Return most common spatial entity by itsje + + Parameters + ---------- + toponym : str + toponym + lang : str + toponym language + Returns + ------- + + """ + res = es.search("gazetteer", "place", + body={"size": 1, "sort": [{"score": {"order": "desc", "unmapped_type": "boolean"}}],"query": {"bool": {"must": [{"term": {lang: toponym}}], "must_not": [], "should": []}}}) + + res = convert_es_to_pandas(res) + if not isinstance(res, pd.DataFrame): + return None, 0 + return res.iloc[0].id, res.iloc[0].score + + +def n_label_similar(toponym, lang, n=5, score=True): + body = { + "query": { + "query_string": { + "default_field": lang, + "query": parse_label2(toponym,lang) + } + }, + "from": 0, + "size": n + } + if score: + body["sort"] = [ + { + 'score': "desc" + } + ] + try: + res = es.search("gazetteer", "place", + body=body) + except: + return None + res = convert_es_to_pandas(res) + if not isinstance(res, pd.DataFrame): + return None + return res + + +def n_alias_similar(toponym, lang, n=5, score=True): + body = {"query": {"nested": {"path": "aliases", + "query": + { + "query_string": { + "default_field": "aliases.{0}".format(lang), + "query": parse_label2(toponym,lang) + } + } + }}, + "from": 0, + "size": n} + if score: + body["sort"] = [ + { + 'score': "desc" + } + ] + try: + res = es.search("gazetteer", "place", + body=body) + except: + return None + + res = convert_es_to_pandas(res) + if not isinstance(res, pd.DataFrame): + return None, 0 + return res.iloc[0].id, res.iloc[0].score + + + +def get_most_common_id_v3(label, lang='fr'): + """ + Return the spatial entity and its score, based on a specific label and language that obtains the highest score. + The difference with the V2 is that it takes special cases: + * english placenames in a french text + * alias like China which designated also a spatial entity + :param label: + :param lang: + :return: + """ + id_, score = most_common_label(label, lang) + if id_: + # China case + id_2, score2 = most_common_alias(label, lang) + if id_2 and score2 > score: + id_, score = id_2, score2 + simi=n_label_similar(label, lang) + if isinstance(simi,pd.DataFrame): + id_3, score3 = simi.iloc[0].id,simi.iloc[0].score + if id_2 and score2 > score: + id_, score = id_3, score3 + return id_, score + + # if nothing found in english, search in aliases + id_, score = most_common_alias(label, lang) + if id_: + return id_, score + + similar_label=n_label_similar(label,lang) + if isinstance(similar_label,pd.DataFrame): + return similar_label.iloc[0].id, similar_label.iloc[0].score + + similar_alias = n_alias_similar(label, lang) + if isinstance(similar_alias,pd.DataFrame): + return similar_alias.iloc[0].id, similar_alias.iloc[0].score + + return None, -1 + + + + +def get_data(id): + """ + Return the data asssociated to an id in Geodict + :param id: + :return: + """ + res = es.search("gazetteer", "place", + body={"query": {"bool": {"must": [{"term": {"id": id}}], "must_not": [], "should": []}}, "from": 0, + "size": 10, "sort": [], "aggs": {}}) + if res["hits"]["total"] > 0: + res = res["hits"]["hits"][0]["_source"] + return objectify(res) + return None + + +def get_data_by_wikidata_id(id): + """ + Return the data asssociated to a wikidata id in Geodict + :param id: + :return: + """ + res = es.search("gazetteer", "place", + body={"query": {"bool": {"must": [{"term": {"wikidataID": id}}], "must_not": [], "should": []}}, + "from": 0, + "size": 10, "sort": [], "aggs": {}}) + if res["hits"]["total"] > 0: + res = res["hits"]["hits"][0]["_source"] + return objectify(res) + return None + + +def get_data_by_geonames_id(id): + """ + Return the data asssociated to a geonames id in Geodict + :param id: + :return: + """ + res = es.search("gazetteer", "place", + body={"query": {"bool": {"must": [{"term": {"geonameID": id}}], "must_not": [], "should": []}}, + "from": 0, + "size": 10, "sort": [], "aggs": {}}) + if res["hits"]["total"] > 0: + res = res["hits"]["hits"][0]["_source"] + return objectify(res) + return None + + +def get_by_label(label, lang): + """ + A Supprimer + :param label: + :param lang: + :return: + """ + query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}, "size": 50} + response = es.search('gazetteer', 'place', body=query) + if 'hits' in response['hits']: + return objectify(response['hits']['hits']) + return None + + +def get_by_alias(alias, lang): + """ + A supprimer + :param alias: + :param lang: + :return: + """ + query = { + "query": {"nested": {"path": "aliases", "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}} + response = es.search('gazetteer', 'place', body=query) + if 'hits' in response['hits']: + return objectify(response['hits']['hits']) + return None + + +def label_exists(label, lang): + """ + Return True if a spatial entity exists with a specific label in a specific language. + :param label: str + :param lang: str + :return: bool + """ + query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} + response = es.count('gazetteer', 'place', body=query) + if response["count"] > 0: + return True + return False + + +def alias_exists(alias, lang): + """ + Return True if a spatial entity exists with a specific alias in a specific language. + :param alias: str + :param lang: str + :return: bool + """ + query = { + "query": {"nested": {"path": "aliases", "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}} + response = es.count('gazetteer', 'place', body=query) + if response["count"] > 0: + return True + return False + + +def count_of_se(label, lang): + """ + Return the number of spatial entities associated with a specific label in a specific language. + :param label: str + :param lang: str + :return: int + """ + query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} + response = es.count('gazetteer', 'place', body=query) + return response["count"] + + +def get_top_candidate(label, lang, n=5): + """ + Return the 5-top candidates for a designated label in a specific language. + :param label: str + :param lang: str + :return: list + """ + if n<4: + n=4 + query={"size": n-3, "sort": [{"score": {"order": "desc"}}],"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} + query2={"size": 1, "sort": [{"score": {"order": "desc"}}], + "query": {"query_string": {"query": "\"{0}\"".format(label), "analyze_wildcard": False}}} + query3 = {"size": 1, "sort": [{"score": {"order": "desc", "unmapped_type": "boolean"}}],"query": {"bool": {"must": [{"term": {"en": "\"{0}\"".format(label)}}], "must_not": [], "should": []}}} + response = es.search('gazetteer', 'place', body=query) + res=[] + if 'hits' in response['hits']: + res=[x["_source"]["id"] for x in response['hits']['hits']] + res.extend([get_most_common_id_v3(label,lang)[0]]) + return res diff --git a/strpython/models/str.py b/strpython/models/str.py index 5cde4ffad66f6db27ba5c3496be1b085e8f9f7e7..3523dbb29f7484c015bc04a9fd21223743a60266 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -1,20 +1,22 @@ # coding = utf-8 import copy import logging +import os import time import warnings +import folium import geopandas as gpd import networkx as nx import pandas as pd from shapely.geometry import MultiPoint,Polygon,Point,LineString from ..helpers.collision import collisionTwoSEBoundaries, add_cache_adjacency -from ..helpers.geodict_helpers import get_data, get_data_by_wikidata_id +from ..helpers.geodict_helpers import gazetteer from ..eval.stats import most_common from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans, dbscan - +import numpy as np # logging.basicConfig(filename=config.log_file,level=logging.INFO) @@ -24,11 +26,11 @@ def get_inclusion_chain(id_, prop): For an entity return it geographical inclusion tree using a property. """ arr__ = [] - current_entity = get_data(id_) - if "inc_" + prop in current_entity: - arr__ = current_entity["inc_" + prop] - elif "inc_geoname" in current_entity: - arr__ = current_entity["inc_geoname"] + current_entity = gazetteer.get_by_id(id_)[0] + if "inc_" + prop in current_entity.other: + arr__ = current_entity.other["inc_" + prop] + elif "inc_geoname" in current_entity.other: + arr__ = current_entity.other.inc_geoname if isinstance(arr__, str): arr__ = [arr__] return arr__ @@ -59,9 +61,9 @@ class STR(object): try: sp_en[nod] = g.nodes[nod]["label"] except KeyError: # If no label found, grab one from the geo-database - data = get_data(nod) + data = gazetteer.get_by_id(nod) if data: - sp_en[nod] = data["en"] + sp_en[nod] = data[0].name str_ = STR(tagged_, sp_en) str_.set_graph(g) @@ -94,10 +96,11 @@ class STR(object): :param label: :return: """ - data_ = get_data(id) + data_ = gazetteer.get_by_id(id) if not data_: warnings.warn("{0} wasn't found in Geo-Database".format(id)) return False + data_=data_[0] if not label and v == True: warnings.warn("Label empty. @en label from Geo-Database will be used.") label = data_["en"] @@ -142,14 +145,15 @@ class STR(object): # Erase old spatial entities new_label = {} for old_se, new_se in transform_map.items(): - data = get_data(new_se) + data = gazetteer.get_by_id(new_se) if data: + data = data[0] final_transform_map[old_se] = new_se if not new_se in self.spatial_entities: - self.add_spatial_entity(new_se, data["en"]) + self.add_spatial_entity(new_se, data.label.en) del self.spatial_entities[old_se] - new_label[new_se] = data["en"] + new_label[new_se] = data.label.en else: warnings.warn("{0} doesn't exists in the geo database!".format(new_se)) self.graph = nx.relabel_nodes(self.graph, final_transform_map) @@ -235,12 +239,10 @@ class STR(object): def getP47AdjacencyData(self, data): p47se1 = [] - for el in data["P47"]: - d = get_data_by_wikidata_id(el) - if not d: - continue - if "id" in d: - p47se1.append(d["id"]) + for el in data.other.P47: + d = gazetteer.get_by_other_id(el,"wikidata") + if not d:continue + p47se1.append(d[0].id) return p47se1 def is_adjacent(self,se1,se2,datase1=None,datase2=None): @@ -252,8 +254,8 @@ class STR(object): elif self.is_included_in(se2, se1): return f - data_se1 = get_data(se1) if not datase1 else datase1 # Évite de recharger à chaque fois -_- - data_se2 = get_data(se2) if not datase2 else datase2 + data_se1 = gazetteer.get_by_id(se1)[0] if not datase1 else datase1 # Évite de recharger à chaque fois -_- + data_se2 = gazetteer.get_by_id(se2)[0] if not datase2 else datase2 # print("testP47") if "P47" in data_se2: @@ -271,9 +273,9 @@ class STR(object): return True if not f: if "coord" in data_se1 and "coord" in data_se2: - if Point(data_se1["coord"]["lon"], data_se1["coord"]["lat"]).distance( - Point(data_se2["coord"]["lon"], data_se2["coord"]["lat"])) < 1 and len( - set(data_se1["class"]) & stop_class) < 1 and len(set(data_se2["class"]) & stop_class) < 1: + if Point(data_se1.coord.lon, data_se1.coord.lat).distance( + Point(data_se2.coord.lon, data_se2.coord.lat)) < 1 and len( + set(data_se1.class_) & stop_class) < 1 and len(set(data_se2.class_) & stop_class) < 1: return True return f @@ -282,7 +284,7 @@ class STR(object): Return all the adjacency relationships between all the spatial entities in the STR. :return: """ - data={se:get_data(se)for se in self.spatial_entities} + data={se:gazetteer.get_by_id(se)[0] for se in self.spatial_entities} for se1 in self.spatial_entities: data_se1 = data[se1] for se2 in self.spatial_entities: @@ -373,21 +375,27 @@ class STR(object): def get_geo_data_of_se(self): points,label,class_ = [], [], [] for se in self.spatial_entities: - data = get_data(se) + data = gazetteer.get_by_id(se)[0] try: - points.append(Point(data["coord"]["lon"], data["coord"]["lat"])) - label.append(data["en"]) - class_.append(most_common(data["class"])) - except: - class_.append("P-PPL") - df=gpd.GeoDataFrame({"geometry":points,"label":label,"classe":class_}) + points.append(Point(data.coord.lon, data.coord.lat)) + label.append(data.name) + # class_.append(most_common(data["class"])) + except KeyError: + pass + # print(len(points),len(label),len(class_)) + df=gpd.GeoDataFrame({"geometry":points,"label":label}) df["x"]=df.geometry.apply(lambda p: p.x) df["y"] = df.geometry.apply(lambda p: p.y) return df - def get_cluster(self): + def get_cluster(self,id_=None): + if os.path.exists("./temp_cluster/{0}.geojson".format(id_)): + return gpd.read_file("./temp_cluster/{0}.geojson".format(id_)) + data=self.get_geo_data_of_se() X=data[["x", "y"]].values + if len(X) ==0: # if zero samples return Empty GeoDataFrame + return gpd.GeoDataFrame() try: bandwidth = estimate_bandwidth(X) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) @@ -415,40 +423,91 @@ class STR(object): geo = data.groupby("cluster").apply(to_Polygon) cluster_polybuff = gpd.GeoDataFrame(geometry=geo) + if id_: + cluster_polybuff.to_file("./temp_cluster/{0}.geojson".format(id_)) return cluster_polybuff + def to_folium(self): + points = [] + for se in self.spatial_entities: + data = gazetteer.get_by_id(se)[0] + try: + points.append(Point(data.coord.lon, data.coord.lat)) + except: + pass + + lines_adj = [] + for se1 in self.adjacency_relationships: + data_se1 = gazetteer.get_by_id(se1)[0] + for se2 in self.adjacency_relationships[se1]: + data_se2 = gazetteer.get_by_id(se2)[0] + if self.adjacency_relationships[se1][se2]: + lines_adj.append( + LineString([(data_se1.coord.lon, data_se1.coord.lat), + (data_se2.coord.lon, data_se2.coord.lat)]) + ) + lines_inc = [] + for se1 in self.inclusion_relationships: + data_se1 = data_se1=gazetteer.get_by_id(se1)[0] + for se2 in self.inclusion_relationships[se1]: + if self.inclusion_relationships[se1][se2]: + data_se2 = data_se1=gazetteer.get_by_id(se2)[0] + lines_inc.append( + LineString([ + (data_se1.coord.lon, data_se1.coord.lat), + (data_se2.coord.lon, data_se2.coord.lat)] + ) + ) + + def to_fol(seris,color="#ff0000"): + df=gpd.GeoDataFrame(geometry=seris.values) + df.crs={'init' :'epsg:4326'} + return folium.features.GeoJson(df.to_json(),style_function=lambda x: {'color':color}) + + gjson1 = to_fol(gpd.GeoSeries(points)) + gjson2 = to_fol(gpd.GeoSeries(lines_adj),color='#00ff00') + gjson3 = to_fol(gpd.GeoSeries(lines_inc)) + + map=folium.Map() + map.add_child(gjson1) + map.add_child(gjson2) + map.add_child(gjson3) + + return map + + def map_projection(self,plt=False): import matplotlib.pyplot as plt world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) base = world.plot(color='white', edgecolor='black', figsize=(16, 9)) points=[] for se in self.spatial_entities: - data=get_data(se) + data=gazetteer.get_by_id(se)[0] try: - points.append(Point(data["coord"]["lon"],data["coord"]["lat"])) + points.append(Point(data.coord.lon,data.coord.lat)) except: pass lines_adj=[] for se1 in self.adjacency_relationships: - data_se1=get_data(se1) + data_se1=gazetteer.get_by_id(se1)[0] for se2 in self.adjacency_relationships[se1]: - data_se2 = get_data(se2) + data_se2 = gazetteer.get_by_id(se2)[0] if self.adjacency_relationships[se1][se2]: lines_adj.append( - LineString([(data_se1["coord"]["lon"],data_se1["coord"]["lat"]),(data_se2["coord"]["lon"], data_se2["coord"]["lat"])]) + LineString([(data_se1.coord.lon,data_se1.coord.lat),(data_se2.coord.lon, data_se2.coord.lat)]) ) lines_inc=[] for se1 in self.inclusion_relationships: - data_se1=get_data(se1) + data_se1 = gazetteer.get_by_id(se1)[0] for se2 in self.inclusion_relationships[se1]: if self.inclusion_relationships[se1][se2]: - data_se2 = get_data(se2) + data_se2 = gazetteer.get_by_id(se2)[0] lines_inc.append( LineString([ - (data_se1["coord"]["lon"], data_se1["coord"]["lat"]), - (data_se2["coord"]["lon"], data_se2["coord"]["lat"])] + (data_se1.coord.lon, data_se1.coord.lat), + (data_se2.coord.lon, data_se2.coord.lat)] ) ) diff --git a/strpython/models/transformation/transform.py b/strpython/models/transformation/transform.py index f97e123873503ea82df1398ca11d0e0f73c8429a..294c75e1ee274cc1193606ef7ca2a8931604c2e1 100644 --- a/strpython/models/transformation/transform.py +++ b/strpython/models/transformation/transform.py @@ -7,7 +7,7 @@ import numpy as np from elasticsearch import Elasticsearch from ...config.configuration import config -from ...helpers.geodict_helpers import get_data +from ...helpers.geodict_helpers import gazetteer from ..str import STR, get_inclusion_chain client = Elasticsearch(config.es_server) @@ -72,14 +72,16 @@ class Generalisation(Transformation): for node in graph.nodes(): if not node in inclusion_dictionnary: inc_list = [] - data = get_data(node) + data = gazetteer.get_by_id(node) + if not data:continue + data=data[0] try: - inc_list = data["inc_P131"] + inc_list = data[0].other.inc_P131 except: pass if not inc_list: - if "inc_geoname" in data: - inc_list = data["inc_geoname"] + if "inc_geoname" in data.other: + inc_list = data.other.inc_geoname if inc_list: inc_list = inc_list if isinstance(inc_list, list) else [inc_list] @@ -94,7 +96,7 @@ class Generalisation(Transformation): for it in list_: if not it in associated_classes: try: - classes_list = get_data(it)["class"] + classes_list = gazetteer.get_by_id(it)[0].class_ except: print("No class found for {0}".format(it)) continue @@ -142,10 +144,10 @@ class Generalisation(Transformation): inc_chain = inclusion_dict[node] if len(inc_chain) < h: transform_map[node] = inc_chain[-1] # if h superior to chain lenght - new_label[inc_chain[-1]] = get_data(inc_chain[-1])["en"] + new_label[inc_chain[-1]] = gazetteer.get_by_id(inc_chain[-1])[0].label.en else: transform_map[node] = inc_chain[h - 1] - new_label[inc_chain[h - 1]] = get_data(inc_chain[h - 1])["en"] + new_label[inc_chain[h - 1]] = gazetteer.get_by_id(inc_chain[h - 1])[0].label.en if cp: copy_ = copy.deepcopy(str_) copy_.transform_spatial_entities(transform_map) @@ -196,9 +198,9 @@ class Expansion(Transformation): es = np.array(list(graph.nodes)) score = [-1 for i in range(len(es))] for e in range(len(es)): - data = get_data(es[e]) - if "score" in data: - score[e] = float(data["score"]) + data = gazetteer.get_by_id(es[e]) + if data: + score[e] = float(data[0].score) return np.median(score), es[score < np.median(score)] def transform(self, str_: STR, **kwargs): @@ -215,18 +217,17 @@ class Expansion(Transformation): median, selected_se = self.select_es(graph) data_se, scores_ = {}, [] for node in selected_se: - data_se[node] = get_data(node) - if "score" in data_se[node]: - scores_.append(float(data_se[node]["score"])) - else: - scores_.append(-1) + d_=gazetteer.get_by_id(node) + if d_: data_se[node] = d_[0] + scores_.append(d_.score) + new_nodes = [] labels = [] for node in selected_se: data_ = data_se[node] - if (not "P-PPL" in data_["class"]) and (not "A-ADM4" in data_["class"]): + if (not "P-PPL" in data_.class_) and (not "A-ADM4" in data_.class_): continue - if not "country" in data_: + if not "country" in data_.other: continue neighbor = self.getAroundEntities(data_, median, distance, unit, n) if not neighbor: @@ -234,7 +235,7 @@ class Expansion(Transformation): neighbor = [get_inclusion_chain(node, "P131")[0]] except: neighbor = [] - labels.extend([get_data(n)["en"] for n in neighbor]) + labels.extend([gazetteer.get_by_id(n)[0].label.en for n in neighbor]) new_nodes.extend(neighbor) new_nodes = list(set(new_nodes)) diff --git a/strpython/nlp/disambiguator/geodict_gaurav.py b/strpython/nlp/disambiguator/geodict_gaurav.py index bbfb37b8b7b4229a94a26caa5036c7df94503a45..3d59912885e7aee5a63b09d0b6805cce9da97d37 100644 --- a/strpython/nlp/disambiguator/geodict_gaurav.py +++ b/strpython/nlp/disambiguator/geodict_gaurav.py @@ -2,6 +2,7 @@ import math from ...helpers.collision import * +#from ...helpers.geodict_helpers_old import * from ...helpers.geodict_helpers import * from .disambiguator import Disambiguator @@ -29,13 +30,13 @@ class GauravGeodict(Disambiguator): For an entity return it geographical inclusion tree using a property. """ arr = [] - current_entity = get_data(id_) + current_entity = gazetteer.get_by_id(id_)[0] while True: if prop in current_entity: arr.append(current_entity[prop][0]) - current_entity = get_data_by_wikidata_id(current_entity[prop][0]) + current_entity = gazetteer.get_by_other_id(current_entity[prop][0],"wikidata") else: - arr.append('Q2') # Earth ID + arr.append(gazetteer.get_by_label("Earth","en")[0].id) # Earth ID break return arr @@ -50,9 +51,9 @@ class GauravGeodict(Disambiguator): return self.inclusion_log(interP131) + self.inclusion_log(interP706) def Adjacency_P47(self, id1, id2): - data_1, data_2 = get_data(id1), get_data(id2) + data_1, data_2 = gazetteer.get_by_id(id1)[0], gazetteer.get_by_id(id2)[0] if "P47" in data_1 and "P47" in data_2: - if id1 in data_2["P47"] or id2 in data_1["P47"]: + if id1 in data_2.other.P47 or id2 in data_1.other.P47: return True return False @@ -63,10 +64,10 @@ class GauravGeodict(Disambiguator): score_dc = {} for cand in spat_candidates: - id_cand = cand["id"] + id_cand = cand.id score_dc[id_cand] = 0 for fixed in fixed_entities: - id_fixed = fixed_entities[fixed]["id"] + id_fixed = fixed_entities[fixed].id if self.Adjacency_P47(id_cand, id_fixed): score_dc[id_cand] += 3 elif self.Adjacency_Hull(id_cand, id_fixed): @@ -76,63 +77,34 @@ class GauravGeodict(Disambiguator): if score_dc[m] < 4: return None for cand in spat_candidates: - if cand["id"] == m: - return cand["id"] + if cand.id == m: + return cand.id - def disambiguate(self, ner_result, lang="en"): - count, se_ = self.extract_se_entities(ner_result) - selected_en = {} - - fixed_entities = {} - ambiguous_entities = {} - for en in se_: - request = get_top_candidate(en, lang) - if len(request) == 0: - request = n_label_similar(en, lang) - - if len(request) > 1: - ambiguous_entities[en] = [r["_source"] for r in request] - elif len(request) == 1: - fixed_entities[en] = request[0]["_source"] - - d_amb_results = {} - for amb_ent in ambiguous_entities: - d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities) - if not d: - d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang)[0] - else: - d_amb_results[amb_ent] = d - for k, v in fixed_entities.items(): - fixed_entities[k] = v["id"] - for k, v in d_amb_results.items(): - fixed_entities[k] = v - - return count, fixed_entities def eval(self,se_,lang): selected_en = {} - fixed_entities = {} ambiguous_entities = {} for en in se_: - request = get_by_label(en, lang) + request = gazetteer.get_by_label(en, lang) if len(request) == 0: - request = get_by_alias(en, lang) + request = gazetteer.get_by_alias(en, lang) if len(request) > 1: - ambiguous_entities[en] = [r["_source"] for r in request] + ambiguous_entities[en] = request elif len(request) == 1: - fixed_entities[en] = request[0]["_source"] + fixed_entities[en] = request[0] d_amb_results = {} for amb_ent in ambiguous_entities: d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities) if not d: - d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang)[0] + d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang).id else: d_amb_results[amb_ent] = d + #print(fixed_entities) for k, v in fixed_entities.items(): - fixed_entities[k] = v["id"] + fixed_entities[k] = v.id for k, v in d_amb_results.items(): fixed_entities[k] = v diff --git a/strpython/nlp/disambiguator/models/bigram.py b/strpython/nlp/disambiguator/models/bigram.py index 9441041fc75f5377532d2339c5544d7350c0cf8d..ec146b47694f05f59d20429e208102dcb0cad7dc 100644 --- a/strpython/nlp/disambiguator/models/bigram.py +++ b/strpython/nlp/disambiguator/models/bigram.py @@ -1,5 +1,4 @@ # coding = utf-8 -from strpython.helpers.geodict_helpers import get_data class BigramModel: diff --git a/strpython/nlp/disambiguator/most_common.py b/strpython/nlp/disambiguator/most_common.py index 12e448912f613c3b12b44c5ee08df0da5b4532ae..178468c4fedb12074e1ae7cd085629958ae89467 100644 --- a/strpython/nlp/disambiguator/most_common.py +++ b/strpython/nlp/disambiguator/most_common.py @@ -1,7 +1,8 @@ # coding = utf-8 -from ...helpers.geodict_helpers import * + +from ...helpers.geodict_helpers import * from .disambiguator import Disambiguator import re, json, os from ...config.configuration import config @@ -35,7 +36,7 @@ class MostCommonDisambiguator(Disambiguator): selected_en = {} for en in se_: id_,score=self.disambiguate_(en,lang) - if not id_ =="O" and id_: + if not id_ == "O" and id_: selected_en[id_] = en new_count[id_] = count[en] @@ -55,15 +56,8 @@ class MostCommonDisambiguator(Disambiguator): if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]: return 'O', -1 - id_, score = most_common_label(label, lang) - if id_: - id_en, score_en = get_most_common_id_v3(label, "en") - if id_en and score_en: - if score_en > score: - id_, score = id_en, score_en - id_alias, score_alias = most_common_alias(label, lang) - if id_alias and score_alias: - if score_alias > score: - id_, score = id_alias, score_alias - #print(label,id_,score) + data=get_most_common_id_v3(label, lang) + id_, score=None,0 + if data: + id_,score=data.id,data.score return id_, score diff --git a/strpython/nlp/disambiguator/pagerank.py b/strpython/nlp/disambiguator/pagerank.py deleted file mode 100644 index 25eb02eb7edb9cdc37cf918a70f8690339f7f19f..0000000000000000000000000000000000000000 --- a/strpython/nlp/disambiguator/pagerank.py +++ /dev/null @@ -1,29 +0,0 @@ -# coding = utf-8 - -from ...helpers.geodict_helpers import * -from .disambiguator import Disambiguator - - -class PageRankDisambiguator(Disambiguator): - def __init__(self): - Disambiguator.__init__(self) - - def disambiguate(self, ner_result, lang="en"): - count, se_ = self.extract_se_entities(ner_result) - new_count = {} - selected_en = {} - for en in se_: - en_most_common, score_en = get_most_common_id_v3(en, "en") - if label_exists(en, lang): - id_, score = get_most_common_id_v3(en, lang) - elif alias_exists(en, lang): - id_, score = (en, lang) - - if en_most_common and score_en > score: - selected_en[en_most_common] = en - new_count[en_most_common] = count[en] - else: - selected_en[en_most_common] = en - new_count[en_most_common] = count[en] - - return new_count, selected_en diff --git a/strpython/nlp/disambiguator/wikipedia_cooc.py b/strpython/nlp/disambiguator/wikipedia_cooc.py index 9d605d3af58ca54e40af2d4b71818600659d02cc..c550904d3568e949c1629ab66895f9c9fcd677fd 100644 --- a/strpython/nlp/disambiguator/wikipedia_cooc.py +++ b/strpython/nlp/disambiguator/wikipedia_cooc.py @@ -5,6 +5,7 @@ from .disambiguator import Disambiguator from .models.bigram import BigramModel import pickle from ...config.configuration import config +#from ...helpers.geodict_helpers_old import * from ...helpers.geodict_helpers import * from .most_common import stop_words,common_words import networkx as nx @@ -38,11 +39,11 @@ class WikipediaDisambiguator(Disambiguator): for e in entities: if re.match("^\d+$", e): continue - if e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]: + if lang in stop_words and e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]: continue plural = e.rstrip("s") + "s" - if plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]: + if lang in stop_words and plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]: continue spat_en.append(e) spat_en=list(set(spat_en)) @@ -54,15 +55,9 @@ class WikipediaDisambiguator(Disambiguator): for e in spat_en: cand = get_top_candidate(e, lang, 5)#get_top_candidate(e, lang,4) - if cand[0] == None: - cand=[] + cand = [c.id for c in cand if c] if not cand: - cand=n_label_similar(e,lang,5) - if isinstance(cand,pd.DataFrame): - cand = cand["id"].values - else: - cand=[] - + cand = [c.id for c in gazetteer.get_n_label_similar(e,lang,5) if c] group_candidate[e] = cand betw_cand[e]=cand for n in cand: @@ -70,17 +65,16 @@ class WikipediaDisambiguator(Disambiguator): possible_candidates.extend(cand) for cand in possible_candidates: - g.add_node(cand, label=get_data(cand)[lang]) + g.add_node(cand, label=gazetteer.get_by_id(cand)[0].label[lang]) - data_candidate={ca :get_data(ca) for ca in possible_candidates} + data_candidate={ca :gazetteer.get_by_id(ca)[0] for ca in possible_candidates} for cand in possible_candidates: for cand2 in possible_candidates: # Get PageRank score d = data_candidate[cand] sc = 1 - if "score" in d: - sc = float(d["score"]) + sc=d.score # Compute probability prob = self.model.get_coocurence_probability(sc, cand, cand2) @@ -108,7 +102,7 @@ class WikipediaDisambiguator(Disambiguator): else:# degree by default selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight')) #print(1) - except: - selected[gr]=get_most_common_id_v3(gr,lang)[0] + except Exception as e: + selected[gr]=get_most_common_id_v3(gr,lang) return selected diff --git a/strpython/nlp/ner/by_dict.py b/strpython/nlp/ner/by_dict.py index ec7ed54fc0c51d685f69f1b92ebe5027584ec0b0..a5ee4af28a07b424f2ef4250727d0fc479d16f2a 100644 --- a/strpython/nlp/ner/by_dict.py +++ b/strpython/nlp/ner/by_dict.py @@ -3,7 +3,7 @@ import numpy as np from polyglot.text import Text as PolyText from .ner import NER -from ...helpers import geodict_helpers +from ...helpers.geodict_helpers import gazetteer,get_most_common_id_v3 class ByDict(NER): @@ -37,9 +37,9 @@ class ByDict(NER): cur = f.tolist() for t in terms: - GID = geodict_helpers.get_most_common_id_v3(" ".join(pos_t[:, 0][t]), lang=self._lang)[0] + GID = get_most_common_id_v3(" ".join(pos_t[:, 0][t]), lang=self._lang)[0] if GID: - data = geodict_helpers.get_data(GID) + data = gazetteer.get_by_id(GID)[0] if "score" in data: if not float(data["score"]) > self.threshold: continue diff --git a/strpython/pipeline.py b/strpython/pipeline.py index c2d5feb71956894c83d87f2f5cce666e62e6d836..7d8e9b281e0f90022ef05a19f8924cdc3571b571 100644 --- a/strpython/pipeline.py +++ b/strpython/pipeline.py @@ -4,7 +4,6 @@ from strpython.models.str import STR from .models.transformation.transform import Generalisation, Expansion from .nlp.disambiguator.disambiguator import Disambiguator from .nlp.disambiguator.most_common import MostCommonDisambiguator -from .nlp.disambiguator.pagerank import PageRankDisambiguator from .nlp.exception.disambiguator import NotADisambiguatorInstance from .nlp.exception.ner import NotANERInstance from .nlp.exception.tagger import NotATaggerInstance diff --git a/synthesize_result.py b/synthesize_result.py index 1f871273586602060cd4ba93851c50ed3a88fd48..a0d58d6e8ccc11752816ae5e39d57ea0230bccf2 100644 --- a/synthesize_result.py +++ b/synthesize_result.py @@ -1,9 +1,18 @@ # coding = utf-8 +import os + import pandas as pd import numpy as np import glob,argparse -fns=glob.glob("data/agromada_annotation_data_final/*") + +parser =argparse.ArgumentParser() +parser.add_argument("annotated_dir") +parser.add_argument("output") +args=parser.parse_args() + + +fns=glob.glob(os.path.join(args.annotated_dir,"*")) data=[] for fn in fns: @@ -17,4 +26,4 @@ for fn in fns: data pd.DataFrame(data,columns="mesure type c1 c2 c3 c4".split()) res=pd.DataFrame(data,columns="mesure type c1 c2 c3 c4".split()) -res.to_csv('result_mada.csv') \ No newline at end of file +res.to_csv(args.output) \ No newline at end of file