diff --git a/.gitignore b/.gitignore index 572be86ffe526155cc18737764a5f524cc854654..46d7054aaed84961ad64817eacf37927ec6fe18f 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ __pycache__/ /UNHCR_en.txt !/tests/ .DS_Store +.Rproj.user diff --git a/data/graph_exp_july_19/selected.json b/data/graph_exp_july_19/selected.json new file mode 100644 index 0000000000000000000000000000000000000000..dc133d29641c276240579a7d46b016cca3ec5aee --- /dev/null +++ b/data/graph_exp_july_19/selected.json @@ -0,0 +1 @@ +[254, 566, 594, 642, 877, 887, 988, 1072, 1210, 1308, 1315, 1335, 1356, 1416, 1548, 1571, 1587, 1683, 1685, 1958, 1960, 1961, 1968, 2034, 2047, 2182, 2194, 2345, 2422, 2528, 2560, 2734, 3306, 3606, 3682, 3718, 3864, 4092, 4119, 4392, 4432, 4789, 5020, 5244, 5704, 5847, 5967, 6031, 6265, 6815, 6922, 7261, 7285, 7303, 7394, 7441, 7498, 7546, 7564, 7570, 7573, 7772, 7776, 9138, 12078, 12216, 12270, 12462, 12813, 12871, 12918, 13009, 13068, 13223, 13408, 13695, 13708, 13727, 13771, 13937, 14179, 14218, 14250, 14295, 14346, 14417, 14507, 14615, 14731, 14748, 14899, 14938, 15008, 15154, 15224, 15236, 15435, 15534, 15628, 15633] \ No newline at end of file diff --git a/exp_22_may.sh b/exp_22_may.sh index 1ef7e4447cf659950fa98bcfce3a103ec2d3f4ab..e583b5ac38ce6a28bd2c062fa9e652a4b2391538 100755 --- a/exp_22_may.sh +++ b/exp_22_may.sh @@ -2,22 +2,22 @@ path_csv=/Users/jacquesfize/LOD_DATASETS/disambiguation path_texts=/Users/jacquesfize/LOD_DATASETS/raw_bvlac/ -output_dir=data/graph_exp_may_25 +output_dir=data/graph_exp_july_19 if [ "$1" == "generate" ]; then - mkdir -p $output_dir/normal - #python3 generate_data_csv.py $path_csv $output_dir/normal asso.json normal; + #mkdir -p $output_dir/normal + #python3 -W ignore generate_data_csv.py $path_csv $output_dir/normal asso.json normal; python3 generate_transform.py $output_dir/normal $output_dir/extension_1 extension -a 1; python3 generate_transform.py $output_dir/normal $output_dir/extension_2 extension -a 2; #python3 generate_transform.py $output_dir/normal $output_dir/extension_3 extension -a 3; - #python3 generate_transform.py $output_dir/normal $output_dir/gen_all_1 generalisation -t all -n 1; + python3 generate_transform.py $output_dir/normal $output_dir/gen_all_1 generalisation -t all -n 1; #python3 generate_transform.py $output_dir/normal $output_dir/gen_all_2 generalisation -t all -n 2; python3 generate_transform.py $output_dir/normal $output_dir/gen_region generalisation -t bounded -b region; - #python3 generate_transform.py $output_dir/normal $output_dir/gen_capital generalisation -t bounded -b capital; + python3 generate_transform.py $output_dir/normal $output_dir/gen_capital generalisation -t bounded -b capital; python3 generate_transform.py $output_dir/normal $output_dir/gen_country generalisation -t bounded -b country; fi diff --git a/generate_data_csv.py b/generate_data_csv.py index 3a1ea4f61b3b2c8e3df639bf82f4537879bb7f62..28127de14def73f900ccfbff5f42d1c33900a199 100644 --- a/generate_data_csv.py +++ b/generate_data_csv.py @@ -6,6 +6,7 @@ import argparse,glob, string,time,re from progressbar import ProgressBar, Timer, Bar, ETA, Counter from strpython.helpers.boundary import get_all_shapes +from strpython.models.str import STR from strpython.nlp.disambiguator.geodict_gaurav import * from strpython.pipeline import * import pandas as pd @@ -70,7 +71,7 @@ start = time.time() associated_es={} count_per_doc={} i=0 -logging.info("Get associated spatial entities and ") +#logging.info("Get associated spatial entities and ") with ProgressBar(max_value=len(files_glob),widgets=[' [', Timer(), '] ',Bar(),'(', Counter(),')','(', ETA(), ')']) as pg: for fn in files_glob: @@ -85,14 +86,14 @@ with ProgressBar(max_value=len(files_glob),widgets=[' [', Timer(), '] ',Bar(),'( associated_es[id_]={} pg.update(i) i+=1 -logging.info("Fetch list of spatial entities available !") +#logging.info("Fetch list of spatial entities available !") all_es=set([]) for k,v in associated_es.items(): for k2 in v: all_es.add(k2) -logging.info("Get All Shapes from Database for all ES") -all_shapes=get_all_shapes(list(all_es)) +#logging.info("Get All Shapes from Database for all ES") +#all_shapes=get_all_shapes(list(all_es)) i=0 def foo_(x): diff --git a/generate_similarity_matrix.py b/generate_similarity_matrix.py index a54738062a25f24d8e0fff9d2898543f9e11e4b7..4c35fdfc6ae2564c00fb4dc366cf9e0f05872afe 100644 --- a/generate_similarity_matrix.py +++ b/generate_similarity_matrix.py @@ -1,7 +1,7 @@ # coding = utf-8 import glob - -from gmatch4py.bag_of_cliques import BagOfCliques +# from gmatch4py.bag_of_cliques import BagOfCliques +from gmatch4py.helpers.reader import import_dir from gmatch4py.base import Base from gmatch4py.ged.graph_edit_dist import GraphEditDistance from gmatch4py.ged.bipartite_graph_matching_2 import BP_2 @@ -11,13 +11,20 @@ from gmatch4py.jaccard import Jaccard from gmatch4py.kernels.weisfeiler_lehman import * from gmatch4py.mcs import MCS from gmatch4py.vertex_edge_overlap import VertexEdgeOverlap +import argparse, os, sys, re, json, logging +import datetime -import argparse, os, sys, re, json +logging.basicConfig( + filename="{0}.csv".format(datetime.datetime.now().strftime("%Y_%m_%d__%H_%M_%S")), + format="%(message)s,%(asctime)s", + level=logging.DEBUG +) parser = argparse.ArgumentParser() parser.add_argument("graphs_input_dir") parser.add_argument("matrix_output_dir") parser.add_argument("-d", action="store_true", help="Return distance matrix") +parser.add_argument("-s", action="store_true", help="Selected graph ?") args = parser.parse_args() if not os.path.exists(args.graphs_input_dir): @@ -30,24 +37,17 @@ if not os.path.exists(args.matrix_output_dir): os.makedirs(args.matrix_output_dir) print("Directory created") -graphs = [] -mapping_files_to_graphs = {} - -# Loading graphs -fns = glob.glob(args.graphs_input_dir.rstrip("/") + "/*.gexf") -if not fns: - print("Input dir empty! Not .gexf file found!") - -i = 0 -for fn in fns: - graphs.append(nx.read_gexf(fn)) - mapping_files_to_graphs[i] = fn - -#print(graphs) - +logging.info(msg="L_G,BEGIN,\"\"") +graphs = import_dir(args.graphs_input_dir) +logging.info(msg="L_G,DONE,\"\"") +# print(graphs) +selected = None +if args.s: + selected = json.load(open("selected.json")) # Compute matrices -for class_ in [BagOfCliques, GraphEditDistance, BP_2, GreedyEditDistance, HED, Jaccard, WeisfeleirLehmanKernel, MCS, +for class_ in [GraphEditDistance, BP_2, GreedyEditDistance, HED, Jaccard, MCS, VertexEdgeOverlap]: + logging.info(msg="C_S,BEG,\"{0}\"".format(class_.__name__)) print("Computing the Similarity Matrix for {0}".format(class_.__name__)) if class_ in (GraphEditDistance, BP_2, GreedyEditDistance, HED): @@ -55,20 +55,22 @@ for class_ in [BagOfCliques, GraphEditDistance, BP_2, GreedyEditDistance, HED, J elif class_ == WeisfeleirLehmanKernel: comparator = class_(h=2) else: - comparator=class_() - matrix = comparator.compare(graphs, None) + comparator = class_() + matrix = comparator.compare(graphs, selected) if not args.d: matrix = comparator.similarity(matrix) else: - matrix= comparator.distance(matrix) - print("Matrix ready. Saving ...") - output_fn="{0}/{1}_{2}.npy".format( + matrix = comparator.distance(matrix) + logging.info(msg="C_S,DONE,\"{0}\"".format(class_.__name__)) + output_fn = "{0}/{1}_{2}.npy".format( args.matrix_output_dir.rstrip("/"), class_.__name__, - os.path.dirname(args.graphs_input_dir).replace("/","_") + os.path.dirname(args.graphs_input_dir).replace("/", "_") ) - np.save(output_fn,matrix) + logging.info(msg="M_S,BEG,\"{0}\"".format(class_.__name__)) + np.save(output_fn, matrix) + logging.info(msg="M_S,DONE,\"{0}\"".format(class_.__name__)) print("Matrix Saved") -json.dump(mapping_files_to_graphs,open("{0}/{1}".format(args.matrix_output_dir.rstrip("/"),"metadata.json"))) -print("Done") \ No newline at end of file +# json.dump(mapping_files_to_graphs,open("{0}/{1}".format(args.matrix_output_dir.rstrip("/"),"metadata.json"))) +print("Done") diff --git a/generate_transform.py b/generate_transform.py index bcf949b5cb521416701ed13a4c89ee8eac0d2998..0e8cd2d799dbff51f00f2e52cef6885823e04593 100644 --- a/generate_transform.py +++ b/generate_transform.py @@ -7,9 +7,11 @@ import logging import time from concurrent.futures import ThreadPoolExecutor +import networkx as nx from progressbar import ProgressBar, Timer, Bar, ETA, Counter from strpython.helpers.boundary import get_all_shapes +from strpython.models.str import STR from strpython.nlp.disambiguator.geodict_gaurav import * from strpython.pipeline import * @@ -90,11 +92,7 @@ for k,v in associated_es.items(): for k2 in v: all_es.add(k2) -logging.info("Get All Shapes from Database for all ES") -all_shapes=get_all_shapes(list(all_es)) -for id_ in graphs_: - graphs_[id].set_all_shapes(all_shapes) def workSTR(id_doc,g,list_gs,pg,argu): global i @@ -103,6 +101,7 @@ def workSTR(id_doc,g,list_gs,pg,argu): # Save Metadata # Save Graph structure + print("savegraph") nx.write_gexf(list_gs[-1], argu.graphs_output_dir + "/{0}.gexf".format(id_doc)) i+=1 pg.update(i) diff --git a/notebooks/EvalDesambiguisationMada.ipynb b/notebooks/EvalDesambiguisationMada.ipynb index 9045fca33ba116b5dddf9bf768cabee270bcc841..3d58d2ac2442e408c4fb916f4cae1f1451ddd073 100644 --- a/notebooks/EvalDesambiguisationMada.ipynb +++ b/notebooks/EvalDesambiguisationMada.ipynb @@ -5,13 +5,14 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2018-06-19T13:09:12.991345Z", - "start_time": "2018-06-19T13:09:12.578369Z" + "end_time": "2018-08-24T14:18:40.551515Z", + "start_time": "2018-08-24T14:18:40.137529Z" } }, "outputs": [], "source": [ - "import pandas as pd" + "import pandas as pd\n", + "%load_ext autoreload" ] }, { @@ -19,8 +20,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2018-06-19T13:09:13.002216Z", - "start_time": "2018-06-19T13:09:12.998336Z" + "end_time": "2018-08-24T14:18:40.558929Z", + "start_time": "2018-08-24T14:18:40.553463Z" } }, "outputs": [ @@ -41,8 +42,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2018-06-19T13:09:14.674713Z", - "start_time": "2018-06-19T13:09:14.668234Z" + "end_time": "2018-08-24T14:18:40.565725Z", + "start_time": "2018-08-24T14:18:40.560729Z" } }, "outputs": [], @@ -57,8 +58,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2018-06-19T13:09:14.912185Z", - "start_time": "2018-06-19T13:09:14.895298Z" + "end_time": "2018-08-24T14:18:40.582053Z", + "start_time": "2018-08-24T14:18:40.567425Z" } }, "outputs": [], @@ -73,24 +74,16 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2018-06-19T13:09:20.638699Z", - "start_time": "2018-06-19T13:09:17.343687Z" + "end_time": "2018-08-24T14:18:43.957963Z", + "start_time": "2018-08-24T14:18:40.585425Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR:root:Line magic function `%autoreload` not found.\n" - ] - } - ], + "outputs": [], "source": [ "%autoreload\n", - "from nlp.disambiguator.geodict_gaurav import GauravGeodict\n", - "from nlp.disambiguator.most_common import MostCommonDisambiguator\n", - "from nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator\n", + "from strpython.nlp.disambiguator.geodict_gaurav import GauravGeodict\n", + "from strpython.nlp.disambiguator.most_common import MostCommonDisambiguator\n", + "from strpython.nlp.disambiguator.wikipedia_cooc import WikipediaDisambiguator\n", "disMost_common=MostCommonDisambiguator()\n", "disGaurav=GauravGeodict()\n", "disWiki=WikipediaDisambiguator()" @@ -98,11 +91,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2018-06-19T13:10:29.965681Z", - "start_time": "2018-06-19T13:10:29.952223Z" + "end_time": "2018-08-24T14:18:44.015575Z", + "start_time": "2018-08-24T14:18:43.960053Z" } }, "outputs": [], @@ -117,25 +110,34 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2018-06-03T19:13:08.776780Z", - "start_time": "2018-06-03T19:13:08.752046Z" + "end_time": "2018-08-24T14:18:44.023135Z", + "start_time": "2018-08-24T14:18:44.017778Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2018-06-03T19:13:13.030925Z", - "start_time": "2018-06-03T19:13:13.028591Z" + "end_time": "2018-08-24T14:18:44.027539Z", + "start_time": "2018-08-24T14:18:44.024973Z" } }, "outputs": [], @@ -145,11 +147,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2018-06-03T19:13:13.238647Z", - "start_time": "2018-06-03T19:13:13.212601Z" + "end_time": "2018-08-24T14:18:44.061164Z", + "start_time": "2018-08-24T14:18:44.029278Z" } }, "outputs": [], @@ -171,11 +173,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2018-06-03T19:43:28.769834Z", - "start_time": "2018-06-03T19:15:06.598715Z" + "end_time": "2018-08-24T14:42:35.179291Z", + "start_time": "2018-08-24T14:18:44.063336Z" } }, "outputs": [ @@ -186,6 +188,20 @@ "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:12: RuntimeWarning: invalid value encountered in long_scalars\n", " if sys.path[0] == '':\n" ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-10-f81592812190>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0macc_wiki\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maccuracyWiki\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdata_lang\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;31m#acc_MC.append(accuracyMostCommon(df,data_lang[id_]))\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;31m#acc_GEO.append(accuracyGeodict(df,data_lang[id_]))\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m<ipython-input-9-7d392d282df9>\u001b[0m in \u001b[0;36maccuracyWiki\u001b[0;34m(df, lang)\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0maccuracyWiki\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlang\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"GID\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"O\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"NR\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"o\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"text\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"GID\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mres_dis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdisWiki\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdisambiguate_wiki\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"text\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlang\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"disambiguation\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mres_dis\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mres_dis\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m\"0\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGID\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdisambiguation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/nas_cloud/Code/str-python/strpython/nlp/disambiguator/wikipedia_cooc.py\u001b[0m in \u001b[0;36mdisambiguate_wiki\u001b[0;34m(self, entities, lang)\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcand\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mcand2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;31m# take the lowest co-occurrency between two candidates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 81\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mcand2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcand\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0medges\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 82\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0medges\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcand2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mcand\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"weight\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mprob\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/site-packages/networkx/classes/reportviews.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1025\u001b[0m \u001b[0mseen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1026\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbrs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_nodes_nbrs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1027\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mnbr\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mnbrs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1028\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnbr\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mseen\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1029\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] } ], "source": [ @@ -195,32 +211,21 @@ " \n", " df=pd.read_csv(fn)\n", " acc_wiki.append(accuracyWiki(df,data_lang[id_]))\n", - " #acc_MC.append(accuracyMostCommon(df,data_lang[id_]))\n", - " #acc_GEO.append(accuracyGeodict(df,data_lang[id_]))\n", + " acc_MC.append(accuracyMostCommon(df,data_lang[id_]))\n", + " acc_GEO.append(accuracyGeodict(df,data_lang[id_]))\n", " " ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T01:22:15.574548Z", - "start_time": "2018-05-17T01:22:15.567387Z" + "end_time": "2018-08-24T14:42:35.180200Z", + "start_time": "2018-08-24T14:18:40.127Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.6118508350166977" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import numpy as np\n", "np.mean(np.nan_to_num(acc_GEO))" @@ -228,50 +233,28 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2018-05-17T01:22:15.618633Z", - "start_time": "2018-05-17T01:22:15.612431Z" + "end_time": "2018-08-24T14:42:35.181124Z", + "start_time": "2018-08-24T14:18:40.128Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.7694373020389706" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "np.mean(np.nan_to_num(acc_MC))" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2018-06-03T19:44:42.307528Z", - "start_time": "2018-06-03T19:44:42.295687Z" + "end_time": "2018-08-24T14:42:35.182157Z", + "start_time": "2018-08-24T14:18:40.130Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.740705700091002" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import numpy as np\n", "np.mean(np.nan_to_num(acc_wiki))" @@ -279,25 +262,16 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2018-06-19T13:12:33.632268Z", - "start_time": "2018-06-19T13:12:26.349957Z" + "end_time": "2018-08-24T14:42:35.182992Z", + "start_time": "2018-08-24T14:18:40.131Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "57451 9248\n", - "6.212262110726644\n" - ] - } - ], + "outputs": [], "source": [ - "from helpers.gazeteer_helpers import count_of_se\n", + "from strpython.helpers.gazeteer_helpers import count_of_se\n", "sum_,count=0,0\n", "for fn in fns:\n", " try:\n", @@ -315,650 +289,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2018-06-19T13:10:32.794585Z", - "start_time": "2018-06-19T13:10:32.759937Z" + "end_time": "2018-08-24T14:42:35.184004Z", + "start_time": "2018-08-24T14:18:40.133Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Unnamed: 0</th>\n", - " <th>Unnamed: 0.1</th>\n", - " <th>Unnamed: 0.1.1</th>\n", - " <th>Unnamed: 0.1.1.1</th>\n", - " <th>diff2</th>\n", - " <th>text</th>\n", - " <th>pos_</th>\n", - " <th>ent_type_</th>\n", - " <th>GID</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0.0</td>\n", - " <td>Réunion</td>\n", - " <td>NOUN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1.0</td>\n", - " <td>Lac 2</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>2.0</td>\n", - " <td>Sud</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>3</td>\n", - " <td>3</td>\n", - " <td>3</td>\n", - " <td>3</td>\n", - " <td>3.0</td>\n", - " <td>Lac 2</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>4</td>\n", - " <td>4</td>\n", - " <td>4</td>\n", - " <td>4</td>\n", - " <td>4.0</td>\n", - " <td>BV Lac 2</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>5</td>\n", - " <td>5</td>\n", - " <td>5</td>\n", - " <td>5</td>\n", - " <td>5.0</td>\n", - " <td>BV Lac 2</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6</th>\n", - " <td>6</td>\n", - " <td>6</td>\n", - " <td>6</td>\n", - " <td>6</td>\n", - " <td>6.0</td>\n", - " <td>Madagascar</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD3404996</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7</th>\n", - " <td>7</td>\n", - " <td>7</td>\n", - " <td>7</td>\n", - " <td>7</td>\n", - " <td>7.0</td>\n", - " <td>Madagascar</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD3404996</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8</th>\n", - " <td>8</td>\n", - " <td>8</td>\n", - " <td>8</td>\n", - " <td>8</td>\n", - " <td>8.0</td>\n", - " <td>–</td>\n", - " <td>PUNCT</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9</th>\n", - " <td>9</td>\n", - " <td>9</td>\n", - " <td>9</td>\n", - " <td>9</td>\n", - " <td>9.0</td>\n", - " <td>Etat</td>\n", - " <td>NOUN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>10</th>\n", - " <td>10</td>\n", - " <td>10</td>\n", - " <td>10</td>\n", - " <td>10</td>\n", - " <td>10.0</td>\n", - " <td>Madagascar</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD3404996</td>\n", - " </tr>\n", - " <tr>\n", - " <th>11</th>\n", - " <td>11</td>\n", - " <td>11</td>\n", - " <td>11</td>\n", - " <td>11</td>\n", - " <td>11.0</td>\n", - " <td>Lac 2</td>\n", - " <td>SPACE</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>12</th>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12</td>\n", - " <td>12.0</td>\n", - " <td>Madagascar</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD3404996</td>\n", - " </tr>\n", - " <tr>\n", - " <th>13</th>\n", - " <td>13</td>\n", - " <td>13</td>\n", - " <td>13</td>\n", - " <td>13</td>\n", - " <td>13.0</td>\n", - " <td>Madagascar</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD3404996</td>\n", - " </tr>\n", - " <tr>\n", - " <th>14</th>\n", - " <td>14</td>\n", - " <td>14</td>\n", - " <td>14</td>\n", - " <td>14</td>\n", - " <td>14.0</td>\n", - " <td>Directeur</td>\n", - " <td>NOUN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>15</th>\n", - " <td>15</td>\n", - " <td>15</td>\n", - " <td>15</td>\n", - " <td>15</td>\n", - " <td>15.0</td>\n", - " <td>Lac</td>\n", - " <td>SPACE</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>16</th>\n", - " <td>16</td>\n", - " <td>16</td>\n", - " <td>16</td>\n", - " <td>16</td>\n", - " <td>16.0</td>\n", - " <td>Lac 2</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>17</th>\n", - " <td>17</td>\n", - " <td>17</td>\n", - " <td>17</td>\n", - " <td>17</td>\n", - " <td>17.0</td>\n", - " <td>Paris</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD5400765</td>\n", - " </tr>\n", - " <tr>\n", - " <th>18</th>\n", - " <td>18</td>\n", - " <td>18</td>\n", - " <td>18</td>\n", - " <td>18</td>\n", - " <td>18.0</td>\n", - " <td>Antananarivo</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD3682867</td>\n", - " </tr>\n", - " <tr>\n", - " <th>19</th>\n", - " <td>19</td>\n", - " <td>19</td>\n", - " <td>19</td>\n", - " <td>19</td>\n", - " <td>19.0</td>\n", - " <td>Directions Régionales</td>\n", - " <td>SPACE</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>20</th>\n", - " <td>20</td>\n", - " <td>20</td>\n", - " <td>20</td>\n", - " <td>20</td>\n", - " <td>20.0</td>\n", - " <td>Centres</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>21</th>\n", - " <td>21</td>\n", - " <td>21</td>\n", - " <td>21</td>\n", - " <td>21</td>\n", - " <td>21.0</td>\n", - " <td>Services Agricoles</td>\n", - " <td>SPACE</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>22</th>\n", - " <td>22</td>\n", - " <td>22</td>\n", - " <td>22</td>\n", - " <td>22</td>\n", - " <td>22.0</td>\n", - " <td>BV Lac</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>23</th>\n", - " <td>23</td>\n", - " <td>23</td>\n", - " <td>23</td>\n", - " <td>23</td>\n", - " <td>23.0</td>\n", - " <td>jusqu’</td>\n", - " <td>VERB</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>24</th>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24</td>\n", - " <td>24.0</td>\n", - " <td>Antananarivo</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD3682867</td>\n", - " </tr>\n", - " <tr>\n", - " <th>25</th>\n", - " <td>25</td>\n", - " <td>25</td>\n", - " <td>25</td>\n", - " <td>25</td>\n", - " <td>25.0</td>\n", - " <td>Suivi</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>26</th>\n", - " <td>26</td>\n", - " <td>26</td>\n", - " <td>26</td>\n", - " <td>26</td>\n", - " <td>26.0</td>\n", - " <td>Ambositra</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD6124882</td>\n", - " </tr>\n", - " <tr>\n", - " <th>27</th>\n", - " <td>27</td>\n", - " <td>27</td>\n", - " <td>27</td>\n", - " <td>27</td>\n", - " <td>27.0</td>\n", - " <td>Farafangana</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD2452325</td>\n", - " </tr>\n", - " <tr>\n", - " <th>28</th>\n", - " <td>28</td>\n", - " <td>28</td>\n", - " <td>28</td>\n", - " <td>28</td>\n", - " <td>28.0</td>\n", - " <td>du Sud</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>29</th>\n", - " <td>29</td>\n", - " <td>29</td>\n", - " <td>29</td>\n", - " <td>29</td>\n", - " <td>29.0</td>\n", - " <td>Est</td>\n", - " <td>NOUN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>30</th>\n", - " <td>30</td>\n", - " <td>30</td>\n", - " <td>30</td>\n", - " <td>30</td>\n", - " <td>30.0</td>\n", - " <td>seuil</td>\n", - " <td>NOUN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>31</th>\n", - " <td>31</td>\n", - " <td>31</td>\n", - " <td>31</td>\n", - " <td>31</td>\n", - " <td>31.0</td>\n", - " <td>BV Lac 2</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>32</th>\n", - " <td>32</td>\n", - " <td>32</td>\n", - " <td>32</td>\n", - " <td>32</td>\n", - " <td>32.0</td>\n", - " <td>jusqu’</td>\n", - " <td>VERB</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>33</th>\n", - " <td>33</td>\n", - " <td>33</td>\n", - " <td>33</td>\n", - " <td>33</td>\n", - " <td>33.0</td>\n", - " <td>BV Lac 2</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>34</th>\n", - " <td>34</td>\n", - " <td>34</td>\n", - " <td>34</td>\n", - " <td>34</td>\n", - " <td>34.0</td>\n", - " <td>Secrétaire</td>\n", - " <td>NOUN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>35</th>\n", - " <td>35</td>\n", - " <td>35</td>\n", - " <td>35</td>\n", - " <td>35</td>\n", - " <td>35.0</td>\n", - " <td>Alaotra</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>36</th>\n", - " <td>36</td>\n", - " <td>36</td>\n", - " <td>36</td>\n", - " <td>36</td>\n", - " <td>36.0</td>\n", - " <td>Mangoro</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD3574285</td>\n", - " </tr>\n", - " <tr>\n", - " <th>37</th>\n", - " <td>37</td>\n", - " <td>37</td>\n", - " <td>37</td>\n", - " <td>37</td>\n", - " <td>37.0</td>\n", - " <td>Directeur</td>\n", - " <td>NOUN</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>38</th>\n", - " <td>38</td>\n", - " <td>38</td>\n", - " <td>38</td>\n", - " <td>38</td>\n", - " <td>38.0</td>\n", - " <td>Lac 2 et</td>\n", - " <td>SPACE</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>39</th>\n", - " <td>39</td>\n", - " <td>39</td>\n", - " <td>39</td>\n", - " <td>39</td>\n", - " <td>39.0</td>\n", - " <td>Sous réserve</td>\n", - " <td>VERB</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " <tr>\n", - " <th>40</th>\n", - " <td>40</td>\n", - " <td>40</td>\n", - " <td>40</td>\n", - " <td>40</td>\n", - " <td>40.0</td>\n", - " <td>Grandjean</td>\n", - " <td>PROPN</td>\n", - " <td>LOC</td>\n", - " <td>GD3254594</td>\n", - " </tr>\n", - " <tr>\n", - " <th>41</th>\n", - " <td>41</td>\n", - " <td>41</td>\n", - " <td>41</td>\n", - " <td>41</td>\n", - " <td>41.0</td>\n", - " <td>jusqu’</td>\n", - " <td>VERB</td>\n", - " <td>LOC</td>\n", - " <td>O</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1.1 Unnamed: 0.1.1.1 diff2 \\\n", - "0 0 0 0 0 0.0 \n", - "1 1 1 1 1 1.0 \n", - "2 2 2 2 2 2.0 \n", - "3 3 3 3 3 3.0 \n", - "4 4 4 4 4 4.0 \n", - "5 5 5 5 5 5.0 \n", - "6 6 6 6 6 6.0 \n", - "7 7 7 7 7 7.0 \n", - "8 8 8 8 8 8.0 \n", - "9 9 9 9 9 9.0 \n", - "10 10 10 10 10 10.0 \n", - "11 11 11 11 11 11.0 \n", - "12 12 12 12 12 12.0 \n", - "13 13 13 13 13 13.0 \n", - "14 14 14 14 14 14.0 \n", - "15 15 15 15 15 15.0 \n", - "16 16 16 16 16 16.0 \n", - "17 17 17 17 17 17.0 \n", - "18 18 18 18 18 18.0 \n", - "19 19 19 19 19 19.0 \n", - "20 20 20 20 20 20.0 \n", - "21 21 21 21 21 21.0 \n", - "22 22 22 22 22 22.0 \n", - "23 23 23 23 23 23.0 \n", - "24 24 24 24 24 24.0 \n", - "25 25 25 25 25 25.0 \n", - "26 26 26 26 26 26.0 \n", - "27 27 27 27 27 27.0 \n", - "28 28 28 28 28 28.0 \n", - "29 29 29 29 29 29.0 \n", - "30 30 30 30 30 30.0 \n", - "31 31 31 31 31 31.0 \n", - "32 32 32 32 32 32.0 \n", - "33 33 33 33 33 33.0 \n", - "34 34 34 34 34 34.0 \n", - "35 35 35 35 35 35.0 \n", - "36 36 36 36 36 36.0 \n", - "37 37 37 37 37 37.0 \n", - "38 38 38 38 38 38.0 \n", - "39 39 39 39 39 39.0 \n", - "40 40 40 40 40 40.0 \n", - "41 41 41 41 41 41.0 \n", - "\n", - " text pos_ ent_type_ GID \n", - "0 Réunion NOUN LOC O \n", - "1 Lac 2 PROPN LOC O \n", - "2 Sud PROPN LOC O \n", - "3 Lac 2 PROPN LOC O \n", - "4 BV Lac 2 PROPN LOC O \n", - "5 BV Lac 2 PROPN LOC O \n", - "6 Madagascar PROPN LOC GD3404996 \n", - "7 Madagascar PROPN LOC GD3404996 \n", - "8 – PUNCT LOC O \n", - "9 Etat NOUN LOC O \n", - "10 Madagascar PROPN LOC GD3404996 \n", - "11 Lac 2 SPACE LOC O \n", - "12 Madagascar PROPN LOC GD3404996 \n", - "13 Madagascar PROPN LOC GD3404996 \n", - "14 Directeur NOUN LOC O \n", - "15 Lac SPACE LOC O \n", - "16 Lac 2 PROPN LOC O \n", - "17 Paris PROPN LOC GD5400765 \n", - "18 Antananarivo PROPN LOC GD3682867 \n", - "19 Directions Régionales SPACE LOC O \n", - "20 Centres PROPN LOC O \n", - "21 Services Agricoles SPACE LOC O \n", - "22 BV Lac PROPN LOC O \n", - "23 jusqu’ VERB LOC O \n", - "24 Antananarivo PROPN LOC GD3682867 \n", - "25 Suivi PROPN LOC O \n", - "26 Ambositra PROPN LOC GD6124882 \n", - "27 Farafangana PROPN LOC GD2452325 \n", - "28 du Sud PROPN LOC O \n", - "29 Est NOUN LOC O \n", - "30 seuil NOUN LOC O \n", - "31 BV Lac 2 PROPN LOC O \n", - "32 jusqu’ VERB LOC O \n", - "33 BV Lac 2 PROPN LOC O \n", - "34 Secrétaire NOUN LOC O \n", - "35 Alaotra PROPN LOC O \n", - "36 Mangoro PROPN LOC GD3574285 \n", - "37 Directeur NOUN LOC O \n", - "38 Lac 2 et SPACE LOC O \n", - "39 Sous réserve VERB LOC O \n", - "40 Grandjean PROPN LOC GD3254594 \n", - "41 jusqu’ VERB LOC O " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df" ] diff --git a/setup.py b/setup.py index 0ab6becda299b213e3cf59d92d09c893b36896db..d1fb08b79346d9e3388d01743b55eab4439e2649 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,6 @@ setup( ) # Put default config file if not exists home = str(Path.home()) -if not os.path.exists(os.path.join(home,".strpython")) or not os.path.exists(os.path.join(home,".strpython/config.json")): +if not os.path.exists(os.path.join(home,".strpython")): #or not os.path.exists(os.path.join(home,".strpython/config.json")): os.makedirs(os.path.dirname(os.path.join(home,".strpython/config.json")), exist_ok=True) shutil.copy2("strpython/config/config.json",os.path.join(home,".strpython/config.json")) \ No newline at end of file diff --git a/strpython/config/config.json b/strpython/config/config.json index be883496a87684dab8bd417288950d2bb21dc98b..50c7d66c903142cb984b995e572b6e7986051f09 100644 --- a/strpython/config/config.json +++ b/strpython/config/config.json @@ -8,8 +8,8 @@ "database_json":"resources/database_exp_25_may.db", "log_file":"extract_log", "wiki_cooc_dis":{ - "cooc_freq":"/Users/jacquesfize/nas_cloud/Code/str-python/resources/coocurrence_wiki.pkl", - "count":"/Users/jacquesfize/nas_cloud/Code/str-python/resources/count_wiki.pkl" + "cooc_freq":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/coocurrence_wiki.pkl", + "count":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/count_wiki.pkl" }, - "language_resources_path":"/Users/jacquesfize/nas_cloud/Code/str-python/resources/language_resources" + "language_resources_path":"/Users/jacquesfize/nas_cloud/Code/str-python/strpython/resources/language_resources" } \ No newline at end of file diff --git a/strpython/config/configuration.py b/strpython/config/configuration.py index 3a018e7831a45640f783967174dc006ec978736d..5e33da1cf7b8402fd5b32cb4342cf0a0d7677a3a 100644 --- a/strpython/config/configuration.py +++ b/strpython/config/configuration.py @@ -1,18 +1,32 @@ # coding = utf-8 -import json,os +import json +import os from pathlib import Path class Configuration(object): + """ + Define the `Configuration` instantiation. The `Configuration` object contains all the + shared variable of strpython like : the georeferential server address, the Stanford NER address, etc. + """ def __init__(self, data): - self.__dict__=data + """ + Constructor :param data: dict that contains all the configuration variable. In the module, these variables + are stored in a file at `~/.strpython/configuration.json`. + """ + self.__dict__ = data for d in self.__dict__: - if isinstance(self.__dict__[d],dict): - self.__dict__[d]=Configuration(self.__dict__[d]) + if isinstance(self.__dict__[d], dict): + self.__dict__[d] = Configuration(self.__dict__[d]) + def __getitem__(self, item): return self.__dict__[item] -home = str(Path.home()) -config = Configuration(json.load(open(os.path.join(home,".strpython/config.json")))) +""" +Initialise the config variable +Access this variable using `from strpython.config.configuration import config` +""" +home = str(Path.home()) +config = Configuration(json.load(open(os.path.join(home, ".strpython/config.json")))) diff --git a/strpython/config/stopwords_en.txt b/strpython/config/stopwords_en.txt deleted file mode 100644 index 6e190b70f008f497b37cdc652e89f0f19b9d90df..0000000000000000000000000000000000000000 --- a/strpython/config/stopwords_en.txt +++ /dev/null @@ -1,173 +0,0 @@ -a -about -above -after -again -against -all -am -an -and -any -are -aren't -as -at -be -because -been -before -being -below -between -both -but -by -can't -cannot -could -couldn't -did -didn't -do -does -doesn't -doing -don't -down -during -each -few -for -from -further -had -hadn't -has -hasn't -have -haven't -having -he -he'd -he'll -he's -her -here -here's -hers -herself -him -himself -his -how -how's -i -i'd -i'll -i'm -i've -if -in -into -is -isn't -it -it's -its -itself -let's -me -more -most -mustn't -my -myself -no -nor -not -of -off -on -once -only -or -other -ought -our -ours ourselves -out -over -own -same -shan't -she -she'd -she'll -she's -should -shouldn't -so -some -such -than -that -that's -the -their -theirs -them -themselves -then -there -there's -these -they -they'd -they'll -they're -they've -this -those -through -to -too -under -until -up -very -was -wasn't -we -we'd -we'll -we're -we've -were -weren't -what -what's -when -when's -where -where's -which -while -who -who's -whom -why -why's -with -won't -would -wouldn't -you -you'd -you'll -you're -you've -your -yours -yourself -yourselves \ No newline at end of file diff --git a/strpython/config/world_borders.shp b/strpython/config/world_borders.shp deleted file mode 100755 index 7cd47e9ad5edeadbce0edecd3a488a133fa7dfa8..0000000000000000000000000000000000000000 Binary files a/strpython/config/world_borders.shp and /dev/null differ diff --git a/strpython/eval/automatic_annotation.py b/strpython/eval/automatic_annotation.py new file mode 100644 index 0000000000000000000000000000000000000000..7c681bcde9a1ebd7f9b3734ccff49db34f6b202c --- /dev/null +++ b/strpython/eval/automatic_annotation.py @@ -0,0 +1,72 @@ +# coding = utf-8 + +from strpython.models.str import STR +import networkx as nx +import numpy as np +import geopandas as gpd +from shapely.geometry import MultiPoint,Polygon,Point,LineString + +class AnnotationAutomatic(object): + """ + + To facilitate the annotation, this class propose an automatic annotation. + Author : Jacques Fize + """ + def __init__(self): + pass + + def all(self,str1,str2): + return [self.criterion1(str1,str2),self.criterion2(str1,str2),self.criterion3(str1,str2),self.criterion4(str1,str2)] + + def criterion1(self,str1,str2): + """ + Return True if both STR contains similar spatial entities. + :param str1: STR + :param str2: STR + :return: + """ + return len(set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys())) > 0 + + def criterion2(self,str1 : STR,str2 : STR): + """ + Return True if two STR contains proper spatial entities that share a proximity. + :param str1: STR + :param str2: STR + :return: + """ + stop_en=set(str1.graph.nodes.keys()) & set(str2.graph.nodes.keys()) + for es in str1.spatial_entities: + for es2 in str2.spatial_entities: + if not es in stop_en and not es2 in stop_en: + if str1.is_included_in(es,es2): + return True + if str1.is_adjacent(es,es2): + return True + return False + + def criterion3(self, str1 :STR , str2: STR): + """ + Return True if one or multiple cluster of spatial entities have been found in both STR. Cluster + are constructed based on low distance between spatial entities. The clustering method used is Mean-Shift as + implemented in scikit-learn module. + :param str1: + :param str2: + :return: + """ + try: + return str1.get_cluster().intersects(str2.get_cluster()).any() + except: + return False + + def criterion4(self, str1, str2): + """ + Return True if both str share the same clusters. Using the same clustering methods as in criterion3(). + :param str1: + :param str2: + :return: + """ + try: + return str1.get_cluster().intersects(str2.get_cluster()).all() + except: + return False + diff --git a/strpython/eval/stats.py b/strpython/eval/stats.py index 915c554d3e7c648ffd787f7041071fd5c4dec8c2..86e92f19f734827f1d60eb81dec9f17bd17f1be3 100644 --- a/strpython/eval/stats.py +++ b/strpython/eval/stats.py @@ -1,8 +1,9 @@ # coding = utf-8 -from ..helpers.gazeteer_helpers import get_data +from ..helpers.geodict_helpers import get_data import numpy as np + def flattern(A): rt = [] for i in A: @@ -14,18 +15,20 @@ def flattern(A): rt.append(i) return rt + def most_common(lst): - if len(list(set(lst))) >1 and "P-PPL" in set(lst): - lst=[x for x in lst if x != "PPL"] + if len(list(set(lst))) > 1 and "P-PPL" in set(lst): + lst = [x for x in lst if x != "PPL"] return max(set(lst), key=lst.count) + def granularity(graph): """ Return the granularity of a STR :param graph: :return: """ - class_list=flattern([get_data(n)["class"] for n in list(graph.nodes())]) + class_list = flattern([get_data(n)["class"] for n in list(graph.nodes())]) if not class_list: return [] - return most_common(class_list) \ No newline at end of file + return most_common(class_list) diff --git a/strpython/helpers/boundary.py b/strpython/helpers/boundary.py deleted file mode 100644 index 975e87fab5a3bfb5f4cfe03d0be17bef95aa22d1..0000000000000000000000000000000000000000 --- a/strpython/helpers/boundary.py +++ /dev/null @@ -1,76 +0,0 @@ -# coding = utf-8 - -import json -import os -from warnings import warn - -from .gazeteer_helpers import get_data -from scipy.spatial import ConvexHull -from shapely.geometry import Polygon, Point, shape -from ..config.configuration import config -from .collision import collide - -__collisions={} - -def parseHull(hull_object,points): - hull=[] - for simplex in hull_object.simplices: - hull.append(points[simplex[0]]) - hull.append(points[simplex[1]]) - return hull - -def getConvexHull(path): - data=json.load(open(os.path.join(config.osm_boundaries_directory,path))) - boundaries=data["geometry"]["coordinates"] - if data["geometry"]["type"]== "Polygon": - hull = parseHull(ConvexHull(boundaries[-1]),boundaries[-1]) - return [hull] - else: - hull=[] - for bound in boundaries[-1]: - hull.append(parseHull(ConvexHull(bound),bound)) - return hull - -def get_all_shapes(ids_list): - shapes = {} - for p in ids_list: - d = get_data(p) - #print(d["path"]) - if "path" in d: - shapes[p] = getConvexHull(config.osm_boundaries_directory + "/" + d["path"])[0] - elif "coord" in d: - shapes[p] = [[d["coord"]["lat"], d["coord"]["lon"]]] - return shapes - -def get_adjacency_relationships(shapes): - collisions={} - for s in shapes: - for s2 in shapes: - if s != s2: - if not s in collisions and s2 in collisions: - if not s in collisions[s2]: - collisions[s2][s]=collide(shapes[s],shapes[s2]) - elif not s2 in collisions and s in collisions: - if not s2 in collisions[s]: - collisions[s][s2]=collide(shapes[s],shapes[s2]) - return collisions - -def is_intersect(id1,id2,shapes): - global __collisions - if id1 in __collisions: - if id2 in __collisions[id1]: - return __collisions[id1][id2] - elif id2 in __collisions: - if id1 in __collisions[id2]: - return __collisions[id2][id1] - - - if id1 in shapes and id2 in shapes: - if not id1 in __collisions:__collisions[id1]={} - if not id2 in __collisions: __collisions[id2] = {} - __collisions[id1][id2]=collide(shapes[id1],shapes[id2]) - __collisions[id2][id1]=__collisions[id1][id2] - return __collisions[id1][id2] - else: - warn("{0} or {1} wasn't found in given shapes !".format(id1,id2)) - return False diff --git a/strpython/helpers/bow_polyglot.py b/strpython/helpers/bow_polyglot.py index 94c60e8510b0852bd9f722f2bd2ba3cad20ee06e..af54e2262b9081322b67d6d3dffa67e037542112 100644 --- a/strpython/helpers/bow_polyglot.py +++ b/strpython/helpers/bow_polyglot.py @@ -7,6 +7,12 @@ from scipy.sparse import csc_matrix def get_vocabulary(corpus): + """ + Return the vocabulary of a corpus, a list of documents. Each document is represented + using a list of tokens. + :param corpus: list or array-like + :return: + """ vocabulary=set([]) for text_tagged in corpus: for token in text_tagged: @@ -14,6 +20,13 @@ def get_vocabulary(corpus): return list(vocabulary) def lemmatize(corpus,stopwords): + """ + Lemmatize a corpus, a list of documents. Each document is represented + using a list of tokens. + :param corpus: list or array-like + :param stopwords: list or array-like + :return: + """ pos_tag_corp=[] lemmatizer = WordNetLemmatizer() for text in corpus: @@ -36,6 +49,13 @@ def lemmatize(corpus,stopwords): return pos_tag_corp def populate_bow(bow,voc_asso,corpus_tagged): + """ + Populate the Bag of words representation for a vocabulary and a corpus. + :param bow: + :param voc_asso: + :param corpus_tagged: + :return: + """ for t in range(len(corpus_tagged)): text=corpus_tagged[t] for token in text: @@ -47,6 +67,12 @@ def populate_bow(bow,voc_asso,corpus_tagged): return bow def create_bow(corpus,stopwords): + """ + Return a Bag of words representation of a corpus, a lists of document. Each document is a list of tokens. + :param corpus: + :param stopwords: + :return: + """ stopwords=set(stopwords) post_tag_corp = lemmatize(corpus,stopwords) voc = get_vocabulary(post_tag_corp) diff --git a/strpython/helpers/collision.py b/strpython/helpers/collision.py index 60240f83bc92ba434410ba370da006825a204748..b46158e6d893e3cbd864dbf6b8e35330387e9067 100644 --- a/strpython/helpers/collision.py +++ b/strpython/helpers/collision.py @@ -1,103 +1,141 @@ -import numpy as np +import json +import os +import warnings -""" -Source : https://hackmd.io/s/ryFmIZrsl# -""" -def is_separating_axis(o, p1, p2): - """ - Return True and the push vector if o is a separating axis of p1 and p2. - Otherwise, return False and None. - """ - min1, max1 = float('+inf'), float('-inf') - min2, max2 = float('+inf'), float('-inf') - - for v in p1: - projection = np.dot(v, o) - - min1 = min(min1, projection) - max1 = max(max1, projection) - - for v in p2: - projection = np.dot(v, o) +from shapely.geometry import Point - min2 = min(min2, projection) - max2 = max(max2, projection) +from ..config.configuration import config +from .geodict_helpers import get_data +import geopandas as gpd - if max1 >= min2 and max2 >= min1: - d = min(max2 - min1, max1 - min2) - # push a bit more than needed so the shapes do not overlap in future - # tests due to float precision - d_over_o_squared = d/np.dot(o, o) + 1e-10 - pv = d_over_o_squared*o - return False, pv - else: - return True, None +__cache = {} +__cache_adjacency = {} +__limit_cache = 2000 -def edges_of(vertices): +def add_cache(id_, hull): """ - Return the vectors for the edges of the polygon p. - - p is a polygon. + Add the extracted data to a cache instance. This process manage the cache based on a limit defined in `__limit_cache`. + If this limit is attained, the less used data deleted from the cache. + :param id_: id of the first spatial entity + :param hull: data + :return: """ - edges = [] - N = len(vertices) - - for i in range(N): - edge = vertices[(i + 1)%N] - vertices[i] - edges.append(edge) + global __cache, __limit_cache, __cache_frequency + if len(__cache) > __limit_cache: + warnings.warn("Limit broken") + del __cache[min(__cache_frequency, key=__cache_frequency.get)] + __cache[id_] = hull + if not id_ in __cache_frequency: __cache_frequency[id_] = 0 + __cache_frequency[id_] += 1 - return edges -def orthogonal(v): +def add_cache_adjacency(id_se1, id_se2): """ - Return a 90 degree clockwise rotation of the vector v. + Add the adjacency between two spatial entitiy in a cache variable. + :param id_se1: id of the first spatial entity + :param id_se2: id of the second spatial entity + :return: """ - return np.array([-v[1], v[0]]) - - -def collide(p1, p2): - ''' - Return True and the MPV if the shapes collide. Otherwise, return False and - None. + global __cache_adjacency + if not id_se1 in __cache_adjacency: + __cache_adjacency[id_se1] = {} + __cache_adjacency[id_se1][id_se2] = True - p1 and p2 are lists of ordered pairs, the vertices of the polygons in the - counterclockwise direction. - ''' - p1 = [np.array(v, 'float64') for v in p1] - p2 = [np.array(v, 'float64') for v in p2] +def explode(gdf): + """ + Explodes a geodataframe - edges = edges_of(p1) - edges += edges_of(p2) - orthogonals = [orthogonal(e) for e in edges] + Will explode muti-part geometries into single geometries. Original index is + stored in column level_0 and zero-based count of geometries per multi- + geometry is stored in level_1 - push_vectors = [] - for o in orthogonals: - separates, pv = is_separating_axis(o, p1, p2) + Args: + gdf (gpd.GeoDataFrame) : input geodataframe with multi-geometries - if separates: - # they do not collide and there is no push vector - return False, None - else: - push_vectors.append(pv) + Returns: + gdf (gpd.GeoDataFrame) : exploded geodataframe with a new index + and two new columns: level_0 and level_1 - # they do collide and the push_vector with the smallest length is the MPV - mpv = min(push_vectors, key=(lambda v: np.dot(v, v))) + """ + gs = gdf.explode() + gdf2 = gs.reset_index().rename(columns={0: 'geometry'}) + gdf_out = gdf2.merge(gdf.drop('geometry', axis=1), left_on='level_0', right_index=True) + gdf_out = gdf_out.set_index(['level_0', 'level_1']).set_geometry('geometry') + gdf_out.crs = gdf.crs + return gdf_out.reset_index(level=[0, 1]) - # assert mpv pushes p1 away from p2 - d = centers_displacement(p1, p2) # direction from p1 to p2 - if np.dot(d, mpv) > 0: # if it's the same direction, then invert - mpv = -mpv - return True, mpv +def getGEO(id_se): + """ + Get the geofootprint of a spatial entity. If found, this geofootprint is a shape extracted from OSM. If not, + coordinates are used. + :param id_se: id of the spatial entity + :return: geopandas.GeoSeries + """ + data = get_data(id_se) + if "path" in data: + return explode(gpd.read_file(os.path.join(config.osm_boundaries_directory, data["path"]))).convex_hull + elif "coord" in data: + return gpd.GeoDataFrame(gpd.GeoSeries([Point(data["coord"]["lon"], data["coord"]["lat"]).buffer(1.0)])).rename( + columns={0: 'geometry'}) + return None -def centers_displacement(p1, p2): +def collide(se1, se2): + """ + Return true, if two entities convex hull intersects. + :param se1: id of the first spatial entity + :param se2: id of the second spatial entity + :return: + """ + try: + if se1 in __cache: + data_se1 = __cache[se1] + __cache_frequency[se1] += 1 + else: + data_se1 = getGEO(se1) + add_cache(se1, data_se1) + if se2 in __cache: + data_se2 = __cache[se2] + __cache_frequency[se2] += 1 + else: + data_se2 = getGEO(se2) + add_cache(se2, data_se2) + except: + return False + if not type(data_se1) == gpd.GeoDataFrame or not type(data_se2) == gpd.GeoDataFrame: + return False + try: + if data_se1.intersects(data_se2): + return True + except: + if data_se1.intersects(data_se2).any(): + return True + return False + + +def collisionTwoSEBoundaries(id_se1, id_se2): """ - Return the displacement between the geometric center of p1 and p2. + Return True if two spatial entities are adjacent. + :param id_se1: id of the first spatial entity + :param id_se2: id of the second spatial entity + :return: """ - # geometric center - c1 = np.mean(np.array(p1), axis=0) - c2 = np.mean(np.array(p2), axis=0) - return c2 - c1 \ No newline at end of file + global __cache, __cache_adjacency + if id_se1 in __cache_adjacency: + if id_se2 in __cache_adjacency[id_se1]: + return __cache_adjacency[id_se1][id_se2] + elif id_se2 in __cache_adjacency: + if id_se1 in __cache_adjacency[id_se2]: + return __cache_adjacency[id_se2][id_se1] + + if not id_se1 in __cache_adjacency: + __cache_adjacency[id_se1] = {} + + if collide(id_se1, id_se2): # and not include_in(h1,h2): + __cache_adjacency[id_se1][id_se2] = True + return True + __cache_adjacency[id_se1][id_se2] = False + return False diff --git a/strpython/helpers/collision_c.pyx b/strpython/helpers/collision_c.pyx deleted file mode 100644 index 8df40d56ccd97f1475f59addfef03c1aec0d66d7..0000000000000000000000000000000000000000 --- a/strpython/helpers/collision_c.pyx +++ /dev/null @@ -1,99 +0,0 @@ -import numpy as np -cimport numpy as np - -ctypedef np.ndarray numpy_array # for return np array -""" -Source : https://hackmd.io/s/ryFmIZrsl# -""" -cdef is_separating_axis(o, list p1, list p2): - """ - Return True and the push vector if o is a separating axis of p1 and p2. - Otherwise, return False and None. - """ - cdef float min1,max1,min2,max2 - cdef np.float_t projection - min1, max1 = float('+inf'), float('-inf') - min2, max2 = float('+inf'), float('-inf') - - for v in p1: - projection = np.dot(v, o) - - min1 = min(min1, projection) - max1 = max(max1, projection) - - for v in p2: - projection = np.dot(v, o) - - min2 = min(min2, projection) - max2 = max(max2, projection) - - if max1 >= min2 and max2 >= min1: - d = min(max2 - min1, max1 - min2) - # push a bit more than needed so the shapes do not overlap in future - # tests due to float precision - d_over_o_squared = d/np.dot(o, o) + 1e-10 - pv = d_over_o_squared*o - return False, pv - else: - return True, None - - -cdef list edges_of(list vertices): - """ - Return the vectors for the edges of the polygon p. - - p is a polygon. - """ - cdef list edges = [] - cdef int N = len(vertices) - - for i in range(N): - edge = vertices[(i + 1)%N] - vertices[i] - edges.append(edge) - - return edges - -cdef numpy_array orthogonal(v): - """ - Return a 90 degree clockwise rotation of the vector v. - """ - return np.array([-v[1], v[0]]) - - -def collide(p1, p2): - ''' - Return True and the MPV if the shapes collide. Otherwise, return False and - None. - - p1 and p2 are lists of ordered pairs, the vertices of the polygons in the - counterclockwise direction. - ''' - - p1 = [np.array(v, 'float64') for v in p1] - p2 = [np.array(v, 'float64') for v in p2] - cdef list edges - edges = edges_of(p1) - edges += edges_of(p2) - orthogonals = [orthogonal(e) for e in edges] - - cdef push_vectors = [] - for o in orthogonals: - separates, pv = is_separating_axis(o, p1, p2) - - if separates: - # they do not collide and there is no push vector - return False - else: - push_vectors.append(pv) - - return True - - -cdef float centers_displacement(p1, p2): - """ - Return the displacement between the geometric center of p1 and p2. - """ - # geometric center - c1 = np.mean(np.array(p1), axis=0) - c2 = np.mean(np.array(p2), axis=0) - return c2 - c1 \ No newline at end of file diff --git a/strpython/helpers/collision_with_gazetteer_data.py b/strpython/helpers/collision_with_gazetteer_data.py deleted file mode 100644 index 4e0665d489596f7220e1ecd694e4cc9c5ef30955..0000000000000000000000000000000000000000 --- a/strpython/helpers/collision_with_gazetteer_data.py +++ /dev/null @@ -1,79 +0,0 @@ -import json -import os - -import shapely -from scipy.spatial import ConvexHull -from shapely.geometry import Polygon, Point, shape - - -from ..config.configuration import config -from .gazeteer_helpers import get_data -#from .collision import collide -import geopandas as gpd - -__cache={} -__cache_adjacency={} -__limit_cache=400 - -def add_cache(id_,hull): - global __cache,__limit_cache - if len(__cache) > __limit_cache: - __cache={} - __cache[id_]=hull - -def getGEO(id_se): - data=get_data(id_se) - if "path" in data: - return gpd.read_file(os.path.join(config.osm_boundaries_directory, data["path"])).geometry - elif "coord" in data: - return Point(data["coord"]["lon"],data["coord"]["lat"]) - return None -def collide(se1,se2): - try: - if se1 in __cache: - data_se1=__cache[se1] - else: - data_se1 = gpd.GeoSeries(list(getGEO(se1).values[0])) - add_cache(se1,data_se1) - if se2 in __cache: - data_se2=__cache[se2] - else: - data_se2 = gpd.GeoSeries(list(getGEO(se2).values[0])) - add_cache(se2, data_se2) - except: - return False - - if type(data_se1) != type(data_se2): - if type(data_se1) == gpd.geoseries.GeoSeries: - return data_se1.intersects(data_se2).any() - else: - return data_se2.intersects(data_se1).any() - try: - if data_se1.intersects(data_se2): - return True - except: - if data_se1.intersects(data_se2).any(): - return True - return False - - - - - -def collisionTwoSEBoundaries(id_SE1,id_SE2): - global __cache,__cache_adjacency - if id_SE1 in __cache_adjacency: - if id_SE2 in __cache_adjacency[id_SE1]: - return __cache_adjacency[id_SE1][id_SE2] - elif id_SE2 in __cache_adjacency: - if id_SE1 in __cache_adjacency[id_SE2]: - return __cache_adjacency[id_SE2][id_SE1] - - if not id_SE1 in __cache_adjacency: - __cache_adjacency[id_SE1]={} - - if collide(id_SE1,id_SE2): #and not include_in(h1,h2): - __cache_adjacency[id_SE1][id_SE2] = True - return True - __cache_adjacency[id_SE1][id_SE2]=False - return False diff --git a/strpython/helpers/gazeteer_helpers.py b/strpython/helpers/gazeteer_helpers.py deleted file mode 100644 index cd806ad9dc17380095a19c20187c42f476020f9b..0000000000000000000000000000000000000000 --- a/strpython/helpers/gazeteer_helpers.py +++ /dev/null @@ -1,141 +0,0 @@ -# coding=utf-8 - -from elasticsearch import Elasticsearch -from ..config.configuration import config - -es = Elasticsearch(config.es_server) - - -def get_most_common_id_v2(label, lang="fr"): - res = es.search("gazetteer", "place", - body={"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}, - "from": 0, - "size": 50, "sort": [{'score': "desc"}], "aggs": {}}) - if res["hits"]["total"] > 0: - if res["hits"]["total"] > 1: - max_id, max_sc = 0, 0 - i = 0 - for hit in res["hits"]["hits"]: - if 'score' in hit['_source']: - if float(hit['_source']["score"]) > max_sc: max_id, max_sc = i, float(hit['_source']["score"]) - i += 1 - res = [res["hits"]["hits"][max_id]] - else: - res = res["hits"]["hits"] - if not "score" in res[0]["_source"]: - return res[0]["_source"]["id"], -1 - return res[0]["_source"]["id"], float(res[0]["_source"]["score"]) - return None, 0 - -def get_most_common_id_v3(label, lang='fr'): - id_, score = get_most_common_id_v2(label, lang) - if id_: - return id_, score - if not id_ and lang != 'en': - id_, score = get_most_common_id_v2(label, 'en') - if id_: - return id_, score - id_, score = get_most_common_id_alias_v2(label, lang) - if not id_ and lang != 'en': - id_, score = get_most_common_id_v2(label, 'en') - if id_: - return id_, score - return None, -1 - -def get_most_common_id_alias_v2(alias, lang="fr"): - res = es.search("gazetteer", "place", - body={"query": {"nested": {"path": "aliases", - "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}}) - if res["hits"]["total"] > 0: - if res["hits"]["total"] > 1: - max_id, max_sc = 0, 0 - i = 0 - for hit in res["hits"]["hits"]: - if 'score' in hit['_source']: - if float(hit['_source']["score"]) > max_sc: max_id, max_sc = i, float(hit['_source']["score"]) - i += 1 - res = [res["hits"]["hits"][max_id]] - else: - res = res["hits"]["hits"] - if not "score" in res[0]["_source"]: - return res[0]["_source"]["id"], -1 - return res[0]["_source"]["id"], float(res[0]["_source"]["score"]) - return None, -1 - - -def get_data(id): - res = es.search("gazetteer", "place", - body={"query": {"bool": {"must": [{"term": {"id": id}}], "must_not": [], "should": []}}, "from": 0, - "size": 10, "sort": [], "aggs": {}}) - if res["hits"]["total"] > 0: - res = res["hits"]["hits"][0]["_source"] - return res - - -def get_data_by_wikidata_id(id): - res = es.search("gazetteer", "place", - body={"query": {"bool": {"must": [{"term": {"wikidataID": id}}], "must_not": [], "should": []}}, - "from": 0, - "size": 10, "sort": [], "aggs": {}}) - if res["hits"]["total"] > 0: - res = res["hits"]["hits"][0]["_source"] - - return res - - - -def get_by_label(label, lang): - query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}, "size": 50} - response = es.search('gazetteer', 'place', body=query) - if 'hits' in response['hits']: - return response['hits']['hits'] - return None - - -def get_by_alias(alias, lang): - query = { - "query": {"nested": {"path": "aliases", "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}} - response = es.search('gazetteer', 'place', body=query) - if 'hits' in response['hits']: - return response['hits']['hits'] - return None - -def label_exists(label, lang): - query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} - response = es.count('gazetteer', 'place', body=query) - if response["count"] > 0: - return True - return False - -def alias_exists(alias, lang): - query = { - "query": {"nested": {"path": "aliases", "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}} - response = es.count('gazetteer', 'place', body=query) - if response["count"] > 0: - return True - return False - - - - - - - -def count_of_se(label, lang): - query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} - response = es.count('gazetteer', 'place', body=query) - return response["count"] - - -def get_top_candidate(label, lang): - query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}, "sort": [ - { - "score": { - "order": "desc" - } - } - ], "size": 5} - response = es.search('gazetteer', 'place', body=query) - if 'hits' in response['hits']: - return [x["_source"]["id"] for x in response['hits']['hits']] - return None diff --git a/strpython/helpers/geodict_helpers.py b/strpython/helpers/geodict_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..91d04087865e7f90c41a09de1e6cd850fe742877 --- /dev/null +++ b/strpython/helpers/geodict_helpers.py @@ -0,0 +1,437 @@ +# coding=utf-8 +import math +import re + +from elasticsearch import Elasticsearch +from ..config.configuration import config +import pandas as pd +from ..helpers.objectify import objectify + +es = Elasticsearch(config.es_server) + +geo_term={ + "fr":open(config.language_resources_path.rstrip("/")+"/geo_term_fr").read().lower().strip().split("\n"), + "en":open(config.language_resources_path.rstrip("/")+"/geo_term_en").read().strip().split("\n") +} + +def convert_es_to_pandas(es_query_results): + """ + Return a `pandas.Dataframe` object built from the elasticsearch query results + + Parameters + ---------- + es_query_results : dict + elasticsearch.search() result + + Returns + ------- + pandas.DataFrame + Dataframe of the elasticsearch query results + """ + if es_query_results["hits"]["total"] == 0: + return None + df = pd.DataFrame([g["_source"] for g in es_query_results["hits"]["hits"]]) + if "score" in df: + df["score"] = df["score"].apply(lambda x: float(x)) + else: + df["score"] = df.apply(lambda x: 0) + df["score"].fillna(-1, inplace=True) + return df + + +def parse_score(score): + if math.isnan(score): + return -1 + else: + return score + +def parse_label2(label : str,lang): + if not lang in geo_term: + return parse_label(label) + + label = re.sub("[ ]+", " ", re.sub('[(?)]+', "", label.strip())) + label = label.strip("'").strip("’") + + parts=label.split(" ") + # f=False + # for part in parts: + # if part.lower() in geo_term[lang]: + # f=True + # if not f: + # return parse_label(label) + new_labels=[] + for part in parts: + if not part.lower() in geo_term[lang]: + new_labels.append(parse_label(part).strip("/?")+"+") + else: + new_labels.append(parse_label(part).strip("/")) + return "/"+"[ ]?".join(new_labels)+"/" + + + + +def parse_label(label: str): + """ + Parse label/toponym to a specific regular expression that allows dissimilarity with the official toponyms/aliases. + + Parameters + ---------- + label : str + toponym + Returns + ------- + str + regular expression built from the toponym + """ + label = re.sub("[ ]+", " ", re.sub('[(?)]+', "", label.strip())) + label = label.strip("'").strip("’") + new_label = "" + for c in label: + if c.isupper(): + close_par = ")" if not (new_label.endswith(")") or new_label.endswith("?")) and new_label != "" else "" + # if new_label.endswith("]"): + # new_label = new_label[:-1] + "({0}{1}]".format(c.lower(), c) + # else: + new_label += close_par + "([{0}{1}]".format(c.lower(), c) + # print("upper", new_label) + elif c == " ": + new_label += ")?[ ]?" + # print("espace", new_label) + elif c == "'" or c == "’": + new_label += c + ")?" + # print("apostrophe", new_label) + else: + + new_label += ("(" if new_label == "" else "") + ("(" if new_label.endswith("?") else "") + c + # print("else", new_label) + new_label = "/" + new_label + ")?/" + return new_label + + +def most_common_label(toponym: str, lang: str): + """ + + + Parameters + ---------- + toponym : str + toponym + lang : str + toponym language + Returns + ------- + + """ + res = es.search("gazetteer", "place", + body={"query": + {"bool": + {"must": [{"term": {lang: toponym}}], "must_not": [], "should": []} + }, + "from": 0, + "size": 50, "sort": [{'score': "desc"}], "aggs": {}}) + res = convert_es_to_pandas(res) + if not isinstance(res, pd.DataFrame): + return None, 0 + return res.iloc[0].id, res.iloc[0].score + + +def most_common_alias(toponym: str, lang: str): + """ + Return most common spatial entity by itsje + + Parameters + ---------- + toponym : str + toponym + lang : str + toponym language + Returns + ------- + + """ + res = es.search("gazetteer", "place", + body={"query": {"nested": {"path": "aliases", + "query": + {"bool": + {"must": [{"term": {"aliases.{0}".format(lang): toponym}}], "must_not": [], "should": []} + } + }}, + "sort": [{"score": "desc"}]}) + + res = convert_es_to_pandas(res) + if not isinstance(res, pd.DataFrame): + return None, 0 + return res.iloc[0].id, res.iloc[0].score + + +def n_label_similar(toponym, lang, n=5, score=True): + body = { + "query": { + "query_string": { + "default_field": lang, + "query": parse_label2(toponym,lang) + } + }, + "from": 0, + "size": n + } + if score: + body["sort"] = [ + { + 'score': "desc" + } + ] + + res = es.search("gazetteer", "place", + body=body) + res = convert_es_to_pandas(res) + if not isinstance(res, pd.DataFrame): + return None + return res + + +def n_alias_similar(toponym, lang, n=5, score=True): + body = {"query": {"nested": {"path": "aliases", + "query": + { + "query_string": { + "default_field": "aliases.{0}".format(lang), + "query": parse_label2(toponym,lang) + } + } + }}, + "from": 0, + "size": n} + if score: + body["sort"] = [ + { + 'score': "desc" + } + ] + res = es.search("gazetteer", "place", + body=body) + + res = convert_es_to_pandas(res) + if not isinstance(res, pd.DataFrame): + return None, 0 + return res.iloc[0].id, res.iloc[0].score + + +def get_most_common_id_v2(label, lang="fr"): + """ + Return the spatial entity and its score, based on a specific label and language that obtains the highest score. + :param label: str + :param lang: str + :return: str, float + """ + query_2 = {"query_string": { + "default_field": lang, + "query": parse_label(label), + + }} + res = es.search("gazetteer", "place", + body={"query": + {"bool": + {"must": [{"term": {lang: label}}], "must_not": [], "should": []} + }, + "from": 0, + "size": 50, "sort": [{'score': "desc"}], "aggs": {}}) + res = convert_es_to_pandas(res) + + if not isinstance(res, pd.DataFrame): + if not res: + res = convert_es_to_pandas(es.search("gazetteer", "place", + body={"query": query_2})) + if not isinstance(res, pd.DataFrame): + return None, 0 + return res.iloc[0].id, res.iloc[0].score + + +def get_most_common_id_v3(label, lang='fr'): + """ + Return the spatial entity and its score, based on a specific label and language that obtains the highest score. + The difference with the V2 is that it takes special cases: + * english placenames in a french text + * alias like China which designated also a spatial entity + :param label: + :param lang: + :return: + """ + id_, score = most_common_label(label, lang) + if id_: + # China case + id_2, score2 = most_common_alias(label, lang) + if id_2 and score2 > score: + return id_2, score2 + return id_, score + + # if nothing found in english, search in aliases + id_, score = most_common_alias(label, lang) + if id_: + return id_, score + + similar_label=n_label_similar(label,lang) + if isinstance(similar_label,pd.DataFrame): + return similar_label.iloc[0].id, similar_label.iloc[0].score + + similar_alias = n_alias_similar(label, lang) + if isinstance(similar_alias,pd.DataFrame): + return similar_alias.iloc[0].id, similar_alias.iloc[0].score + + return None, -1 + + +def get_most_common_id_alias_v2(alias, lang="fr"): + res = es.search("gazetteer", "place", + body={"query": {"nested": {"path": "aliases", + "query": + { + "query_string": { + "default_field": "aliases.{0}".format(lang), + "query": parse_label(alias) + } + } + }}, + "sort": [{"score": "desc"}]}) + + res = convert_es_to_pandas(res) + if not isinstance(res, pd.DataFrame): + return None, 0 + return res.iloc[0].id, res.iloc[0].score + + +def get_data(id): + """ + Return the data asssociated to an id in Geodict + :param id: + :return: + """ + res = es.search("gazetteer", "place", + body={"query": {"bool": {"must": [{"term": {"id": id}}], "must_not": [], "should": []}}, "from": 0, + "size": 10, "sort": [], "aggs": {}}) + if res["hits"]["total"] > 0: + res = res["hits"]["hits"][0]["_source"] + return objectify(res) + return None + + +def get_data_by_wikidata_id(id): + """ + Return the data asssociated to a wikidata id in Geodict + :param id: + :return: + """ + res = es.search("gazetteer", "place", + body={"query": {"bool": {"must": [{"term": {"wikidataID": id}}], "must_not": [], "should": []}}, + "from": 0, + "size": 10, "sort": [], "aggs": {}}) + if res["hits"]["total"] > 0: + res = res["hits"]["hits"][0]["_source"] + return objectify(res) + return None + + +def get_data_by_geonames_id(id): + """ + Return the data asssociated to a geonames id in Geodict + :param id: + :return: + """ + res = es.search("gazetteer", "place", + body={"query": {"bool": {"must": [{"term": {"geonameID": id}}], "must_not": [], "should": []}}, + "from": 0, + "size": 10, "sort": [], "aggs": {}}) + if res["hits"]["total"] > 0: + res = res["hits"]["hits"][0]["_source"] + return objectify(res) + return None + + +def get_by_label(label, lang): + """ + A Supprimer + :param label: + :param lang: + :return: + """ + query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}, "size": 50} + response = es.search('gazetteer', 'place', body=query) + if 'hits' in response['hits']: + return objectify(response['hits']['hits']) + return None + + +def get_by_alias(alias, lang): + """ + A supprimer + :param alias: + :param lang: + :return: + """ + query = { + "query": {"nested": {"path": "aliases", "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}} + response = es.search('gazetteer', 'place', body=query) + if 'hits' in response['hits']: + return objectify(response['hits']['hits']) + return None + + +def label_exists(label, lang): + """ + Return True if a spatial entity exists with a specific label in a specific language. + :param label: str + :param lang: str + :return: bool + """ + query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} + response = es.count('gazetteer', 'place', body=query) + if response["count"] > 0: + return True + return False + + +def alias_exists(alias, lang): + """ + Return True if a spatial entity exists with a specific alias in a specific language. + :param alias: str + :param lang: str + :return: bool + """ + query = { + "query": {"nested": {"path": "aliases", "query": {"bool": {"must": [{"match": {"aliases." + lang: alias}}]}}}}} + response = es.count('gazetteer', 'place', body=query) + if response["count"] > 0: + return True + return False + + +def count_of_se(label, lang): + """ + Return the number of spatial entities associated with a specific label in a specific language. + :param label: str + :param lang: str + :return: int + """ + query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}} + response = es.count('gazetteer', 'place', body=query) + return response["count"] + + +def get_top_candidate(label, lang, n=5): + """ + Return the 5-top candidates for a designated label in a specific language. + :param label: str + :param lang: str + :return: list + """ + query = {"query": {"bool": {"must": [{"term": {lang: label}}], "must_not": [], "should": []}}, "sort": [ + { + "score": { + "order": "desc" + } + } + ], "size": n} + response = es.search('gazetteer', 'place', body=query) + if 'hits' in response['hits']: + return [x["_source"]["id"] for x in response['hits']['hits']] + return [] diff --git a/strpython/helpers/objectify.py b/strpython/helpers/objectify.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf4780e1a05caba16ed5336f2be2e34098aa1cc --- /dev/null +++ b/strpython/helpers/objectify.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python + + +"""Scrap module. + +Just tiny bits & bolts. + +.. author: Adrian Castravete +.. modified by : Jacques Fize (Implemented for Python 3 and recursive objectification) +""" + +from functools import wraps + + +def objectify(func): + """Mimic an object given a dictionary. + + Given a dictionary, create an object and make sure that each of its + keys are accessible via attributes. + If func is a function act as decorator, otherwise just change the dictionary + and return it. + :param func: A function or another kind of object. + :returns: Either the wrapper for the decorator, or the changed value. + + Example:: + + >>> obj = {'old_key': 'old_value'} + >>> oobj = objectify(obj) + >>> oobj['new_key'] = 'new_value' + >>> print oobj['old_key'], oobj['new_key'], oobj.old_key, oobj.new_key + + >>> @objectify + ... def func(): + ... return {'old_key': 'old_value'} + >>> obj = func() + >>> obj['new_key'] = 'new_value' + >>> print obj['old_key'], obj['new_key'], obj.old_key, obj.new_key + + """ + + def create_object(value): + """Create the object. + + Given a dictionary, create an object and make sure that each of its + keys are accessible via attributes. + Ignore everything if the given value is not a dictionary. + :param value: A dictionary or another kind of object. + :returns: Either the created object or the given value. + + """ + if isinstance(value, dict): + # Build a simple generic object. + class Object(dict): + def __setitem__(self, key, val): + setattr(self, key, val) + return super(Object, self).__setitem__(key, val) + + # Create that simple generic object. + ret_obj = Object() + # Assign the attributes given the dictionary keys. + for key, val in value.items(): + if isinstance(val,dict): + ret_obj[key] = objectify(val) + else: + ret_obj[key] = val + setattr(ret_obj, key, val) + return ret_obj + else: + return value + + # If func is a function, wrap around and act like a decorator. + if hasattr(func, '__call__'): + @wraps(func) + def wrapper(*args, **kwargs): + """Wrapper function for the decorator. + + :returns: The return value of the decorated function. + + """ + value = func(*args, **kwargs) + return create_object(value) + + return wrapper + + # Else just try to objectify the value given. + else: + return create_object(func) diff --git a/strpython/helpers/sim_matrix.py b/strpython/helpers/sim_matrix.py new file mode 100644 index 0000000000000000000000000000000000000000..5ad26695ca6fc0cdfde3128a06fc44965913e7fe --- /dev/null +++ b/strpython/helpers/sim_matrix.py @@ -0,0 +1,42 @@ +# coding = utf-8 + +import argparse, bz2, os +import json + +import pandas as pd +import numpy as np + + +def read_bz2_matrix(file_path): + f = bz2.BZ2File(file_path, 'r') + matrix_ = np.load(f) + return matrix_ + + +def filter_selected(matrix, selected): + return matrix[[selected]] + + +def read_and_load(file_path, selected=None, bz2=True): + matrix = None + if bz2: + matrix = read_bz2_matrix(file_path) + else: + matrix = np.load(file_path) + if selected: + return filter_selected(matrix, selected) + else: + return matrix + + +def matrix_to_pandas_dataframe(matrix, selected, sim_measure, type_str, n=5): + sim, type_ = sim_measure, type_str + tab_array = [] + for line in range(len(matrix)): + top_n = np.argsort(matrix[line])[::-1][1:n + 1] + index = selected[line] + rank = 1 + for val in top_n: + tab_array.append([index, val, sim, type_, rank, 0, 0, 0, 0]) + rank += 1 + return pd.DataFrame(tab_array, columns="G1 G2 sim_measure type_str rank c1 c2 c3 c4".split()) diff --git a/strpython/models/str.py b/strpython/models/str.py index 3c255b020a05cfd122b52aff4522a091f5e966da..be98e0019b7dd184dcc4f337db41abc5e90307dd 100644 --- a/strpython/models/str.py +++ b/strpython/models/str.py @@ -1,24 +1,22 @@ # coding = utf-8 +import copy +import logging import time import warnings +import geopandas as gpd import networkx as nx import pandas as pd -import logging +from shapely.geometry import MultiPoint,Polygon,Point,LineString -from shapely.geometry import Point, MultiPoint, MultiLineString, LineString +from ..helpers.collision import collisionTwoSEBoundaries, add_cache_adjacency +from ..helpers.geodict_helpers import get_data, get_data_by_wikidata_id +from ..eval.stats import most_common -from ..config.configuration import config -#logging.basicConfig(filename=config.log_file,level=logging.INFO) +from sklearn.cluster import MeanShift, estimate_bandwidth +# logging.basicConfig(filename=config.log_file,level=logging.INFO) -from ..helpers.boundary import is_intersect -from ..helpers.collision_with_gazetteer_data import collisionTwoSEBoundaries -from ..helpers.deprecated import deprecated -from ..helpers.gazeteer_helpers import get_data, get_data_by_wikidata_id -from ..nlp.ner.ner import NER -import geopandas as gpd - def get_inclusion_chain(id_, prop): """ For an entity return it geographical inclusion tree using a property. @@ -38,14 +36,10 @@ class STR(object): """ Str basic structure """ - - def __init__(self, tagged_text, spatial_entities, shapes=None): + __cache_inclusion = {} + def __init__(self, tagged_text, spatial_entities): self.tagged_text = tagged_text - self.shapes = shapes - if self.shapes: - self.spatial_entities = {k: v for k, v in spatial_entities.items() if k in self.shapes} - else: - self.spatial_entities = spatial_entities + self.spatial_entities = spatial_entities self.adjacency_relationships = {} self.inclusion_relationships = {} @@ -124,6 +118,7 @@ class STR(object): except: label = None self.add_spatial_entity(id, label, False) + # print(self.graph.nodes(data=True)) def add_adjacency_rel(self, se1, se2,v=True): if not se1 in self.adjacency_relationships: @@ -164,23 +159,34 @@ class STR(object): Method for updating links between spatial entities :return: """ - nodes = self.graph.nodes(data=True) + nodes = copy.deepcopy(self.graph.nodes(data=True)) self.graph.clear() self.graph.add_nodes_from(nodes) + print("inclusion") + self.get_inclusion_relationships() + for se1 in self.inclusion_relationships: + for se2 in self.inclusion_relationships[se1]: + if self.inclusion_relationships[se1][se2]: + self.graph.add_edge(se1, se2, key=0, color="red") + + print("adjacency") self.get_adjacency_relationships() for se1 in self.adjacency_relationships: for se2 in self.adjacency_relationships[se1]: if self.adjacency_relationships[se1][se2]: self.graph.add_edge(se1, se2, key=0, color="green") + print("fin adj") + + - self.get_inclusion_relationships() - for se1 in self.inclusion_relationships: - for se2 in self.inclusion_relationships[se1]: - if self.inclusion_relationships[se1][se2]: - self.graph.add_edge(se1, se2, key=0, color="red") + def add_cache_inclusion(self,id1, id2): + if not id1 in STR.__cache_inclusion: + STR.__cache_inclusion[id1] = set([]) + STR.__cache_inclusion[id1].add(id2) def is_included_in(self, se1_id, se2_id): + global __cache_inclusion """ Return true if the two spatial entities identified by @se1_id and @se2_id share an inclusion relationship :param se1_id: @@ -191,13 +197,19 @@ class STR(object): if se2_id in self.inclusion_relationships[se1_id]: return self.inclusion_relationships[se1_id][se2_id] + if se1_id in STR.__cache_inclusion: + if se2_id in STR.__cache_inclusion[se1_id]: + return True + inc_chain_P131 = get_inclusion_chain(se1_id, "P131") inc_chain_P706 = get_inclusion_chain(se1_id, "P706") inc_chain = inc_chain_P131 inc_chain.extend(inc_chain_P706) inc_chain = set(inc_chain) if se2_id in inc_chain: + self.add_cache_inclusion(se1_id,se2_id) return True + return False def get_inclusion_relationships(self): @@ -223,49 +235,65 @@ class STR(object): p47se1 = [] for el in data["P47"]: d = get_data_by_wikidata_id(el) + if not d: + continue if "id" in d: p47se1.append(d["id"]) return p47se1 + def is_adjacent(self,se1,se2,datase1=None,datase2=None): + f = False + stop_class = set(["A-PCLI", "A-ADM1"]) + if self.is_included_in(se1, se2): + return f + + elif self.is_included_in(se2, se1): + return f + + data_se1 = get_data(se1) if not datase1 else datase1 # Évite de recharger à chaque fois -_- + data_se2 = get_data(se2) if not datase2 else datase2 + + # print("testP47") + if "P47" in data_se2: + if se1 in self.getP47AdjacencyData(data_se2): + return True + # print("P47") + if not f: + if "P47" in data_se1: + if se2 in self.getP47AdjacencyData(data_se1): + return True + # print("P47") + if not f: + # print("test collision") + if collisionTwoSEBoundaries(se1, se2): + return True + if not f: + if "coord" in data_se1 and "coord" in data_se2: + if Point(data_se1["coord"]["lon"], data_se1["coord"]["lat"]).distance( + Point(data_se2["coord"]["lon"], data_se2["coord"]["lat"])) < 1 and len( + set(data_se1["class"]) & stop_class) < 1 and len(set(data_se2["class"]) & stop_class) < 1: + return True + return f + def get_adjacency_relationships(self): """ Return all the adjacency relationships between all the spatial entities in the STR. :return: """ - stop_class=set(["A-PCLI","A-ADM1"]) - + data={se:get_data(se)for se in self.spatial_entities} for se1 in self.spatial_entities: - data_se1 = get_data(se1) + data_se1 = data[se1] for se2 in self.spatial_entities: if se1 == se2: continue - - if self.is_included_in(se1,se2) or self.is_included_in(se2,se1): - continue + # print("test adjacency") if se1 in self.adjacency_relationships: if se2 in self.adjacency_relationships[se1]: continue if se2 in self.adjacency_relationships: if se1 in self.adjacency_relationships[se2]: continue - data_se2 = get_data(se2) - f = False - if "P47" in data_se2: - if se1 in self.getP47AdjacencyData(data_se2): - f = True - #print(data_se1["en"], data_se2["en"], "P47") - if not f: - if "P47" in data_se2: - if se2 in self.getP47AdjacencyData(data_se2): - f = True - #print(data_se1["en"], data_se2["en"], "P47") - if not f: - f = collisionTwoSEBoundaries(se1, se2) - if not f: - if Point(data_se1["coord"]["lon"], data_se1["coord"]["lat"]).distance( - Point(data_se2["coord"]["lon"], data_se2["coord"]["lat"])) < 1 and len( - set(data_se1["class"]) & stop_class) < 1 and len(set(data_se2["class"]) & stop_class) < 1: - f = True - self.add_adjacency_rel(se1, se2, f) + data_se2 = data[se2] + self.add_adjacency_rel(se1, se2, self.is_adjacent(se1,se2,data_se1,data_se2)) @@ -334,12 +362,55 @@ class STR(object): id1, id2 = edge[0], edge[1] if edge[2]["color"] == "green": self.add_adjacency_rel(edge[0],edge[1]) + add_cache_adjacency(id1, id2) elif edge[2]["color"] == "red": self.add_inclusion_rel(edge[0], edge[1]) - def set_all_shapes(self,shapes): - self.shapes=shapes + self.add_cache_inclusion(id1,id2) + - def map_projection(self): + def get_geo_data_of_se(self): + points,label,class_ = [], [], [] + for se in self.spatial_entities: + data = get_data(se) + try: + points.append(Point(data["coord"]["lon"], data["coord"]["lat"])) + label.append(data["en"]) + class_.append(most_common(data["class"])) + except: + pass + df=gpd.GeoDataFrame({"geometry":points,"label":label,"classe":class_}) + df["x"]=df.geometry.apply(lambda p: p.x) + df["y"] = df.geometry.apply(lambda p: p.y) + return df + + def get_cluster(self): + data=self.get_geo_data_of_se() + bandwidth = estimate_bandwidth(data[["x", "y"]].values) + ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) + ms.fit(data[["x", "y"]].values) + data["cluster"] = ms.labels_ + """ + + # deuxième découpe en cluster + c=data['cluster'].value_counts().idxmax() + X=data[data["cluster"] == c] + X=X[["x","y"]] + bandwidth = estimate_bandwidth(X.values) + ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) + ms.fit(X.values) + X["cluster"]=ms.labels_+(data['cluster'].max()+1) + lab=ms.labels_ + lab+=data['cluster'].max()+1 + + data["cluster"][data["cluster"] == c]=X["cluster"] + """ + + geo = data.groupby("cluster").apply(to_Polygon) + cluster_polybuff = gpd.GeoDataFrame(geometry=geo) + return cluster_polybuff + + + def map_projection(self,plt=False): import matplotlib.pyplot as plt world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) base = world.plot(color='white', edgecolor='black', figsize=(16, 9)) @@ -376,7 +447,24 @@ class STR(object): gpd.GeoSeries(points).plot(ax=base,marker='o',markersize=5,color="blue") gpd.GeoSeries(lines_adj).plot(ax=base, color="green") gpd.GeoSeries(lines_inc).plot(ax=base, color="red") - print("adj",gpd.GeoSeries(lines_adj)) - print("inc",gpd.GeoSeries(lines_inc)) + + if not plt: + return base plt.show() + +def to_Multipoints(x): + #print(x[["x","y"]].values) + return Polygon([Point(z) for z in x[["x","y"]].values]).buffer(1) + +def to_Polygon(x): + points = [Point(z) for z in x[["x","y"]].values] + if len(points) > 2: + coords = [p.coords[:][0] for p in points] + poly = Polygon(coords).buffer(1) + return poly + elif len(points)==1: + return points[0].buffer(1) + else: + coords = [p.coords[:][0] for p in points] + return LineString(coords).buffer(1) \ No newline at end of file diff --git a/strpython/models/transformation/transform.py b/strpython/models/transformation/transform.py index adbc968399ff40c386d38ed50ab519e88a9961fc..f97e123873503ea82df1398ca11d0e0f73c8429a 100644 --- a/strpython/models/transformation/transform.py +++ b/strpython/models/transformation/transform.py @@ -7,7 +7,7 @@ import numpy as np from elasticsearch import Elasticsearch from ...config.configuration import config -from ...helpers.gazeteer_helpers import get_data +from ...helpers.geodict_helpers import get_data from ..str import STR, get_inclusion_chain client = Elasticsearch(config.es_server) @@ -15,12 +15,28 @@ client = Elasticsearch(config.es_server) class Transformation(): """ - Transform structure + Transform class basic structure """ + def transform(self, str_: STR, **kwargs) -> STR: + """ + Transform a STR + :param str_: STR input + :param kwargs: dict --> args needed for the transformation + :return: STR + """ pass + class Generalisation(Transformation): + """ + Generalisation transformation class declaration. Basically, the generalisation will replace spatial entity in a STR, + by its "upper" entity -- an upper entity e1 of e2, includes e2-- using different hypothesis. The first hypothesis + "transform_all", generalise all the spatial entity, while the second "transformation_bounded" only transform entities + between a certain level (town, country, region, ...). + """ + + # Level accepted for "transform_bounded()" bounded_class_references = { "country": ["A-PCLI"], "region": ["A-ADM1"], @@ -29,6 +45,12 @@ class Generalisation(Transformation): } def transform(self, str_: STR, **kwargs) -> STR: + """ + + :param str_: STR + :param kwargs: args + :return: STR + """ h = kwargs.get("n", 1) type_ = kwargs.get("type_gen", "all") bound = kwargs.get("bound", "country") @@ -37,9 +59,9 @@ class Generalisation(Transformation): Store Inclusion Informations """ if type_ == "all": - return self.transform_all(str_, h,cp=cp) + return self.transform_all(str_, h, cp=cp) if type_ == "bounded": - return self.transform_bounded(str_, bound,cp=cp) + return self.transform_bounded(str_, bound, cp=cp) else: warnings.warn("No Generalisation transform type specified! Using \"all\" generalisation by default") return self.transform_all(str_, h, cp=cp) @@ -50,14 +72,14 @@ class Generalisation(Transformation): for node in graph.nodes(): if not node in inclusion_dictionnary: inc_list = [] - data=get_data(node) + data = get_data(node) try: inc_list = data["inc_P131"] except: pass if not inc_list: if "inc_geoname" in data: - inc_list=data["inc_geoname"] + inc_list = data["inc_geoname"] if inc_list: inc_list = inc_list if isinstance(inc_list, list) else [inc_list] @@ -80,8 +102,7 @@ class Generalisation(Transformation): associated_classes[it] = classes_list return associated_classes - - def transform_bounded(self, str_: STR, bound: str,cp=True) -> STR: + def transform_bounded(self, str_: STR, bound: str, cp=True) -> STR: if not bound in Generalisation.bounded_class_references: print("'bound' must be a value from {0}".format(str(Generalisation.bounded_class_references))) exit() @@ -100,7 +121,7 @@ class Generalisation(Transformation): if t_: transform_map[es] = t_ if cp: - copy_= copy.deepcopy(str_) + copy_ = copy.deepcopy(str_) copy_.transform_spatial_entities(transform_map) copy_.update() return copy_ @@ -109,13 +130,13 @@ class Generalisation(Transformation): str_.update() return str_ - def transform_all(self, str_: STR, h: int,cp=True) -> STR: - h=int(h) - graph=str_.graph + def transform_all(self, str_: STR, h: int, cp=True) -> STR: + h = int(h) + graph = str_.graph inclusion_dict = Generalisation.get_inclusion_map(graph) transform_map = {} new_label = {} - i=0 + i = 0 for node in graph.nodes(): if node in inclusion_dict: inc_chain = inclusion_dict[node] @@ -126,7 +147,7 @@ class Generalisation(Transformation): transform_map[node] = inc_chain[h - 1] new_label[inc_chain[h - 1]] = get_data(inc_chain[h - 1])["en"] if cp: - copy_= copy.deepcopy(str_) + copy_ = copy.deepcopy(str_) copy_.transform_spatial_entities(transform_map) copy_.update() return copy_ @@ -136,7 +157,7 @@ class Generalisation(Transformation): class Expansion(Transformation): - def getAroundEntities(self, data, score, distance=150,unit="km",n=1): + def getAroundEntities(self, data, score, distance=150, unit="km", n=1): if not "coord" in data: return [] hits = client.search("gazetteer", "place", { @@ -155,7 +176,7 @@ class Expansion(Transformation): ], "filter": { "geo_distance": { - "distance": "{0}{1}".format(distance,unit), + "distance": "{0}{1}".format(distance, unit), "coord": data["coord"] } } @@ -165,69 +186,65 @@ class Expansion(Transformation): {"score": "desc"} ], "size": n}) if hits["hits"]["total"] > 0: - ids_=[] + ids_ = [] for h in hits["hits"]["hits"]: ids_.append(h["_source"]["id"]) return ids_ return [] - def select_es(self,graph): + def select_es(self, graph): es = np.array(list(graph.nodes)) - score = [np.inf for i in range(len(es))] + score = [-1 for i in range(len(es))] for e in range(len(es)): data = get_data(es[e]) if "score" in data: score[e] = float(data["score"]) - return es[score < np.median(score)] + return np.median(score), es[score < np.median(score)] def transform(self, str_: STR, **kwargs): type_ = "adjacency" distance = kwargs.get("distance", 150) unit = kwargs.get("unit", 150) n = kwargs.get("adjacent_count", 1) - cp=kwargs.get("cp", True) + cp = kwargs.get("cp", True) if type_ == "adjacency": - return self.transform_adj(str_, distance,unit,n,cp) + return self.transform_adj(str_, distance, unit, n, cp) - - def transform_adj(self, str_: STR, distance: int,unit : str,n :int,cp=True) -> STR: - graph=str_.graph - selected_se = self.select_es(graph) + def transform_adj(self, str_: STR, distance: int, unit: str, n: int, cp=True) -> STR: + graph = str_.graph + median, selected_se = self.select_es(graph) data_se, scores_ = {}, [] for node in selected_se: data_se[node] = get_data(node) if "score" in data_se[node]: scores_.append(float(data_se[node]["score"])) else: - scores_.append(np.inf) - median = np.median(scores_) - - new_nodes=[] + scores_.append(-1) + new_nodes = [] + labels = [] for node in selected_se: data_ = data_se[node] if (not "P-PPL" in data_["class"]) and (not "A-ADM4" in data_["class"]): continue if not "country" in data_: continue - neighbor = self.getAroundEntities(data_, median, distance,unit,n) + neighbor = self.getAroundEntities(data_, median, distance, unit, n) if not neighbor: try: - neighbor=[get_inclusion_chain(node,"P131")[0]] + neighbor = [get_inclusion_chain(node, "P131")[0]] except: - neighbor=[] + neighbor = [] + labels.extend([get_data(n)["en"] for n in neighbor]) new_nodes.extend(neighbor) - new_nodes=list(set(new_nodes)) - labels=[] - for no in new_nodes: - #print(no,get_data(no)) - labels.append(get_data(no)["en"]) + new_nodes = list(set(new_nodes)) if cp: - copy_= copy.deepcopy(str_) - copy_.add_spatial_entities(new_nodes,labels) + copy_ = copy.deepcopy(str_) + copy_.add_spatial_entities(new_nodes, labels) + copy_.update() return copy_ - str_.add_spatial_entities(new_nodes,labels) + str_.add_spatial_entities(new_nodes, labels) str_.update() return str_ diff --git a/strpython/nlp/disambiguator/geodict_gaurav.py b/strpython/nlp/disambiguator/geodict_gaurav.py index 14eb26050e8b8648bac5465fdc46ee0151a9a83a..f6ae42277bb7e9bd5e6c553799e76d8db7a999de 100644 --- a/strpython/nlp/disambiguator/geodict_gaurav.py +++ b/strpython/nlp/disambiguator/geodict_gaurav.py @@ -1,11 +1,13 @@ # coding = utf-8 import math -from ...helpers.collision_with_gazetteer_data import * -from ...helpers.gazeteer_helpers import * +from ...helpers.collision import * +from ...helpers.geodict_helpers import * from .disambiguator import Disambiguator from ...models.str import get_inclusion_chain + + class GauravGeodict(Disambiguator): def __init__(self): @@ -67,7 +69,7 @@ class GauravGeodict(Disambiguator): id_fixed = fixed_entities[fixed]["id"] if self.Adjacency_P47(id_cand, id_fixed): score_dc[id_cand] += 3 - if self.Adjacency_Hull(id_cand, id_fixed): + elif self.Adjacency_Hull(id_cand, id_fixed): score_dc[id_cand] += 2 score_dc[id_cand] += self.get_inclusion_score(id_cand, id_fixed) m = max(score_dc, key=score_dc.get) @@ -126,7 +128,7 @@ class GauravGeodict(Disambiguator): for amb_ent in ambiguous_entities: d = self.disambiguateOne(ambiguous_entities[amb_ent], fixed_entities) if not d: - d_amb_results[amb_ent] = get_most_common_id_v2(amb_ent, lang)[0] + d_amb_results[amb_ent] = get_most_common_id_v3(amb_ent, lang)[0] else: d_amb_results[amb_ent] = d for k, v in fixed_entities.items(): diff --git a/strpython/nlp/disambiguator/models/bigram.py b/strpython/nlp/disambiguator/models/bigram.py index d9ce129f14032e25035c7f69d597623e38529cbc..f45ba97b13382fa7cc7c7bccc421732284fba791 100644 --- a/strpython/nlp/disambiguator/models/bigram.py +++ b/strpython/nlp/disambiguator/models/bigram.py @@ -31,7 +31,7 @@ class BigramModel: def get_bigram_probability(self,uri1,uri2,pr1=1): - nna=0.00000000000000001 + nna=0.00000001 if uri1 in self.cooc_freq: if uri2 in self.cooc_freq[uri1]: return (self.cooc_freq[uri1][uri2] / self.count_associated[uri1])+pr1 diff --git a/strpython/nlp/disambiguator/most_common.py b/strpython/nlp/disambiguator/most_common.py index 60de2a6c50a38365dbd0f7a64bf5b5ed5cfe6dcf..12e448912f613c3b12b44c5ee08df0da5b4532ae 100644 --- a/strpython/nlp/disambiguator/most_common.py +++ b/strpython/nlp/disambiguator/most_common.py @@ -1,11 +1,18 @@ # coding = utf-8 -from ...helpers.gazeteer_helpers import label_exists, alias_exists, get_most_common_id_v2,get_most_common_id_v3, get_most_common_id_alias_v2 +from ...helpers.geodict_helpers import * from .disambiguator import Disambiguator import re, json, os from ...config.configuration import config +from inflector import Inflector,English,Spanish,French + +inflectors= { + "en":Inflector(English()), + "fr":Inflector(French()), + "es":Inflector(Spanish()) +} stop_words = { "fr": set(open(os.path.join(config.language_resources_path,"stop_words_fr.txt")).read().split("\n")), "en": set(open(os.path.join(config.language_resources_path,"stop_words_en.txt")).read().split("\n")) @@ -37,22 +44,26 @@ class MostCommonDisambiguator(Disambiguator): def disambiguate_(self, label, lang='fr'): if re.match("^\d+$", label): return 'O', -1 - if label.lower().rstrip("s") in stop_words[lang] or label.lower().rstrip("s") in common_words[lang]: - return 'O', -1 + if lang in stop_words: #and lang in common_words: + if label.lower().rstrip("s") in stop_words[lang]: #or label.lower().rstrip("s") in common_words[lang]: + return 'O', -1 - plural = label.rstrip("s") + "s" - if plural.lower() in stop_words[lang] or plural.lower() in common_words[lang]: - return 'O', -1 + if lang in inflectors: + plural=inflectors[lang].singularize(label) + else: + plural = label.rstrip("s") + "s" + if plural.lower() in stop_words[lang]:# or plural.lower() in common_words[lang]: + return 'O', -1 - id_, score = get_most_common_id_v3(label, lang) + id_, score = most_common_label(label, lang) if id_: id_en, score_en = get_most_common_id_v3(label, "en") if id_en and score_en: if score_en > score: id_, score = id_en, score_en - id_alias, score_alias = get_most_common_id_alias_v2(label, lang) + id_alias, score_alias = most_common_alias(label, lang) if id_alias and score_alias: if score_alias > score: id_, score = id_alias, score_alias - print(label,id_,score) + #print(label,id_,score) return id_, score diff --git a/strpython/nlp/disambiguator/pagerank.py b/strpython/nlp/disambiguator/pagerank.py index 7c95ee46197626051ccb976d4e02d0578582a841..25eb02eb7edb9cdc37cf918a70f8690339f7f19f 100644 --- a/strpython/nlp/disambiguator/pagerank.py +++ b/strpython/nlp/disambiguator/pagerank.py @@ -1,6 +1,6 @@ # coding = utf-8 -from ...helpers.gazeteer_helpers import label_exists, alias_exists, get_most_common_id_v2, get_most_common_id_alias_v2 +from ...helpers.geodict_helpers import * from .disambiguator import Disambiguator @@ -13,11 +13,11 @@ class PageRankDisambiguator(Disambiguator): new_count = {} selected_en = {} for en in se_: - en_most_common, score_en = get_most_common_id_v2(en, "en") + en_most_common, score_en = get_most_common_id_v3(en, "en") if label_exists(en, lang): - id_, score = get_most_common_id_v2(en, lang) + id_, score = get_most_common_id_v3(en, lang) elif alias_exists(en, lang): - id_, score = get_most_common_id_alias_v2(en, lang) + id_, score = (en, lang) if en_most_common and score_en > score: selected_en[en_most_common] = en diff --git a/strpython/nlp/disambiguator/wikipedia_cooc.py b/strpython/nlp/disambiguator/wikipedia_cooc.py index 485002244978a4c4e13b31cdb3781b3977dac47f..56ec7cd83f97971a9d392f74151eb4dc37c0f047 100644 --- a/strpython/nlp/disambiguator/wikipedia_cooc.py +++ b/strpython/nlp/disambiguator/wikipedia_cooc.py @@ -5,7 +5,7 @@ from .disambiguator import Disambiguator from .models.bigram import BigramModel import pickle from ...config.configuration import config -from ...helpers.gazeteer_helpers import get_data,get_most_common_id_v3,get_top_candidate +from ...helpers.geodict_helpers import get_data,get_most_common_id_v3,get_top_candidate from .most_common import stop_words,common_words import networkx as nx @@ -14,11 +14,11 @@ def read_pickle(fn): class WikipediaDisambiguator(Disambiguator): - def __init__(self): + def __init__(self,measure="centrality"): Disambiguator.__init__(self) # Load model self.model=BigramModel(read_pickle(config.wiki_cooc_dis.cooc_freq),read_pickle(config.wiki_cooc_dis.count)) - + self.measure=measure def disambiguate(self, ner_result, lang="en"): count, se_ = self.extract_se_entities(ner_result) new_count = {} @@ -38,21 +38,22 @@ class WikipediaDisambiguator(Disambiguator): for e in entities: if re.match("^\d+$", e): continue - if e.lower().rstrip("s") in stop_words[lang] or e.lower().rstrip("s") in common_words[lang]: + if e.lower().rstrip("s") in stop_words[lang]:# or e.lower().rstrip("s") in common_words[lang]: continue plural = e.rstrip("s") + "s" - if plural.lower() in stop_words[lang] or plural.lower() in common_words[lang]: + if plural.lower() in stop_words[lang]: #or plural.lower() in common_words[lang]: continue spat_en.append(e) - + spat_en=list(set(spat_en)) g = nx.Graph() possible_candidates = [] betw_cand={} # indicate which toponym group a candidate belong to #w maybe useless ... group_candidate = {} #candidates per toponym + for e in spat_en: - cand = get_top_candidate(e, lang) + cand = get_top_candidate(e, lang,4) group_candidate[e] = cand betw_cand[e]=cand for n in cand: @@ -62,35 +63,42 @@ class WikipediaDisambiguator(Disambiguator): for cand in possible_candidates: g.add_node(cand, label=get_data(cand)[lang]) + data_candidate={ca :get_data(ca) for ca in possible_candidates} for cand in possible_candidates: for cand2 in possible_candidates: # Get PageRank score - d = get_data(cand) + d = data_candidate[cand] + sc = 1 if "score" in d: sc = float(d["score"]) - # Compute probability prob = self.model.get_coocurence_probability(sc, cand, cand2) + if cand2 in betw_cand[cand] or cand in betw_cand[cand2]: prob = 0.0 if prob < 0.0000001: prob = 0.0 if not cand == cand2: # take the lowest co-occurrency between two candidates - if (cand2, cand) in list(g.edges): - if g.edge[cand2][cand]["weight"] < prob: + if g.has_edge(cand2, cand) : + if g.edges[cand2,cand]["weight"] < prob: continue g.add_edge(cand, cand2, weight=prob) selected = {} + #Take the candidates with the highest degree weighted for gr in group_candidate: try: - selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight')) + if self.measure == "degree": + selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight')) + elif self.measure == "centrality": + selected[gr] = max(group_candidate[gr], key=lambda x: nx.closeness_centrality(g, x, distance="weight")) + else:# degree by default + selected[gr] = max(group_candidate[gr], key=lambda x: g.degree(x, weight='weight')) + except: - #print(group_candidate[gr]) empty group - selected[gr]=get_most_common_id_v3(gr,lang) - #print(entities,selected) + selected[gr]=get_most_common_id_v3(gr,lang)[0] return selected diff --git a/strpython/nlp/ner/by_dict.py b/strpython/nlp/ner/by_dict.py index af4a3619401de2a576929bd3ca5476d0f9a74fdf..ec7ed54fc0c51d685f69f1b92ebe5027584ec0b0 100644 --- a/strpython/nlp/ner/by_dict.py +++ b/strpython/nlp/ner/by_dict.py @@ -3,7 +3,7 @@ import numpy as np from polyglot.text import Text as PolyText from .ner import NER -from ...helpers import gazeteer_helpers +from ...helpers import geodict_helpers class ByDict(NER): @@ -37,9 +37,9 @@ class ByDict(NER): cur = f.tolist() for t in terms: - GID = gazeteer_helpers.get_most_common_id_v3(" ".join(pos_t[:, 0][t]), lang=self._lang)[0] + GID = geodict_helpers.get_most_common_id_v3(" ".join(pos_t[:, 0][t]), lang=self._lang)[0] if GID: - data = gazeteer_helpers.get_data(GID) + data = geodict_helpers.get_data(GID) if "score" in data: if not float(data["score"]) > self.threshold: continue diff --git a/strpython/pipeline.py b/strpython/pipeline.py index 013963d797b3ba090354cdec0633ea796a7f965c..c2d5feb71956894c83d87f2f5cce666e62e6d836 100644 --- a/strpython/pipeline.py +++ b/strpython/pipeline.py @@ -1,5 +1,6 @@ # coding =utf-8 -from .helpers.deprecated import deprecated +from strpython.models.str import STR + from .models.transformation.transform import Generalisation, Expansion from .nlp.disambiguator.disambiguator import Disambiguator from .nlp.disambiguator.most_common import MostCommonDisambiguator @@ -117,42 +118,6 @@ class Pipeline(object): str_=Expansion().transform(str_,**kwargs) return str_ - @deprecated("Have been proved to be not useable for now ...") - def buildSemSTR(self,text,win_size=5): - """ - Return the corresponding STR for a text. - :param text: - :return: STR - """ - _,output, se_identified = self.parse(text) - - str_=STR_SEM(output,se_identified) - str_.build(win_size=win_size) - return str_ - @deprecated("Have been proved to be not useable") - def build_class_variation_str(self,text): - """ - Return the corresponding STR for a text. - :param text: - :return: STR - """ - _,output, se_identified = self.parse(text) - str_=STRClassVariation(output,se_identified) - str_.build() - return str_ - - @deprecated("Have been proved to be not useable") - def build_population_variations_str(self,text): - """ - Return the corresponding STR for a text. - :param text: - :return: STR - """ - _,output, se_identified = self.parse(text) - str_=STRPopulationVariation(output,se_identified) - str_.build() - return str_ - if __name__ == '__main__': pass \ No newline at end of file diff --git a/strpython/resources/language_resources/dic_fr.json b/strpython/resources/language_resources/dic_fr.json index a558ed10c05b6c5883911fb5da24829c8eb052ce..c36482ecbde2c0ebacdecf2849ca0beb45c743d5 100644 --- a/strpython/resources/language_resources/dic_fr.json +++ b/strpython/resources/language_resources/dic_fr.json @@ -229191,7 +229191,6 @@ "iatrogénique", "affronteuse", "emplacement", - "belgique", "reparlementer", "péquin", "surmiser", diff --git a/strpython/resources/language_resources/geo_term_en b/strpython/resources/language_resources/geo_term_en new file mode 100644 index 0000000000000000000000000000000000000000..3911cd2fcf4220bf27800b6113f94c819bd7405d --- /dev/null +++ b/strpython/resources/language_resources/geo_term_en @@ -0,0 +1,1030 @@ +absolute humidity +absolute location +accessibility resource +accessibility +acid rain +active volcano +agricultural geography +air mass +alluvia +alluvial soils +altitude +antarctic +antarctic circle +anthracite +anthropization +anticline +antimeridian +antipodes +aquifer +archipelago +arête +arroyo +arctic +arctic circle +ash +atlantic seaboard fall line +atlas +atmosphere +atoll +ayre +azimuth +badlands +barrier ridge +base level +basin +batholith +bay +beach +bearing +bedrock +bight +biological diversity +biogeography +biosphere +biota +bituminous +blowout +bocage +body of water +bootheel +border +break-in-bulk point +built environment +butte +calanque +caldera +canal +canyon +cape +caprock +cardinal directions +carrying capacity +cartography +cartogram +cave +cay +cenote +central business district +census-designated place (cdp) +channel +chaparral +chimney +chinook +chorography +cinder cone +circle of latitude +cirque +city +city-state +cliff +climax vegetation +coast +col +colony +commonwealth +compass +compass rose +confluence +coniferous +contiguity +continent +continental climate +continental divide +continental shelf +continentality +contour lines +conurbation +corrasion +core area +coulee +couloir +country +county +course +crater +crater lake +crop-lien system +crust +cryosphere +cryoturbation +cuesta +cultural geography +culture +culture hearth +cut bank +cwm +cyclopean stairs +dale +dam +de facto'' segregation +de jure'' segregation +deciduous forest +degree +degree day +dell +delta +demography +depression +desert +digital elevation model (dem) +dike +distributary +district +dome +dormant volcano +drainage basin +drainage divide +draw +drumlin +dry farming +dry point +dune +eastern hemisphere +economic geography +economies of agglomeration +edgelands +elevation +emergent coastline +enclave +endorheic basin +equator +erratic +escarpment +esker +estuary +evapotranspiration +exclave +exotic stream +extinct volcano +exurban +fall line +fallow +fault +fault-block mountain +fault zone +federation +fen +field +firth +fish ladder +fjord +floodplain +focality +forest +functional diversity +gazetteer +geodesy +geoid +geoinformatics +geographic information science (gis) +geographic information system (gis) +geographic names information system (gnis) +geography +geolocation +geomatics +geomorphology +geosphere +geostatistics +ghetto +glacial till +glaciation +glacier +global positioning system (gps) +globe +graben +great circle +great-circle distance +grid +groundwater +growing season +groyne +gulch +gulf +gully +guyot +hamlet +hanging valley +harmonic tremor +heading +headland +hearth +heartland +heath +hedgerow +hemisphere +highland +highway +hill +hillock +hinterland +historical geography +hogback +horizon +hotspot +human geography +hummock +humus +hydrography +hydrosphere +ice age +ice cap +iceberg +igneous rock +impact crater +impoundment +inertia costs of location +inlet +inselberg +insular +integrated geography +intercardinal directions +interfluve +intermediate directions +international date line +international waters +intervening opportunity +intracoastal waterway system +inverted river delta +island +islet +isohyet +isthmus +jurisdiction +jhum cultivation +jungle +kame +karst +kettle +key col +knoll +lacustrine plain +lagoon +lahar +lake +land bridge +landform +landmark +landmass +lateral blast +latitude +lava +leaching +leeward +legend +ledc +levee +life-cycle stage +lignite +lithosphere +lithospheric plates +location +loess +longitude +lowland +magma +main stem +mainland +makhtesh +mantle +map +map projection +maritime climate +marsh +mason–dixon line +massif +meander +meander scar +medc +mediterranean climate +megalopolis +megaregion +meridian +mesa +metamorphic rock +metes and bounds +metropolis +metropolitan area +metropolitan coalescence +mogote +monadnock +moor +moraine +mound +mountain +mountain range +mouth +mudflow +multilingual +municipality +nadir +nation +national mapping agency +national park +natural landscape +neighborhood +nodal region +north geographic pole +north magnetic pole +northern hemisphere +oasis +ocean +open range +ordinal directions +orographic rainfall +outwash +overburden +oxbow +palisade +panhandle +parish +permafrost +peninsula +photogrammetry +physical geography +physiographic region +physiography +piedmont +pit crater +place identity +plain +plate tectonics +plateau +platted land +plural society +polar circle +polar ice cap +polar region +pole of inaccessibility +political geography +polynodal +pond +populated place +population +population geography +post-industrial +pothole +precambrian rock +prevailing winds +primary sector +prime meridian +promontory +protected area +province +psychogeography +pueblo +quadrangle +quaternary sector +quarry +rail gauge +rainforest +rainshadow +ravine +region +regiopolis +relief +relief map +remote sensing +reservoir +resource +retroreflector +ribbon lake +ridge +rift valley +riparian rights +ria +river +riverine +rural +saddle +salient +salt pan +scale +scarp +sea +sea level +seamount +second home +secondary-intercardinal directions +secondary sector +sedimentary rock +seismograph +settlement +sheepback +shield +shield volcano +shoal +shore +sinkhole +site +situation +slough +smog +standard metropolitan statistical area (smsa) +snowline +soil horizon +solubility +sound +south geographic pole +south magnetic pole +southern hemisphere +space economy +spatial citizenship +spatial complementarity +spatial interaction +spatial reference system (srs) +spreading ridges +spring +spur +stack +state +steppe +strait +stratovolcano +stream +subduction zone +suburban +suburbanization +summit +surface water +surveying +swale +swamp +syrt +taiga +tarn +temperature inversion +tephra +terrain +territorial waters +territory +tertiary sector +thalweg +tide +till +time distance +time geography +time zone +topographic map +topographical relief +topographic isolation +topographic prominence +topography +toponymy +tor +town +township and range +transferability +transhumance +tree line +tributary +tropic of cancer +tropic of capricorn +tropics +tundra +underpopulation +uniform region +urban +urban geography +urban sprawl +urbanization +vale +valley +vertical exaggeration +vent +viewshed +village +volcanic avalanche +volcanic crater +volcano +wadi +water mapping +water pollution +water table +watershed +waterway +weathering +western hemisphere +wetland +wilderness +windward +world map +zoning +zenith +absolute humidity +absolute location +accessibility resource +accessibility +acid rain +active volcano +agricultural geography +air mass +alluvia +alluvial soils +altitude +antarctic +antarctic circle +anthracite +anthropization +anticline +antimeridian +antipodes +aquifer +archipelago +arête +arroyo +arctic +arctic circle +ash +atlantic seaboard fall line +atlas +atmosphere +atoll +ayre +azimuth +badlands +barrier ridge +base level +basin +batholith +bay +beach +bearing +bedrock +bight +biological diversity +biogeography +biosphere +biota +bituminous +blowout +bocage +body of water +bootheel +border +break-in-bulk point +built environment +butte +calanque +caldera +canal +canyon +cape +caprock +cardinal directions +carrying capacity +cartography +cartogram +cave +cay +cenote +central business district +census-designated place (cdp) +channel +chaparral +chimney +chinook +chorography +cinder cone +circle of latitude +cirque +city +city-state +cliff +climax vegetation +coast +col +colony +commonwealth +compass +compass rose +confluence +coniferous +contiguity +continent +continental climate +continental divide +continental shelf +continentality +contour lines +conurbation +corrasion +core area +coulee +couloir +country +county +course +crater +crater lake +crop-lien system +crust +cryosphere +cryoturbation +cuesta +cultural geography +culture +culture hearth +cut bank +cwm +cyclopean stairs +dale +dam +de facto'' segregation +de jure'' segregation +deciduous forest +degree +degree day +dell +delta +demography +depression +desert +digital elevation model (dem) +dike +distributary +district +dome +dormant volcano +drainage basin +drainage divide +draw +drumlin +dry farming +dry point +dune +eastern hemisphere +economic geography +economies of agglomeration +edgelands +elevation +emergent coastline +enclave +endorheic basin +equator +erratic +escarpment +esker +estuary +evapotranspiration +exclave +exotic stream +extinct volcano +exurban +fall line +fallow +fault +fault-block mountain +fault zone +federation +fen +field +firth +fish ladder +fjord +floodplain +focality +forest +functional diversity +gazetteer +geodesy +geoid +geoinformatics +geographic information science (gis) +geographic information system (gis) +geographic names information system (gnis) +geography +geolocation +geomatics +geomorphology +geosphere +geostatistics +ghetto +glacial till +glaciation +glacier +global positioning system (gps) +globe +graben +great circle +great-circle distance +grid +groundwater +growing season +groyne +gulch +gulf +gully +guyot +hamlet +hanging valley +harmonic tremor +heading +headland +hearth +heartland +heath +hedgerow +hemisphere +highland +highway +hill +hillock +hinterland +historical geography +hogback +horizon +hotspot +human geography +hummock +humus +hydrography +hydrosphere +ice age +ice cap +iceberg +igneous rock +impact crater +impoundment +inertia costs of location +inlet +inselberg +insular +integrated geography +intercardinal directions +interfluve +intermediate directions +international date line +international waters +intervening opportunity +intracoastal waterway system +inverted river delta +island +islet +isohyet +isthmus +jurisdiction +jhum cultivation +jungle +kame +karst +kettle +key col +knoll +lacustrine plain +lagoon +lahar +lake +land bridge +landform +landmark +landmass +lateral blast +latitude +lava +leaching +leeward +legend +ledc +levee +life-cycle stage +lignite +lithosphere +lithospheric plates +location +loess +longitude +lowland +magma +main stem +mainland +makhtesh +mantle +map +map projection +maritime climate +marsh +mason–dixon line +massif +meander +meander scar +medc +mediterranean climate +megalopolis +megaregion +meridian +mesa +metamorphic rock +metes and bounds +metropolis +metropolitan area +metropolitan coalescence +mogote +monadnock +moor +moraine +mound +mountain +mountain range +mouth +mudflow +multilingual +municipality +nadir +nation +national mapping agency +national park +natural landscape +neighborhood +nodal region +north geographic pole +north magnetic pole +northern hemisphere +oasis +ocean +open range +ordinal directions +orographic rainfall +outwash +overburden +oxbow +palisade +panhandle +parish +permafrost +peninsula +photogrammetry +physical geography +physiographic region +physiography +piedmont +pit crater +place identity +plain +plate tectonics +plateau +platted land +plural society +polar circle +polar ice cap +polar region +pole of inaccessibility +political geography +polynodal +pond +populated place +population +population geography +post-industrial +pothole +precambrian rock +prevailing winds +primary sector +prime meridian +promontory +protected area +province +psychogeography +pueblo +quadrangle +quaternary sector +quarry +rail gauge +rainforest +rainshadow +ravine +region +regiopolis +relief +relief map +remote sensing +reservoir +resource +retroreflector +ribbon lake +ridge +rift valley +riparian rights +ria +river +riverine +rural +saddle +salient +salt pan +scale +scarp +sea +sea level +seamount +second home +secondary-intercardinal directions +secondary sector +sedimentary rock +seismograph +settlement +sheepback +shield +shield volcano +shoal +shore +sinkhole +site +situation +slough +smog +standard metropolitan statistical area (smsa) +snowline +soil horizon +solubility +sound +south geographic pole +south magnetic pole +southern hemisphere +space economy +spatial citizenship +spatial complementarity +spatial interaction +spatial reference system (srs) +spreading ridges +spring +spur +stack +state +steppe +strait +stratovolcano +stream +subduction zone +suburban +suburbanization +summit +surface water +surveying +swale +swamp +syrt +taiga +tarn +temperature inversion +tephra +terrain +territorial waters +territory +tertiary sector +thalweg +tide +till +time distance +time geography +time zone +topographic map +topographical relief +topographic isolation +topographic prominence +topography +toponymy +tor +town +township and range +transferability +transhumance +tree line +tributary +tropic of cancer +tropic of capricorn +tropics +tundra +underpopulation +uniform region +urban +urban geography +urban sprawl +urbanization +vale +valley +vertical exaggeration +vent +viewshed +village +volcanic avalanche +volcanic crater +volcano +wadi +water mapping +water pollution +water table +watershed +waterway +weathering +western hemisphere +wetland +wilderness +windward +world map +zoning +zenith +north +west +south +east \ No newline at end of file diff --git a/strpython/resources/language_resources/geo_term_fr b/strpython/resources/language_resources/geo_term_fr new file mode 100644 index 0000000000000000000000000000000000000000..3d038768f7ea5c7f8f273eec490dd6c2fefb854f --- /dev/null +++ b/strpython/resources/language_resources/geo_term_fr @@ -0,0 +1,416 @@ +About +Abrupt +Aérodrome +Aéroport +Affluent +Agglomération +Agglomération de recensement +Allée +Anse +Antre +Arboretum +Archipel +Arrêt ferroviaire +Arrière-arrière-fief +Arrière-fief +Arrondissement +Arrondissement historique +Arrondissement naturel +Autoroute +Avenue +Baie +Baignage +Baissière +Banc +Banc de pêche +Banc de sable +Barachois +Barrage +Barre +Barrière +Base +Base de plein air +Base militaire +Bas-fond +Bassin +Bassin hydrographique +Bassin portuaire +Batture +Belvédère +Berge +Bleuetière +Bogan +Bois +Bonnet +Bosquet +Boule +Boulevard +Branche +Bras +Bras mort +Brisant +Bureau de douane +Bureau de poste +Bureau de poste militaire +Butte +Buttereau +Button +Cabouron +Cairn +Calvette +Camp +Camp de plein air +Camp de vacances +Camp forestier +Camp militaire +Camp saisonnier +Camp sportif +Canal +Canton +Canyon +Cap +Carré +Carrefour +Carrière +Cascade +Cascatelle +Cataracte +Caverne +Caye +Cayon +Cédrière +Cénacle +Centrale +Centrale hydroélectrique +Centrale thermique +Centrale nucléaire +Centre de ski +Centre de villégiature +Centre d'interprétation de la nature +Centre écologique +Centre éducatif forestier +Cercle +Chaîne +Chaîne de montagnes +Chalet +Champ +Charge +Chaussée +Chemin +Chemin de front +Chemin de ligne +Chenail +Chenal +Chute +Chuton +Cimetière +Circonscription +Circonscription électorale +Circonscription électorale fédérale +Circonscription électorale provinciale +Cité +Club de chasse et de pêche +Col +Colline +Communauté régionale +Communauté urbaine +Commune +Concession +Confluent +Conseil régional de la santé et des services sociaux +Cordon +Cordon littoral +Corniche +Côte +Coteau +Coude +Coulée +Coupe +Courant +Courbe +Cours +Cours d'eau +Cours d'eau agricole +Cran +Cratère +Cratère météorique +Crête +Crevasse +Crique +Croissant +Cul-de-sac +Débarcadère +Décharge +Défilé +Dépôt +Dépôt forestier +Descente +Desserte +Détroit +Déversant +Digue +District électoral +District judiciaire +Division d'enregistrement +Division de recensement +Domaine +Dôme +Dune +Échangeur +Écluse +Écueil +Embranchement +Ensemble résidentiel +Entrée +Éperon +Escarpement +Esker +Esplanade +Est +Estran +Estuaire +Établissement amérindien +Établissement piscicole +Étang +Évacuateur +Falaise +Ferme +Fief +Fjord +Flèche +Fleuve +Fleuve côtier +Fond +Fondrière +Fontaine +Forêt +Forêt d'enseignement et de recherche +Forêt d'expérimentation +Fort +Fosse +Fossé +Fosse à saumon +Fourche +Fronteau +Gaine +Gare +Gare de triage +Golfe +Gorge +Gouffre +Goulet +Grève +Grotte +Halte +Halte routière +Hameau +Haut-fond +Havre +Héliport +Hydrobase +Île +Îlet +Îlot +Impasse +Jardin +Jardin zoologique +Jetée +Jonction +Kettle +Lac +Lac artificiel +Lacon +Lacotte +Lac salé +Lagune +Langue de terre +Lieu-dit +Ligne +Littoral +Localité +Marais +Marche +Mare +Marécage +Marina +Marmite +Massif +Méandre +Mer +Mine +Mont +Montagne +Montée +Morne +Mouillage +Municipalité +Municipalité de canton +Municipalité de cantons unis +Municipalité de comté +Municipalité de paroisse +Municipalité de village +Municipalité de village cri +Municipalité de village naskapi +Municipalité de village nordique +Municipalité régionale de comté +Niche +Nord +Ouest +Parc +Parc de conservation +Parc de maisons mobiles +Parc de récréation +Parc historique national +Parc industriel +Parc national +Parc national fédéral +Parc public +Paroi +Paroisse +Passage +Passe +Passerelle +Pâturage +Pavillon +Péninsule +Pépinière +Phare +Pic +Piémont +Piste +Piton +Place +Plage +Plaine +Plateau +Plate-forme insulaire +Platier +Platin +Platon +Plé +Plée +Pointe +Ponceau +Pont +Pont naturel +Pont-tunnel +Port +Portage +Port de plaisance +Poste +Poste d'accueil +Poste de douane +Poste de traite +Poste de transformation hydroélectrique +Poulier +Prairie +Pré +Prée +Presqu'île +Promenade +Promontoire +Puits +Puits artésien +Puits naturel +Quai +Quartier +Quartier résidentiel +Quartier scolaire +Rade +Rang +Rapide +Rapidon +Ravin +Ravine +Récif +Région +Région administrative +Remous +Réserve +Réserve de chasse +Réserve de chasse et de pêche +Réserve de la biosphère +Réserve de pêche +Réserve écologique +Réserve faunique +Réserve faunique de saumon +Réserve indienne +Réserve militaire +Réservoir +Rigole +Rigolet +Rivage +Rive +Rivière +Roche +Rocher +Route +Rue +Ruelle +Ruisseau +Ruisselet +Sanctuaire +Sanctuaire de pêche +Sault +Saut +Savane +Secteur +Secteur résidentiel +Seigneurie +Sente +Sentier +Sentier de migration +Sentier écologique +Site +Site historique +Sommet +Source +Square +Station +Station de métro +Station de pompage +Station de relais +Station de ski +Station forestière +Station halieutique +Station météorologique +Station militaire +Station radar +Sud +Terrain de camping +Terrain de jeu +Terrasse +Terrier +Territoire +Territoire non organisé +Tour +Tour à feu +Tourbière +Trait-carré +Traverse +Traverse d'animaux +Trou +Tunnel +Vallée +Vallon +Verger +Versant +Village +Village cri +Village forestier +Village historique +Village minier +Village naskapi +Ville +Ville minière +Voie +Voie de communication +Voie de desserte +Voie d'évitement +Zac +Zec +Zec-saumon +Zone +Zone d'aménagement et de conservation +Zone d'exploitation contrôlée +Zoo diff --git a/tools.py b/tools.py index d978b5a21a9c2e33ee1c1594bdf95e9d24dbf5a3..d417d4539b13172d65d20bdb2d8338a289e9dfef 100644 --- a/tools.py +++ b/tools.py @@ -4,7 +4,7 @@ import argparse from termcolor import colored -from strpython.helpers.gazeteer_helpers import get_most_common_id_v3, get_data, get_by_label +from strpython.helpers.geodict_helpers import get_most_common_id_v3, get_data, get_by_label parser = argparse.ArgumentParser()