Add eval function

-pareto Add notebooks Debug data generation

Add eval function
-pareto Add notebooks Debug data generation
be395fef · Fize Jacques · 802203ee · be395fef · be395fef · be395fef
Commit be395fef authored 7 years ago by Fize Jacques
Expand all Hide whitespace changes
Inline Side-by-side

Showing

with 84454 additions and 21 deletions
+84454 -21
--- a/data/graph_exp_mar_12/selected.json
+++ b/data/graph_exp_mar_12/selected.json
+[3, 5, 6, 7, 8, 10, 11, 13, 14, 17, 19, 20, 23, 25, 26, 28, 29, 32, 34, 35, 39, 40, 41, 42, 43, 44, 50, 53, 57, 58, 61, 62, 63, 68, 70, 74, 80, 81, 82, 93, 94, 100, 101, 103, 113, 114, 115, 116, 122, 131, 133, 135, 140, 141, 142, 144, 151, 153, 154, 162, 198, 205, 207, 213, 218, 224, 226, 231, 234, 236, 243, 245, 246, 247, 248, 249, 252, 253, 256, 258, 259, 262, 264, 267, 270, 273, 275, 279, 280, 282, 283, 286, 287, 291, 328, 329, 330, 332, 340, 341]
--- a/eval.py
+++ b/eval.py
@@ -87,7 +87,7 @@ def compareBOC(graphs_array):
 def compareVEO(graphs_array):
    return 1 - VertexEdgeOverlap.compare(graphs_array)

-def compareJaccard(graphs):
+def compareJaccard(graphs_array):
    return 1 - Jaccard.compare(graphs_array)

 funcDict={
@@ -110,12 +110,13 @@ parser.add_argument("distance")
 parser.add_argument("texts_dir")
 parser.add_argument("graphs_dir")
 parser.add_argument("metadata_fn")
-parser.add_argument("-s","--selectedGraph",default="data/graph_exp_fev_18/selected.json")
+parser.add_argument("original_dir")
+parser.add_argument("-s","--selectedGraph")
 parser.add_argument("-a","--all",action="store_true")
 parser.add_argument("-o","--output",help="Output Filename")
 args = parser.parse_args()

-original_dir="data/graph_exp_fev_18/normal"
+original_dir=args.original_dir
 if not args.distance in funcDict.keys():
    raise NotFoundDistance(args.distance,funcDict)
    exit()
@@ -187,6 +188,7 @@ top_ten_documents=[]
 final_data={}

 deb=time.time()
+print("Computing Similarity Matrix ...")
 similarity_matrix = funcDict[args.distance](graphs_array)
 print("Similarity Matrix Computed in {0} s.".format(time.time()-deb))

@@ -194,6 +196,7 @@ graphs={}
 for file in glob.glob(original_dir.rstrip("/")+"/*.gexf"):
    id=int(re.findall("\d+",file)[-1])
    graphs[id]=nx.read_gexf(file)
+
 with ProgressBar(max_value=len(selected_documents_),widgets=[' [', Timer(), '] ',Bar(),' (', ETA(), ') ',]) as pg:
    inc=0
    for doc_s in selected_documents_:

--- a/eval/__init__.py
+++ b/eval/__init__.py
+# coding = utf-8
\ No newline at end of file
--- a/eval/pareto.py
+++ b/eval/pareto.py
+# coding = utf-8
+
+def is_pareto_front(dataf, row, columns):
+    """
+    Return true if the combination of data for the columns 'columns' is a pareto optimum in 'dataf'.
+    :param dataf:
+    :param row:
+    :param columns:
+    :return:
+    """
+    values = [row[col] for col in columns]
+    boolean_is_max = []
+    for c in range(len(columns)):
+        val = values[c]
+        col = columns[c]
+        bool_temp = True
+        for c2 in range(len(columns)):
+            if c != c2: break
+            val2 = values[c]
+            col2 = columns[c]
+            bool_temp = bool_temp and (dataf.loc[dataf[col2] == val2].max()[col] <= val)
+        boolean_is_max.append(bool_temp)
+    # if no criteria superior
+    daf = dataf.copy()
+    for c in range(len(columns)):
+        val = values[c]
+        col = columns[c]
+        daf = daf.loc[(dataf[col] > val)]
+    return sum(map(int, boolean_is_max)) == len(columns) and len(daf) == 0
--- a/eval/visualize.py
+++ b/eval/visualize.py
+# coding = utf-8
+
+from .pareto import is_pareto_front
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.graph_objs as go
+from plotly.offline import init_notebook_mode,  iplot
+
+init_notebook_mode(connected=True)
+
+cm = sns.color_palette("hls", 8)
+def draw_pareto_static(df, x_label, criteria, x_ax_label="X", y_ax_label="Y", title="Titre"):
+    fig, ax = plt.subplots(figsize=(10, 5), ncols=1)
+    for i in range(len(criteria)):
+        y_label = criteria[i]
+        df_is_pareto = df.apply(lambda row: is_pareto_front(df, row, [x_label, y_label]), axis=1)
+        df_pareto = df.ix[df_is_pareto].sort_values(by=x_label)
+
+        sns.swarmplot(x=x_label, y=y_label, data=df, ax=ax, color=cm[i])
+        ax.plot(df_pareto[x_label].index, df_pareto[y_label].values, '--', color=cm[i],
+                label='P. Frontier for {0}'.format(criteria[i]))
+
+    plt.xlabel(x_ax_label)
+    plt.ylabel(y_ax_label)
+    plt.xticks(rotation=90)
+    plt.title(title)
+    plt.show()
+
+def draw_pareto_dynamic(df, x_label, criteria, layout = None):
+    if not layout:
+        fig = go.Figure(data=data_pareto(df, x_label, criteria))
+    else:
+        fig = go.Figure(data=data_pareto(df, x_label, criteria), layout=layout)
+    return iplot(fig)
+
+
+def data_pareto(df, x_label, criteria):
+    data = []
+    for i in range(len(criteria)):
+        y_label = criteria[i]
+        # df.assign(normalized=df.bought.div(df.groupby('user').bought.transform('sum')))
+        df_is_pareto = df.apply(lambda row: is_pareto_front(df, row, [x_label, y_label]), axis=1)
+        df_pareto = df.ix[df_is_pareto].sort_values(by=x_label)
+        data.append(go.Scatter(
+            x=df[x_label],  # assign x as the dataframe column 'x'
+            y=df[y_label],
+            mode="markers",
+            marker=dict(
+                color=("rgb" + str(cm[i])),
+            ),
+            name="{0} ".format(criteria[i]),
+        ))
+
+        data.append(
+            go.Scatter(
+                x=df_pareto[x_label],  # assign x as the dataframe column 'x'
+                y=df_pareto[y_label],
+                name="{0} Pareto Frontier".format(criteria[i]),
+                line=dict(
+                    color=("rgb" + str(cm[i])),
+                    width=4, )
+            )
+
+        )
+    return data
+
+
+
--- a/exp_mar_12.sh
+++ b/exp_mar_12.sh
@@ -21,11 +21,12 @@ fi

 if [ "$1" == "eval" ]; then
    ## Normal STR eval
+    original=data/graph_exp_mar_12/normal
    dir=normal;
    mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE");
    for me in ${mesure[@]}; do
        echo $me" for STR "$dir;
-        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json -s $output_dir/$dir/selected.json  -o $output_dir/$dir/result_eval/$dir/;
+        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
    done;

    ## Generalised STR eval
@@ -33,26 +34,26 @@ if [ "$1" == "eval" ]; then
    mesure=( "MCS" "VEO"  "JACCARD" "HED" "GREEDY"  "GED" "BOC" "BOWSE");
    for me in ${mesure[@]}; do
        echo $me" for STR "$dir;
-        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json  -s $output_dir/$dir/selected.json  -o $output_dir/$dir/result_eval/$dir/;
+        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original  -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
    done;

    dir=gen_all_2
    mesure=( "MCS" "VEO"  "JACCARD" "HED" "GREEDY" "BOC" "BOWSE");
    for me in ${mesure[@]}; do
        echo $me" for STR "$dir;
-        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json  -s $output_dir/$dir/selected.json  -o $output_dir/$dir/result_eval/$dir/;
+        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json  $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
    done;

    dir=gen_region
    for me in ${mesure[@]}; do
        echo $me" for STR "$dir;
-        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json  -s $output_dir/$dir/selected.json  -o $output_dir/$dir/result_eval/$dir/;
+        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
    done;

    dir=gen_country
    for me in ${mesure[@]}; do
        echo $me" for STR "$dir;
-        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json  -s $output_dir/$dir/selected.json  -o $output_dir/$dir/result_eval/$dir/;
+        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
    done;

    ## Extended STR eval
@@ -60,12 +61,12 @@ if [ "$1" == "eval" ]; then
    mesure=( "MCS" "VEO" "JACCARD" "BOC" "WLSUBTREE" "BOWSE");
    for me in ${mesure[@]}; do
        echo $me" for STR "$dir;
-        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json  -s $output_dir/$dir/selected.json  -o $output_dir/$dir/result_eval/$dir/;
+        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
    done;
    dir=extension_2

    for me in ${mesure[@]}; do
        echo $me" for STR "$dir;
-        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json  -s $output_dir/$dir/selected.json  -o $output_dir/$dir/result_eval/$dir/;
+        python3 eval.py $me $path_texts  $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json  -o $output_dir/result_eval/$dir/;
    done;
 fi
\ No newline at end of file
--- a/generate_data.py
+++ b/generate_data.py
@@ -122,8 +122,10 @@ for text in range(len(texts_)):
    else:
        try:
            lang=detect(texts_[text])
-        except:
+
+        except Exception as e:
            lang="en"
+        print(lang, text)
    if not lang in data and lang in pipeline:
        data[lang]=[]
    if lang in pipeline:
@@ -141,7 +143,7 @@ i=0



-def workSTR(id_doc,text,count_per_doc,associated_es, list_gs,pg):
+def workSTR(id_doc,text,count_per_doc,associated_es, list_gs,pg,lang):
    global i
    if not text:
        count_per_doc[id_doc] = {}
@@ -176,7 +178,7 @@ with  ThreadPoolExecutor(max_workers=4) as executor:
        pg.start()
        for lang in data:
            for id_doc in data[lang]:
-                future = executor.submit(workSTR,id_doc,texts_[id_doc],count_per_doc,associated_es, list_gs,pg)
+                future = executor.submit(workSTR,id_doc,texts_[id_doc],count_per_doc,associated_es, list_gs,pg,lang)
                # print(id_doc)
                # if not texts_[id_doc]:
                #     count_per_doc[id_doc] = {}

--- a/generate_selected_document.py
+++ b/generate_selected_document.py
 # coding = utf-8
+import random
+
 import networkx as nx
 import glob,re
 import argparse
+import numpy as np

 parser = argparse.ArgumentParser()
 parser.add_argument("graph_input_dir")
-
+args=parser.parse_args()

 graphs={}
-for file in glob.glob("data/graph_exp_mar_12/normal/*.gexf"):
+for file in glob.glob(args.graph_input_dir+"/normal/*.gexf"):
    id=int(re.findall("\d+",file)[-1])
    graphs[id]=nx.read_gexf(file)

+median=np.median([len(g) for g in graphs.values()])
+if median <=2:
+    median=int(np.mean([len(g) for g in graphs.values()]))
+
+cat_interval=[
+    [1,2],
+    [2,median],
+    [median,1000000]
+]
+
+size_selection=100
+cat_size=[
+    size_selection/5,
+    (size_selection/5)*2,
+    (size_selection/5)*2
+]
+
+per_size={0:[],1:[],2:[]}
+for i,g in graphs.items():
+    size_ = len(g)
+    for c in range(len(cat_interval)):
+        cat=cat_interval[c]
+        if size_ >= cat[0] and size_ < cat[1]:
+            per_size[c].append(i)
+            break
+
+for k,p in per_size.items():
+    random.shuffle(p)
+
+selected=[]
+for k,p in per_size.items():
+    selected.extend(p[:int(cat_size[k])])
+print(sorted(selected))
+
+count={0:0,1:0,2:0}
+for i in selected:
+    size_ = len(graphs[i])
+    for c in range(len(cat_interval)):
+        cat=cat_interval[c]
+        if size_ >= cat[0] and size_ < cat[1]:
+            count[c]+=1
+            break

+print("Check if good proportions {0}".format(count))
\ No newline at end of file
--- a/generate_transform.py
+++ b/generate_transform.py
@@ -67,7 +67,8 @@ pipeline= {
 }


-
+associated_es={}
+count_per_doc={}
 # Read Input Files
 import re
 graphs_={}
@@ -76,6 +77,7 @@ if os.path.exists(args.graphs_input_dir):
    for fn in files_glob:
        id = int(re.findall("\d+", fn)[-1])
        graphs_[id]=STR.from_networkx_graph(nx.read_gexf(fn))
+        associated_es[id]=graphs_[id].spatial_entities
    if not graphs_:
        print("No .gexf files found in {0}".format(args.graphs_input_dir))
        exit()
@@ -104,12 +106,12 @@ def workSTR(id_doc,g,list_gs,pg,argu):
    pg.update(i)

 queue=[]
-with  ThreadPoolExecutor(max_workers=4) as executor:
+with ThreadPoolExecutor(max_workers=4) as executor:
    with ProgressBar(max_value=len(graphs_),widgets=[' [', Timer(), '] ',Bar(),'(', Counter(),')','(', ETA(), ')']) as pg:
        pg.start()
        for id_doc in graphs_:

            workSTR(id_doc,graphs_[id_doc],list_gs,pg, args)

-
+open(os.path.join(args.graphs_output_dir,"asso.json"),'w').write(json.dumps([associated_es,count_per_doc],indent=4))
 print("--- %s seconds ---" % (time.time() - start))
\ No newline at end of file
--- a/gui_graph_viewer/config/config.json
+++ b/gui_graph_viewer/config/config.json
 {
-  "database_json":"../resources/database_graph_viewer.db"
+  "database_json":"../resources/database_graph_viewer_exp_mars_12.db"
 }
\ No newline at end of file
--- a/gui_graph_viewer/server.py
+++ b/gui_graph_viewer/server.py
@@ -118,9 +118,7 @@ def getMeasureid(mesure):

 @app.route("/save_eval/<g1id>/<g2id>/<mesure>/<type>/<int:c1>/<int:c2>/<int:c3>/<int:c4>")
 def save_eval(g1id,g2id,mesure,type,c1,c2,c3,c4):
-    print(g1id, g2id, mesure, type, c1, c2, c3, c4)
    c1,c2,c3,c4=bool(c1),bool(c2),bool(c3),bool(c4)
-    print(g1id, g2id, mesure, type, c1, c2, c3, c4)
    eval_query = sql_session.query(Eval).filter_by(
        id_g1=g1id,
        id_g2=g2id,
@@ -130,7 +128,9 @@ def save_eval(g1id,g2id,mesure,type,c1,c2,c3,c4):
    )
    if eval_query.count()< 1:
        sql_session.add(Eval(g1id,g2id,getMeasureid(mesure),type,current_user.id,c1,c2,c3,c4))
+        print("ADD",g1id, g2id, mesure, type, c1, c2, c3, c4)
    else:
+        print("UPD",g1id, g2id, mesure, type, c1, c2, c3, c4)
        eval_=eval_query.first()
        eval_.c1_val = c1
        eval_.c2_val = c2

--- a/notebooks/Eval.ipynb
+++ b/notebooks/Eval.ipynb
--- a/notebooks/Eval2.ipynb
+++ b/notebooks/Eval2.ipynb
--- a/notebooks/TF TF-IDF IDF.ipynb
+++ b/notebooks/TF TF-IDF IDF.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/jacquesfize/nas_cloud/Code/str-python\n"
+     ]
+    }
+   ],
+   "source": [
+    "cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob,re,json,os\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataEPI=[open(f).read() for f in glob.glob(\"data/EPI_ELENA/raw_text/*.txt\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%autoreload\n",
+    "from pipeline import *\n",
+    "PipEn=Pipeline(lang=\"english\",tagger=Tagger(),ner=StanfordNER(lang=\"en\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "count_global=[]\n",
+    "for text in dataEPI:\n",
+    "    if not text:\n",
+    "        count_global.append({})\n",
+    "        continue\n",
+    "    counting,_,_= PipEn.parse(text)\n",
+    "    count_global.append(counting)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "count_all={}\n",
+    "for counting in count_global:\n",
+    "    for k,v in counting.items():\n",
+    "        if not k in count_all:count_all[k]=0\n",
+    "        count_all[k]+=v\n",
+    "count_all=np.array(list(count_all.items()),dtype=[(\"dd\",\"<U10\"),(\"de\",np.int)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf=np.sort(count_all, order='de')[::-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "count_idf={}\n",
+    "for counting in count_global:\n",
+    "    for k,v in counting.items():\n",
+    "        if not k in count_idf:count_idf[k]=0\n",
+    "        count_idf[k]+=1\n",
+    "idf=[[k,int(v)] for k,v in count_idf.items()]\n",
+    "for k in range(len(idf)):\n",
+    "    idf[k]=[get_data(idf[k][0])[\"en\"],np.log(len(dataEPI)/idf[k][1])]\n",
+    "idf=np.array(idf)\n",
+    "sorted_=np.argsort(idf[:,1].astype(float))\n",
+    "idf=idf[sorted_]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"resources/tf_epi.csv\",'w') as tf_w:\n",
+    "    for t in tf:\n",
+    "        tf_w.write(\"{0}\\t{1}\\n\".format(get_data(t[0])[\"en\"],t[1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"resources/idf_epi.csv\",'w') as tf_w:\n",
+    "    for t in idf:\n",
+    "        tf_w.write(\"{0}\\t{1}\\n\".format(t[0],t[1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataBVLAC=[open(f).read() for f in glob.glob(\"data/BV_LAC21/*.txt\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "count_global_bv=json.load(open(\"associateJPT.json\"))[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "count_idf={}\n",
+    "for _, counting in count_global_bv.items():\n",
+    "    for k,v in counting.items():\n",
+    "        if not k in count_idf:count_idf[k]=0\n",
+    "        count_idf[k]+=1\n",
+    "idf=[[k,int(v)] for k,v in count_idf.items()]\n",
+    "for k in range(len(idf)):\n",
+    "    idf[k]=[get_data(idf[k][0])[\"en\"],np.log(len(dataBVLAC)/idf[k][1])]\n",
+    "idf=np.array(idf)\n",
+    "sorted_=np.argsort(idf[:,1].astype(float))\n",
+    "idf=idf[sorted_]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"resources/idf_bvlac.csv\",'w') as tf_w:\n",
+    "    for t in idf:\n",
+    "        tf_w.write(\"{0}\\t{1}\\n\".format(t[0],t[1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.0"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+
+``` python
+cd ..
+```
+
+%% Output
+
+    /Users/jacquesfize/nas_cloud/Code/str-python
+
+%% Cell type:code id: tags:
+
+``` python
+%load_ext autoreload
+```
+
+%% Cell type:code id: tags:
+
+``` python
+import glob,re,json,os
+```
+
+%% Cell type:code id: tags:
+
+``` python
+dataEPI=[open(f).read() for f in glob.glob("data/EPI_ELENA/raw_text/*.txt")]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+%autoreload
+from pipeline import *
+PipEn=Pipeline(lang="english",tagger=Tagger(),ner=StanfordNER(lang="en"))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+count_global=[]
+for text in dataEPI:
+    if not text:
+        count_global.append({})
+        continue
+    counting,_,_= PipEn.parse(text)
+    count_global.append(counting)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+count_all={}
+for counting in count_global:
+    for k,v in counting.items():
+        if not k in count_all:count_all[k]=0
+        count_all[k]+=v
+count_all=np.array(list(count_all.items()),dtype=[("dd","<U10"),("de",np.int)])
+```
+
+%% Cell type:code id: tags:
+
+``` python
+tf=np.sort(count_all, order='de')[::-1]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+count_idf={}
+for counting in count_global:
+    for k,v in counting.items():
+        if not k in count_idf:count_idf[k]=0
+        count_idf[k]+=1
+idf=[[k,int(v)] for k,v in count_idf.items()]
+for k in range(len(idf)):
+    idf[k]=[get_data(idf[k][0])["en"],np.log(len(dataEPI)/idf[k][1])]
+idf=np.array(idf)
+sorted_=np.argsort(idf[:,1].astype(float))
+idf=idf[sorted_]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+with open("resources/tf_epi.csv",'w') as tf_w:
+    for t in tf:
+        tf_w.write("{0}\t{1}\n".format(get_data(t[0])["en"],t[1]))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+with open("resources/idf_epi.csv",'w') as tf_w:
+    for t in idf:
+        tf_w.write("{0}\t{1}\n".format(t[0],t[1]))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+dataBVLAC=[open(f).read() for f in glob.glob("data/BV_LAC21/*.txt")]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+count_global_bv=json.load(open("associateJPT.json"))[1]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+count_idf={}
+for _, counting in count_global_bv.items():
+    for k,v in counting.items():
+        if not k in count_idf:count_idf[k]=0
+        count_idf[k]+=1
+idf=[[k,int(v)] for k,v in count_idf.items()]
+for k in range(len(idf)):
+    idf[k]=[get_data(idf[k][0])["en"],np.log(len(dataBVLAC)/idf[k][1])]
+idf=np.array(idf)
+sorted_=np.argsort(idf[:,1].astype(float))
+idf=idf[sorted_]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+with open("resources/idf_bvlac.csv",'w') as tf_w:
+    for t in idf:
+        tf_w.write("{0}\t{1}\n".format(t[0],t[1]))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/notebooks/Update Criteria Value .ipynb
+++ b/notebooks/Update Criteria Value .ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-03-07T12:45:14.694851Z",
+     "start_time": "2018-03-07T12:45:14.245401Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-03-07T12:46:20.365335Z",
+     "start_time": "2018-03-07T12:46:20.361055Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/Users/jacquesfize/nas_cloud/Code/str-python'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%pwd\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-03-07T14:39:47.452843Z",
+     "start_time": "2018-03-07T14:39:47.445671Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df=pd.read_csv(\"resources/test.tsv\",delimiter=\"\\t\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-03-07T14:39:47.860139Z",
+     "start_time": "2018-03-07T14:39:47.853669Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "freq_couples=df.groupby([\"id_g1\",\"id_g2\"]).size().reset_index(name='Freq')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-03-07T14:39:55.422633Z",
+     "start_time": "2018-03-07T14:39:48.242558Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "new_data=[]\n",
+    "for index, row in freq_couples.iterrows():\n",
+    "    df_temp=df.query('id_g1 == {0} & id_g2 == {1}'.format(row.id_g1,row.id_g2))\n",
+    "    freq_c_values=df_temp.groupby([\"c1_val\",\"c2_val\",\"c3_val\",\"c4_val\"]).size().reset_index(name='Freq')\n",
+    "    n=len(freq_c_values.index)\n",
+    "    if n >1:\n",
+    "        #max_key=freq_c_values['Freq'].argmax()\n",
+    "        #new_data.append([row.id_g1,row.id_g2,list(freq_c_values.iloc[max_key].drop('Freq').values)])\n",
+    "        #new_data.append([row.id_g1,row.id_g2,df_temp.tail(1)[[\"c1_val\",\"c2_val\",\"c3_val\",\"c4_val\"]].values.tolist()[0]])\n",
+    "        new_val=df_temp.tail(1)[[\"c1_val\",\"c2_val\",\"c3_val\",\"c4_val\"]].values.tolist()[0]\n",
+    "        #print(new_val)\n",
+    "        df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c1_val']] = new_val[0]\n",
+    "        df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c2_val']] = new_val[1]\n",
+    "        df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c4_val']] = new_val[2]\n",
+    "        df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c3_val']] = new_val[3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-03-07T14:39:55.498705Z",
+     "start_time": "2018-03-07T14:39:55.492502Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "freq_couples=df.groupby([\"id_g1\",\"id_g2\"]).size().reset_index(name='Freq')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-03-07T14:40:00.850421Z",
+     "start_time": "2018-03-07T14:39:55.566732Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "new_data=[]\n",
+    "for index, row in freq_couples.iterrows():\n",
+    "    df_temp=df.query('id_g1 == {0} & id_g2 == {1}'.format(row.id_g1,row.id_g2))\n",
+    "    freq_c_values=df_temp.groupby([\"c1_val\",\"c2_val\",\"c3_val\",\"c4_val\"]).size().reset_index(name='Freq')\n",
+    "    n=len(freq_c_values.index)\n",
+    "    if n >1:\n",
+    "        print(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 142,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-03-07T14:41:31.263194Z",
+     "start_time": "2018-03-07T14:41:31.221996Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df.to_csv(\"resources/test_updated.tsv\",sep=\"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.0"
+  },
+  "toc": {
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": "block",
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+```
+
+%% Cell type:code id: tags:
+
+``` python
+%pwd
+```
+
+%% Output
+
+    '/Users/jacquesfize/nas_cloud/Code/str-python'
+
+%% Cell type:code id: tags:
+
+``` python
+df=pd.read_csv("resources/test.tsv",delimiter="\t")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+freq_couples=df.groupby(["id_g1","id_g2"]).size().reset_index(name='Freq')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+new_data=[]
+for index, row in freq_couples.iterrows():
+    df_temp=df.query('id_g1 == {0} & id_g2 == {1}'.format(row.id_g1,row.id_g2))
+    freq_c_values=df_temp.groupby(["c1_val","c2_val","c3_val","c4_val"]).size().reset_index(name='Freq')
+    n=len(freq_c_values.index)
+    if n >1:
+        #max_key=freq_c_values['Freq'].argmax()
+        #new_data.append([row.id_g1,row.id_g2,list(freq_c_values.iloc[max_key].drop('Freq').values)])
+        #new_data.append([row.id_g1,row.id_g2,df_temp.tail(1)[["c1_val","c2_val","c3_val","c4_val"]].values.tolist()[0]])
+        new_val=df_temp.tail(1)[["c1_val","c2_val","c3_val","c4_val"]].values.tolist()[0]
+        #print(new_val)
+        df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c1_val']] = new_val[0]
+        df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c2_val']] = new_val[1]
+        df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c4_val']] = new_val[2]
+        df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c3_val']] = new_val[3]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+freq_couples=df.groupby(["id_g1","id_g2"]).size().reset_index(name='Freq')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+new_data=[]
+for index, row in freq_couples.iterrows():
+    df_temp=df.query('id_g1 == {0} & id_g2 == {1}'.format(row.id_g1,row.id_g2))
+    freq_c_values=df_temp.groupby(["c1_val","c2_val","c3_val","c4_val"]).size().reset_index(name='Freq')
+    n=len(freq_c_values.index)
+    if n >1:
+        print(1)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df.to_csv("resources/test_updated.tsv",sep="\t")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,3 +21,4 @@ progressbar2==3.35.0
 scikit_bio==0.5.1
 scikit_learn==0.19.1
 typing==3.6.4
+plotly
\ No newline at end of file