Commit be395fef authored by Fize Jacques's avatar Fize Jacques
Browse files

Add eval function

 -pareto

Add notebooks

Debug data generation
parent 802203ee
No related merge requests found
Showing with 84454 additions and 21 deletions
+84454 -21
[3, 5, 6, 7, 8, 10, 11, 13, 14, 17, 19, 20, 23, 25, 26, 28, 29, 32, 34, 35, 39, 40, 41, 42, 43, 44, 50, 53, 57, 58, 61, 62, 63, 68, 70, 74, 80, 81, 82, 93, 94, 100, 101, 103, 113, 114, 115, 116, 122, 131, 133, 135, 140, 141, 142, 144, 151, 153, 154, 162, 198, 205, 207, 213, 218, 224, 226, 231, 234, 236, 243, 245, 246, 247, 248, 249, 252, 253, 256, 258, 259, 262, 264, 267, 270, 273, 275, 279, 280, 282, 283, 286, 287, 291, 328, 329, 330, 332, 340, 341]
......@@ -87,7 +87,7 @@ def compareBOC(graphs_array):
def compareVEO(graphs_array):
return 1 - VertexEdgeOverlap.compare(graphs_array)
def compareJaccard(graphs):
def compareJaccard(graphs_array):
return 1 - Jaccard.compare(graphs_array)
funcDict={
......@@ -110,12 +110,13 @@ parser.add_argument("distance")
parser.add_argument("texts_dir")
parser.add_argument("graphs_dir")
parser.add_argument("metadata_fn")
parser.add_argument("-s","--selectedGraph",default="data/graph_exp_fev_18/selected.json")
parser.add_argument("original_dir")
parser.add_argument("-s","--selectedGraph")
parser.add_argument("-a","--all",action="store_true")
parser.add_argument("-o","--output",help="Output Filename")
args = parser.parse_args()
original_dir="data/graph_exp_fev_18/normal"
original_dir=args.original_dir
if not args.distance in funcDict.keys():
raise NotFoundDistance(args.distance,funcDict)
exit()
......@@ -187,6 +188,7 @@ top_ten_documents=[]
final_data={}
deb=time.time()
print("Computing Similarity Matrix ...")
similarity_matrix = funcDict[args.distance](graphs_array)
print("Similarity Matrix Computed in {0} s.".format(time.time()-deb))
......@@ -194,6 +196,7 @@ graphs={}
for file in glob.glob(original_dir.rstrip("/")+"/*.gexf"):
id=int(re.findall("\d+",file)[-1])
graphs[id]=nx.read_gexf(file)
with ProgressBar(max_value=len(selected_documents_),widgets=[' [', Timer(), '] ',Bar(),' (', ETA(), ') ',]) as pg:
inc=0
for doc_s in selected_documents_:
......
# coding = utf-8
\ No newline at end of file
# coding = utf-8
def is_pareto_front(dataf, row, columns):
"""
Return true if the combination of data for the columns 'columns' is a pareto optimum in 'dataf'.
:param dataf:
:param row:
:param columns:
:return:
"""
values = [row[col] for col in columns]
boolean_is_max = []
for c in range(len(columns)):
val = values[c]
col = columns[c]
bool_temp = True
for c2 in range(len(columns)):
if c != c2: break
val2 = values[c]
col2 = columns[c]
bool_temp = bool_temp and (dataf.loc[dataf[col2] == val2].max()[col] <= val)
boolean_is_max.append(bool_temp)
# if no criteria superior
daf = dataf.copy()
for c in range(len(columns)):
val = values[c]
col = columns[c]
daf = daf.loc[(dataf[col] > val)]
return sum(map(int, boolean_is_max)) == len(columns) and len(daf) == 0
# coding = utf-8
from .pareto import is_pareto_front
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
cm = sns.color_palette("hls", 8)
def draw_pareto_static(df, x_label, criteria, x_ax_label="X", y_ax_label="Y", title="Titre"):
fig, ax = plt.subplots(figsize=(10, 5), ncols=1)
for i in range(len(criteria)):
y_label = criteria[i]
df_is_pareto = df.apply(lambda row: is_pareto_front(df, row, [x_label, y_label]), axis=1)
df_pareto = df.ix[df_is_pareto].sort_values(by=x_label)
sns.swarmplot(x=x_label, y=y_label, data=df, ax=ax, color=cm[i])
ax.plot(df_pareto[x_label].index, df_pareto[y_label].values, '--', color=cm[i],
label='P. Frontier for {0}'.format(criteria[i]))
plt.xlabel(x_ax_label)
plt.ylabel(y_ax_label)
plt.xticks(rotation=90)
plt.title(title)
plt.show()
def draw_pareto_dynamic(df, x_label, criteria, layout = None):
if not layout:
fig = go.Figure(data=data_pareto(df, x_label, criteria))
else:
fig = go.Figure(data=data_pareto(df, x_label, criteria), layout=layout)
return iplot(fig)
def data_pareto(df, x_label, criteria):
data = []
for i in range(len(criteria)):
y_label = criteria[i]
# df.assign(normalized=df.bought.div(df.groupby('user').bought.transform('sum')))
df_is_pareto = df.apply(lambda row: is_pareto_front(df, row, [x_label, y_label]), axis=1)
df_pareto = df.ix[df_is_pareto].sort_values(by=x_label)
data.append(go.Scatter(
x=df[x_label], # assign x as the dataframe column 'x'
y=df[y_label],
mode="markers",
marker=dict(
color=("rgb" + str(cm[i])),
),
name="{0} ".format(criteria[i]),
))
data.append(
go.Scatter(
x=df_pareto[x_label], # assign x as the dataframe column 'x'
y=df_pareto[y_label],
name="{0} Pareto Frontier".format(criteria[i]),
line=dict(
color=("rgb" + str(cm[i])),
width=4, )
)
)
return data
......@@ -21,11 +21,12 @@ fi
if [ "$1" == "eval" ]; then
## Normal STR eval
original=data/graph_exp_mar_12/normal
dir=normal;
mesure=("MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE");
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json -s $output_dir/$dir/selected.json -o $output_dir/$dir/result_eval/$dir/;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/;
done;
## Generalised STR eval
......@@ -33,26 +34,26 @@ if [ "$1" == "eval" ]; then
mesure=( "MCS" "VEO" "JACCARD" "HED" "GREEDY" "GED" "BOC" "BOWSE");
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json -s $output_dir/$dir/selected.json -o $output_dir/$dir/result_eval/$dir/;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/;
done;
dir=gen_all_2
mesure=( "MCS" "VEO" "JACCARD" "HED" "GREEDY" "BOC" "BOWSE");
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json -s $output_dir/$dir/selected.json -o $output_dir/$dir/result_eval/$dir/;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/;
done;
dir=gen_region
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json -s $output_dir/$dir/selected.json -o $output_dir/$dir/result_eval/$dir/;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/;
done;
dir=gen_country
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json -s $output_dir/$dir/selected.json -o $output_dir/$dir/result_eval/$dir/;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/;
done;
## Extended STR eval
......@@ -60,12 +61,12 @@ if [ "$1" == "eval" ]; then
mesure=( "MCS" "VEO" "JACCARD" "BOC" "WLSUBTREE" "BOWSE");
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json -s $output_dir/$dir/selected.json -o $output_dir/$dir/result_eval/$dir/;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/;
done;
dir=extension_2
for me in ${mesure[@]}; do
echo $me" for STR "$dir;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json -s $output_dir/$dir/selected.json -o $output_dir/$dir/result_eval/$dir/;
python3 eval.py $me $path_texts $output_dir/$dir $output_dir/$dir/asso.json $original -s $output_dir/selected.json -o $output_dir/result_eval/$dir/;
done;
fi
\ No newline at end of file
......@@ -122,8 +122,10 @@ for text in range(len(texts_)):
else:
try:
lang=detect(texts_[text])
except:
except Exception as e:
lang="en"
print(lang, text)
if not lang in data and lang in pipeline:
data[lang]=[]
if lang in pipeline:
......@@ -141,7 +143,7 @@ i=0
def workSTR(id_doc,text,count_per_doc,associated_es, list_gs,pg):
def workSTR(id_doc,text,count_per_doc,associated_es, list_gs,pg,lang):
global i
if not text:
count_per_doc[id_doc] = {}
......@@ -176,7 +178,7 @@ with ThreadPoolExecutor(max_workers=4) as executor:
pg.start()
for lang in data:
for id_doc in data[lang]:
future = executor.submit(workSTR,id_doc,texts_[id_doc],count_per_doc,associated_es, list_gs,pg)
future = executor.submit(workSTR,id_doc,texts_[id_doc],count_per_doc,associated_es, list_gs,pg,lang)
# print(id_doc)
# if not texts_[id_doc]:
# count_per_doc[id_doc] = {}
......
# coding = utf-8
import random
import networkx as nx
import glob,re
import argparse
import numpy as np
parser = argparse.ArgumentParser()
parser.add_argument("graph_input_dir")
args=parser.parse_args()
graphs={}
for file in glob.glob("data/graph_exp_mar_12/normal/*.gexf"):
for file in glob.glob(args.graph_input_dir+"/normal/*.gexf"):
id=int(re.findall("\d+",file)[-1])
graphs[id]=nx.read_gexf(file)
median=np.median([len(g) for g in graphs.values()])
if median <=2:
median=int(np.mean([len(g) for g in graphs.values()]))
cat_interval=[
[1,2],
[2,median],
[median,1000000]
]
size_selection=100
cat_size=[
size_selection/5,
(size_selection/5)*2,
(size_selection/5)*2
]
per_size={0:[],1:[],2:[]}
for i,g in graphs.items():
size_ = len(g)
for c in range(len(cat_interval)):
cat=cat_interval[c]
if size_ >= cat[0] and size_ < cat[1]:
per_size[c].append(i)
break
for k,p in per_size.items():
random.shuffle(p)
selected=[]
for k,p in per_size.items():
selected.extend(p[:int(cat_size[k])])
print(sorted(selected))
count={0:0,1:0,2:0}
for i in selected:
size_ = len(graphs[i])
for c in range(len(cat_interval)):
cat=cat_interval[c]
if size_ >= cat[0] and size_ < cat[1]:
count[c]+=1
break
print("Check if good proportions {0}".format(count))
\ No newline at end of file
......@@ -67,7 +67,8 @@ pipeline= {
}
associated_es={}
count_per_doc={}
# Read Input Files
import re
graphs_={}
......@@ -76,6 +77,7 @@ if os.path.exists(args.graphs_input_dir):
for fn in files_glob:
id = int(re.findall("\d+", fn)[-1])
graphs_[id]=STR.from_networkx_graph(nx.read_gexf(fn))
associated_es[id]=graphs_[id].spatial_entities
if not graphs_:
print("No .gexf files found in {0}".format(args.graphs_input_dir))
exit()
......@@ -104,12 +106,12 @@ def workSTR(id_doc,g,list_gs,pg,argu):
pg.update(i)
queue=[]
with ThreadPoolExecutor(max_workers=4) as executor:
with ThreadPoolExecutor(max_workers=4) as executor:
with ProgressBar(max_value=len(graphs_),widgets=[' [', Timer(), '] ',Bar(),'(', Counter(),')','(', ETA(), ')']) as pg:
pg.start()
for id_doc in graphs_:
workSTR(id_doc,graphs_[id_doc],list_gs,pg, args)
open(os.path.join(args.graphs_output_dir,"asso.json"),'w').write(json.dumps([associated_es,count_per_doc],indent=4))
print("--- %s seconds ---" % (time.time() - start))
\ No newline at end of file
{
"database_json":"../resources/database_graph_viewer.db"
"database_json":"../resources/database_graph_viewer_exp_mars_12.db"
}
\ No newline at end of file
......@@ -118,9 +118,7 @@ def getMeasureid(mesure):
@app.route("/save_eval/<g1id>/<g2id>/<mesure>/<type>/<int:c1>/<int:c2>/<int:c3>/<int:c4>")
def save_eval(g1id,g2id,mesure,type,c1,c2,c3,c4):
print(g1id, g2id, mesure, type, c1, c2, c3, c4)
c1,c2,c3,c4=bool(c1),bool(c2),bool(c3),bool(c4)
print(g1id, g2id, mesure, type, c1, c2, c3, c4)
eval_query = sql_session.query(Eval).filter_by(
id_g1=g1id,
id_g2=g2id,
......@@ -130,7 +128,9 @@ def save_eval(g1id,g2id,mesure,type,c1,c2,c3,c4):
)
if eval_query.count()< 1:
sql_session.add(Eval(g1id,g2id,getMeasureid(mesure),type,current_user.id,c1,c2,c3,c4))
print("ADD",g1id, g2id, mesure, type, c1, c2, c3, c4)
else:
print("UPD",g1id, g2id, mesure, type, c1, c2, c3, c4)
eval_=eval_query.first()
eval_.c1_val = c1
eval_.c2_val = c2
......
This diff is collapsed.
This diff is collapsed.
%% Cell type:code id: tags:
``` python
cd ..
```
%% Output
/Users/jacquesfize/nas_cloud/Code/str-python
%% Cell type:code id: tags:
``` python
%load_ext autoreload
```
%% Cell type:code id: tags:
``` python
import glob,re,json,os
```
%% Cell type:code id: tags:
``` python
dataEPI=[open(f).read() for f in glob.glob("data/EPI_ELENA/raw_text/*.txt")]
```
%% Cell type:code id: tags:
``` python
%autoreload
from pipeline import *
PipEn=Pipeline(lang="english",tagger=Tagger(),ner=StanfordNER(lang="en"))
```
%% Cell type:code id: tags:
``` python
count_global=[]
for text in dataEPI:
if not text:
count_global.append({})
continue
counting,_,_= PipEn.parse(text)
count_global.append(counting)
```
%% Cell type:code id: tags:
``` python
count_all={}
for counting in count_global:
for k,v in counting.items():
if not k in count_all:count_all[k]=0
count_all[k]+=v
count_all=np.array(list(count_all.items()),dtype=[("dd","<U10"),("de",np.int)])
```
%% Cell type:code id: tags:
``` python
tf=np.sort(count_all, order='de')[::-1]
```
%% Cell type:code id: tags:
``` python
count_idf={}
for counting in count_global:
for k,v in counting.items():
if not k in count_idf:count_idf[k]=0
count_idf[k]+=1
idf=[[k,int(v)] for k,v in count_idf.items()]
for k in range(len(idf)):
idf[k]=[get_data(idf[k][0])["en"],np.log(len(dataEPI)/idf[k][1])]
idf=np.array(idf)
sorted_=np.argsort(idf[:,1].astype(float))
idf=idf[sorted_]
```
%% Cell type:code id: tags:
``` python
with open("resources/tf_epi.csv",'w') as tf_w:
for t in tf:
tf_w.write("{0}\t{1}\n".format(get_data(t[0])["en"],t[1]))
```
%% Cell type:code id: tags:
``` python
with open("resources/idf_epi.csv",'w') as tf_w:
for t in idf:
tf_w.write("{0}\t{1}\n".format(t[0],t[1]))
```
%% Cell type:code id: tags:
``` python
dataBVLAC=[open(f).read() for f in glob.glob("data/BV_LAC21/*.txt")]
```
%% Cell type:code id: tags:
``` python
count_global_bv=json.load(open("associateJPT.json"))[1]
```
%% Cell type:code id: tags:
``` python
count_idf={}
for _, counting in count_global_bv.items():
for k,v in counting.items():
if not k in count_idf:count_idf[k]=0
count_idf[k]+=1
idf=[[k,int(v)] for k,v in count_idf.items()]
for k in range(len(idf)):
idf[k]=[get_data(idf[k][0])["en"],np.log(len(dataBVLAC)/idf[k][1])]
idf=np.array(idf)
sorted_=np.argsort(idf[:,1].astype(float))
idf=idf[sorted_]
```
%% Cell type:code id: tags:
``` python
with open("resources/idf_bvlac.csv",'w') as tf_w:
for t in idf:
tf_w.write("{0}\t{1}\n".format(t[0],t[1]))
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
import pandas as pd
```
%% Cell type:code id: tags:
``` python
%pwd
```
%% Output
'/Users/jacquesfize/nas_cloud/Code/str-python'
%% Cell type:code id: tags:
``` python
df=pd.read_csv("resources/test.tsv",delimiter="\t")
```
%% Cell type:code id: tags:
``` python
freq_couples=df.groupby(["id_g1","id_g2"]).size().reset_index(name='Freq')
```
%% Cell type:code id: tags:
``` python
new_data=[]
for index, row in freq_couples.iterrows():
df_temp=df.query('id_g1 == {0} & id_g2 == {1}'.format(row.id_g1,row.id_g2))
freq_c_values=df_temp.groupby(["c1_val","c2_val","c3_val","c4_val"]).size().reset_index(name='Freq')
n=len(freq_c_values.index)
if n >1:
#max_key=freq_c_values['Freq'].argmax()
#new_data.append([row.id_g1,row.id_g2,list(freq_c_values.iloc[max_key].drop('Freq').values)])
#new_data.append([row.id_g1,row.id_g2,df_temp.tail(1)[["c1_val","c2_val","c3_val","c4_val"]].values.tolist()[0]])
new_val=df_temp.tail(1)[["c1_val","c2_val","c3_val","c4_val"]].values.tolist()[0]
#print(new_val)
df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c1_val']] = new_val[0]
df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c2_val']] = new_val[1]
df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c4_val']] = new_val[2]
df.loc[(df.id_g1 == row.id_g1) & (df.id_g2 == row.id_g2),['c3_val']] = new_val[3]
```
%% Cell type:code id: tags:
``` python
freq_couples=df.groupby(["id_g1","id_g2"]).size().reset_index(name='Freq')
```
%% Cell type:code id: tags:
``` python
new_data=[]
for index, row in freq_couples.iterrows():
df_temp=df.query('id_g1 == {0} & id_g2 == {1}'.format(row.id_g1,row.id_g2))
freq_c_values=df_temp.groupby(["c1_val","c2_val","c3_val","c4_val"]).size().reset_index(name='Freq')
n=len(freq_c_values.index)
if n >1:
print(1)
```
%% Cell type:code id: tags:
``` python
df.to_csv("resources/test_updated.tsv",sep="\t")
```
%% Cell type:code id: tags:
``` python
```
requirements.txt 100644 → 100755
......@@ -21,3 +21,4 @@ progressbar2==3.35.0
scikit_bio==0.5.1
scikit_learn==0.19.1
typing==3.6.4
plotly
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment