Eval_mada_light_30_mars-checkpoint.ipynb 22.42 KiB
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import json,glob,re
from scipy.spatial.distance import jaccard
import numpy as np
from math import*

import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
#from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf', 'svg')
#sns.set_palette(sns.color_palette("hls", 8))


import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)


def jaccard_similarity(x,y): 
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)
%cd ..
Out [1]:
Out [1]:
/Users/jacquesfize/nas_cloud/Code/str-python
In [2]:
df=pd.read_csv("resources/results_graph_exp18fev.tsv",delimiter="\t",index_col=0)
new_df=pd.DataFrame(columns=df.columns)

selected_graph=json.load(open("data/graph_exp_fev_18/selected.json"))
types=df.type.unique()
graph_size={}
graphs_={}

files_glob= glob.glob("data/graph_exp_fev_18/normal/*.gexf")
for fn in files_glob:
    id_ = int(re.findall("\d+", fn)[-1])
    graphs_[id_]=nx.read_gexf(fn)
    graph_size[id_]=len(graphs_[id_])
graph_size[999]=0
nb_of_g_w_es_com={}
for g in graphs_:
    if not g in nb_of_g_w_es_com:
        nb_of_g_w_es_com[g]=0
    for g2 in graphs_:
        if not g2 == g:
            if set(graphs_[g].nodes()).intersection(set(graphs_[g2].nodes())):
                nb_of_g_w_es_com[g]+=1    

In [3]:
df_mesure=pd.read_csv("resources/mesures.tsv",delimiter="\t")
type2int={}
int2type={}
i=0
for t in df.type.unique():
    type2int[t]=i
    int2type[i]=t
    i+=1
In [4]:
n=3
df_copy=pd.DataFrame(columns=df.columns)
for t in types:
    mesures=df[df.type == t].mesure.unique()
    for m in mesures:
        data=df[(df.mesure == m) & (df.type == t)]
        for g in selected_graph:
            subset=data[data.id_g1 == g].iloc[:n]
            if len(subset)<1:#No graph found
                df_2=pd.DataFrame([[g,999,m,t,3,0,0,0,0]],columns=df.columns)
                for i in range(n):df_copy=df_copy.append(df_2)
            elif len(subset)
In [5]:
print(df.sample(frac=0.001).to_latex())
Out [5]:
\begin{tabular}{lrrrlrrrrr}
\toprule
{} &  id\_g1 &  id\_g2 &  mesure &         type &  id\_user &  c1\_val &  c2\_val &  c3\_val &  c4\_val \\
\midrule
6441 &  346.0 &  190.0 &     8.0 &   gen\_region &      3.0 &     1.0 &     1.0 &     0.0 &     1.0 \\
1364 &    2.0 &  269.0 &     9.0 &       normal &      3.0 &     1.0 &     1.0 &     1.0 &     1.0 \\
1897 &   14.0 &    5.0 &     8.0 &  gen\_country &      3.0 &     1.0 &     1.0 &     1.0 &     1.0 \\
7336 &  264.0 &  363.0 &    10.0 &   gen\_region &      3.0 &     0.0 &     0.0 &     0.0 &     0.0 \\
\bottomrule
\end{tabular}

In [6]:
df["g1_size"]=df["id_g1"].apply(lambda x:graph_size[int(x)])
df["g2_size"]=df["id_g2"].apply(lambda x:graph_size[int(x)])
#df["mesure"]=df["mesure"].apply(lambda x:df_mesure[df_mesure.id==x].values[0][-1])
df["mesure"]=df["mesure"].apply(lambda x:int(x))
df["typeI"]=df["type"].apply(lambda x:type2int[x])
df['c1*c2']=df.c1_val*df.c2_val
df['c1+c2']=(df.c1_val+df.c2_val).apply(lambda x:x if x<2 else 1)
df['c1*c2*c3']=df.c1_val*df.c2_val*df.c3_val
df['c1*c3']=df.c1_val*df.c3_val
df['c1+c3']=(df.c1_val+df.c3_val).apply(lambda x:x if x<2 else 1)
df['c2*c3']=df.c2_val*df.c3_val
df["for_c"]=df["id_g2"].apply(lambda x:1)
df["es_in_common"]=df["id_g1"].apply(lambda x:nb_of_g_w_es_com[x])
normal=df[df.type == type2int["normal"]]
gen_country=df[df.type == type2int["gen_country"]]
gen_region=df[df.type == type2int["gen_region"]]
extension_1=df[df.type == type2int["extension_1"]]
In [7]:
 colorized_subset=['c1_val', 'c2_val', 'c3_val',
       'c4_val', 'c1*c2', 'c1+c2', 'c1*c2*c3', 'c1*c3', 'c1+c3', 'c2*c3']
In [8]:
from eval.pareto import is_pareto_front
from eval.visualize import *
Out [8]:
In [9]:
def get_pareto_graph_ids(df,criteria):
    set_=set([])
    for c in criteria:
        df_is_pareto = df.apply(lambda row: is_pareto_front(df,row, [x_label, c]), axis=1)
        df_pareto = df.ix[df_is_pareto]
        for i,row in df_pareto.iterrows():
            set_.add(int(row.id_g1))
    return list(set_)
In [10]:
keys_alone=['c1_val', 'c2_val', 'c3_val', 'c4_val']
keys_combined=['c1*c2', 'c1*c2*c3', 'c1*c3', 'c2*c3']

Quelle mesure maximise les 4 critères ?

Procédure de test:

  • On récupére la valeur de précision pour chaque mesure et critère.
  • On calcule le front de Pareto sur les 4 critères de validation

Résultat : MCS et VEO maximise les différents critères selon la valeur de précision moyenne sur l'ensemble des couples de graphes.

In [11]:
d_pc=df.groupby(['mesure'],as_index=False).mean()[['mesure','c1_val','c2_val','c3_val','c4_val']]
df_is_pareto = d_pc.apply(lambda row: is_pareto_front(d_pc,row, ['c1_val','c2_val','c3_val','c4_val']), axis=1)
df_pareto = d_pc.ix[df_is_pareto].sort_values(by=['c1_val','c2_val','c3_val','c4_val'])
df_pareto["mesureL"]=df_pareto["mesure"].apply(lambda x:df_mesure[df_mesure.id==x].values[0][-1])
%matplotlib inline
df_pareto
Out [11]:
(Click to sort ascending)
mesure
(Click to sort ascending)
c1_val
(Click to sort ascending)
c2_val
(Click to sort ascending)
c3_val
(Click to sort ascending)
c4_val
(Click to sort ascending)
mesureL
(Click to sort ascending)
020.8888890.8622220.5088890.444444VEO
110.8896320.8695650.5016720.433110MCS
In [12]:
print(d_pc.to_latex(index=False))
Out [12]:
\begin{tabular}{rrrrr}
\toprule
 mesure &    c1\_val &    c2\_val &    c3\_val &    c4\_val \\
\midrule
      1 &  0.889632 &  0.869565 &  0.501672 &  0.433110 \\
      2 &  0.888889 &  0.862222 &  0.508889 &  0.444444 \\
      3 &  0.697842 &  0.575540 &  0.273381 &  0.251799 \\
      5 &  0.675610 &  0.587805 &  0.314634 &  0.258537 \\
      6 &  0.503371 &  0.505618 &  0.301124 &  0.238202 \\
      7 &  0.786667 &  0.706667 &  0.413333 &  0.280000 \\
      8 &  0.882747 &  0.829146 &  0.492462 &  0.423786 \\
      9 &  0.872910 &  0.779264 &  0.463211 &  0.394649 \\
     10 &  0.478333 &  0.490000 &  0.256667 &  0.243333 \\
\bottomrule
\end{tabular}

Quelles couple "Mesure-TypeSTR" maximise la validation des 4 critères ?

Procèdure :

  • On récupére la valeur de précision moyenne pour chaque critère, en fonction de la mesure et du type.
  • On récupére les tuples appartenant au front de pareto sur les 4 critères.

Résultat: Comme dans les résultats précédents, les mesures MCS, VEO obtiennent les meilleurs scores. Enfin, les types de STR associées, donnant les meilleurs scores sont : gen_region, extension1, puis normal. On peut déjà conclure que la généralisation --bornée Pays-- déforme trop l'information contenue dans les graphes, on perd trop en finesse.

In [13]:
# On regroupe les données selon la mesure et le type de STR utilisé --> Pour chaque critère, on aura la valeur moyenne
# retourné par le critère sur l'ensemble des couples de graphes de la mesure.
d_pc=df.groupby(['mesure','typeI'],as_index=False).mean()[['mesure','typeI','c1_val','c2_val','c3_val','c4_val']]
df_is_pareto = d_pc.apply(lambda row: is_pareto_front(d_pc,row, ['c1_val','c2_val','c3_val','c4_val']), axis=1)
df_pareto = d_pc.ix[df_is_pareto].sort_values(by=['c1_val','c2_val','c3_val','c4_val'])
df_pareto["mesureL"]=df_pareto["mesure"].apply(lambda x:df_mesure[df_mesure.id==x].values[0][-1])
df_pareto["typeL"]=df_pareto["typeI"].apply(lambda x:int2type[x])
#df_pareto["type"]=df_pareto["type"].apply(lambda x:type2int.index(x))
%matplotlib inline
df_pareto
Out [13]:
(Click to sort ascending)
mesure
(Click to sort ascending)
typeI
(Click to sort ascending)
c1_val
(Click to sort ascending)
c2_val
(Click to sort ascending)
c3_val
(Click to sort ascending)
c4_val
(Click to sort ascending)
mesureL
(Click to sort ascending)
typeL
(Click to sort ascending)
0110.7933330.9266670.5066670.420000MCSgen_country
1210.8200000.9266670.5333330.440000VEOgen_country
2220.9200000.8200000.4866670.453333VEOextension_1
3130.9200000.8733330.5200000.446667MCSgen_region
4900.9266670.7466670.4600000.406667BOCnormal
5820.9266670.7800000.4600000.440000BOWSEextension_1
6120.9266670.8333330.4933330.440000MCSextension_1
7830.9266670.8333330.5133330.433333BOWSEgen_region
8230.9266670.8400000.5066670.440000VEOgen_region
In [14]:
df_pareto.std(axis=0)
Out [14]:
mesure    3.456074
typeI     1.054093
c1_val    0.052576
c2_val    0.060093
c3_val    0.025386
c4_val    0.014142
dtype: float64
In [19]:
print(d_pc.sample(frac=0.1).to_latex(index=False))
Out [19]:
\begin{tabular}{rrrrrr}
\toprule
 mesure &  typeI &    c1\_val &    c2\_val &    c3\_val &    c4\_val \\
\midrule
      5 &      1 &  0.748252 &  0.790210 &  0.468531 &  0.363636 \\
     10 &      0 &  0.466667 &  0.473333 &  0.266667 &  0.260000 \\
     10 &      3 &  0.533333 &  0.526667 &  0.280000 &  0.273333 \\
\bottomrule
\end{tabular}