Eval.ipynb 3.34 MiB
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import json,glob,re
from scipy.spatial.distance import jaccard
import numpy as np
from math import*

import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
#from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf', 'svg')
#sns.set_palette(sns.color_palette("hls", 8))


import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)


def jaccard_similarity(x,y): 
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)
%cd ..
Out [1]:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/requests/__init__.py:80: RequestsDependencyWarning: urllib3 (1.22) or chardet (2.3.0) doesn't match a supported version!
  RequestsDependencyWarning)
Out [1]:
/Users/jacquesfize/nas_cloud/Code/str-python
In [2]:
df=pd.read_csv("resources/results_graph_exp18fev.tsv",delimiter="\t",index_col=0)
new_df=pd.DataFrame(columns=df.columns)

selected_graph=json.load(open("data/graph_exp_fev_18/selected.json"))
types=df.type.unique()
graph_size={}
graphs_={}

files_glob= glob.glob("data/graph_exp_fev_18/normal/*.gexf")
for fn in files_glob:
    id_ = int(re.findall("\d+", fn)[-1])
    graphs_[id_]=nx.read_gexf(fn)
    graph_size[id_]=len(graphs_[id_])
graph_size[999]=0
nb_of_g_w_es_com={}
for g in graphs_:
    if not g in nb_of_g_w_es_com:
        nb_of_g_w_es_com[g]=0
    for g2 in graphs_:
        if not g2 == g:
            if set(graphs_[g].nodes()).intersection(set(graphs_[g2].nodes())):
                nb_of_g_w_es_com[g]+=1    

In [3]:
df_mesure=pd.read_csv("resources/mesures.tsv",delimiter="\t")
rank_data=json.load(open("data/graph_exp_fev_18/rank.json"))
In [4]:
n=3
df_copy=pd.DataFrame(columns=df.columns)
for t in types:
    mesures=df[df.type == t].mesure.unique()
    for m in mesures:
        data=df[(df.mesure == m) & (df.type == t)]
        for g in selected_graph:
            subset=data[data.id_g1 == g].iloc[:n]
            if len(subset)<1:#No graph found
                df_2=pd.DataFrame([[g,999,m,t,3,0,0,0,0]],columns=df.columns)
                for i in range(n):df_copy=df_copy.append(df_2)
            elif len(subset)
In [5]:
df.head(20)
Out [5]:
(Click to sort ascending)
id_g1
(Click to sort ascending)
id_g2
(Click to sort ascending)
mesure
(Click to sort ascending)
type
(Click to sort ascending)
id_user
(Click to sort ascending)
c1_val
(Click to sort ascending)
c2_val
(Click to sort ascending)
c3_val
(Click to sort ascending)
c4_val
(Click to sort ascending)
02.0230.01.0normal3.01.01.01.01.0
12.0275.01.0normal3.01.01.01.01.0
22.0269.01.0normal3.01.01.01.01.0
37.0437.01.0normal3.01.01.00.00.0
47.04.01.0normal3.01.01.00.00.0
57.0371.01.0normal3.01.01.00.00.0
614.0443.01.0normal3.01.01.01.01.0
714.0437.01.0normal3.01.01.01.01.0
814.04.01.0normal3.01.01.01.01.0
927.026.01.0normal3.01.00.00.00.0
1027.033.01.0normal3.01.01.00.00.0
1127.040.01.0normal3.01.00.00.00.0
1245.021.01.0normal3.01.01.00.01.0
1345.023.01.0normal3.01.01.00.00.0
1445.044.01.0normal3.01.01.00.00.0
1547.081.01.0normal3.01.01.00.00.0
1647.0170.01.0normal3.01.01.00.00.0
1747.0488.01.0normal3.01.01.00.00.0
1853.0154.01.0normal3.01.01.00.00.0
1953.0512.01.0normal3.01.01.00.00.0
In [6]:
df["g1_size"]=df["id_g1"].apply(lambda x:graph_size[int(x)])
df["g2_size"]=df["id_g2"].apply(lambda x:graph_size[int(x)])
df["mesureL"]=df["mesure"].apply(lambda x:df_mesure[df_mesure.id==x].values[0][-1])
df['c1*c2']=df.c1_val*df.c2_val
df['c1+c2']=(df.c1_val+df.c2_val).apply(lambda x:x if x<2 else 1)
df['c1*c2*c3']=df.c1_val*df.c2_val*df.c3_val
df['c1*c3']=df.c1_val*df.c3_val
df['c1+c3']=(df.c1_val+df.c3_val).apply(lambda x:x if x<2 else 1)
df['c2*c3']=df.c2_val*df.c3_val
df["for_c"]=df["id_g2"].apply(lambda x:1)
df["es_in_common"]=df["id_g1"].apply(lambda x:nb_of_g_w_es_com[x])
normal=df[df.type == "normal"]
gen_country=df[df.type == "gen_country"]
gen_region=df[df.type == "gen_region"]
extension_1=df[df.type == "extension_1"]
In [9]:
new_df=pd.DataFrame(data=None,columns=df.columns)
for id,row in df.iterrows():
    ranks=set(rank_data[row.type][row.mesureL][str(int(row.id_g1))][:5])
    if int(row.id_g2) in ranks:
        new_df=new_df.append(row)
In [10]:
df=new_df
In [11]:
colorized_subset=['c1_val', 'c2_val', 'c3_val',
       'c4_val', 'c1*c2', 'c1+c2', 'c1*c2*c3', 'c1*c3', 'c1+c3', 'c2*c3']

Raw Performance

Normal

In [12]:
score_per_mesure=normal.groupby(["id_g1","mesureL"], as_index=False).mean()
score_normal=score_per_mesure.groupby(["mesureL"],as_index=False).mean()
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]
def highlight_min(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: red;color:white;' if v else '' for v in is_max]
score_normal.style.apply(highlight_max,subset=colorized_subset).apply(highlight_min,subset=colorized_subset)
Out [12]:
In [14]:
score_normal[["c1_val","c2_val","c3_val","c4_val"]].plot.bar(x=score_normal["mesure"].unique())
plt.show()
score_normal[["c1*c2","c1+c2","c1*c2*c3","c1+c3"]].plot.bar(x=score_normal["mesure"].unique())
plt.show()

Generalisation Pays

In [16]:
score_per_mesure=gen_country.groupby(["id_g1","mesureL"], as_index=False).mean()
score_gen_country=score_per_mesure.groupby(["mesureL"],as_index=False).mean()
score_gen_country.style.apply(highlight_max,subset=colorized_subset).apply(highlight_min,subset=colorized_subset)
Out [16]:
In [37]:
score_gen_country[["c1_val","c2_val","c3_val","c4_val"]].plot.bar(x=score_gen_country["mesure"].unique())
plt.show()
score_gen_country[["c1*c2","c1+c2","c1*c2*c3","c1+c3"]].plot.bar(x=score_gen_country["mesure"].unique())
plt.show()

Generalisation region

In [21]:
score_per_mesure=gen_region.groupby(["id_g1","mesureL"], as_index=False).mean()
score_gen_region=score_per_mesure.groupby(["mesureL"],as_index=False).mean()
score_gen_region.style.apply(highlight_max,subset=colorized_subset).apply(highlight_min,subset=colorized_subset)
Out [21]:
In [39]:
score_gen_region[["c1_val","c2_val","c3_val","c4_val"]].plot.bar(x=score_gen_region["mesure"].unique())
plt.show()
score_gen_region[["c1*c2","c1+c2","c1*c2*c3","c1+c3"]].plot.bar(x=score_gen_region["mesure"].unique())
plt.show()

Extension 1

In [20]:
score_per_mesure=extension_1.groupby(["id_g1","mesureL"], as_index=False).mean()
score_ext_1=score_per_mesure.groupby(["mesureL"],as_index=False).mean()
score_ext_1.style.apply(highlight_max,subset=colorized_subset).apply(highlight_min,subset=colorized_subset)
Out [20]:
In [41]:
score_ext_1[["c1_val","c2_val","c3_val","c4_val"]].plot.bar(x=score_ext_1["mesure"].unique())
plt.show()
score_ext_1[["c1*c2","c1+c2","c1*c2*c3","c1+c3"]].plot.bar(x=score_ext_1["mesure"].unique())
plt.show()

Impact of the graph size over criteria

In [42]:
df=df[df["id_g2"] != 999]
In [43]:
x_label="g1_size"

Score Gradient over size

Selon la taille sans prendre en compte mesure

Normal

In [44]:
score_normal_size=normal.groupby([x_label],as_index=False).mean()
score_normal_size.style.background_gradient(subset=colorized_subset,low=0,high=0)
Out [44]:

Extension 1

In [45]:
score_extension_1_size=extension_1.groupby([x_label],as_index=False).mean()
score_extension_1_size.style.background_gradient(subset=colorized_subset,low=0,high=0)
Out [45]:

gen country

In [46]:
score_gen_country_size=gen_country.groupby([x_label],as_index=False).mean()
score_gen_country_size.style.background_gradient(subset=colorized_subset,low=0,high=0)
Out [46]:

Gen region

In [47]:
score_gen_region_size=gen_region.groupby([x_label],as_index=False).mean()
score_gen_region_size.style.background_gradient(subset=colorized_subset,low=0,high=0)
Out [47]:

Par taille et par mesure

In [48]:

score_normal_size=normal.groupby([x_label,"mesure"],as_index=False).mean()
score_normal_size.style.background_gradient(subset=colorized_subset,low=0,high=1)
Out [48]:
In [49]:
score_gen_country_size=gen_country.groupby([x_label,"mesure"],as_index=False).mean()
score_gen_country_size.style.background_gradient(subset=colorized_subset,low=0,high=1)
Out [49]:
In [50]:
score_gen_region_size=gen_region.groupby([x_label,"mesure"],as_index=False).mean()
score_gen_region_size.style.background_gradient(subset=colorized_subset,low=0,high=1)
Out [50]:
In [51]:
score_extension_1_size=extension_1.groupby([x_label,"mesure"],as_index=False).mean()
score_extension_1_size.style.background_gradient(subset=colorized_subset,low=0,high=1)
Out [51]:
In [52]:
key=["g1_size","mesure","c1_val","c2_val","c3_val","c4_val","c1*c2","c1+c2","c1*c2*c3","c1*c3","c1+c3","c2*c3"]
writer = pd.ExcelWriter('PerSizeAndPerMesurePerType.xlsx')
score_normal_size[key].to_excel(writer,'Normal')
score_gen_country_size[key].to_excel(writer,'GenCountry')
score_gen_region_size[key].to_excel(writer,'GenRegion')
score_extension_1_size[key].to_excel(writer,'Extension_1')
writer.save()

Pareto Frontier with x = G1_size

In [53]:
# Thanks to http://hinnefe2.github.io/python/tools/2015/09/21/mario-kart.html
def is_pareto_front(dataf,row, xlabel, ylabel):
    
    x = row[xlabel]
    y = row[ylabel]
    
    # look for points with the same y value but larger x value
    is_max_x = dataf.loc[dataf[ylabel]==y].max()[xlabel] <= x
    # look for points with the same x value but larger y value
    is_max_y = dataf.loc[dataf[xlabel]==x].max()[ylabel] <= y
    # look for points that are larger in both x and y
    is_double = len(dataf.loc[(dataf[xlabel]>x) & (dataf[ylabel]>y)])==0
    
    return is_max_x and is_max_y and is_double
In [244]:

import colorlover as cl
cm=sns.color_palette("hls", 8)
def draw_pareto_static(df,x_label,criteria,x_ax_label="X",y_ax_label="Y",title="Titre"):
    
    fig, ax = plt.subplots(figsize=(10,5),ncols=1)
    #axes=(ax1,ax2,ax3,ax4)
    for i in range(len(criteria)):
        y_label=criteria[i]
        #df.assign(normalized=df.bought.div(df.groupby('user').bought.transform('sum')))
        df_is_pareto = df.apply(lambda row: is_pareto_front(df,row, [x_label, y_label]), axis=1)
        df_pareto = df.ix[df_is_pareto].sort_values(by=x_label)

        
        sns.swarmplot(x=x_label, y=y_label, data=df, ax=ax,color=cm[i])
        # plot the pareto frontier
        ax.plot(df_pareto[x_label].index,df_pareto[y_label].values, '--',color=cm[i], label='P. Frontier for {0}'.format(criteria[i]))

        #ax.legend(loc='best')
        plt.xlabel(x_ax_label)
        plt.ylabel(y_ax_label)
        plt.xticks(rotation=90)
        plt.title(title)
        

def data_pareto(df,x_label,criteria):
    
    data=[]
    for i in range(len(criteria)):
        y_label=criteria[i]
        #df.assign(normalized=df.bought.div(df.groupby('user').bought.transform('sum')))
        df_is_pareto = df.apply(lambda row: is_pareto_front(df,row, [x_label, y_label]), axis=1)
        df_pareto = df.ix[df_is_pareto].sort_values(by=x_label)
        data.append(go.Scatter(
                x=df[x_label], # assign x as the dataframe column 'x'
                y=df[y_label],
                mode="markers",
                marker = dict(
                    color = ("rgb"+str(cm[i])),
                    ),
                name="{0} ".format(criteria[i]),
            ))

        data.append(
            go.Scatter(
                x=df_pareto[x_label], # assign x as the dataframe column 'x'
                y=df_pareto[y_label],
                name="{0} Pareto Frontier".format(criteria[i]),
                line = dict(
                    color = ("rgb"+str(cm[i])),
                    width = 4,)
                    )

        )
    return data

def draw_pareto2(df,x_label,criteria,typeSTR="normal",layout=None):
    if not layout:
        fig = go.Figure(data=data_pareto(df,x_label,criteria))
    else:
        fig = go.Figure(data=data_pareto(df,x_label,criteria), layout=layout)
    return iplot(fig,filename='{0}_{1}_{2}.png'.format(x_label,",".join(criteria),typeSTR))
In [108]:
def get_pareto_graph_ids(df,criteria):
    set_=set([])
    for c in criteria:
        df_is_pareto = df.apply(lambda row: is_pareto_front(df,row, x_label, c), axis=1)
        df_pareto = df.ix[df_is_pareto]
        for i,row in df_pareto.iterrows():
            set_.add(int(row.id_g1))
    return list(set_)
In [58]:
# Données nécessaire
x_label="id_g1"
extension_1_g1_mean=extension_1.groupby([x_label],as_index=False).mean()
normal_g1_mean=normal.groupby([x_label],as_index=False).mean()
gen_region_g1_mean=gen_region.groupby([x_label],as_index=False).mean()
gen_country_g1_mean=gen_country.groupby([x_label],as_index=False).mean()
keys_alone=['c1_val', 'c2_val', 'c3_val', 'c4_val']
keys_combined=['c1*c2', 'c1*c2*c3', 'c1*c3', 'c2*c3']

pareto_graphs={
        "normal":sorted(get_pareto_graph_ids(normal_g1_mean,keys_alone)),
        "ext_1":sorted(get_pareto_graph_ids(extension_1_g1_mean,keys_alone)),
        "gen_country":sorted(get_pareto_graph_ids(gen_country_g1_mean,keys_alone)),
        "gen_region":sorted(get_pareto_graph_ids(gen_region_g1_mean,keys_alone)),
}
        
df_data={}
for d in pareto_graphs:
    if not d in df_data:df_data[d]={}
    for d2 in pareto_graphs:
        df_data[d][d2]=jaccard_similarity(pareto_graphs[d],pareto_graphs[d2])

Quelles graphes maximisent un critère ?

Normal

In [69]:
draw_pareto2(normal_g1_mean,x_label,keys_alone,"normal")
Out [69]:

Gen_region

In [70]:

draw_pareto2(gen_region_g1_mean,x_label,keys_alone,"gen_region")
Out [70]:

Gen_country

In [71]:
draw_pareto2(gen_country_g1_mean,x_label,keys_alone,"gen_country")
Out [71]:

Extension_1

In [72]:
draw_pareto2(extension_1_g1_mean,x_label,keys_alone,"extension")
Out [72]:

Est que ces graphes partagent une ou plusieurs entités avec beaucoup de graphe ?

In [337]:
normal_size_mean=normal.groupby(["id_g1"],as_index=False).mean()
keys_=['es_in_common']
draw_pareto2(normal_size_mean,[x_label,'es_in_common'],"normal")
Out [337]:
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/indexes/base.py in get_value(self, series, key)
   2174             try:
-> 2175                 return tslib.get_value_box(s, key)
   2176             except IndexError:

pandas/tslib.pyx in pandas.tslib.get_value_box (pandas/tslib.c:19053)()

pandas/tslib.pyx in pandas.tslib.get_value_box (pandas/tslib.c:18687)()

TypeError: 'str' object cannot be interpreted as an integer

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-337-7cdd5e29f18a> in <module>()
      1 normal_size_mean=normal.groupby(["id_g1"],as_index=False).mean()
      2 keys_=['es_in_common']
----> 3 draw_pareto2(normal_size_mean,[x_label,'es_in_common'],"normal")

<ipython-input-244-85c76a567b54> in draw_pareto2(df, x_label, criteria, typeSTR, layout)
     57 def draw_pareto2(df,x_label,criteria,typeSTR="normal",layout=None):
     58     if not layout:
---> 59         fig = go.Figure(data=data_pareto(df,x_label,criteria))
     60     else:
     61         fig = go.Figure(data=data_pareto(df,x_label,criteria), layout=layout)

<ipython-input-244-85c76a567b54> in data_pareto(df, x_label, criteria)
     30         y_label=criteria[i]
     31         #df.assign(normalized=df.bought.div(df.groupby('user').bought.transform('sum')))
---> 32         df_is_pareto = df.apply(lambda row: is_pareto_front(df,row, [x_label, y_label]), axis=1)
     33         df_pareto = df.ix[df_is_pareto].sort_values(by=x_label)
     34         data.append(go.Scatter(

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   4150                     if reduce is None:
   4151                         reduce = True
-> 4152                     return self._apply_standard(f, axis, reduce=reduce)
   4153             else:
   4154                 return self._apply_broadcast(f, axis)

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
   4246             try:
   4247                 for i, v in enumerate(series_gen):
-> 4248                     results[i] = func(v)
   4249                     keys.append(v.name)
   4250             except Exception as e:

<ipython-input-244-85c76a567b54> in <lambda>(row)
     30         y_label=criteria[i]
     31         #df.assign(normalized=df.bought.div(df.groupby('user').bought.transform('sum')))
---> 32         df_is_pareto = df.apply(lambda row: is_pareto_front(df,row, [x_label, y_label]), axis=1)
     33         df_pareto = df.ix[df_is_pareto].sort_values(by=x_label)
     34         data.append(go.Scatter(

<ipython-input-287-187d0a772f0a> in is_pareto_front(dataf, row, columns)
      2 def is_pareto_front(dataf,row,columns):
      3 
----> 4     values=[row[col] for col in columns]
      5     boolean_is_max=[]
      6     #if max(value)

<ipython-input-287-187d0a772f0a> in <listcomp>(.0)
      2 def is_pareto_front(dataf,row,columns):
      3 
----> 4     values=[row[col] for col in columns]
      5     boolean_is_max=[]
      6     #if max(value)

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
    601         key = com._apply_if_callable(key, self)
    602         try:
--> 603             result = self.index.get_value(self, key)
    604 
    605             if not is_scalar(result):

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/indexes/base.py in get_value(self, series, key)
   2181                     raise InvalidIndexError(key)
   2182                 else:
-> 2183                     raise e1
   2184             except Exception:  # pragma: no cover
   2185                 raise e1

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/indexes/base.py in get_value(self, series, key)
   2167         try:
   2168             return self._engine.get_value(s, k,
-> 2169                                           tz=getattr(series.dtype, 'tz', None))
   2170         except KeyError as e1:
   2171             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3557)()

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3240)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: ('n', 'occurred at index 0')
In [76]:
pareto_graphs
Out [76]:
{'ext_1': [14, 236, 304, 346, 359, 429, 476, 499, 503, 527],
 'gen_country': [92, 236, 300, 375, 426, 439, 453, 499, 503, 527],
 'gen_region': [14, 92, 211, 236, 304, 346, 359, 429, 439, 499, 527],
 'normal': [14, 92, 236, 346, 359, 424, 426, 429, 476, 499, 527]}
In [77]:
pd.DataFrame(df_data,columns=df_data.keys())
Out [77]:
(Click to sort ascending)
normal
(Click to sort ascending)
ext_1
(Click to sort ascending)
gen_country
(Click to sort ascending)
gen_region
(Click to sort ascending)
00.6153851.0000000.25000.615385
10.3125000.2500001.00000.312500
20.5714290.6153850.31251.000000
31.0000000.6153850.31250.571429

GenCountry apporte le plus de changement dans les graphes "optimum", i.e., graphes validant un critère sur les différentes mesures.

In [140]:
# Données nécessaire
x_label="mesure"
dataf=df.groupby(['type',x_label],as_index=False).mean()
dataf['fusion'] = dataf[['type',x_label]].apply(lambda x: ''.join(x),axis=1)
#normal_g1_mean=normal.groupby(['type',x_label],as_index=False).mean()
#gen_region_g1_mean=gen_region.groupby([x_label],as_index=False).mean()
#gen_country_g1_mean=gen_country.groupby([x_label],as_index=False).mean()
keys_alone=['c1_val', 'c2_val', 'c3_val', 'c4_val']
keys_combined=['c1*c2', 'c1*c2*c3', 'c1*c3', 'c2*c3']

In [144]:
dataf
Out [144]:
(Click to sort ascending)
type
(Click to sort ascending)
mesure
(Click to sort ascending)
id_g1
(Click to sort ascending)
id_g2
(Click to sort ascending)
id_user
(Click to sort ascending)
c1_val
(Click to sort ascending)
c2_val
(Click to sort ascending)
c3_val
(Click to sort ascending)
c4_val
(Click to sort ascending)
g1_size
(Click to sort ascending)
g2_size
(Click to sort ascending)
c1*c2
(Click to sort ascending)
c1+c2
(Click to sort ascending)
c1*c2*c3
(Click to sort ascending)
c1*c3
(Click to sort ascending)
c1+c3
(Click to sort ascending)
c2*c3
(Click to sort ascending)
for_c
(Click to sort ascending)
es_in_common
(Click to sort ascending)
fusion
(Click to sort ascending)
0extension_1BOC241.380000252.4333333.00.9200000.7200000.4400000.4000004.9200003.8400000.6933330.9466670.4133330.4333330.9266670.4200001.070.320000extension_1BOC
1extension_1BOWSE241.380000252.8666673.00.9266670.7800000.4600000.4400004.9200004.0666670.7533330.9533330.4200000.4533330.9333330.4266671.070.320000extension_1BOWSE
2extension_1JACCARD241.380000288.7400003.00.4733330.4800000.2600000.2466674.9200004.9866670.4666670.4866670.2600000.2600000.4733330.2600001.070.320000extension_1JACCARD
3extension_1MCS241.380000270.1000003.00.9266670.8333330.4933330.4400004.9200004.7600000.8133330.9466670.4533330.4866670.9333330.4600001.070.320000extension_1MCS
4extension_1VEO241.380000255.6400003.00.9200000.8200000.4866670.4533334.9200004.5333330.8000000.9400000.4466670.4800000.9266670.4533331.070.320000extension_1VEO
5extension_1WLSUBTREE241.380000270.6000003.00.7866670.7066670.4133330.2800004.9200004.8400000.6733330.8200000.3866670.4000000.8000000.4000001.070.320000extension_1WLSUBTREE
6gen_countryBOC237.653061253.9387763.00.7278910.8843540.4897960.3673475.0000004.5306120.6802720.9319730.4149660.4285710.7891160.4761901.070.000000gen_countryBOC
7gen_countryBOWSE241.489796249.6190483.00.7687070.9115650.5306120.3945584.9795924.5918370.7142860.9659860.4421770.4693880.8299320.5034011.070.224490gen_countryBOWSE
8gen_countryGREEDY236.604167204.0625003.00.5486110.6527780.3541670.2569445.0625004.1111110.4513890.7500000.3055560.3194440.5833330.3402781.069.854167gen_countryGREEDY
9gen_countryHED242.325581264.1085273.00.8294570.8759690.5193800.4031015.3255813.9147290.7441860.9612400.4496120.4651160.8837210.5038761.078.488372gen_countryHED
10gen_countryJACCARD241.380000308.4133333.00.4400000.4800000.2200000.1933334.9200005.3733330.4400000.4800000.2133330.2133330.4466670.2200001.070.320000gen_countryJACCARD
11gen_countryMCS241.380000256.9733333.00.7933330.9266670.5066670.4200004.9200005.2800000.7466670.9733330.4533330.4733330.8266670.4866671.070.320000gen_countryMCS
12gen_countryVEO241.380000258.4733333.00.8200000.9266670.5333330.4400004.9200005.1000000.7733330.9733330.4666670.4866670.8666670.5133331.070.320000gen_countryVEO
13gen_regionBOC241.380000255.5133333.00.9200000.7733330.4666670.4066674.9200004.1400000.7333330.9600000.4400000.4600000.9266670.4466671.070.320000gen_regionBOC
14gen_regionBOWSE241.380000259.6400003.00.9266670.8333330.5133330.4333334.9200004.3866670.8000000.9600000.4733330.5066670.9333330.4800001.070.320000gen_regionBOWSE
15gen_regionGREEDY241.380000184.6933333.00.5200000.5000000.2933330.2400004.9200003.5733330.4466670.5733330.2800000.2800000.5333330.2933331.070.320000gen_regionGREEDY
16gen_regionHED221.592593254.8888893.01.0000000.7530860.3703700.2839517.4074073.1234570.7530861.0000000.3703700.3703701.0000000.3703701.0111.259259gen_regionHED
17gen_regionJACCARD241.380000288.4666673.00.5333330.5266670.2800000.2733334.9200005.1800000.5133330.5466670.2800000.2800000.5333330.2800001.070.320000gen_regionJACCARD
18gen_regionMCS241.380000266.9333333.00.9200000.8733330.5200000.4466674.9200004.7800000.8466670.9466670.4866670.5133330.9266670.4933331.070.320000gen_regionMCS
19gen_regionVEO241.380000259.2266673.00.9266670.8400000.5066670.4400004.9200004.6600000.8200000.9466670.4666670.5000000.9333330.4733331.070.320000gen_regionVEO
20normalBOC241.380000267.1000003.00.9266670.7466670.4600000.4066674.9200004.1000000.7200000.9533330.4333330.4533330.9333330.4400001.070.320000normalBOC
21normalBOWSE244.387755265.3877553.00.9251700.8095240.4761900.4353744.8775514.1360540.7823130.9523810.4353740.4693880.9319730.4421771.071.306122normalBOWSE
22normalGED221.088235273.1274513.00.9509800.7843140.3725490.3431376.2941183.3725490.7745100.9607840.3431370.3725490.9509800.3431371.094.823529normalGED
23normalGREEDY237.346939180.8707483.00.4557820.3809520.2653060.2244904.9387763.8979590.3673470.4693880.2653060.2653060.4557820.2653061.071.326531normalGREEDY
24normalHED219.700000246.2666673.00.9888890.7444440.3555560.3444446.9333333.0333330.7444440.9888890.3555560.3555560.9888890.3555561.0103.133333normalHED
25normalJACCARD241.380000290.1333333.00.4666670.4733330.2666670.2600004.9200005.1933330.4600000.4800000.2666670.2666670.4666670.2666671.070.320000normalJACCARD
26normalMCS240.102041265.9727893.00.9251700.8503400.4897960.4285714.9795924.9931970.8367350.9387760.4693880.4829930.9319730.4761901.071.448980normalMCS

Quelles couples mesures/types maximisent un critère ?

In [197]:
layout = go.Layout(
    xaxis=dict(
        tickangle=45
    ),
    margin = go.Margin(b = 160)
)
draw_pareto2(dataf,"fusion",keys_alone,"normal",layout)
Out [197]:

Quelles couples mesures/types maximisent des critères combinées?

In [81]:
layout = go.Layout(
    xaxis=dict(
        tickangle=45
    ),
    margin = go.Margin(b = 160)
)
draw_pareto2(dataf,"fusion",keys_combined,"normal",layout)
Out [81]:
In [287]:
# Thanks to http://hinnefe2.github.io/python/tools/2015/09/21/mario-kart.html
def is_pareto_front(dataf,row,columns):
    
    values=[row[col] for col in columns]
    boolean_is_max=[]
    #if max(value)
    for c in range(len(columns)):
        val=values[c]
        col=columns[c]
        bool_temp=True
        for c2 in range(len(columns)):
            if c != c2:break
            val2=values[c]
            col2=columns[c]
            bool_temp=bool_temp and (dataf.loc[dataf[col2]==val2].max()[col] <= val)
        boolean_is_max.append(bool_temp)
    #if no criteria superior
    daf=dataf.copy()
    for c in range(len(columns)):
        val=values[c]
        col=columns[c]
        daf=daf.loc[(dataf[col]>val)]
    return sum(map(int,boolean_is_max))==len(columns) and len(daf)==0
 
In [294]:
df_is_pareto = d_pc.apply(lambda row: is_pareto_front(d_pc,row, ['id_g1','c1_val','c2_val' ]), axis=1)
df_pareto = d_pc.ix[df_is_pareto].sort_values(by="id_g1")
def get_pareto_couples(df,criteria):
    data={}
    for c in criteria:
        set_=[]
        df_is_pareto = df.apply(lambda row: is_pareto_front(df,row, "fusion", c), axis=1)
        df_pareto = df.ix[df_is_pareto].sort_values(by=x_label)
        for i,row in df_pareto.iterrows():
            set_.append(row.fusion)
        data[c]=set_
    return data


df_data={}
par=get_pareto_couples(dataf,keys_alone)
for d in par:
    if not d in df_data:df_data[d]={}
    for d2 in par:
        df_data[d][d2]=jaccard_similarity(par[d],par[d2])
pd.DataFrame(df_data,columns=df_data.keys())
In [202]:
par
Out [202]:
{'c1_val': ['gen_regionHED', 'normalHED', 'normalMCS'],
 'c2_val': ['gen_regionMCS', 'normalMCS', 'gen_countryVEO'],
 'c3_val': ['gen_regionMCS', 'normalMCS', 'gen_countryVEO', 'gen_regionVEO'],
 'c4_val': ['normalBOWSE',
  'gen_regionMCS',
  'normalMCS',
  'extension_1VEO',
  'gen_regionVEO']}

Pour l'instant, c'est le couple VEO et gen_region qui obtient de meilleur score

In [204]:
df_data={}
par=get_pareto_couples(dataf,keys_combined)
for d in par:
    if not d in df_data:df_data[d]={}
    for d2 in par:
        df_data[d][d2]=jaccard_similarity(par[d],par[d2])
pd.DataFrame(df_data,columns=df_data.keys())
Out [204]:
(Click to sort ascending)
c1*c2
(Click to sort ascending)
c1*c2*c3
(Click to sort ascending)
c1*c3
(Click to sort ascending)
c2*c3
(Click to sort ascending)
01.0000001.0000000.6666670.666667
11.0000001.0000000.6666670.666667
20.6666670.6666671.0000000.500000
30.6666670.6666670.5000001.000000
In [205]:
par
Out [205]:
{'c1*c2': ['gen_regionMCS', 'normalMCS'],
 'c1*c2*c3': ['gen_regionMCS', 'normalMCS'],
 'c1*c3': ['gen_regionMCS', 'normalMCS', 'gen_regionVEO'],
 'c2*c3': ['gen_regionMCS', 'normalMCS', 'gen_countryVEO']}
In [293]:
d_pc=df.groupby(['id_g1'],as_index=False).mean()[['id_g1','c1_val','c2_val','c3_val','c4_val']]
d_pc['id_g1']=d_pc['id_g1'].apply(lambda x:int(x))
# Thanks to http://hinnefe2.github.io/python/tools/2015/09/21/mario-kart.html
def is_pareto_front(dataf,row, xlabel, ylabel):
    
    x = row[xlabel]
    y = row[ylabel]
    
    # look for points with the same y value but larger x value
    is_max_x = dataf.loc[dataf[ylabel]==y].max()[xlabel] <= x
    # look for points with the same x value but larger y value
    is_max_y = dataf.loc[dataf[xlabel]==x].max()[ylabel] <= y
    # look for points that are larger in both x and y
    is_double = len(dataf.loc[(dataf[xlabel]>x) & (dataf[ylabel]>y)])==0
    
    return is_max_x and is_max_y and is_double