diff --git a/src/py/analyse.py b/src/py/analyse.py
deleted file mode 100644
index 5ce096577bf18201f165eea7a6ac2cd2195d520f..0000000000000000000000000000000000000000
--- a/src/py/analyse.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# https://docs.python.org/fr/3/library/xml.etree.elementtree.html#module-xml.etree.ElementTree
-#
-# Lit les fichiers xml générés par pdftotext :
-# pdftotext -bbox-layout ../tmp/viti/20180801_LOR_BSV_Viticulture_cle857461.pdf
-# et fait une sortie destinées à être lue rapidement sur un terminal,
-# avec une délimitation des blocs.
-#
-import xml.etree.ElementTree as ET
-import os
-import sys
-import re
-
-# https://unix.stackexchange.com/questions/238180/execute-shell-commands-in-python
-import subprocess
-
-### Parameters
-CMD_PDFTOTEXT = '/usr/sbin/pdftotext'
-CMD_PDFTOHTML = '/usr/sbin/pdftohtml'
-
-
-### Entering MAIN process
-
-#### Getting pdf filename as a parameter.
-if (len(sys.argv) < 1):
-    print("-U-> Usage : python analyse.py <fichier_pdf>")
-    sys.exit(-1)
-#print('Parsing %s' % sys.argv[1])
-
-basename = os.path.splitext(sys.argv[1])[0]
-
-#### Calling pdftotext command and getting its standard output
-cmd = [CMD_PDFTOTEXT, '-bbox-layout', '-eol', 'unix', '%s.pdf' % basename, '-']
-proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-o, e = proc.communicate()
-if (proc.returncode != 0):
-    print('-S-> Command pdftotext returned an error :')
-    print('     '  + e.decode('utf8'))
-    sys.exit(-2)
-
-xml = o.decode('utf8')
-root = ET.fromstring(xml)
-
-
-
-#### Extract xml to lists and dictionaries for faster and easier access.
-#
-# Data format :
-#   flow=[{page, blocks = [{page, lines:[{height, text}]}]
-# Rq : page is redundant but for now we don't know which is the best
-#
-page_num = 0
-flow = []
-for body in root:
-    if (body.tag.endswith('body')):
-        for doc in body:
-            if (doc.tag.endswith('doc')):
-                for page in doc:
-                    if (page.tag.endswith('page')):
-                        page_num += 1
-                        for fl in page:
-                            if (fl.tag.endswith('flow')):
-                                blocks = []
-                                for bloc in fl:
-                                    if (bloc.tag.endswith('block')):
-                                        bl = {'page':page_num, 'lines':[]}
-                                        bwords = 0
-                                        bcars = 0
-                                        for line in bloc:
-                                            if (line.tag.endswith('line')):
-                                                h = float(line.get('yMax')) - float(line.get('yMin'))
-                                                li = ''
-                                                lwords = 0
-                                                last_nbcar = 0
-                                                last_h = 0
-                                                for word in line:
-                                                    if (word.tag.endswith('word')):
-                                                        hword = float(word.get('yMax')) - float(word.get('yMin'))
-                                                        if ((hword != last_h)
-                                                          and (last_nbcar < 2)):
-                                                            last_h = hword
-                                                            li = "%s%s" % (li, word.text)
-                                                        else:
-                                                            li = "%s %s" % (li, word.text)
-                                                        last_nbcar = len(word.text)
-                                                        lwords += 1
-                                                bl['lines'].append({
-                                                    'height':h,
-                                                    'text':li.strip(),
-                                                    'nb_cars': len(li.strip()),
-                                                    'nb_words':lwords})
-                                                bwords += lwords
-                                                bcars += len(li.strip())
-                                        bl['nb_words'] = bwords
-                                        bl['nb_cars'] = bcars
-                                        blocks.append(bl)
-                                flow.append({'page':page_num, 'blocks':blocks})
-
-
-
-
-#### Now, calls pdftohtml to improve font attributes
-cmd = [CMD_PDFTOHTML, '-xml', '-i', '-stdout', '%s.pdf' % basename]
-proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-o, e = proc.communicate()
-if (proc.returncode != 0):
-    print('-S-> Command pdftohtml returned an error :')
-    print('     '  + e.decode('utf8'))
-    sys.exit(-2)
-
-xml = o.decode('utf8')
-root = ET.fromstring(xml)
-
-#### Extracts font information (id, size, family, color)
-#### and the link between lines of text and their font.
-fontspec = []
-p2x_text = []
-for page in root:
-    if (page.tag.endswith('page')):
-        pg = int(page.get('number'))
-        for tg in page:
-            if (tg.tag.endswith('fontspec')):
-                fontspec.append({
-                    'id': int(tg.get('id')),
-                    'size': int(tg.get('size')),
-                    'family': tg.get('family'),
-                    'color': tg.get('color')
-                })
-            elif (tg.tag.endswith('text')):
-                fnt = int(tg.get('font'))
-                while (tg.text is None) and (len(tg) > 0):
-                    tg = tg[0] # remove html style tags (like <b>, …)
-                if (tg.text is not None):
-                    li = "%s" % (tg.text)
-                    if (len(li.strip()) > 0):
-                        p2x_text.append({
-                            'page': pg,
-                            'font': fnt,
-                            'text': li.strip()
-                        })
-
-
-
-#### Try to find fontspec of flow's lines
-###### 1. By line recognition
-for fl in flow:
-    for bl in fl['blocks']:
-        for li in bl['lines']:
-            nocc = 0
-            fo = 0
-            for ligne in p2x_text:
-                if (ligne['text'] == li['text']) and (ligne['page'] == fl['page']):
-                    nocc += 1
-                    if (nocc == 2) and (ligne['font'] == fo):
-                        nocc = 1
-                    fo = ligne['font']
-            if (nocc == 1):
-                li['font'] = fo
-            else:
-                li['font'] = None
-###### 2. Block uniformization
-for fl in flow:
-    for bl in fl['blocks']:
-        for li in bl['lines']:
-            if (li['font'] is None):
-                h = round(li['height'])
-                fnt = None
-                for li2 in bl['lines']:
-                    if (fnt is None) \
-                        and (round(li2['height']) == h) \
-                        and (li2['font'] is not None):
-                            fnt = li2['font']
-                if (fnt is not None):
-                    li['font'] = fnt
-
-
-#### Page bottom detection
-pb = 'dummy'
-if (flow[-1]['page'] == 1): pb = None
-while (pb is not None):
-    pb = None
-    last_lines = []
-    last_read_page = flow[0]['page']
-    last_read_line = 'Foo'
-    for fl in flow:
-        if (fl['page'] != last_read_page):
-            last_lines.append(re.sub(r'[^a-zA-Z]', '', last_read_line))
-        last_read_line = fl['blocks'][-1]['lines'][-1]['text']
-        last_read_page = fl['page']
-    last_lines.append(re.sub(r'[^a-zA-Z]', '', last_read_line))
-
-    ### Is last_lines filled with the same string ?
-    pb = last_lines[0]
-    for li in last_lines[1:]:
-        if (pb is not None) and (pb != li): pb = None
-
-    ### Yes, so mark these lines to be removed
-    if (pb is not None):
-        print("#####> %s" % pb)
-        last_read_page = flow[0]['page']
-        last_read_flow = flow[0]
-        for fl in flow[1:]:
-            if (fl['page'] != last_read_page):
-                print('  xxx> %s' % last_read_flow['blocks'][-1]['lines'][-1]['text'])
-                del last_read_flow['blocks'][-1]['lines'][-1]
-                if not last_read_flow['blocks'][-1]['lines']: # ie it's empty
-                    del last_read_flow['blocks'][-1]
-                    #print('****> %d' % len(last_read_flow['blocks']))
-                    #if last_read_page == 1: print('  **> %s' % flow)
-                    if not last_read_flow['blocks']:
-                        flow.remove(last_read_flow)
-                    #if last_read_page == 1: print('  ··> %s' % flow)
-            last_read_flow = fl
-            last_read_page = fl['page']
-        print('  xxx> %s' % last_read_flow['blocks'][-1]['lines'][-1]['text'])
-        del last_read_flow['blocks'][-1]['lines'][-1]
-        if not last_read_flow['blocks'][-1]['lines']: # ie it's empty
-            del last_read_flow['blocks'][-1]
-            #print('****> %d' % len(last_read_flow['blocks']))
-            if not last_read_flow['blocks']:
-                flow.remove(last_read_flow)
-
-
-
-
-#### Calcultate some stats
-font_sizes = {}
-pipe = []
-bl_num = 0
-fl_num = 0
-for fl in flow:
-    for bl in fl['blocks']:
-        for li in bl['lines']:
-            h = round(li['height'])
-            if (pipe == []):
-                pipe.append(h)
-            else:
-                #if (h != pipe[-1]):
-                    pipe.append(h)
-            if (font_sizes.get(h) is None):
-                font_sizes[h] = {'nb_lines':1, 'nb_cars':li['nb_cars'],
-                    'nb_words':li['nb_words'], 'blocks':[bl_num],
-                    'flows':[fl_num]}
-            else:
-                font_sizes[h]['nb_lines'] += 1
-                font_sizes[h]['nb_cars'] += li['nb_cars']
-                font_sizes[h]['nb_words'] += li['nb_words']
-                if (font_sizes[h]['blocks'][-1] != bl_num):
-                    font_sizes[h]['blocks'].append(bl_num)
-                if (font_sizes[h]['flows'][-1] != fl_num):
-                    font_sizes[h]['flows'].append(fl_num)
-        bl_num += 1
-    fl_num += 1
-
-
-#### Choose "normal" fontsize.
-print('')
-normal_font = 0
-nf_nword = 0
-for ft in sorted(font_sizes.keys()):
-    f = font_sizes.get(ft)
-    if (f['nb_words'] > nf_nword):
-        normal_font = ft
-        nf_nword = f['nb_words']
-
-
-
-
-
-#### Prints p2x_text content
-for fnt in fontspec:
-    print(fnt)
-#print('=============================================>')
-#for li in p2x_text:
-#    print('[p. %d][%d] %s' % (li['page'], li['font'], li['text']))
-print('<=============================================')
-print('')
-
-
-#### Prints font stats
-print('')
-for ft in sorted(font_sizes.keys()):
-    f = font_sizes.get(ft)
-    print("[%d] ====> %d flows, %d blocks, %d lines, %d words, %d cars" %(ft,
-        len(f['flows']), len(f['blocks']),
-        f['nb_lines'], f['nb_words'], f['nb_cars']))
-print('')
-print('---> %d' % normal_font)
-print('')
-print('=============================================')
-
-
-#### Prints flow content, but just for normal and bigger than normal text.
-#for fl in flow:
-#    print('')
-#    print('[p. %d]' % fl['page'])
-#    for bl in fl['blocks']:
-#        h = 0.0;
-#        for li in bl['lines']: h += li['height']
-#        n = len(bl['lines'])
-#        print('    ---------------------------------------> [%d]' % round(h/n))
-#        for li in bl['lines']:
-#            if (li['font'] is not None):
-#                print('    [%2d] (%d) %s' % (li['font'], round(li['height']), li['text']))
-#            else:
-#                print('         (%d) %s' % (round(li['height']), li['text']))
-#    print('    <---------------------------------------')
-
-BIG_ONLY = True
-print('')
-nb_blocks = 0
-for fl in flow:
-    if (nb_blocks > 0):
-        print('')
-    #print('[p. %d]' % fl['page'])
-    nb_blocks = 0
-    for bl in fl['blocks']:
-        h = 0.0;
-        for li in bl['lines']: h += li['height']
-        n = len(bl['lines'])
-        h = round(h/n)
-        if (h >= normal_font) or not BIG_ONLY:
-            nb_blocks += 1
-            print('    <---------------------------------------- (p. %d)' % bl['page'])
-            for li in bl['lines']:
-                if (li['font'] is not None):
-                    print('    [%2d] (%d) %s' % (li['font'], round(li['height']), li['text']))
-                else:
-                    print('         (%d) %s' % (round(li['height']), li['text']))
-    if (nb_blocks > 0):
-        print('    ---------------------------------------->')
diff --git a/src/py/p2b.py b/src/py/p2b.py
deleted file mode 100644
index 0b80c61742793d08f3a3171fa0eb02dcf133587f..0000000000000000000000000000000000000000
--- a/src/py/p2b.py
+++ /dev/null
@@ -1,649 +0,0 @@
-import xml.etree.ElementTree as ET
-import os
-import sys
-import re
-
-# https://unix.stackexchange.com/questions/238180/execute-shell-commands-in-python
-import subprocess
-
-from p2b_utils import levenshtein
-
-### Script pour faire tout le corpus :
-# D=~/Boulot/Ontology/BSV/tmp/Corpus/2019/Viticulture; for i in ${D}/*.pdf; do j=$( basename "$i" | sed -e 's/\.pdf//' ); echo $j; python p2b.py ${D}/$j | tee ${D}/${j}.md | markdown -o ${D}/${j}.html ; done
-
-
-CMD_PDFTOTEXT = '/usr/sbin/pdftotext'
-CMD_PDFTOHTML = '/usr/sbin/pdftohtml'
-
-LEFT_THRESHOLD = 25 # In p2b_text_utils.add_lines() : the max horizontal space
-                    # to consider aligned items to be on the same line.
-
-FLAG_NONE = 0x0000
-SMALL_FONT = 0x0001
-# BIG_FONT = 0x0002 -> Unused
-PAGE_BOTTOM = 0x0004
-MANY_FONTS = 0x0010
-IS_BULLET = 0x0020
-DEFAULT_FONT_SIZE = 0x0040
-TITLE_SMALLER_THAN_SUBTITLE = 0x0080
-
-
-TITLE_MAX_LINES = 2
-
-TITLE_MIN_CHAR = 2 # To avoid “styled” bullet : we consider that a font never
-                  # used for more than TITLE_MIN_CHAR characters per line
-                  # is a kind of text styling and will take the next line's font
-
-SIMILARITY_THRESHOLD = 1.0
-
-# Celle là est un peu compliquée : Pour détecter la structure, on compte
-# le nombre de successions d'un changement de police de caractères vers
-# un autre (ex : la fonte 3 succède *2* fois à la fonte 8).
-# Si ce nombre est trop peu élevé (<= NB_SUCCESSION_FOR_SAME) alors
-# on considère que 8 n'est pas un titre de 3, et qu'ils sont au même niveau.
-# Sinon, on considère que 8 est un niveau au-dessus dans la hiérarchie des
-# titres, sous-titres, …
-NB_SUCCESSION_FOR_SAME = 0
-
-# Regex
-INDICES_EXPOSANTS_USUELS = [
-  'er|ère|ere', # 1er, 1ère, …
-  'nde?', # 2nd
-  'i?[eè]me', # 3ème, 4ieme, …
-  '°',
-]
-
-
-# +--------------------------------------------------------------+
-# |                       get_pdftotext                          |
-# +--------------------------------------------------------------+
-def get_pdftotext(filename):
-  # Calls pdftotext and retreive standard output in a string (o)
-  basename = os.path.splitext(filename)[0]
-  cmd = [CMD_PDFTOTEXT, '-bbox-layout', '-eol', 'unix', '%s.pdf' % basename, '-']
-  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-  o, e = proc.communicate()
-  if (proc.returncode != 0):
-    print('-S-> Command pdftotext returned an error :')
-    print('     '  + e.decode('utf8'))
-    return []
-
-  # Parse xml code and create block table.
-  xml = o.decode('utf8')
-  root = ET.fromstring(xml)
-
-  page_num = 0
-  flow_num = 0
-  blocks = []
-  for body in root:
-    if (body.tag.endswith('body')):
-      for doc in body:
-        if (doc.tag.endswith('doc')):
-          for page in doc:
-            if (page.tag.endswith('page')):
-              page_num += 1
-              for fl in page:
-                if (fl.tag.endswith('flow')):
-                  flow_num += 1
-                  for bloc in fl:
-                    if (bloc.tag.endswith('block')):
-                      bl = {'page': page_num, 'flow': flow_num, 'lines': [],
-                            'flags': FLAG_NONE,
-                            'x_min': float(bloc.get('xMin')),
-                            'x_max': float(bloc.get('xMax')),
-                            'y_min': float(bloc.get('yMin')),
-                            'y_max': float(bloc.get('yMax')),
-                            }
-                      for line in bloc:
-                        if (line.tag.endswith('line')):
-                          h = float(line.get('yMax')) - float(line.get('yMin'))
-                          li = { 'text': '', 'height': h, 'words': [],
-                            'flags': FLAG_NONE,
-                            'x_min': float(bloc.get('xMin')),
-                            'x_max': float(bloc.get('xMax')),
-                            'y_min': float(bloc.get('yMin')),
-                            'y_max': float(bloc.get('yMax')),
-                          }
-                          last_nbcar = 0
-                          last_h = 0
-                          for word in line:
-                            if (word.tag.endswith('word')):
-                              hword = float(word.get('yMax')) - float(word.get('yMin'))
-                              li['words'].append({'height': hword, 'text': word.text})
-                              if ((hword != last_h) and (last_nbcar < 2)):
-                                  # This is to avoid separation of one big capital
-                                  # letter at the beginin of a title or paragraph.
-                                  last_h = hword
-                                  if len(re.sub(r'\W','', li['text'])) == 0:
-                                    li['text'] = "%s %s" % (li['text'], word.text)
-                                  else:
-                                    li['text'] = "%s%s" % (li['text'], word.text)
-                              else:
-                                  li['text'] = "%s %s" % (li['text'], word.text)
-                              li['text'] = li['text'].strip()
-                              last_nbcar = len(word.text)
-                          bl['lines'].append(li)
-                      blocks.append(bl)
-  return blocks
-
-
-# +--------------------------------------------------------------+
-# |                       get_pdftohtml                          |
-# +--------------------------------------------------------------+
-def get_pdftohtml(filename):
-  basename = os.path.splitext(filename)[0]
-  cmd = [CMD_PDFTOHTML, '-xml', '-i', '-stdout', '%s.pdf' % basename]
-  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-  o, e = proc.communicate()
-  if (proc.returncode != 0):
-    print('-S-> Command pdftohtml returned an error :')
-    print('     '  + e.decode('utf8'))
-    return None
-
-  # Parse xml code and create block table.
-  xml = o.decode('utf8')
-  root = ET.fromstring(xml)
-
-  fontspec = []
-  segments = []
-  for page in root:
-    if (page.tag.endswith('page')):
-        pg = int(page.get('number'))
-        for tg in page:
-            if (tg.tag.endswith('fontspec')):
-                fontspec.append({
-                    'id': int(tg.get('id')),
-                    'size': int(tg.get('size')),
-                    'family': tg.get('family'),
-                    'color': tg.get('color'),
-                    'nb_cars': 0
-                })
-            elif (tg.tag.endswith('text')):
-                fnt = int(tg.get('font'))
-                top = int(tg.get('top'))
-                left = int(tg.get('left'))
-                width = int(tg.get('width'))
-                height = int(tg.get('height'))
-                while (tg.text is None) and (len(tg) > 0):
-                    tg = tg[0] # remove html style tags (like <b>, …)
-                if (tg.text is not None):
-                    li = "%s" % (tg.text)
-                    if (len(li.strip()) > 0):
-                        segments.append({'page': pg, 'font': fnt,
-                            'top': top, 'left': left,
-                            'width': width, 'height': height,
-                            'text': li.strip()
-                        })
-                        # Find font in fontspec
-                        for font in fontspec:
-                            if font['id'] == fnt: break
-                        font['nb_cars'] += len(li.strip())
-  return { 'fonts': fontspec, 'segments': segments }
-
-
-# +--------------------------------------------------------------+
-# |                   get_default_font_size                      |
-# +--------------------------------------------------------------+
-def get_default_font_size(fontspec):
-  sizes = {}
-  max_cars = 0
-  size_max_cars = 42 # Doesn't matter : it'll change
-  for f in fontspec:
-      if sizes.get(f['size']) is None:
-          sizes[f['size']] = f['nb_cars']
-      else:
-          sizes[f['size']] += f['nb_cars']
-      if sizes[f['size']] > max_cars:
-          max_cars = sizes[f['size']]
-          size_max_cars = f['size']
-  return size_max_cars
-
-
-# +--------------------------------------------------------------+
-# |                      mark_small_fonts                        |
-# +--------------------------------------------------------------+
-# RQ : Also marks bullet lines
-def mark_small_fonts(blocks, default_font_size):
-    for b in blocks:
-        for l in b['lines']:
-            if (round(l['height']) < default_font_size):
-                l['flags'] |= SMALL_FONT
-            if len(re.sub(r'\W','', l['text'])) == 0:
-                l['flags'] |= IS_BULLET
-
-
-# +--------------------------------------------------------------+
-# |                      mark_page_bottom                        |
-# +--------------------------------------------------------------+
-def mark_page_bottom(blocks):
-    if (blocks[-1]['page'] == 1): return
-
-    # Find indexes of last blocks in pages
-    bndx = []
-    for i in range(0, len(blocks) - 1):
-        if (blocks[i]['page'] != blocks[i+1]['page']):
-            bndx.append(i)
-    bndx.append(len(blocks)-1)
-
-    # Get last line indexes
-    lndx = []
-    for i in bndx:
-        lndx.append(len(blocks[i]['lines'])-1)
-
-    # Loop while finding always same characters in last lines
-    end = False
-    while not end:
-        txt = None
-        # Test if last lines characters are the same
-        for i,j in zip(bndx, lndx):
-            li = re.sub(r'[^a-zA-Z]', '', blocks[i]['lines'][j]['text'])
-            if txt is None: txt = li
-            else: end = (txt != li)
-        # All last line are the same, so mark them
-        if not end:
-            for i in range(0, len(bndx)):
-                blocks[bndx[i]]['lines'][lndx[i]]['flags'] |= PAGE_BOTTOM
-                lndx[i] -= 1
-                if (lndx[i] < 0):
-                    #-# blocks[bndx[i]]['flags'] |= PAGE_BOTTOM
-                    bndx[i] -= 1
-                    lndx[i] = len(blocks[bndx[i]]['lines']) - 1
-                    end = bndx[i] < 0
-
-# +--------------------------------------------------------------+
-# |                         is_ind_exp                           |
-# +--------------------------------------------------------------+
-# Is it an indice or exposant ?
-def is_ind_exp(str):
-  for ie in INDICES_EXPOSANTS_USUELS:
-      if re.match(ie, str):
-          return True
-  return False
-
-# +--------------------------------------------------------------+
-# |                         get_lines                            |
-# +--------------------------------------------------------------+
-# Extract lines from 'text' attribute returned by get_pdftohtml and associates
-# a font id (and the page number), which is the font used by the higher number
-# of characters of the line.
-# Does a column splitting considering the value of LEFT_THRESHOLD
-def get_lines(segments, fontspec):
-    last_top = -1
-    line_no = -1
-    last_right = 0
-    for txt in segments:
-        if (txt['top'] == last_top) and ((txt['left'] - last_right) <= LEFT_THRESHOLD):
-            txt['line'] = line_no
-        elif is_ind_exp(txt['text'].strip()):
-            txt['line'] = line_no
-        else:
-            line_no += 1
-            txt['line'] = line_no
-            last_top = txt['top']
-        last_right = txt['left'] + txt['width']
-
-    for f in fontspec:
-        if 'same_line' not in f:
-            f['same_line'] = []
-
-    lines = []
-    last_line = -2
-    li = ''
-    fnt = {}
-    page_num = segments[0]['page']
-    for txt in segments:
-        if (txt['line'] != last_line) or (txt == segments[-1]):
-            if (len(li.strip()) > 0):
-                fnt_no = -1; max_car = 0;
-                for f in fnt.keys():
-                    if (fnt[f] > max_car):
-                        max_car = fnt[f]
-                        fnt_no = f
-                lines.append({ 'text': li.strip(),
-                    'most_used_font': fnt_no,
-                    'nb_fonts': len(fnt),
-                    'page': page_num})
-            li = txt['text'].strip()
-            last_line = txt['line']
-            for fi1 in fnt.keys():
-                for fi2 in fnt.keys():
-                    if fi1 != fi2:
-                        f1 = next(it for it in fontspec if it['id'] == int(fi1))
-                        f2 = next(it for it in fontspec if it['id'] == int(fi2))
-                        if (f2['id'] not in f1['same_line']):
-                            f1['same_line'].append(f2['id'])
-                            f2['same_line'].append(f1['id'])
-            fnt = {}
-            fnt[txt['font']] = len(li.strip())
-        else:
-            if (is_ind_exp(txt['text'])):
-                li = "%s%s" % (li, txt['text'].strip())
-            else:
-                li = "%s %s" % (li, txt['text'].strip())
-            if (fnt.get(txt['font']) is None):
-                fnt[txt['font']] = len(txt['text'].strip())
-            else:
-                fnt[txt['font']] += len(txt['text'].strip())
-        page_num = txt['page']
-    return lines
-
-# +--------------------------------------------------------------+
-# |                        guess_fonts                           |
-# +--------------------------------------------------------------+
-# Tries to guess fontspec of each line into blocks list.
-# It calculates the levenshtein distance with every segment of the same page
-# and assigns the best matching score's font.
-def guess_fonts(blocks, segments, fontspec):
-    lines = get_lines(segments, fontspec)
-    ndx_lines = [0,] # Indexation des indices de line par numéro de page
-    for ndx in range(1, len(lines)):
-        if (lines[ndx-1]['page'] != lines[ndx]['page']):
-            ndx_lines.append(ndx)
-    ndx_lines.append(len(lines))
-
-    for f in fontspec:
-        f['nb_lines'] = 0
-        f['dist_sum'] = 0
-        #f['block_pos_sum'] = 0
-
-    for bl in blocks:
-        for l in bl['lines']:
-            if (len(l['text']) > 0):
-                min_dist = len(l['text'])
-                min_score = 1.0
-                font_sel = -1
-                line_no = -1
-                for i in range(ndx_lines[bl['page']-1], ndx_lines[bl['page']]):
-                    if (len(lines[i]['text']) > 0):
-                        d = levenshtein(l['text'], lines[i]['text'])
-                        if (d == 0):
-                            min_dist = 0
-                            min_score = 0.0
-                            font_sel = lines[i]['most_used_font']
-                            line_no = i
-                            break;
-                        score = float(d) / float(max(len(l['text']), len(lines[i]['text'])))
-                        if (score <= SIMILARITY_THRESHOLD):
-                            if (d < min_dist):
-                                min_dist = d
-                                min_score = score
-                                font_sel = lines[i]['most_used_font']
-                                line_no = i
-                l['font'] = font_sel
-                if (font_sel >= 0):
-                  fnt = next(it for it in fontspec if it['id'] == font_sel)
-                  fnt['nb_lines'] +=1
-                  fnt['dist_sum'] += min_dist
-                l['score'] = min_score # For debuggin purpose
-                l['dist'] = min_dist   #    idem.
-                l['line_no'] = line_no # idem. Stores the "similar line" number
-                # print("> %s" % l['text'])
-                # print("  %s" % lines[line_no]['text'])
-                # print("  [%d]" % font_sel)
-                # print("")
-                if (lines[line_no]['nb_fonts'] > 1):
-                    l['flags'] |= MANY_FONTS
-
-# +--------------------------------------------------------------+
-# |                    replace_block_fonts                       |
-# +--------------------------------------------------------------+
-# Adds a 'short_font' attribute to lines which gives another font value which
-# doesn't care about style (bold, …).
-# RK: def_size is default_font_size, used to mark SMALL_FONT flag.
-def replace_block_fonts(blocks, fontspec, def_size):
-    for i in range(0, len(fontspec) - 1):
-        for j in range(i+1, len(fontspec)):
-          if (fontspec[j].get('replaceWith') is None):
-            if (fontspec[j]['id'] in fontspec[i]['same_line']):
-                if fontspec[i].get('replaceWith') is None:
-                    fontspec[j]['replaceWith'] = fontspec[i]['id']
-                else:
-                    fontspec[j]['replaceWith'] = fontspec[i]['replaceWith']
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['font'] < 0):
-                f = None
-            else:
-                f = next(it for it in fontspec if it['id'] == l['font'])
-            if (f is None) or (f.get('replaceWith') is None):
-                l['short_font'] = l['font']
-            else:
-                l['short_font'] = f.get('replaceWith')
-            if (f is not None):
-                f = next(it for it in fontspec if it['id'] == l['short_font'])
-                if (f['size'] < def_size):
-                    l['flags'] |= SMALL_FONT
-                if (f['size'] == def_size):
-                    l['flags'] |= DEFAULT_FONT_SIZE
-
-
-# +--------------------------------------------------------------+
-# |                      guess_structure                         |
-# +--------------------------------------------------------------+
-def guess_structure(blocks, fontspec,
-  remove_flags = SMALL_FONT | PAGE_BOTTOM | IS_BULLET):
-    t = [] # A list used here and there
-    n = [] # Another one
-
-    # Search for the most used font
-    # Here, t will be used to count the number of cars of each font.
-    #   and n will be used to store the maximum line size for each font.
-    for i in range(len(fontspec)):
-        t.append(0)
-        n.append(0)
-    nb_max = -1
-    ndx_most_used = -1
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['short_font'] >= 0) and ((l['flags'] & remove_flags) == FLAG_NONE):
-                lon = len(l['text'].strip())
-                t[l['short_font']] += lon
-                if lon > n[l['short_font']]: n[l['short_font']] = lon
-                if (t[l['short_font']] > nb_max):
-                    nb_max = t[l['short_font']]
-                    ndx_most_used = l['short_font']
-    b = [nb <= TITLE_MIN_CHAR for nb in n]
-
-    ### ndx_most_used is the most used font number.
-    ### b[font_number] is True if the font seems used for bullets.
-
-    t = [] # We'll use it to list the fonts succession
-    n = [] # Used to count the number of lines
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['flags'] & remove_flags) == FLAG_NONE:
-                if t == []:
-                    t.append(l['short_font'])
-                    n.append(1)
-                else:
-                    if (t[-1] != l['short_font']):
-                        t.append(l['short_font'])
-                        n.append(1)
-                    else:
-                        n[-1] += 1
-
-    f = {} # Will contain used font numbers and number of occurences in t
-    for i,j in zip(t,n):
-        if i not in f.keys():
-            f[i] = {'nb': 1, 'nl':j, 'maxl': j,
-                    'is_bullet': b[i], 'flags': FLAG_NONE}
-        else:
-            f[i]['nb'] += 1
-            f[i]['nl'] += j
-            if (j > f[i]['maxl']):
-                f[i]['maxl'] = j
-
-    for i in f.keys():
-        f[i]['isnt_title'] = (f[i]['maxl'] > TITLE_MAX_LINES)
-
-    # Replace short_font for lines considered as bullets (or text styling).
-    last_bullet_lines = []
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['flags'] & remove_flags) == FLAG_NONE:
-                if f[l['short_font']]['is_bullet']:
-                    last_bullet_lines.append(l)
-                else:
-                    if (len(last_bullet_lines) > 0):
-                        for last in last_bullet_lines:
-                            last['short_font'] = l['short_font']
-                        last_bullet_lines = []
-    if (len(last_bullet_lines) > 0):
-        for last in last_bullet_lines:
-            last['short_font'] = ndx_most_used
-
-    # n and b won't be used anymore I think. So they're free
-
-    # Rebuild the font succession list (is not optimized but is the safest)
-    t = []
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['flags'] & remove_flags) == FLAG_NONE:
-                if t == []: t.append(l['short_font'])
-                else:
-                    if (t[-1] != l['short_font']):
-                        t.append(l['short_font'])
-
-    b = [] # We'll do a 2d table with b[i][j] = number of transitions
-           # from fonti to fontj (will be a tree of font transitions)
-    for i in range(len(fontspec)+1): # Consider len+1 to have font number -1
-        b.append([0 for j in range(len(fontspec)+1)])
-    for i in range(len(t)-1):
-        j = i+1
-        if not f[t[i]]['isnt_title']:
-            b[t[i]][t[j]] += 1
-
-    # Should we do this : ? &&&&&&&&&&&&&&&&&&&& A tester
-    # Un moyen de s'assurer que tout sera parcouru...
-    # Signifie qu'on ne finit pas sur un titre.
-    f[t[-1]]['isnt_title'] = True
-
-    # Create a deep attribute in f which contains distance from leaves
-    for k,v in f.items():
-        if v['isnt_title']:
-            v['deep'] = 0
-            v['nb_transitions'] = 999999999
-        else: v['deep'] = None
-
-    # Algo : dans le tableau b, on parcourt les colonnes (j) pour les fontes qui ont un deep.
-    #        Un indice (i) de ligne pour lequel la valeur b[i][j] est non nulle signifie
-    #        que la fonte i
-    #        précède la fonte j b[i][j] fois.
-    # Rq : Une colonne vide pour un indice dont la ligne est non-vide est une racine
-    #      Une ligne vide pour un indice dont la colonne est non-vide est une feuille
-    # On répète tant qu'on change des valeurs (c'est pas optimisé mais crotte,
-    # le tableau n'est pas si grand)
-    has_changed = True
-    deep_max = 0
-    while has_changed:
-        has_changed = False
-        for k,v in f.items():
-            if v['deep'] is not None:
-                for i in range(-1,len(b)-1):
-                    if b[i][k] != 0:
-                        if f[i]['deep'] is None:
-                            if (b[i][k] <= NB_SUCCESSION_FOR_SAME):
-                                f[i]['deep'] = v['deep']
-                                f[i]['nb_transitions'] = b[i][k]
-                            else:
-                                f[i]['deep'] = v['deep'] + 1
-                                f[i]['nb_transitions'] = b[i][k]
-                            if f[i]['deep'] > deep_max:
-                                deep_max = f[i]['deep']
-                            has_changed = True
-                            if (fontspec[i]['size'] < fontspec[k]['size']):
-                                f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
-                        elif f[i]['nb_transitions'] < b[i][k]:
-                            f[i]['deep'] = v['deep'] + 1
-                            f[i]['nb_transitions'] = b[i][k]
-                            has_changed = True
-                            if (fontspec[i]['size'] < fontspec[k]['size']):
-                                f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
-
-    # Reverse deepness value, to make it distance from root
-    for v in f.values():
-        if (v['deep'] is not None):
-            v['deep'] = deep_max - v['deep']
-
-    # Add deep in blocks lines
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['flags'] & remove_flags) == FLAG_NONE:
-                l['deep'] = f[l['short_font']]['deep']
-                if ((f[l['short_font']]['flags']) & TITLE_SMALLER_THAN_SUBTITLE != 0):
-                   l['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
-            else:
-                l['deep'] = deep_max
-
-
-
-
-
-
-
-# +--------------------------------------------------------------+
-# |                      print_block_list                        |
-# +--------------------------------------------------------------+
-def print_block_list(t, remove_flags = FLAG_NONE):
-    last_page = -1
-    deep_max = -1
-    for bl in t:
-        for l in bl['lines']:
-            if (l.get('deep') is not None):
-                if deep_max < l['deep']: deep_max = l['deep']
-    if deep_max > 10: deep_max = 10
-    ttl = "#############"
-    last_deep = -1
-
-    for block in t:
-        if (block['page'] != last_page):
-            if (last_page > 0):
-                print("")
-            last_page = block['page']
-            print("________________________________")
-            print("*page %d*" % last_page)
-
-        print("")
-
-        for l in block['lines']:
-            if (l['flags'] & remove_flags) == FLAG_NONE:
-                pre = ''
-                post = '  '
-                if (l.get('deep') is None):
-                    pre = '!! '
-                    last_deep = -1
-                else:
-                    if (l['flags'] & SMALL_FONT) != 0:
-                        pre = "> %s" % pre
-                    #if (len(l['text']) > 20) and \
-                    #   len(re.sub(r'\w','', l['text']).strip()) > 5:
-                    #    post = "%s  " % post
-                    if (l['flags'] & TITLE_SMALLER_THAN_SUBTITLE) != 0 and \
-                       (l['flags'] & (DEFAULT_FONT_SIZE | SMALL_FONT)) != 0:
-                        pre = "%s**" % (pre)
-                        post = "**%s" % post
-                    elif l['deep'] < deep_max:
-                        pre = "%s%s " % (pre, ttl[0:(l['deep']+1)])
-                    last_deep = l['deep']
-                print("%s%s%s" % (pre, l['text'], post))
-
-
-# +--------------------------------------------------------------+
-# |                           main                               |
-# +--------------------------------------------------------------+
-if (len(sys.argv) < 1):
-    print("-U-> Usage : python pdf2blocks.py <fichier_pdf>")
-    sys.exit(-1)
-
-blocks = get_pdftotext(sys.argv[1])
-p2h = get_pdftohtml(sys.argv[1])
-fontspec = p2h['fonts']
-segments = p2h['segments']
-
-default_font_size = get_default_font_size(fontspec)
-# mark_small_fonts(blocks, default_font_size)
-mark_page_bottom(blocks)
-guess_fonts(blocks, segments, fontspec)
-replace_block_fonts(blocks, fontspec, default_font_size)
-guess_structure(blocks, fontspec)
-print_block_list(blocks, PAGE_BOTTOM | IS_BULLET)
diff --git a/src/py/p2b_analyse.py b/src/py/p2b_analyse.py
deleted file mode 100644
index 4339529817dab29ed543a7ff2e6f9a07db5c779e..0000000000000000000000000000000000000000
--- a/src/py/p2b_analyse.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Quelques fonctions pour analyser les retours des fonctions
-# get_pdftotext et get_pdftohtml
-
-
-def get_pdftotext_fontstats(blocks):
-    # TODO: Changer en comptage mot par mot.
-  font_sizes = {}
-  pipe = []
-  bl_num = 0
-  fl_num = 0
-  for bl in blocks:
-        for li in bl['lines']:
-            h = round(li['height']) ##### IMPORTANT : on arrondit.
-            if (font_sizes.get(h) is None):
-                font_sizes[h] = {'nb_lines':len(bl['lines']),
-                    'nb_cars':bl['nb_cars'], 'nb_words':bl['nb_words'],
-                    'nb_blocks': 1}
-            else:
-                font_sizes[h]['nb_lines'] += len(bl['lines'])
-                font_sizes[h]['nb_cars'] += bl['nb_cars']
-                #font_sizes[h]['nb_words'] += li['nb_words']
-                font_sizes[h]['nb_blocks'] += 1
-  return font_sizes
-
-def get_pdftotext_normal_fontsize(font_sizes):
-  normal_font = 0
-  nf_nword = 0
-  for ft in sorted(font_sizes.keys()):
-    f = font_sizes.get(ft)
-#    if (f['nb_words'] > nf_nword):
-#        normal_font = ft
-#        nf_nword = f['nb_words']
-    if (f['nb_cars'] > nf_nword):
-        normal_font = ft
-        nf_nword = f['nb_cars']
-  return normal_font
diff --git a/src/py/p2b_blocks_utils.py b/src/py/p2b_blocks_utils.py
deleted file mode 100644
index 0c60f624cbcf15d01a62df94ebc39602b61201be..0000000000000000000000000000000000000000
--- a/src/py/p2b_blocks_utils.py
+++ /dev/null
@@ -1,371 +0,0 @@
-import re
-from p2b_config import *
-from p2b_utils import levenshtein
-from p2b_text_utils import get_lines
-
-
-
-def print_block_list(t, remove_flags = FLAG_NONE):
-    last_page = -1
-    last_flow = -1
-    write_flow = False
-
-    deep_max = -1
-    for bl in t:
-        for l in bl['lines']:
-            if (l.get('deep') is not None):
-                if deep_max < l['deep']: deep_max = l['deep']
-    if deep_max > 10: deep_max = 10
-    ttl = "###########"
-    last_deep = -1
-
-    for block in t:
-        if (block['page'] != last_page):
-            if (last_page > 0):
-                print("")
-            last_page = block['page']
-            print("________________________________")
-            print("*page %d*" % last_page)
-
-        if (block['flow'] != last_flow):
-         write_flow = True
-         last_flow = block['flow']
-
-        print("")
-
-        for l in block['lines']:
-            if (l['flags'] & remove_flags) == FLAG_NONE:
-                if write_flow:
-                    #print("+----------------------------------------------------------------------------+")
-                    write_flow = False
-                pre = ''
-                if (l.get('deep') is None):
-                    pre = '!! '
-                    last_deep = -1
-                else:
-                    if l['deep'] < deep_max:
-                        pre = "%s " % ttl[0:(l['deep']+1)]
-                        #if (l['deep'] <= 3) and (l['deep'] != last_deep):
-                        #    print("")
-                    last_deep = l['deep']
-                print("%s%s" % (pre, l['text']))
-
-
-def print_block_list_old(t, remove_flags = FLAG_NONE):
-    last_page = -1
-    last_flow = -1
-    for block in t:
-        if (block['page'] != last_page):
-            if (last_page > 0):
-                print("")
-            last_page = block['page']
-            print("== [page %d] ==================================================================" % last_page)
-        if (block['flow'] != last_flow):
-            print("+----------------------------------------------------------------------------+")
-            last_flow = block['flow']
-
-        flags = ''
-        if (block['flags'] == FLAG_NONE): flags = '      '
-        else: flags = "<%x>" % block['flags']
-
-        #print("| → %s w:%d c:%d, [(%.2f, %.2f) ; (%.2f, %.2f)]" % (flags,
-        #    block['nb_words'], block['nb_cars'],
-        #    block['x_min'], block['y_min'], block['x_max'], block['y_max']))
-        for l in block['lines']:
-            if (l['flags'] & remove_flags != 0):
-                break
-            lflags = ' '
-            if (l['flags'] == FLAG_NONE):
-                lflags = '---→'
-            else:
-                lflags = "(%2x)" % l['flags']
-            mf_flag = '-' if ((l.get('flags') & FLAG_MANY_FONTS) == 0) else '*'
-
-            if l.get('font') is None:
-                font = "[ - %2d - ]" % round(l['height'])
-            else:
-                if l.get('short_font') is None:
-                    font = "[%2d %s %.2f]" %(l.get('font'), mf_flag, l['score'])
-                else:
-                    font = "%s%2d%s" % (mf_flag, l.get('short_font'), mf_flag)
-
-            print("    %s %s %s" % (lflags, font, l['text']))
-
-        #print("")
-
-
-def mark_blocks_fontsize(blocks, normal_size):
-    for b in blocks:
-        if (round(b['h_max']) < normal_size):
-            b['flags'] |= FLAG_SMALL_FONT
-        if (round(b['h_min']) > normal_size):
-            b['flags'] |= FLAG_BIG_FONT
-        for l in b['lines']:
-            if (round(l['height']) < normal_size):
-                l['flags'] |= FLAG_SMALL_FONT
-            elif (round(l['height']) > normal_size):
-                l['flags'] |= FLAG_BIG_FONT
-
-
-def mark_bullet_lines(blocks):
-    for b in blocks:
-        for l in b['lines']:
-            if len(re.sub(r'\W','', l['text'])) == 0:
-                l['flags'] |= FLAG_BULLET
-
-
-def mark_page_bottom(blocks):
-    if (blocks[-1]['page'] == 1): return
-
-    # Find indexes of last blocks in pages
-    bndx = []
-    for i in range(0, len(blocks) - 1):
-        if (blocks[i]['page'] != blocks[i+1]['page']):
-            bndx.append(i)
-    bndx.append(len(blocks)-1)
-
-    # Get last line indexes
-    lndx = []
-    for i in bndx:
-        lndx.append(len(blocks[i]['lines'])-1)
-
-    # Loop while finding always same characters in those lines
-    end = False
-    while not end:
-        txt = None
-        # Test if last lines characters are the same
-        for i,j in zip(bndx, lndx):
-            li = re.sub(r'[^a-zA-Z]', '', blocks[i]['lines'][j]['text'])
-            if txt is None: txt = li
-            else: end = (txt != li)
-        # All last line are the same, so mark them
-        if not end:
-            for i in range(0, len(bndx)):
-                blocks[bndx[i]]['lines'][lndx[i]]['flags'] |= FLAG_PAGE_BOTTOM
-                lndx[i] -= 1
-                if (lndx[i] < 0):
-                    blocks[bndx[i]]['flags'] |= FLAG_PAGE_BOTTOM
-                    bndx[i] -= 1
-                    lndx[i] = len(blocks[bndx[i]]['lines']) - 1
-                    end = bndx[i] < 0
-
-
-# Tries to guess fontspec of each line of text.
-# It calculates the levenshtein distance with every line of the same page.
-# If the result divided by the longuest string length is lower than
-# SIMILARITY_THRESHOLD (defined in p2b_config), then it's used to calculate
-# the best matching string. Of course, if a distance of 0 is found,
-# it's considered to be the line we're looking for.
-def guess_fonts(blocks, text, fontspec):
-    lines = get_lines(text, fontspec)
-    ndx_lines = [0,] # Indexation des indices de line par numéro de page
-    for ndx in range(1, len(lines)):
-        if (lines[ndx-1]['page'] != lines[ndx]['page']):
-            ndx_lines.append(ndx)
-    ndx_lines.append(len(lines))
-
-    for f in fontspec:
-        f['nb_lines'] = 0
-        f['dist_sum'] = 0
-        #f['block_pos_sum'] = 0
-
-    for bl in blocks:
-        for l in bl['lines']:
-            if (len(l['text']) > 0):
-                min_dist = len(l['text'])
-                min_score = 1.0
-                font_sel = -1
-                line_no = -1
-                for i in range(ndx_lines[bl['page']-1], ndx_lines[bl['page']]):
-                    if (len(lines[i]['text']) > 0):
-                        d = levenshtein(l['text'], lines[i]['text'])
-                        if (d == 0):
-                            min_dist = 0
-                            min_score = 0.0
-                            font_sel = lines[i]['most_used_font']
-                            line_no = i
-                            break;
-                        score = float(d) / float(max(len(l['text']), len(lines[i]['text'])))
-                        if (score <= SIMILARITY_THRESHOLD):
-                            if (d < min_dist):
-                                min_dist = d
-                                min_score = score
-                                font_sel = lines[i]['most_used_font']
-                                line_no = i
-                l['font'] = font_sel
-                if (font_sel >= 0):
-                  fnt = next(it for it in fontspec if it['id'] == font_sel)
-                  fnt['nb_lines'] +=1
-                  fnt['dist_sum'] += min_dist
-                l['score'] = min_score # For debuggin purpose
-                l['dist'] = min_dist   #    idem.
-                l['line_no'] = line_no # idem. Stores the "similar line" number
-                if (lines[line_no]['nb_fonts'] > 1):
-                    l['flags'] |= FLAG_MANY_FONTS
-
-
-# Adds a 'short_font' attribute to lines which gives another font value which
-# doesn't care about style (bold, …).
-def replace_block_fonts(blocks, fonts):
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['font'] < 0) or (fonts[l['font']].get('replaceWith') is None):
-                l['short_font'] = l['font']
-            else:
-                l['short_font'] = fonts[l['font']].get('replaceWith')
-
-
-
-def guess_structure(blocks, fonts,
-  remove_flags = FLAG_SMALL_FONT | FLAG_PAGE_BOTTOM | FLAG_BULLET):
-    t = [] # A list used here and there
-    n = [] # Another one
-
-    # Search for the most used font
-    # Here, t will be used to count the number of cars of each font.
-    #   and n will be used to store the maximum line size for each font.
-    for i in range(len(fonts)):
-        t.append(0)
-        n.append(0)
-    nb_max = -1
-    ndx_most_used = -1
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['short_font'] >= 0) and ((l['flags'] & remove_flags) == FLAG_NONE):
-                lon = len(l['text'].strip())
-                t[l['short_font']] += lon
-                if lon > n[l['short_font']]: n[l['short_font']] = lon
-                if (t[l['short_font']] > nb_max):
-                    nb_max = t[l['short_font']]
-                    ndx_most_used = l['short_font']
-    b = [nb <= TITLE_MIN_CHAR for nb in n]
-
-    ### ndx_most_used is the most used font number.
-    ### b[font_number] is True if the font seems used for bullets.
-
-    t = [] # We'll use it to list the fonts succession
-    n = [] # Used to count the number of lines
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['flags'] & remove_flags) == FLAG_NONE:
-                if t == []:
-                    t.append(l['short_font'])
-                    n.append(1)
-                else:
-                    if (t[-1] != l['short_font']):
-                        t.append(l['short_font'])
-                        n.append(1)
-                    else:
-                        n[-1] += 1
-
-    f = {} # Will contain used font numbers and number of occurences in t
-    for i,j in zip(t,n):
-        if i not in f.keys():
-            f[i] = {'nb': 1, 'nl':j, 'maxl': j, 'is_bullet': b[i]}
-        else:
-            f[i]['nb'] += 1
-            f[i]['nl'] += j
-            if (j > f[i]['maxl']):
-                f[i]['maxl'] = j
-
-    for i in f.keys():
-        # a = float(f[i]['nl']) / f[i]['nb']
-        # f[i]['isnt_title'] = (a > TITLE_MAX_MEAN_LINES)
-        f[i]['isnt_title'] = (f[i]['maxl'] > TITLE_MAX_LINES)
-
-    # Replace short_font for lines considered as bullets (or text styling).
-    last_bullet_lines = []
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['flags'] & remove_flags) == FLAG_NONE:
-                if f[l['short_font']]['is_bullet']:
-                    last_bullet_lines.append(l)
-                else:
-                    if (len(last_bullet_lines) > 0):
-                        for last in last_bullet_lines:
-                            last['short_font'] = l['short_font']
-                        last_bullet_lines = []
-    if (len(last_bullet_lines) > 0):
-        for last in last_bullet_lines:
-            last['short_font'] = ndx_most_used
-
-    # n and b won't be used anymore I think. So they're free
-
-    # Rebuild the font succession list (is not optimized but is the safest)
-    t = []
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['flags'] & remove_flags) == FLAG_NONE:
-                if t == []: t.append(l['short_font'])
-                else:
-                    if (t[-1] != l['short_font']):
-                        t.append(l['short_font'])
-
-    b = [] # We'll do a 2d table with b[i][j] = number of transitions
-           # from fonti to fontj (will be a tree of font transitions)
-    for i in range(len(fonts)+1): # Consider len+1 to have font number -1
-        b.append([0 for j in range(len(fonts)+1)])
-    for i in range(len(t)-1):
-        j = i+1
-        if not f[t[i]]['isnt_title']:
-            b[t[i]][t[j]] += 1
-
-    # Should we do this : ? &&&&&&&&&&&&&&&&&&&& A tester
-    # Un moyen de s'assurer que tout sera parcouru...
-    # Signifie qu'on ne finit pas sur un titre.
-    f[t[-1]]['isnt_title'] = True
-
-    # Create a deep attribute in f which contains distance from leaves
-    for k,v in f.items():
-        if v['isnt_title']:
-            v['deep'] = 0
-            v['nb_transitions'] = 999999999
-        else: v['deep'] = None
-
-    # Algo : dans le tableau b, on parcourt les colonnes (j) pour les fontes qui ont un deep.
-    #        Un indice (i) de ligne pour lequel la valeur b[i][j] est non nulle signifie
-    #        que la fonte i
-    #        précède la fonte j b[i][j] fois.
-    # Rq : Une colonne vide pour un indice dont la ligne est non-vide est une racine
-    #      Une ligne vide pour un indice dont la colonne est non-vide est une feuille
-    # On répète tant qu'on change des valeurs (c'est pas optimisé mais crotte,
-    # le tableau n'est pas si grand)
-    has_changed = True
-    deep_max = 0
-    while has_changed:
-        has_changed = False
-        for k,v in f.items():
-            if v['deep'] is not None:
-                for i in range(-1,len(b)-1):
-                    if b[i][k] != 0:
-                        if f[i]['deep'] is None:
-                            if (b[i][k] <= NB_SUCCESSION_FOR_SAME):
-                                f[i]['deep'] = v['deep']
-                                f[i]['nb_transitions'] = b[i][k]
-                            else:
-                                f[i]['deep'] = v['deep'] + 1
-                                f[i]['nb_transitions'] = b[i][k]
-                            if f[i]['deep'] > deep_max:
-                                deep_max = f[i]['deep']
-                            has_changed = True
-                        elif f[i]['nb_transitions'] < b[i][k]:
-                            f[i]['deep'] = v['deep'] + 1
-                            f[i]['nb_transitions'] = b[i][k]
-                            has_changed = True
-
-    # Reverse deepness value, to make it distance from root
-    for v in f.values():
-        if (v['deep'] is not None):
-            v['deep'] = deep_max - v['deep']
-
-    # Add deep in blocks lines
-    for bl in blocks:
-        for l in bl['lines']:
-            if (l['flags'] & remove_flags) == FLAG_NONE:
-                l['deep'] = f[l['short_font']]['deep']
-            else:
-                l['deep'] = deep_max
-
-    #for k,v in f.items():
-    #    if (v['deep'] is not None):
-    #        print("[%2d] → %d" % (k, v['deep']))
diff --git a/src/py/p2b_config.py b/src/py/p2b_config.py
deleted file mode 100644
index b5f6172572ef1c4765c080d9e997ceb45a244e47..0000000000000000000000000000000000000000
--- a/src/py/p2b_config.py
+++ /dev/null
@@ -1,45 +0,0 @@
-
-
-CMD_PDFTOTEXT = '/usr/sbin/pdftotext'
-CMD_PDFTOHTML = '/usr/sbin/pdftohtml'
-
-FLAG_NONE = 0x0000
-FLAG_SMALL_FONT = 0x0001 # Block or line has too small fontsize
-FLAG_BIG_FONT = 0x0002 # Block or line has bigger then normal fontsize
-FLAG_PAGE_BOTTOM = 0x0004
-FLAG_PAGE_TOP = 0x0008
-FLAG_MANY_FONTS = 0x0010 # pdftohtml return different fonts for the associated line
-FLAG_BULLET = 0x0020
-
-# For font guessing (see guess_fonts in p2b_blocks_utils)
-SIMILARITY_THRESHOLD = 1.0
-LEFT_THRESHOLD = 20 # In p2b_text_utils.add_lines() : the max horizontal space
-                    # to consider aligned items to be on the same line.
-
-#BOLD_FONT_THRESHOLD = 0.5
-#TITLE_MAX_MEAN_LINES = 2.0 # For each “short font”, the mean number of succesive
-                           # lines is computed. If this mean is higher than
-                           # TITLE_MAX_MEAN_LINES, we consider this font is not
-                           # used for titles.
-TITLE_MAX_LINES = 2 # Replaces TITLE_MAX_MEAN_LINES
-
-TITLE_MIN_CHAR = 2 # To avoid “styled” bullet : we consider that a font never
-                  # used for more than TITLE_MIN_CHAR characters per line
-                  # is a kind of text styling and will take the next line's font
-
-# Celle là est un peu compliquée : Pour détecter la structure, on compte
-# le nombre de successions d'un changement de police de caractères vers
-# un autre (ex : la fonte 3 succède *2* fois à la fonte 8).
-# Si ce nombre est trop peu élevé (<= NB_SUCCESSION_FOR_SAME) alors
-# on considère que 8 n'est pas un titre de 3, et qu'ils sont au même niveau.
-# Sinon, on considère que 8 est un niveau au-dessus dans la hiérarchie des
-# titres, sous-titres, …
-NB_SUCCESSION_FOR_SAME = 0
-
-# Regex
-INDICES_EXPOSANTS_USUELS = [
-  'er|ère|ere', # 1er, 1ère, …
-  'nde?', # 2nd
-  'i?[eè]me', # 3ème, 4ieme, …
-  '°',
-]
diff --git a/src/py/p2b_file.py b/src/py/p2b_file.py
deleted file mode 100644
index 60d9111eb99cf099e68a75914a5a9ecc59a83b8b..0000000000000000000000000000000000000000
--- a/src/py/p2b_file.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import xml.etree.ElementTree as ET
-import os
-#import sys
-import re
-
-# https://unix.stackexchange.com/questions/238180/execute-shell-commands-in-python
-import subprocess
-
-from p2b_config import CMD_PDFTOTEXT, CMD_PDFTOHTML, FLAG_NONE
-
-def get_pdftotext(filename):
-  # Calls pdftotext and retreive standard output in a string (o)
-  basename = os.path.splitext(filename)[0]
-  cmd = [CMD_PDFTOTEXT, '-bbox-layout', '-eol', 'unix', '%s.pdf' % basename, '-']
-  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-  o, e = proc.communicate()
-  if (proc.returncode != 0):
-    print('-S-> Command pdftotext returned an error :')
-    print('     '  + e.decode('utf8'))
-    return []
-
-  # Parse xml code and create block table.
-  xml = o.decode('utf8')
-  root = ET.fromstring(xml)
-
-  page_num = 0
-  flow_num = 0
-  blocks = []
-  for body in root:
-    if (body.tag.endswith('body')):
-      for doc in body:
-        if (doc.tag.endswith('doc')):
-          for page in doc:
-            if (page.tag.endswith('page')):
-              page_num += 1
-              for fl in page:
-                if (fl.tag.endswith('flow')):
-                  flow_num += 1
-                  for bloc in fl:
-                    if (bloc.tag.endswith('block')):
-                      bl = {'page': page_num, 'flow': flow_num, 'lines': [],
-                            'h_min': 150000, 'h_max': 0,
-                            'x_min': float(bloc.get('xMin')),
-                            'x_max': float(bloc.get('xMax')),
-                            'y_min': float(bloc.get('yMin')),
-                            'y_max': float(bloc.get('yMax')),
-                            'nb_cars': 0, 'nb_words': 0, 'flags': FLAG_NONE }
-                      bwords = 0
-                      bcars = 0
-                      for line in bloc:
-                        if (line.tag.endswith('line')):
-                          h = float(line.get('yMax')) - float(line.get('yMin'))
-                          if (h < bl['h_min']): bl['h_min'] = h
-                          if (h > bl['h_max']): bl['h_max'] = h
-                          li = { 'text': '', 'height': h, 'words': [],
-                          'nb_words': 0, 'nb_cars': 0, 'flags': FLAG_NONE }
-                          last_nbcar = 0
-                          last_h = 0
-                          for word in line:
-                            if (word.tag.endswith('word')):
-                              hword = float(word.get('yMax')) - float(word.get('yMin'))
-                              li['words'].append({'height': hword, 'text': word.text})
-                              if ((hword != last_h) and (last_nbcar < 2)):
-                                  # This is to avoid separation of one big capital
-                                  # letter at the beginin of a title or paragraph.
-                                  last_h = hword
-                                  if len(re.sub(r'\W','', li['text'])) == 0:
-                                    li['text'] = "%s %s" % (li['text'], word.text)
-                                  else:
-                                    li['text'] = "%s%s" % (li['text'], word.text)
-                              else:
-                                  li['text'] = "%s %s" % (li['text'], word.text)
-                              li['text'] = li['text'].strip()
-                              last_nbcar = len(word.text)
-                              li['nb_words'] += 1
-                          bl['lines'].append(li)
-                          bl['nb_cars'] += len(li['text'].strip())
-                          bl['nb_words'] += li['nb_words']
-                      blocks.append(bl)
-  return blocks
-
-
-def get_pdftohtml(filename):
-  basename = os.path.splitext(filename)[0]
-  cmd = [CMD_PDFTOHTML, '-xml', '-i', '-stdout', '%s.pdf' % basename]
-  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-  o, e = proc.communicate()
-  if (proc.returncode != 0):
-    print('-S-> Command pdftohtml returned an error :')
-    print('     '  + e.decode('utf8'))
-    return None
-
-  # Parse xml code and create block table.
-  xml = o.decode('utf8')
-  root = ET.fromstring(xml)
-
-  fontspec = []
-  p2x_text = []
-  for page in root:
-    if (page.tag.endswith('page')):
-        pg = int(page.get('number'))
-        for tg in page:
-            if (tg.tag.endswith('fontspec')):
-                fontspec.append({
-                    'id': int(tg.get('id')),
-                    'size': int(tg.get('size')),
-                    'family': tg.get('family'),
-                    'color': tg.get('color')
-                })
-            elif (tg.tag.endswith('text')):
-                fnt = int(tg.get('font'))
-                top = int(tg.get('top'))
-                left = int(tg.get('left'))
-                width = int(tg.get('width'))
-                height = int(tg.get('height'))
-                while (tg.text is None) and (len(tg) > 0):
-                    tg = tg[0] # remove html style tags (like <b>, …)
-                if (tg.text is not None):
-                    li = "%s" % (tg.text)
-                    if (len(li.strip()) > 0):
-                        p2x_text.append({
-                            'page': pg,
-                            'font': fnt,
-                            'top': top, 'left': left,
-                            'width': width, 'height': height,
-                            'text': li.strip()
-                        })
-  return { 'fonts': fontspec, 'text': p2x_text }
diff --git a/src/py/p2b_fonts_utils.py b/src/py/p2b_fonts_utils.py
deleted file mode 100644
index 0cb507765b01a23a867f7f737097e69abb997da5..0000000000000000000000000000000000000000
--- a/src/py/p2b_fonts_utils.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import re
-from p2b_config import *
-
-# Defines a 'sameAs' attribute, containing a list of similar fonts,
-# considering style attributes (bold, slanted, …) not important.
-def group_fonts(fonts):
-    for i in range(0, len(fonts) - 1):
-        for j in range(i+1, len(fonts)):
-            if (fonts[i]['size'] == fonts[j]['size']) and \
-                (fonts[i]['color'] == fonts[j]['color']) and \
-                (fonts[i]['family'].split(',')[0] == fonts[j]['family'].split(',')[0]):
-                if (fonts[i].get('sameAs') is None):
-                    fonts[i]['sameAs'] = [j]
-                else:
-                    fonts[i]['sameAs'].append(j)
-                if (fonts[j].get('sameAs') is None):
-                    fonts[j]['sameAs'] = [i]
-                else:
-                    fonts[j]['sameAs'].append(i)
-
-
-# Same as group_fonts, except that every sameAs (which is named replaceWith)
-# points out only one font (the first one identified 'sameAs'). For example,
-# if the fonts n° 5, 8 and 12 are identified "same", it will return :
-# - replaceWith is None for font n° 5
-# - replaceWith is 5 for fonts 8 and 12.
-# Thus, replacing whole block fonts with replaceWith value will reduce
-# the number of fonts used for the whole document.
-def replace_fonts(fonts):
-    for i in range(0, len(fonts) - 1):
-        for j in range(i+1, len(fonts)):
-          if (fonts[j].get('replaceWith') is None):
-            if (fonts[j]['id'] in fonts[i]['same_line']):
-                if fonts[i].get('replaceWith') is None:
-                    fonts[j]['replaceWith'] = i
-                else:
-                    fonts[j]['replaceWith'] = fonts[i]['replaceWith']
-#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
-#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
-#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
-####      BUG !!!!!!!!!
-#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
-#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
-#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
-# L'algo ne convient pas à la nouvelle forme d'unification ;
-# Si on a 2 sur la même ligne que 4, puis 4 sur la même ligne que 7, alors on a :
-# 2:same_line = [4], et 4:same_line = [2,7]
-# et la conséquence est que l'on obtient :
-# 2:replaceWith <- None, 4:replaceWith <- 2, 7:replaceWith <- 4.
-#
-# Rustine posée, (if fonts[i]['replaceWith'] is None) à tester.
-
-
-            ## Regroupement par famille/couleur/taille :
-            #if (fonts[i]['size'] == fonts[j]['size']) and \
-            #    (fonts[i]['color'] == fonts[j]['color']) and \
-            #    (fonts[i]['family'].split(',')[0] == fonts[j]['family'].split(',')[0]):
-            #        fonts[j]['replaceWith'] = i
diff --git a/src/py/p2b_text_utils.py b/src/py/p2b_text_utils.py
deleted file mode 100644
index 11d5134d4fe483e4364c11f0fb0f1f2b209ade8d..0000000000000000000000000000000000000000
--- a/src/py/p2b_text_utils.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import re
-from p2b_config import *
-
-# Is it an indice or expoesant ?
-def is_ind_exp(str):
-  for ie in INDICES_EXPOSANTS_USUELS:
-      if re.match(ie, str):
-          return True
-  return False
-
-def add_lines(text):
-    last_top = -1
-    line_no = -1
-    last_right = 0
-    for txt in text:
-        if (txt['top'] == last_top) and ((txt['left'] - last_right) < LEFT_THRESHOLD):
-            txt['line'] = line_no
-        elif is_ind_exp(txt['text'].strip()):
-            txt['line'] = line_no
-        else:
-            line_no += 1
-            txt['line'] = line_no
-            last_top = txt['top']
-        last_right = txt['left'] + txt['width']
-#    for txt in text:
-#        print("[%d] (%d | %d ←→ %d) %s" %(txt['line'], txt['top'], txt['left'], (txt['left'] + txt['width']), txt['text']))
-
-
-# Extract lines from 'text' attribute returned by get_pdftohtml and associates
-# a font id (and the page number), which is the font used by the higher number
-# of characters of the line.
-def get_lines(text, fontspec):
-    add_lines(text)
-
-    for f in fontspec:
-        if 'same_line' not in f:
-            f['same_line'] = []
-
-    lines = []
-    last_line = -2
-    li = ''
-    fnt = {}
-    page_num = text[0]['page']
-    for txt in text:
-        if (txt['line'] != last_line) or (txt == text[-1]):
-            if (len(li.strip()) > 0):
-                fnt_no = -1; max_car = 0;
-                for f in fnt.keys():
-                    if (fnt[f] > max_car):
-                        max_car = fnt[f]
-                        fnt_no = f
-                lines.append({ 'text': li.strip(),
-                    'most_used_font': fnt_no,
-                    'nb_fonts': len(fnt),
-                    'page': page_num})
-            li = txt['text'].strip()
-            last_line = txt['line']
-            for fi1 in fnt.keys():
-                for fi2 in fnt.keys():
-                    if fi1 != fi2:
-                        f1 = next(it for it in fontspec if it['id'] == int(fi1))
-                        f2 = next(it for it in fontspec if it['id'] == int(fi2))
-                        if (f2['id'] not in f1['same_line']):
-                            f1['same_line'].append(f2['id'])
-                            f2['same_line'].append(f1['id'])
-            fnt = {}
-            fnt[txt['font']] = len(li.strip())
-        else:
-            if (is_ind_exp(txt['text'])):
-                li = "%s%s" % (li, txt['text'].strip())
-            else:
-                li = "%s %s" % (li, txt['text'].strip())
-            if (fnt.get(txt['font']) is None):
-                fnt[txt['font']] = len(txt['text'].strip())
-            else:
-                fnt[txt['font']] += len(txt['text'].strip())
-        page_num = txt['page']
-    return lines
-
-def print_lines(lines):
-    for l in lines:
-        print("(%d) [%d] %s" % (l['page'], l['most_used_font'], l['text']))
diff --git a/src/py/pdf2blocks.py b/src/py/pdf2blocks.py
index 186ff3991d2a67541be09695a58aa88bb23e2c86..0b80c61742793d08f3a3171fa0eb02dcf133587f 100644
--- a/src/py/pdf2blocks.py
+++ b/src/py/pdf2blocks.py
@@ -1,63 +1,649 @@
+import xml.etree.ElementTree as ET
+import os
 import sys
+import re
 
-from p2b_file import get_pdftotext, get_pdftohtml
-from p2b_blocks_utils import *
-from p2b_text_utils import *
-from p2b_fonts_utils import *
-from p2b_analyse import get_pdftotext_fontstats, get_pdftotext_normal_fontsize
+# https://unix.stackexchange.com/questions/238180/execute-shell-commands-in-python
+import subprocess
 
+from p2b_utils import levenshtein
+
+### Script pour faire tout le corpus :
+# D=~/Boulot/Ontology/BSV/tmp/Corpus/2019/Viticulture; for i in ${D}/*.pdf; do j=$( basename "$i" | sed -e 's/\.pdf//' ); echo $j; python p2b.py ${D}/$j | tee ${D}/${j}.md | markdown -o ${D}/${j}.html ; done
+
+
+CMD_PDFTOTEXT = '/usr/sbin/pdftotext'
+CMD_PDFTOHTML = '/usr/sbin/pdftohtml'
+
+LEFT_THRESHOLD = 25 # In p2b_text_utils.add_lines() : the max horizontal space
+                    # to consider aligned items to be on the same line.
+
+FLAG_NONE = 0x0000
+SMALL_FONT = 0x0001
+# BIG_FONT = 0x0002 -> Unused
+PAGE_BOTTOM = 0x0004
+MANY_FONTS = 0x0010
+IS_BULLET = 0x0020
+DEFAULT_FONT_SIZE = 0x0040
+TITLE_SMALLER_THAN_SUBTITLE = 0x0080
+
+
+TITLE_MAX_LINES = 2
+
+TITLE_MIN_CHAR = 2 # To avoid “styled” bullet : we consider that a font never
+                  # used for more than TITLE_MIN_CHAR characters per line
+                  # is a kind of text styling and will take the next line's font
+
+SIMILARITY_THRESHOLD = 1.0
+
+# Celle là est un peu compliquée : Pour détecter la structure, on compte
+# le nombre de successions d'un changement de police de caractères vers
+# un autre (ex : la fonte 3 succède *2* fois à la fonte 8).
+# Si ce nombre est trop peu élevé (<= NB_SUCCESSION_FOR_SAME) alors
+# on considère que 8 n'est pas un titre de 3, et qu'ils sont au même niveau.
+# Sinon, on considère que 8 est un niveau au-dessus dans la hiérarchie des
+# titres, sous-titres, …
+NB_SUCCESSION_FOR_SAME = 0
+
+# Regex
+INDICES_EXPOSANTS_USUELS = [
+  'er|ère|ere', # 1er, 1ère, …
+  'nde?', # 2nd
+  'i?[eè]me', # 3ème, 4ieme, …
+  '°',
+]
+
+
+# +--------------------------------------------------------------+
+# |                       get_pdftotext                          |
+# +--------------------------------------------------------------+
+def get_pdftotext(filename):
+  # Calls pdftotext and retreive standard output in a string (o)
+  basename = os.path.splitext(filename)[0]
+  cmd = [CMD_PDFTOTEXT, '-bbox-layout', '-eol', 'unix', '%s.pdf' % basename, '-']
+  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+  o, e = proc.communicate()
+  if (proc.returncode != 0):
+    print('-S-> Command pdftotext returned an error :')
+    print('     '  + e.decode('utf8'))
+    return []
+
+  # Parse xml code and create block table.
+  xml = o.decode('utf8')
+  root = ET.fromstring(xml)
+
+  page_num = 0
+  flow_num = 0
+  blocks = []
+  for body in root:
+    if (body.tag.endswith('body')):
+      for doc in body:
+        if (doc.tag.endswith('doc')):
+          for page in doc:
+            if (page.tag.endswith('page')):
+              page_num += 1
+              for fl in page:
+                if (fl.tag.endswith('flow')):
+                  flow_num += 1
+                  for bloc in fl:
+                    if (bloc.tag.endswith('block')):
+                      bl = {'page': page_num, 'flow': flow_num, 'lines': [],
+                            'flags': FLAG_NONE,
+                            'x_min': float(bloc.get('xMin')),
+                            'x_max': float(bloc.get('xMax')),
+                            'y_min': float(bloc.get('yMin')),
+                            'y_max': float(bloc.get('yMax')),
+                            }
+                      for line in bloc:
+                        if (line.tag.endswith('line')):
+                          h = float(line.get('yMax')) - float(line.get('yMin'))
+                          li = { 'text': '', 'height': h, 'words': [],
+                            'flags': FLAG_NONE,
+                            'x_min': float(bloc.get('xMin')),
+                            'x_max': float(bloc.get('xMax')),
+                            'y_min': float(bloc.get('yMin')),
+                            'y_max': float(bloc.get('yMax')),
+                          }
+                          last_nbcar = 0
+                          last_h = 0
+                          for word in line:
+                            if (word.tag.endswith('word')):
+                              hword = float(word.get('yMax')) - float(word.get('yMin'))
+                              li['words'].append({'height': hword, 'text': word.text})
+                              if ((hword != last_h) and (last_nbcar < 2)):
+                                  # This is to avoid separation of one big capital
+                                  # letter at the beginin of a title or paragraph.
+                                  last_h = hword
+                                  if len(re.sub(r'\W','', li['text'])) == 0:
+                                    li['text'] = "%s %s" % (li['text'], word.text)
+                                  else:
+                                    li['text'] = "%s%s" % (li['text'], word.text)
+                              else:
+                                  li['text'] = "%s %s" % (li['text'], word.text)
+                              li['text'] = li['text'].strip()
+                              last_nbcar = len(word.text)
+                          bl['lines'].append(li)
+                      blocks.append(bl)
+  return blocks
+
+
+# +--------------------------------------------------------------+
+# |                       get_pdftohtml                          |
+# +--------------------------------------------------------------+
+def get_pdftohtml(filename):
+  basename = os.path.splitext(filename)[0]
+  cmd = [CMD_PDFTOHTML, '-xml', '-i', '-stdout', '%s.pdf' % basename]
+  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+  o, e = proc.communicate()
+  if (proc.returncode != 0):
+    print('-S-> Command pdftohtml returned an error :')
+    print('     '  + e.decode('utf8'))
+    return None
+
+  # Parse xml code and create block table.
+  xml = o.decode('utf8')
+  root = ET.fromstring(xml)
+
+  fontspec = []
+  segments = []
+  for page in root:
+    if (page.tag.endswith('page')):
+        pg = int(page.get('number'))
+        for tg in page:
+            if (tg.tag.endswith('fontspec')):
+                fontspec.append({
+                    'id': int(tg.get('id')),
+                    'size': int(tg.get('size')),
+                    'family': tg.get('family'),
+                    'color': tg.get('color'),
+                    'nb_cars': 0
+                })
+            elif (tg.tag.endswith('text')):
+                fnt = int(tg.get('font'))
+                top = int(tg.get('top'))
+                left = int(tg.get('left'))
+                width = int(tg.get('width'))
+                height = int(tg.get('height'))
+                while (tg.text is None) and (len(tg) > 0):
+                    tg = tg[0] # remove html style tags (like <b>, …)
+                if (tg.text is not None):
+                    li = "%s" % (tg.text)
+                    if (len(li.strip()) > 0):
+                        segments.append({'page': pg, 'font': fnt,
+                            'top': top, 'left': left,
+                            'width': width, 'height': height,
+                            'text': li.strip()
+                        })
+                        # Find font in fontspec
+                        for font in fontspec:
+                            if font['id'] == fnt: break
+                        font['nb_cars'] += len(li.strip())
+  return { 'fonts': fontspec, 'segments': segments }
+
+
+# +--------------------------------------------------------------+
+# |                   get_default_font_size                      |
+# +--------------------------------------------------------------+
+def get_default_font_size(fontspec):
+  sizes = {}
+  max_cars = 0
+  size_max_cars = 42 # Doesn't matter : it'll change
+  for f in fontspec:
+      if sizes.get(f['size']) is None:
+          sizes[f['size']] = f['nb_cars']
+      else:
+          sizes[f['size']] += f['nb_cars']
+      if sizes[f['size']] > max_cars:
+          max_cars = sizes[f['size']]
+          size_max_cars = f['size']
+  return size_max_cars
+
+
+# +--------------------------------------------------------------+
+# |                      mark_small_fonts                        |
+# +--------------------------------------------------------------+
+# RQ : Also marks bullet lines
+def mark_small_fonts(blocks, default_font_size):
+    for b in blocks:
+        for l in b['lines']:
+            if (round(l['height']) < default_font_size):
+                l['flags'] |= SMALL_FONT
+            if len(re.sub(r'\W','', l['text'])) == 0:
+                l['flags'] |= IS_BULLET
+
+
+# +--------------------------------------------------------------+
+# |                      mark_page_bottom                        |
+# +--------------------------------------------------------------+
+def mark_page_bottom(blocks):
+    if (blocks[-1]['page'] == 1): return
+
+    # Find indexes of last blocks in pages
+    bndx = []
+    for i in range(0, len(blocks) - 1):
+        if (blocks[i]['page'] != blocks[i+1]['page']):
+            bndx.append(i)
+    bndx.append(len(blocks)-1)
+
+    # Get last line indexes
+    lndx = []
+    for i in bndx:
+        lndx.append(len(blocks[i]['lines'])-1)
+
+    # Loop while finding always same characters in last lines
+    end = False
+    while not end:
+        txt = None
+        # Test if last lines characters are the same
+        for i,j in zip(bndx, lndx):
+            li = re.sub(r'[^a-zA-Z]', '', blocks[i]['lines'][j]['text'])
+            if txt is None: txt = li
+            else: end = (txt != li)
+        # All last line are the same, so mark them
+        if not end:
+            for i in range(0, len(bndx)):
+                blocks[bndx[i]]['lines'][lndx[i]]['flags'] |= PAGE_BOTTOM
+                lndx[i] -= 1
+                if (lndx[i] < 0):
+                    #-# blocks[bndx[i]]['flags'] |= PAGE_BOTTOM
+                    bndx[i] -= 1
+                    lndx[i] = len(blocks[bndx[i]]['lines']) - 1
+                    end = bndx[i] < 0
+
+# +--------------------------------------------------------------+
+# |                         is_ind_exp                           |
+# +--------------------------------------------------------------+
+# Is it an indice or exposant ?
+def is_ind_exp(str):
+  for ie in INDICES_EXPOSANTS_USUELS:
+      if re.match(ie, str):
+          return True
+  return False
+
+# +--------------------------------------------------------------+
+# |                         get_lines                            |
+# +--------------------------------------------------------------+
+# Extract lines from 'text' attribute returned by get_pdftohtml and associates
+# a font id (and the page number), which is the font used by the higher number
+# of characters of the line.
+# Does a column splitting considering the value of LEFT_THRESHOLD
+def get_lines(segments, fontspec):
+    last_top = -1
+    line_no = -1
+    last_right = 0
+    for txt in segments:
+        if (txt['top'] == last_top) and ((txt['left'] - last_right) <= LEFT_THRESHOLD):
+            txt['line'] = line_no
+        elif is_ind_exp(txt['text'].strip()):
+            txt['line'] = line_no
+        else:
+            line_no += 1
+            txt['line'] = line_no
+            last_top = txt['top']
+        last_right = txt['left'] + txt['width']
+
+    for f in fontspec:
+        if 'same_line' not in f:
+            f['same_line'] = []
+
+    lines = []
+    last_line = -2
+    li = ''
+    fnt = {}
+    page_num = segments[0]['page']
+    for txt in segments:
+        if (txt['line'] != last_line) or (txt == segments[-1]):
+            if (len(li.strip()) > 0):
+                fnt_no = -1; max_car = 0;
+                for f in fnt.keys():
+                    if (fnt[f] > max_car):
+                        max_car = fnt[f]
+                        fnt_no = f
+                lines.append({ 'text': li.strip(),
+                    'most_used_font': fnt_no,
+                    'nb_fonts': len(fnt),
+                    'page': page_num})
+            li = txt['text'].strip()
+            last_line = txt['line']
+            for fi1 in fnt.keys():
+                for fi2 in fnt.keys():
+                    if fi1 != fi2:
+                        f1 = next(it for it in fontspec if it['id'] == int(fi1))
+                        f2 = next(it for it in fontspec if it['id'] == int(fi2))
+                        if (f2['id'] not in f1['same_line']):
+                            f1['same_line'].append(f2['id'])
+                            f2['same_line'].append(f1['id'])
+            fnt = {}
+            fnt[txt['font']] = len(li.strip())
+        else:
+            if (is_ind_exp(txt['text'])):
+                li = "%s%s" % (li, txt['text'].strip())
+            else:
+                li = "%s %s" % (li, txt['text'].strip())
+            if (fnt.get(txt['font']) is None):
+                fnt[txt['font']] = len(txt['text'].strip())
+            else:
+                fnt[txt['font']] += len(txt['text'].strip())
+        page_num = txt['page']
+    return lines
+
+# +--------------------------------------------------------------+
+# |                        guess_fonts                           |
+# +--------------------------------------------------------------+
+# Tries to guess fontspec of each line into blocks list.
+# It calculates the levenshtein distance with every segment of the same page
+# and assigns the best matching score's font.
+def guess_fonts(blocks, segments, fontspec):
+    lines = get_lines(segments, fontspec)
+    ndx_lines = [0,] # Indexation des indices de line par numéro de page
+    for ndx in range(1, len(lines)):
+        if (lines[ndx-1]['page'] != lines[ndx]['page']):
+            ndx_lines.append(ndx)
+    ndx_lines.append(len(lines))
+
+    for f in fontspec:
+        f['nb_lines'] = 0
+        f['dist_sum'] = 0
+        #f['block_pos_sum'] = 0
+
+    for bl in blocks:
+        for l in bl['lines']:
+            if (len(l['text']) > 0):
+                min_dist = len(l['text'])
+                min_score = 1.0
+                font_sel = -1
+                line_no = -1
+                for i in range(ndx_lines[bl['page']-1], ndx_lines[bl['page']]):
+                    if (len(lines[i]['text']) > 0):
+                        d = levenshtein(l['text'], lines[i]['text'])
+                        if (d == 0):
+                            min_dist = 0
+                            min_score = 0.0
+                            font_sel = lines[i]['most_used_font']
+                            line_no = i
+                            break;
+                        score = float(d) / float(max(len(l['text']), len(lines[i]['text'])))
+                        if (score <= SIMILARITY_THRESHOLD):
+                            if (d < min_dist):
+                                min_dist = d
+                                min_score = score
+                                font_sel = lines[i]['most_used_font']
+                                line_no = i
+                l['font'] = font_sel
+                if (font_sel >= 0):
+                  fnt = next(it for it in fontspec if it['id'] == font_sel)
+                  fnt['nb_lines'] +=1
+                  fnt['dist_sum'] += min_dist
+                l['score'] = min_score # For debuggin purpose
+                l['dist'] = min_dist   #    idem.
+                l['line_no'] = line_no # idem. Stores the "similar line" number
+                # print("> %s" % l['text'])
+                # print("  %s" % lines[line_no]['text'])
+                # print("  [%d]" % font_sel)
+                # print("")
+                if (lines[line_no]['nb_fonts'] > 1):
+                    l['flags'] |= MANY_FONTS
+
+# +--------------------------------------------------------------+
+# |                    replace_block_fonts                       |
+# +--------------------------------------------------------------+
+# Adds a 'short_font' attribute to lines which gives another font value which
+# doesn't care about style (bold, …).
+# RK: def_size is default_font_size, used to mark SMALL_FONT flag.
+def replace_block_fonts(blocks, fontspec, def_size):
+    for i in range(0, len(fontspec) - 1):
+        for j in range(i+1, len(fontspec)):
+          if (fontspec[j].get('replaceWith') is None):
+            if (fontspec[j]['id'] in fontspec[i]['same_line']):
+                if fontspec[i].get('replaceWith') is None:
+                    fontspec[j]['replaceWith'] = fontspec[i]['id']
+                else:
+                    fontspec[j]['replaceWith'] = fontspec[i]['replaceWith']
+    for bl in blocks:
+        for l in bl['lines']:
+            if (l['font'] < 0):
+                f = None
+            else:
+                f = next(it for it in fontspec if it['id'] == l['font'])
+            if (f is None) or (f.get('replaceWith') is None):
+                l['short_font'] = l['font']
+            else:
+                l['short_font'] = f.get('replaceWith')
+            if (f is not None):
+                f = next(it for it in fontspec if it['id'] == l['short_font'])
+                if (f['size'] < def_size):
+                    l['flags'] |= SMALL_FONT
+                if (f['size'] == def_size):
+                    l['flags'] |= DEFAULT_FONT_SIZE
+
+
+# +--------------------------------------------------------------+
+# |                      guess_structure                         |
+# +--------------------------------------------------------------+
+def guess_structure(blocks, fontspec,
+  remove_flags = SMALL_FONT | PAGE_BOTTOM | IS_BULLET):
+    t = [] # A list used here and there
+    n = [] # Another one
+
+    # Search for the most used font
+    # Here, t will be used to count the number of cars of each font.
+    #   and n will be used to store the maximum line size for each font.
+    for i in range(len(fontspec)):
+        t.append(0)
+        n.append(0)
+    nb_max = -1
+    ndx_most_used = -1
+    for bl in blocks:
+        for l in bl['lines']:
+            if (l['short_font'] >= 0) and ((l['flags'] & remove_flags) == FLAG_NONE):
+                lon = len(l['text'].strip())
+                t[l['short_font']] += lon
+                if lon > n[l['short_font']]: n[l['short_font']] = lon
+                if (t[l['short_font']] > nb_max):
+                    nb_max = t[l['short_font']]
+                    ndx_most_used = l['short_font']
+    b = [nb <= TITLE_MIN_CHAR for nb in n]
+
+    ### ndx_most_used is the most used font number.
+    ### b[font_number] is True if the font seems used for bullets.
+
+    t = [] # We'll use it to list the fonts succession
+    n = [] # Used to count the number of lines
+    for bl in blocks:
+        for l in bl['lines']:
+            if (l['flags'] & remove_flags) == FLAG_NONE:
+                if t == []:
+                    t.append(l['short_font'])
+                    n.append(1)
+                else:
+                    if (t[-1] != l['short_font']):
+                        t.append(l['short_font'])
+                        n.append(1)
+                    else:
+                        n[-1] += 1
+
+    f = {} # Will contain used font numbers and number of occurences in t
+    for i,j in zip(t,n):
+        if i not in f.keys():
+            f[i] = {'nb': 1, 'nl':j, 'maxl': j,
+                    'is_bullet': b[i], 'flags': FLAG_NONE}
+        else:
+            f[i]['nb'] += 1
+            f[i]['nl'] += j
+            if (j > f[i]['maxl']):
+                f[i]['maxl'] = j
+
+    for i in f.keys():
+        f[i]['isnt_title'] = (f[i]['maxl'] > TITLE_MAX_LINES)
+
+    # Replace short_font for lines considered as bullets (or text styling).
+    last_bullet_lines = []
+    for bl in blocks:
+        for l in bl['lines']:
+            if (l['flags'] & remove_flags) == FLAG_NONE:
+                if f[l['short_font']]['is_bullet']:
+                    last_bullet_lines.append(l)
+                else:
+                    if (len(last_bullet_lines) > 0):
+                        for last in last_bullet_lines:
+                            last['short_font'] = l['short_font']
+                        last_bullet_lines = []
+    if (len(last_bullet_lines) > 0):
+        for last in last_bullet_lines:
+            last['short_font'] = ndx_most_used
+
+    # n and b won't be used anymore I think. So they're free
+
+    # Rebuild the font succession list (is not optimized but is the safest)
+    t = []
+    for bl in blocks:
+        for l in bl['lines']:
+            if (l['flags'] & remove_flags) == FLAG_NONE:
+                if t == []: t.append(l['short_font'])
+                else:
+                    if (t[-1] != l['short_font']):
+                        t.append(l['short_font'])
+
+    b = [] # We'll do a 2d table with b[i][j] = number of transitions
+           # from fonti to fontj (will be a tree of font transitions)
+    for i in range(len(fontspec)+1): # Consider len+1 to have font number -1
+        b.append([0 for j in range(len(fontspec)+1)])
+    for i in range(len(t)-1):
+        j = i+1
+        if not f[t[i]]['isnt_title']:
+            b[t[i]][t[j]] += 1
+
+    # Should we do this : ? &&&&&&&&&&&&&&&&&&&& A tester
+    # Un moyen de s'assurer que tout sera parcouru...
+    # Signifie qu'on ne finit pas sur un titre.
+    f[t[-1]]['isnt_title'] = True
+
+    # Create a deep attribute in f which contains distance from leaves
+    for k,v in f.items():
+        if v['isnt_title']:
+            v['deep'] = 0
+            v['nb_transitions'] = 999999999
+        else: v['deep'] = None
+
+    # Algo : dans le tableau b, on parcourt les colonnes (j) pour les fontes qui ont un deep.
+    #        Un indice (i) de ligne pour lequel la valeur b[i][j] est non nulle signifie
+    #        que la fonte i
+    #        précède la fonte j b[i][j] fois.
+    # Rq : Une colonne vide pour un indice dont la ligne est non-vide est une racine
+    #      Une ligne vide pour un indice dont la colonne est non-vide est une feuille
+    # On répète tant qu'on change des valeurs (c'est pas optimisé mais crotte,
+    # le tableau n'est pas si grand)
+    has_changed = True
+    deep_max = 0
+    while has_changed:
+        has_changed = False
+        for k,v in f.items():
+            if v['deep'] is not None:
+                for i in range(-1,len(b)-1):
+                    if b[i][k] != 0:
+                        if f[i]['deep'] is None:
+                            if (b[i][k] <= NB_SUCCESSION_FOR_SAME):
+                                f[i]['deep'] = v['deep']
+                                f[i]['nb_transitions'] = b[i][k]
+                            else:
+                                f[i]['deep'] = v['deep'] + 1
+                                f[i]['nb_transitions'] = b[i][k]
+                            if f[i]['deep'] > deep_max:
+                                deep_max = f[i]['deep']
+                            has_changed = True
+                            if (fontspec[i]['size'] < fontspec[k]['size']):
+                                f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
+                        elif f[i]['nb_transitions'] < b[i][k]:
+                            f[i]['deep'] = v['deep'] + 1
+                            f[i]['nb_transitions'] = b[i][k]
+                            has_changed = True
+                            if (fontspec[i]['size'] < fontspec[k]['size']):
+                                f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
+
+    # Reverse deepness value, to make it distance from root
+    for v in f.values():
+        if (v['deep'] is not None):
+            v['deep'] = deep_max - v['deep']
+
+    # Add deep in blocks lines
+    for bl in blocks:
+        for l in bl['lines']:
+            if (l['flags'] & remove_flags) == FLAG_NONE:
+                l['deep'] = f[l['short_font']]['deep']
+                if ((f[l['short_font']]['flags']) & TITLE_SMALLER_THAN_SUBTITLE != 0):
+                   l['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
+            else:
+                l['deep'] = deep_max
+
+
+
+
+
+
+
+# +--------------------------------------------------------------+
+# |                      print_block_list                        |
+# +--------------------------------------------------------------+
+def print_block_list(t, remove_flags = FLAG_NONE):
+    last_page = -1
+    deep_max = -1
+    for bl in t:
+        for l in bl['lines']:
+            if (l.get('deep') is not None):
+                if deep_max < l['deep']: deep_max = l['deep']
+    if deep_max > 10: deep_max = 10
+    ttl = "#############"
+    last_deep = -1
+
+    for block in t:
+        if (block['page'] != last_page):
+            if (last_page > 0):
+                print("")
+            last_page = block['page']
+            print("________________________________")
+            print("*page %d*" % last_page)
+
+        print("")
+
+        for l in block['lines']:
+            if (l['flags'] & remove_flags) == FLAG_NONE:
+                pre = ''
+                post = '  '
+                if (l.get('deep') is None):
+                    pre = '!! '
+                    last_deep = -1
+                else:
+                    if (l['flags'] & SMALL_FONT) != 0:
+                        pre = "> %s" % pre
+                    #if (len(l['text']) > 20) and \
+                    #   len(re.sub(r'\w','', l['text']).strip()) > 5:
+                    #    post = "%s  " % post
+                    if (l['flags'] & TITLE_SMALLER_THAN_SUBTITLE) != 0 and \
+                       (l['flags'] & (DEFAULT_FONT_SIZE | SMALL_FONT)) != 0:
+                        pre = "%s**" % (pre)
+                        post = "**%s" % post
+                    elif l['deep'] < deep_max:
+                        pre = "%s%s " % (pre, ttl[0:(l['deep']+1)])
+                    last_deep = l['deep']
+                print("%s%s%s" % (pre, l['text'], post))
+
+
+# +--------------------------------------------------------------+
+# |                           main                               |
+# +--------------------------------------------------------------+
 if (len(sys.argv) < 1):
     print("-U-> Usage : python pdf2blocks.py <fichier_pdf>")
     sys.exit(-1)
 
 blocks = get_pdftotext(sys.argv[1])
-fnt_sizes = get_pdftotext_fontstats(blocks)
-normal_font = get_pdftotext_normal_fontsize(fnt_sizes)
-mark_blocks_fontsize(blocks, normal_font)
-mark_bullet_lines(blocks)
-mark_page_bottom(blocks)
-#print_block_list(blocks)
-##print("<<< %d >>>" % normal_font)
-
 p2h = get_pdftohtml(sys.argv[1])
-#li = get_lines(p2h['text'])
-#print_lines(li)
-
-####group_fonts(p2h['fonts'])
-##########replace_fonts(p2h['fonts'])
-# for f in p2h['fonts']:
-#     sa = '(  )' if f.get('replaceWith') is None else ("(%2d)" % f.get('replaceWith'))
-#     print("[%2d] %s %2dpx '%s' <%s>" % (f['id'], sa, f['size'], f['family'], f['color']))
-#
-# print('')
-# print("#######################################################################")
-# print('')
-
-guess_fonts(blocks, p2h['text'], p2h['fonts'])
-
-
-#### Ce qui a été modifié :
-## On n'unifie que les polices détectées présentes sur une même ligne.
-## Amélioration de la détection de puces (lignes sans car alphanum '\W')
-## et ajout d'un FLAG 'BULLET', pour sortir du traitement les lignes
-## qui ne contiennent qu'un caractère alacon.
-## Du coup on a zingué le short_font, il faudrait réécrire au propre.
-
-
-#for f in p2h['fonts']:
-#    print("(%2d) [%s ; %s] %s" % (f['id'], f["family"], f["size"], f['same_line']))
-#print('')
-#print('------------------------------------------------')
-#print('')
-for bl in blocks:
-    for l in bl['lines']:
-        l['short_font'] = l['font']
-#        print("[%2d] %s"%(l['font'], l['text']))
-#
-#exit(0)
-
-replace_fonts(p2h['fonts'])
-replace_block_fonts(blocks, p2h['fonts'])
-
-guess_structure(blocks, p2h['fonts'])
-print_block_list(blocks, FLAG_SMALL_FONT | FLAG_PAGE_BOTTOM | FLAG_BULLET)
+fontspec = p2h['fonts']
+segments = p2h['segments']
+
+default_font_size = get_default_font_size(fontspec)
+# mark_small_fonts(blocks, default_font_size)
+mark_page_bottom(blocks)
+guess_fonts(blocks, segments, fontspec)
+replace_block_fonts(blocks, fontspec, default_font_size)
+guess_structure(blocks, fontspec)
+print_block_list(blocks, PAGE_BOTTOM | IS_BULLET)