p2b.py

import xml.etree.ElementTree as ET
import os
import sys
import re

# https://unix.stackexchange.com/questions/238180/execute-shell-commands-in-python
import subprocess

from p2b_utils import levenshtein

### Script pour faire tout le corpus :
# D=~/Boulot/Ontology/BSV/tmp/Corpus/2019/Viticulture; for i in ${D}/*.pdf; do j=$( basename "$i" | sed -e 's/\.pdf//' ); echo $j; python p2b.py ${D}/$j | tee ${D}/${j}.md | markdown -o ${D}/${j}.html ; done


CMD_PDFTOTEXT = '/usr/sbin/pdftotext'
CMD_PDFTOHTML = '/usr/sbin/pdftohtml'

LEFT_THRESHOLD = 25 # In p2b_text_utils.add_lines() : the max horizontal space
                    # to consider aligned items to be on the same line.

FLAG_NONE = 0x0000
SMALL_FONT = 0x0001
# BIG_FONT = 0x0002 -> Unused
PAGE_BOTTOM = 0x0004
MANY_FONTS = 0x0010
IS_BULLET = 0x0020
DEFAULT_FONT_SIZE = 0x0040
TITLE_SMALLER_THAN_SUBTITLE = 0x0080


TITLE_MAX_LINES = 2

TITLE_MIN_CHAR = 2 # To avoid “styled” bullet : we consider that a font never
                  # used for more than TITLE_MIN_CHAR characters per line
                  # is a kind of text styling and will take the next line's font

SIMILARITY_THRESHOLD = 1.0

# Celle là est un peu compliquée : Pour détecter la structure, on compte
# le nombre de successions d'un changement de police de caractères vers
# un autre (ex : la fonte 3 succède *2* fois à la fonte 8).
# Si ce nombre est trop peu élevé (<= NB_SUCCESSION_FOR_SAME) alors
# on considère que 8 n'est pas un titre de 3, et qu'ils sont au même niveau.
# Sinon, on considère que 8 est un niveau au-dessus dans la hiérarchie des
# titres, sous-titres, …
NB_SUCCESSION_FOR_SAME = 0

# Regex
INDICES_EXPOSANTS_USUELS = [
  'er|ère|ere', # 1er, 1ère, …
  'nde?', # 2nd
  'i?[eè]me', # 3ème, 4ieme, …
  '°',
]


# +--------------------------------------------------------------+
# |                       get_pdftotext                          |
# +--------------------------------------------------------------+
def get_pdftotext(filename):
  # Calls pdftotext and retreive standard output in a string (o)
  basename = os.path.splitext(filename)[0]
  cmd = [CMD_PDFTOTEXT, '-bbox-layout', '-eol', 'unix', '%s.pdf' % basename, '-']
  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  o, e = proc.communicate()
  if (proc.returncode != 0):
    print('-S-> Command pdftotext returned an error :')
    print('     '  + e.decode('utf8'))
    return []

  # Parse xml code and create block table.
  xml = o.decode('utf8')
  root = ET.fromstring(xml)

  page_num = 0
  flow_num = 0
  blocks = []
  for body in root:
    if (body.tag.endswith('body')):
      for doc in body:
        if (doc.tag.endswith('doc')):
          for page in doc:
            if (page.tag.endswith('page')):
              page_num += 1
              for fl in page:
                if (fl.tag.endswith('flow')):
                  flow_num += 1
                  for bloc in fl:
                    if (bloc.tag.endswith('block')):
                      bl = {'page': page_num, 'flow': flow_num, 'lines': [],
                            'flags': FLAG_NONE,
                            'x_min': float(bloc.get('xMin')),
                            'x_max': float(bloc.get('xMax')),
                            'y_min': float(bloc.get('yMin')),
                            'y_max': float(bloc.get('yMax')),
                            }
                      for line in bloc:
                        if (line.tag.endswith('line')):
                          h = float(line.get('yMax')) - float(line.get('yMin'))
                          li = { 'text': '', 'height': h, 'words': [],
                            'flags': FLAG_NONE,
                            'x_min': float(bloc.get('xMin')),
                            'x_max': float(bloc.get('xMax')),
                            'y_min': float(bloc.get('yMin')),
                            'y_max': float(bloc.get('yMax')),
                          }
                          last_nbcar = 0
                          last_h = 0
                          for word in line:
                            if (word.tag.endswith('word')):
                              hword = float(word.get('yMax')) - float(word.get('yMin'))
                              li['words'].append({'height': hword, 'text': word.text})
                              if ((hword != last_h) and (last_nbcar < 2)):
                                  # This is to avoid separation of one big capital
                                  # letter at the beginin of a title or paragraph.
                                  last_h = hword
                                  if len(re.sub(r'\W','', li['text'])) == 0:
                                    li['text'] = "%s %s" % (li['text'], word.text)
                                  else:
                                    li['text'] = "%s%s" % (li['text'], word.text)
                              else:
                                  li['text'] = "%s %s" % (li['text'], word.text)
                              li['text'] = li['text'].strip()
                              last_nbcar = len(word.text)
                          bl['lines'].append(li)
                      blocks.append(bl)
  return blocks


# +--------------------------------------------------------------+
# |                       get_pdftohtml                          |
# +--------------------------------------------------------------+
def get_pdftohtml(filename):
  basename = os.path.splitext(filename)[0]
  cmd = [CMD_PDFTOHTML, '-xml', '-i', '-stdout', '%s.pdf' % basename]
  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  o, e = proc.communicate()
  if (proc.returncode != 0):
    print('-S-> Command pdftohtml returned an error :')
    print('     '  + e.decode('utf8'))
    return None

  # Parse xml code and create block table.
  xml = o.decode('utf8')
  root = ET.fromstring(xml)

  fontspec = []
  segments = []
  for page in root:
    if (page.tag.endswith('page')):
        pg = int(page.get('number'))
        for tg in page:
            if (tg.tag.endswith('fontspec')):
                fontspec.append({
                    'id': int(tg.get('id')),
                    'size': int(tg.get('size')),
                    'family': tg.get('family'),
                    'color': tg.get('color'),
                    'nb_cars': 0
                })
            elif (tg.tag.endswith('text')):
                fnt = int(tg.get('font'))
                top = int(tg.get('top'))
                left = int(tg.get('left'))
                width = int(tg.get('width'))
                height = int(tg.get('height'))
                while (tg.text is None) and (len(tg) > 0):
                    tg = tg[0] # remove html style tags (like <b>, …)
                if (tg.text is not None):
                    li = "%s" % (tg.text)
                    if (len(li.strip()) > 0):
                        segments.append({'page': pg, 'font': fnt,
                            'top': top, 'left': left,
                            'width': width, 'height': height,
                            'text': li.strip()
                        })
                        # Find font in fontspec
                        for font in fontspec:
                            if font['id'] == fnt: break
                        font['nb_cars'] += len(li.strip())
  return { 'fonts': fontspec, 'segments': segments }


# +--------------------------------------------------------------+
# |                   get_default_font_size                      |
# +--------------------------------------------------------------+
def get_default_font_size(fontspec):
  sizes = {}
  max_cars = 0
  size_max_cars = 42 # Doesn't matter : it'll change
  for f in fontspec:
      if sizes.get(f['size']) is None:
          sizes[f['size']] = f['nb_cars']
      else:
          sizes[f['size']] += f['nb_cars']
      if sizes[f['size']] > max_cars:
          max_cars = sizes[f['size']]
          size_max_cars = f['size']
  return size_max_cars


# +--------------------------------------------------------------+
# |                      mark_small_fonts                        |
# +--------------------------------------------------------------+
# RQ : Also marks bullet lines
def mark_small_fonts(blocks, default_font_size):
    for b in blocks:
        for l in b['lines']:
            if (round(l['height']) < default_font_size):
                l['flags'] |= SMALL_FONT
            if len(re.sub(r'\W','', l['text'])) == 0:
                l['flags'] |= IS_BULLET


# +--------------------------------------------------------------+
# |                      mark_page_bottom                        |
# +--------------------------------------------------------------+
def mark_page_bottom(blocks):
    if (blocks[-1]['page'] == 1): return

    # Find indexes of last blocks in pages
    bndx = []
    for i in range(0, len(blocks) - 1):
        if (blocks[i]['page'] != blocks[i+1]['page']):
            bndx.append(i)
    bndx.append(len(blocks)-1)

    # Get last line indexes
    lndx = []
    for i in bndx:
        lndx.append(len(blocks[i]['lines'])-1)

    # Loop while finding always same characters in last lines
    end = False
    while not end:
        txt = None
        # Test if last lines characters are the same
        for i,j in zip(bndx, lndx):
            li = re.sub(r'[^a-zA-Z]', '', blocks[i]['lines'][j]['text'])
            if txt is None: txt = li
            else: end = (txt != li)
        # All last line are the same, so mark them
        if not end:
            for i in range(0, len(bndx)):
                blocks[bndx[i]]['lines'][lndx[i]]['flags'] |= PAGE_BOTTOM
                lndx[i] -= 1
                if (lndx[i] < 0):
                    #-# blocks[bndx[i]]['flags'] |= PAGE_BOTTOM
                    bndx[i] -= 1
                    lndx[i] = len(blocks[bndx[i]]['lines']) - 1
                    end = bndx[i] < 0

# +--------------------------------------------------------------+
# |                         is_ind_exp                           |
# +--------------------------------------------------------------+
# Is it an indice or exposant ?
def is_ind_exp(str):
  for ie in INDICES_EXPOSANTS_USUELS:
      if re.match(ie, str):
          return True
  return False

# +--------------------------------------------------------------+
# |                         get_lines                            |
# +--------------------------------------------------------------+
# Extract lines from 'text' attribute returned by get_pdftohtml and associates
# a font id (and the page number), which is the font used by the higher number
# of characters of the line.
# Does a column splitting considering the value of LEFT_THRESHOLD
def get_lines(segments, fontspec):
    last_top = -1
    line_no = -1
    last_right = 0
    for txt in segments:
        if (txt['top'] == last_top) and ((txt['left'] - last_right) <= LEFT_THRESHOLD):
            txt['line'] = line_no
        elif is_ind_exp(txt['text'].strip()):
            txt['line'] = line_no
        else:
            line_no += 1
            txt['line'] = line_no
            last_top = txt['top']
        last_right = txt['left'] + txt['width']

    for f in fontspec:
        if 'same_line' not in f:
            f['same_line'] = []

    lines = []
    last_line = -2
    li = ''
    fnt = {}
    page_num = segments[0]['page']
    for txt in segments:
        if (txt['line'] != last_line) or (txt == segments[-1]):
            if (len(li.strip()) > 0):
                fnt_no = -1; max_car = 0;
                for f in fnt.keys():
                    if (fnt[f] > max_car):
                        max_car = fnt[f]
                        fnt_no = f
                lines.append({ 'text': li.strip(),
                    'most_used_font': fnt_no,
                    'nb_fonts': len(fnt),
                    'page': page_num})
            li = txt['text'].strip()
            last_line = txt['line']
            for fi1 in fnt.keys():
                for fi2 in fnt.keys():
                    if fi1 != fi2:
                        f1 = next(it for it in fontspec if it['id'] == int(fi1))
                        f2 = next(it for it in fontspec if it['id'] == int(fi2))
                        if (f2['id'] not in f1['same_line']):
                            f1['same_line'].append(f2['id'])
                            f2['same_line'].append(f1['id'])
            fnt = {}
            fnt[txt['font']] = len(li.strip())
        else:
            if (is_ind_exp(txt['text'])):
                li = "%s%s" % (li, txt['text'].strip())
            else:
                li = "%s %s" % (li, txt['text'].strip())
            if (fnt.get(txt['font']) is None):
                fnt[txt['font']] = len(txt['text'].strip())
            else:
                fnt[txt['font']] += len(txt['text'].strip())
        page_num = txt['page']
    return lines

# +--------------------------------------------------------------+
# |                        guess_fonts                           |
# +--------------------------------------------------------------+
# Tries to guess fontspec of each line into blocks list.
# It calculates the levenshtein distance with every segment of the same page
# and assigns the best matching score's font.
def guess_fonts(blocks, segments, fontspec):
    lines = get_lines(segments, fontspec)
    ndx_lines = [0,] # Indexation des indices de line par numéro de page
    for ndx in range(1, len(lines)):
        if (lines[ndx-1]['page'] != lines[ndx]['page']):
            ndx_lines.append(ndx)
    ndx_lines.append(len(lines))

    for f in fontspec:
        f['nb_lines'] = 0
        f['dist_sum'] = 0
        #f['block_pos_sum'] = 0

    for bl in blocks:
        for l in bl['lines']:
            if (len(l['text']) > 0):
                min_dist = len(l['text'])
                min_score = 1.0
                font_sel = -1
                line_no = -1
                for i in range(ndx_lines[bl['page']-1], ndx_lines[bl['page']]):
                    if (len(lines[i]['text']) > 0):
                        d = levenshtein(l['text'], lines[i]['text'])
                        if (d == 0):
                            min_dist = 0
                            min_score = 0.0
                            font_sel = lines[i]['most_used_font']
                            line_no = i
                            break;
                        score = float(d) / float(max(len(l['text']), len(lines[i]['text'])))
                        if (score <= SIMILARITY_THRESHOLD):
                            if (d < min_dist):
                                min_dist = d
                                min_score = score
                                font_sel = lines[i]['most_used_font']
                                line_no = i
                l['font'] = font_sel
                if (font_sel >= 0):
                  fnt = next(it for it in fontspec if it['id'] == font_sel)
                  fnt['nb_lines'] +=1
                  fnt['dist_sum'] += min_dist
                l['score'] = min_score # For debuggin purpose
                l['dist'] = min_dist   #    idem.
                l['line_no'] = line_no # idem. Stores the "similar line" number
                # print("> %s" % l['text'])
                # print("  %s" % lines[line_no]['text'])
                # print("  [%d]" % font_sel)
                # print("")
                if (lines[line_no]['nb_fonts'] > 1):
                    l['flags'] |= MANY_FONTS

# +--------------------------------------------------------------+
# |                    replace_block_fonts                       |
# +--------------------------------------------------------------+
# Adds a 'short_font' attribute to lines which gives another font value which
# doesn't care about style (bold, …).
# RK: def_size is default_font_size, used to mark SMALL_FONT flag.
def replace_block_fonts(blocks, fontspec, def_size):
    for i in range(0, len(fontspec) - 1):
        for j in range(i+1, len(fontspec)):
          if (fontspec[j].get('replaceWith') is None):
            if (fontspec[j]['id'] in fontspec[i]['same_line']):
                if fontspec[i].get('replaceWith') is None:
                    fontspec[j]['replaceWith'] = fontspec[i]['id']
                else:
                    fontspec[j]['replaceWith'] = fontspec[i]['replaceWith']
    for bl in blocks:
        for l in bl['lines']:
            if (l['font'] < 0):
                f = None
            else:
                f = next(it for it in fontspec if it['id'] == l['font'])
            if (f is None) or (f.get('replaceWith') is None):
                l['short_font'] = l['font']
            else:
                l['short_font'] = f.get('replaceWith')
            if (f is not None):
                f = next(it for it in fontspec if it['id'] == l['short_font'])
                if (f['size'] < def_size):
                    l['flags'] |= SMALL_FONT
                if (f['size'] == def_size):
                    l['flags'] |= DEFAULT_FONT_SIZE


# +--------------------------------------------------------------+
# |                      guess_structure                         |
# +--------------------------------------------------------------+
def guess_structure(blocks, fontspec,
  remove_flags = SMALL_FONT | PAGE_BOTTOM | IS_BULLET):
    t = [] # A list used here and there
    n = [] # Another one

    # Search for the most used font
    # Here, t will be used to count the number of cars of each font.
    #   and n will be used to store the maximum line size for each font.
    for i in range(len(fontspec)):
        t.append(0)
        n.append(0)
    nb_max = -1
    ndx_most_used = -1
    for bl in blocks:
        for l in bl['lines']:
            if (l['short_font'] >= 0) and ((l['flags'] & remove_flags) == FLAG_NONE):
                lon = len(l['text'].strip())
                t[l['short_font']] += lon
                if lon > n[l['short_font']]: n[l['short_font']] = lon
                if (t[l['short_font']] > nb_max):
                    nb_max = t[l['short_font']]
                    ndx_most_used = l['short_font']
    b = [nb <= TITLE_MIN_CHAR for nb in n]

    ### ndx_most_used is the most used font number.
    ### b[font_number] is True if the font seems used for bullets.

    t = [] # We'll use it to list the fonts succession
    n = [] # Used to count the number of lines
    for bl in blocks:
        for l in bl['lines']:
            if (l['flags'] & remove_flags) == FLAG_NONE:
                if t == []:
                    t.append(l['short_font'])
                    n.append(1)
                else:
                    if (t[-1] != l['short_font']):
                        t.append(l['short_font'])
                        n.append(1)
                    else:
                        n[-1] += 1

    f = {} # Will contain used font numbers and number of occurences in t
    for i,j in zip(t,n):
        if i not in f.keys():
            f[i] = {'nb': 1, 'nl':j, 'maxl': j,
                    'is_bullet': b[i], 'flags': FLAG_NONE}
        else:
            f[i]['nb'] += 1
            f[i]['nl'] += j
            if (j > f[i]['maxl']):
                f[i]['maxl'] = j

    for i in f.keys():
        f[i]['isnt_title'] = (f[i]['maxl'] > TITLE_MAX_LINES)

    # Replace short_font for lines considered as bullets (or text styling).
    last_bullet_lines = []
    for bl in blocks:
        for l in bl['lines']:
            if (l['flags'] & remove_flags) == FLAG_NONE:
                if f[l['short_font']]['is_bullet']:
                    last_bullet_lines.append(l)
                else:
                    if (len(last_bullet_lines) > 0):
                        for last in last_bullet_lines:
                            last['short_font'] = l['short_font']
                        last_bullet_lines = []
    if (len(last_bullet_lines) > 0):
        for last in last_bullet_lines:
            last['short_font'] = ndx_most_used

    # n and b won't be used anymore I think. So they're free

    # Rebuild the font succession list (is not optimized but is the safest)
    t = []
    for bl in blocks:
        for l in bl['lines']:
            if (l['flags'] & remove_flags) == FLAG_NONE:
                if t == []: t.append(l['short_font'])
                else:
                    if (t[-1] != l['short_font']):
                        t.append(l['short_font'])

    b = [] # We'll do a 2d table with b[i][j] = number of transitions
           # from fonti to fontj (will be a tree of font transitions)
    for i in range(len(fontspec)+1): # Consider len+1 to have font number -1
        b.append([0 for j in range(len(fontspec)+1)])
    for i in range(len(t)-1):
        j = i+1
        if not f[t[i]]['isnt_title']:
            b[t[i]][t[j]] += 1

    # Should we do this : ? &&&&&&&&&&&&&&&&&&&& A tester
    # Un moyen de s'assurer que tout sera parcouru...
    # Signifie qu'on ne finit pas sur un titre.
    f[t[-1]]['isnt_title'] = True

    # Create a deep attribute in f which contains distance from leaves
    for k,v in f.items():
        if v['isnt_title']:
            v['deep'] = 0
            v['nb_transitions'] = 999999999
        else: v['deep'] = None

    # Algo : dans le tableau b, on parcourt les colonnes (j) pour les fontes qui ont un deep.
    #        Un indice (i) de ligne pour lequel la valeur b[i][j] est non nulle signifie
    #        que la fonte i
    #        précède la fonte j b[i][j] fois.
    # Rq : Une colonne vide pour un indice dont la ligne est non-vide est une racine
    #      Une ligne vide pour un indice dont la colonne est non-vide est une feuille
    # On répète tant qu'on change des valeurs (c'est pas optimisé mais crotte,
    # le tableau n'est pas si grand)
    has_changed = True
    deep_max = 0
    while has_changed:
        has_changed = False
        for k,v in f.items():
            if v['deep'] is not None:
                for i in range(-1,len(b)-1):
                    if b[i][k] != 0:
                        if f[i]['deep'] is None:
                            if (b[i][k] <= NB_SUCCESSION_FOR_SAME):
                                f[i]['deep'] = v['deep']
                                f[i]['nb_transitions'] = b[i][k]
                            else:
                                f[i]['deep'] = v['deep'] + 1
                                f[i]['nb_transitions'] = b[i][k]
                            if f[i]['deep'] > deep_max:
                                deep_max = f[i]['deep']
                            has_changed = True
                            if (fontspec[i]['size'] < fontspec[k]['size']):
                                f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
                        elif f[i]['nb_transitions'] < b[i][k]:
                            f[i]['deep'] = v['deep'] + 1
                            f[i]['nb_transitions'] = b[i][k]
                            has_changed = True
                            if (fontspec[i]['size'] < fontspec[k]['size']):
                                f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE

    # Reverse deepness value, to make it distance from root
    for v in f.values():
        if (v['deep'] is not None):
            v['deep'] = deep_max - v['deep']

    # Add deep in blocks lines
    for bl in blocks:
        for l in bl['lines']:
            if (l['flags'] & remove_flags) == FLAG_NONE:
                l['deep'] = f[l['short_font']]['deep']
                if ((f[l['short_font']]['flags']) & TITLE_SMALLER_THAN_SUBTITLE != 0):
                   l['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
            else:
                l['deep'] = deep_max


# +--------------------------------------------------------------+
# |                      print_block_list                        |
# +--------------------------------------------------------------+
def print_block_list(t, remove_flags = FLAG_NONE):
    last_page = -1
    deep_max = -1
    for bl in t:
        for l in bl['lines']:
            if (l.get('deep') is not None):
                if deep_max < l['deep']: deep_max = l['deep']
    if deep_max > 10: deep_max = 10
    ttl = "#############"
    last_deep = -1

    for block in t:
        if (block['page'] != last_page):
            if (last_page > 0):
                print("")
            last_page = block['page']
            print("________________________________")
            print("*page %d*" % last_page)

        print("")

        for l in block['lines']:
            if (l['flags'] & remove_flags) == FLAG_NONE:
                pre = ''
                post = '  '
                if (l.get('deep') is None):
                    pre = '!! '
                    last_deep = -1
                else:
                    if (l['flags'] & SMALL_FONT) != 0:
                        pre = "> %s" % pre
                    #if (len(l['text']) > 20) and \
                    #   len(re.sub(r'\w','', l['text']).strip()) > 5:
                    #    post = "%s  " % post
                    if (l['flags'] & TITLE_SMALLER_THAN_SUBTITLE) != 0 and \
                       (l['flags'] & (DEFAULT_FONT_SIZE | SMALL_FONT)) != 0:
                        pre = "%s**" % (pre)
                        post = "**%s" % post
                    elif l['deep'] < deep_max:
                        pre = "%s%s " % (pre, ttl[0:(l['deep']+1)])
                    last_deep = l['deep']
                print("%s%s%s" % (pre, l['text'], post))


# +--------------------------------------------------------------+
# |                           main                               |
# +--------------------------------------------------------------+
if (len(sys.argv) < 1):
    print("-U-> Usage : python pdf2blocks.py <fichier_pdf>")
    sys.exit(-1)

blocks = get_pdftotext(sys.argv[1])
p2h = get_pdftohtml(sys.argv[1])
fontspec = p2h['fonts']
segments = p2h['segments']

default_font_size = get_default_font_size(fontspec)
# mark_small_fonts(blocks, default_font_size)
mark_page_bottom(blocks)
guess_fonts(blocks, segments, fontspec)
replace_block_fonts(blocks, fontspec, default_font_size)
guess_structure(blocks, fontspec)
print_block_list(blocks, PAGE_BOTTOM | IS_BULLET)