Commit 585c0b0f authored by Bernard Stephan's avatar Bernard Stephan
Browse files

Mise à jour de l'algo et de son descriptif. Ajout de figure.

parent 5df6f37c
This diff is collapsed.
......@@ -9,7 +9,7 @@ import subprocess
from p2b_utils import levenshtein
### Script pour faire tout le corpus :
# D=/home/phan/Boulot/Ontology/BSV/tmp/Corpus/2019/GrandesCultures; for i in ${D}/*.pdf; do j=$( basename "$i" | sed -e 's/\.pdf//' ); echo $j; python pdf2blocks.py ${D}/$j > ${D}/${j}.html ; done
# D=/home/phan/Boulot/Ontology/BSV/tmp/Corpus/Tests/GrandesCultures; for i in ${D}/*.pdf; do j=$( basename "$i" | sed -e 's/\.pdf//' ); echo $j; python pdf2blocks.py ${D}/$j > ${D}/${j}.html ; done
CMD_PDFTOTEXT = '/usr/sbin/pdftotext'
......@@ -106,9 +106,8 @@ NUMBERING_THRESHOLD = 0.3
# Below the TABLE_LENGTH value,
TABLE_LINE_SIZE = 25
TITLE_MAX_LINES = 2
TITLE_SIZE_MAX = 60 # Max size for titles in 1st phase
TITLE_SIZE_LIMIT = 140 # At the end, all title>TITLE_SIZE_LIMIT are marked <p>
TITLE_MIN_CHAR = 2 # To avoid “styled” bullet : we consider that a font never
# used for more than TITLE_MIN_CHAR characters per line
......@@ -135,7 +134,7 @@ INDICES_EXPOSANTS_USUELS = [
NUMBERING_REGEX = r'[0-9,\.\-\+\(\)%°±~ ]'
PUNCTUATION_REGEX = r'[\.\?\!]'
PUNCTUATION_REGEX = r'[\.\;\,\?\!]'
CAPTION_REGEX = ['cr[ée]dit photo', 'photo ?:']
......@@ -619,7 +618,8 @@ def expand_blocks(page_blocks, default_font_size):
# | sort_blocks |
# +--------------------------------------------------------------+
def sort_blocks(blocks, columns, default_font_size):
res = []
PRINT_SORT = False
ordered_blocks = []
nb_blocks = 0
for b in blocks:
b['treat'] = (b['class'] == BL_IGNORE \
......@@ -638,7 +638,7 @@ def sort_blocks(blocks, columns, default_font_size):
for b in blocks:
if b['class'] == BL_TOP_PAGE:
res.append(b)
ordered_blocks.append(b)
nb_blocks += 1
# curr_col is for "current columns". It is designed by its index in the
......@@ -648,13 +648,15 @@ def sort_blocks(blocks, columns, default_font_size):
curr_col_min = 0
curr_col_max = 1
while len(res) != nb_blocks:
#&& print("===> %d / %d" % (len(res), len(blocks)))
while len(ordered_blocks) != nb_blocks:
if PRINT_SORT:
print("===> %d / %d" % (len(ordered_blocks), len(blocks)))
# On cherche la hauteur du plus haut des blocs dans la colonne
bl_top = []
while len(bl_top) == 0:
#&& print("* [%d,%d]" % (curr_col_min, curr_col_max))
if PRINT_SORT:
print("* [%d,%d]" % (curr_col_min, curr_col_max))
# Tant qu'on n'en trouve pas, on élargit les colonnes.
# Si on n'en trouvait toujours pas c'est qu'il y a un gros bug :
......@@ -688,19 +690,21 @@ def sort_blocks(blocks, columns, default_font_size):
for b in aligned_bl[1:]:
if b['x_min'] < selected_bl['x_min']: selected_bl = b
#&& print("{(%s, %s);(%s, %s)} %s" % (selected_bl['x_min'],
#&& selected_bl['x_max'], selected_bl['y_min'], selected_bl['y_max'],
#&& selected_bl['lines'][0]['text']))
if PRINT_SORT:
print("{(%s, %s);(%s, %s)} %s" % (selected_bl['x_min'],
selected_bl['x_max'], selected_bl['y_min'], selected_bl['y_max'],
selected_bl['lines'][0]['text']))
# On a le prochain bloc : (selected_bl)
# On a le prochain bloc candidat : (selected_bl)
# Cas de figure :
# 1. Il y a un bloc non traité qui commence +haut dans une colonne plus
# à gauche. C'est qu'on est passé un truc petit (centré?) qui nous
# a mis dans la mauvaise colonne. On va donc changer de colonne.
# 2. Le bloc trouvé ne chevauche pas la colonne suivante
# 1. (C) Il y a un bloc non traité qui commence +haut que le bas
# de selected_bl dans une colonne
# plus à gauche. C'est qu'on est passé un truc petit (centré?) qui
# nous a mis dans la mauvaise colonne. On va donc changer de colonne.
# 2. (A) Le bloc trouvé ne chevauche pas la colonne suivante
# Alors on l'ajoute et RAS, mais on met les colonnes en cours
# à son col_min.
# 3. Le bloc trouvé chevauche la colonne suivante ou
# 3. (B) Le bloc trouvé chevauche la colonne suivante ou
# il y a un bloc non traité plus haut en FONT_HUGE. Alors on déplace
# son temp_left dans la colonne suivante et on cherche s'il
# n'y a pas des blocs plus hauts dans la colonne suivante.
......@@ -710,41 +714,43 @@ def sort_blocks(blocks, columns, default_font_size):
b['y_min'] <= selected_bl['y_max']]
if len(higher_bl) > 0: # ← Case 1
#&& print("→ Case 1 : %s" % selected_bl['lines'][0]['text'])
if PRINT_SORT:
print("→ Case C : %s" % selected_bl['lines'][0]['text'])
# &&& À tester : col_max à 1 ?
curr_col_min = 0
curr_col_max = 1
# curr_col_max = len(columns) - 1
elif selected_bl['col_max'] > curr_col_max : # ← Case 3
#&& print("→ Case 3 : %s" % selected_bl['lines'][0]['text'])
if PRINT_SORT:
print("→ Case B : %s" % selected_bl['lines'][0]['text'])
selected_bl['temp_left'] += 1
curr_col_max += 1
curr_col_min += 1
else: # ← Case 2
#&& print("→ Case 2 : %s" % selected_bl['lines'][0]['text'])
if PRINT_SORT:
print("→ Case A : %s" % selected_bl['lines'][0]['text'])
curr_col_max = selected_bl['col_max']
curr_col_min = selected_bl['col_min']
res.append(selected_bl)
ordered_blocks.append(selected_bl)
selected_bl['treat'] = True
# - End loop : all blocks are in res.
# - End loop : all blocks are in ordered_blocks.
for b in blocks:
if b['class'] == BL_IGNORE:
res.append(b)
ordered_blocks.append(b)
for b in blocks:
if b['class'] == BL_BOTTOM_PAGE:
res.append(b)
ordered_blocks.append(b)
# Before leaving function
for b in blocks:
del b['treat']
del b['temp_left']
return res
return ordered_blocks
# +--------------------------------------------------------------+
......@@ -1178,11 +1184,11 @@ def guess_structure(blocks, fontspec, def_size):
last_block_font = -2
for p in pages.values():
for b in p['blocks']:
if b['class'] == BL_UNDEF and b['nb_lines'] <= TITLE_MAX_LINES and \
(b['font_class'] > FONT_DEFAULT \
or (b['font_class'] == FONT_DEFAULT and \
if b['class'] == BL_UNDEF and b['block_size'] <= TITLE_SIZE_MAX and \
re.search(PUNCTUATION_REGEX, b['last_character']) is None \
and b['font']['has_style']))\
and (b['font_class'] > FONT_DEFAULT \
or (b['font_class'] == FONT_DEFAULT \
and b['font']['has_style']))\
and not block_contains_keywords(b, LINK_REGEX):
if b['font']['id'] == last_block_font:
title_nb[-1] += 1
......@@ -1206,21 +1212,6 @@ def guess_structure(blocks, fontspec, def_size):
del(title_blocks[0])
break
# for n,f in zip(title_nb, title_block_fonts):
# print("==[%d]==> %s" % (n, f))
# We delete titles occuring only once, and titles having too much
# successive occurences (let's say than more than TITLE_MAX_LINES
# occurences is certainly not a title (that can be 4 lines)
i = 0
while i < len(title_block_fonts):
if title_block_fonts[i]['nb_blocks'] == 1 or \
title_nb[i] > TITLE_MAX_LINES:
del(title_nb[i])
del(title_block_fonts[i])
del(title_blocks[i])
else:
i += 1
#print("====> %s" % title_block_fonts)
......@@ -1238,13 +1229,47 @@ def guess_structure(blocks, fontspec, def_size):
#print("======> %s" % title_font_sizes)
# On fait un tableau tel que profondeur(blks[i])=title_depth[i]
title_depth = []
for blks in title_blocks:
b = blks[0]
sz = b['font']['size']
if b['font']['has_style']: sz += 0.5
sz += 0.45 - min(0.45, 0.1 * float(round(b['font']['nb_blocks'] / 3)))
title_depth.append(title_font_sizes.index(sz))
#&& print("======> %s" % title_depth)
# Et maintenant on essaye d'optimiser ce tableau pour remonter
# le niveau des titres.
TITLE_OPTIMISE_LEVEL = True
if (TITLE_OPTIMISE_LEVEL):
preced = [[] for _ in range(len(title_font_sizes))]
if (len(title_depth) > 0):
last_depth = title_depth[0]
for d,p in zip(title_depth[1:], title_depth[:-1]):
if d > p and not p in preced[d]:
preced[d].append(p)
title_font_depth = []
thresh = def_size + FONT_THRESHOLDS[FONT_HUGE]
for p,s in zip(preced,title_font_sizes):
if len(p) > 0:
title_font_depth.append(max([title_font_depth[d] for d in p])+1)
else:
if s >= thresh or len(title_font_depth) == 0:
title_font_depth.append(0)
else:
title_font_depth.append(1)
for i in range(len(title_depth)):
title_depth[i] = min(title_font_depth[title_depth[i]],
len(TITLE_CLASSES)-1)
#&& print("------> %s" % title_depth)
#&& print("")
title_fonts = [] # To be used at the beginig of prototype distance process
for blks,depth in zip(title_blocks, title_depth):
for b in blks:
sz = b['font']['size']
if b['font']['has_style']: sz += 0.5
sz += 0.45 - min(0.45, 0.1 * float(round(b['font']['nb_blocks'] / 3)))
depth = min(title_font_sizes.index(sz),
len(TITLE_CLASSES)-1)
if b['font'] not in title_fonts:
title_fonts.append(b['font'])
b['class'] = TITLE_CLASSES[depth]
adjust_block_prototypes(b)
......@@ -1259,6 +1284,25 @@ def guess_structure(blocks, fontspec, def_size):
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# - Now, we complete undefined blocks with closest prototypes.
# First, too long titles are marked as paragraphs, but we want this
# to be used for prototype definition. So we do an extra pass just for this
# case. But we can also mark others : that'll make 2 pass shorter.
for p in pages.values():
for b in p['blocks']:
if b['class'] == BL_UNDEF:
ndx = get_closest_block_class(b)
if ndx["class"] in TITLE_CLASSES:
if b['block_size'] > TITLE_SIZE_LIMIT:
b['class'] = BL_PARAGRAPH
b['score'] = ndx["score"]
adjust_block_prototypes(b)
# else nothing : we'll test again other titles.
else : # not titles
b['class'] = ndx["class"]
b['score'] = ndx["score"]
# Titles haven't been marked yet, but some may be paragraphs.
for p in pages.values():
for b in p['blocks']:
if b['class'] == BL_UNDEF:
......@@ -1271,6 +1315,7 @@ def guess_structure(blocks, fontspec, def_size):
# Some adjustments :
## 1. Table cannot be alone. So an alone table will be MISC
## 2. Titles having size > TITLE_SIZE_LIMIT are marked paragraph
prev_table = 0
prev_block = None
for p in pages.values():
......@@ -1282,6 +1327,7 @@ def guess_structure(blocks, fontspec, def_size):
prev_block['class'] = BL_MISC
prev_table = 0
prev_block = b
return pages
# +--------------------------------------------------------------+
......@@ -1327,7 +1373,7 @@ def print_html(pages, fontspec):
BLOCK_TAGS = ['div', 'p', 'footer', 'header', 'figcaption', 'table',
'a', 'h1', 'h1', 'h2', 'h3', 'h4', 'h5',
'div', 'div', 'div', 'p', '']
'div', 'div', 'div', 'small', '']
BLOCK_ENDLINES = ['<br />', '', '<br />', '<br />', '', '',
'', '', '', '', '', '', '', '<br />', '<br />', '<br />', '',
'']
......@@ -1366,11 +1412,14 @@ def print_html(pages, fontspec):
if cl == BL_CAPTION:
pre = '<figure>'
post = '</figure>'
if cl == BL_MISC:
pre = '<p>'
post = '</p>'
if not DEBUG_PRINT:
txt = '%s%s<%s%s>' % (txt, pre, BLOCK_TAGS[cl], id_cl)
#&& txt = '%s%s<%s%s>(%s, %s)' % (txt, pre, BLOCK_TAGS[cl], id_cl, blocks[i]['x_min'],blocks[i]['x_max'])
elif blocks[i]['score'] is None:
txt = '%s%s<%s%s font="%d">' % (txt, pre, LOCK_TAGS[cl],
txt = '%s%s<%s%s font="%d">' % (txt, pre, BLOCK_TAGS[cl],
id_cl, blocks[i]['font']['id'])
else:
txt = '%s%s<%s%s font="%d" score="%f">' % (txt, pre,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment