Commit 9a4a368d authored by Bernard Stephan's avatar Bernard Stephan
Browse files

Version légèrement améliorée, moins de <div> dans les sorties, et documentation de l'algo à jour.

parent c250fac8
......@@ -14,7 +14,7 @@ table {
}
.caption {
figcaption {
font-size: small;
font-style: italic;
color: #555555;
......@@ -30,7 +30,7 @@ table {
padding-left: 50px;
}
.page_top, .bottom_page {
header, footer {
font-family: sans;
font-size: small;
text-align: right;
......
This diff is collapsed.
......@@ -15,13 +15,14 @@ from p2b_utils import levenshtein
CMD_PDFTOTEXT = '/usr/sbin/pdftotext'
CMD_PDFTOHTML = '/usr/sbin/pdftohtml'
DEBUG_PRINT = True
DEBUG_PRINT = False
PRINT_CSS = False
LEFT_THRESHOLD = 25 # In p2b_text_utils.add_lines() : the max horizontal space
# to consider aligned items to be on the same line.
HORIZONTAL_ALIGMENT_THRESHOLD = 10
VERTICAL_ALIGMENT_THRESHOLD = 8
HORIZONTAL_ALIGMENT_THRESHOLD = 8
VERTICAL_ALIGMENT_THRESHOLD = 14 # Avant, c'était 8
# En dessous de ces valeurs, un bloc n'est pas considéré comme
# représentatif pour définir une colonne.
......@@ -289,7 +290,7 @@ def get_pdftohtml(filename):
def get_default_font_size(fontspec):
sizes = {}
max_cars = 0
size_max_cars = 42 # Doesn't matter : it'll change
size_max_cars = 42 # Doesn't matter : it's going to change
for f in fontspec:
if sizes.get(f['size']) is None:
sizes[f['size']] = f['nb_cars']
......@@ -583,13 +584,41 @@ def get_columns(blocks, default_font_size):
return columns
# +--------------------------------------------------------------+
# | expand_blocks |
# +--------------------------------------------------------------+
# Make the big-fonted blocs (titles?) take the most space they can, growing
# right without intersecting another block.
# RQ : this is to be applied on a list of blocs in the SAME PAGE
def expand_blocks(page_blocks, default_font_size):
pb = page_blocks # for shorter writing
max_x = max([b['x_max'] for b in pb])
min_x = max([b['x_min'] for b in pb])
for b in pb:
if b['font']['size'] > default_font_size:
b['xx_max'] = max_x
b['xx_min'] = min_x
for block in pb:
if block['font']['size'] > default_font_size:
for b in pb:
if b['y_min'] < block['y_max'] and b['y_max'] > block['y_min']:
if b['x_min'] >= block['x_max'] and block['xx_max'] > b['x_min']:
block['xx_max'] = b['x_min']
if b['x_max'] <= block['x_min'] and block['xx_min'] < b['x_max']:
block['xx_min'] = b['x_max']
for b in pb:
if b['font']['size'] > default_font_size:
b['x_max'] = b['xx_max']
b['x_min'] = b['xx_min']
del b['xx_max']
del b['xx_min']
# +--------------------------------------------------------------+
# | sort_blocks |
# +--------------------------------------------------------------+
def sort_blocks(blocks, columns, default_font_size):
HUGE_RESIZE_LEFT = False # If FONT_HUGE, resize block as large as possible.
# But resize to left may not be a good idea
# Set this to True or False and test results.
res = []
nb_blocks = 0
for b in blocks:
......@@ -602,29 +631,11 @@ def sort_blocks(blocks, columns, default_font_size):
c = 1
while columns[c] <= b['x_min']: c += 1
b['col_min'] = c - 1
while columns[c] < b['x_max'] - VERTICAL_ALIGMENT_THRESHOLD: c += 1
while columns[c] < b['x_max'] - HORIZONTAL_ALIGMENT_THRESHOLD: c += 1
b['col_max'] = c
# ATTENTION : if block in column 1, col_min ← 0 and col_max ← 1.
# WARNING : if block in column 1, col_min ← 0 and col_max ← 1.
b['temp_left'] = b['col_min']
# HUGE_FONT blocks are resized the largest possible.
for b in blocks:
if not b['treat']:
if b['font_class'] == FONT_HUGE:
oth_bl = [bl for bl in blocks if (not bl['treat']) and bl != b \
and bl['y_min'] <= b['y_max'] and bl['y_max'] >= b['y_min']]
if len(oth_bl) == 0:
if HUGE_RESIZE_LEFT:
b['col_min'] = b['temp_left'] = 0
b['col_max'] = len(columns) - 1
else :
if HUGE_RESIZE_LEFT:
while len([bl for bl in oth_bl if bl['col_max'] < b['col_min']]) != 0:
b['col_min'] -= 1
b['temp_left'] = b['col_min']
while len([bl for bl in oth_bl if bl['col_min'] > b['col_max']]) != 0:
b['col_max'] += 1
for b in blocks:
if b['class'] == BL_TOP_PAGE:
res.append(b)
......@@ -635,7 +646,7 @@ def sort_blocks(blocks, columns, default_font_size):
# "we're looking for blocks contained into columns 0 to 2" (there, min is 0
# and max is 2).
curr_col_min = 0
curr_col_max = len(columns) - 1
curr_col_max = 1
while len(res) != nb_blocks:
#&& print("===> %d / %d" % (len(res), len(blocks)))
......@@ -647,7 +658,10 @@ def sort_blocks(blocks, columns, default_font_size):
# Tant qu'on n'en trouve pas, on élargit les colonnes.
# Si on n'en trouvait toujours pas c'est qu'il y a un gros bug :
# donc on choisit de faire planter
# donc on choisit de faire planter. PS : ça n'est jamais arrivé
# dans le corpus.
# WARNING : bl_top is badly named. It doesn't contain the top blocks
# but all blocks in current column(s)
bl_top = [b for b in blocks if not b['treat'] and \
b['temp_left'] >= curr_col_min and \
b['temp_left'] < curr_col_max]
......@@ -663,13 +677,15 @@ def sort_blocks(blocks, columns, default_font_size):
y_min = min([b['y_min'] for b in bl_top])
y_max = max([b['y_max'] for b in bl_top if b['y_min'] == y_min])
#&& print("----- %d %f %f" % (len(bl_top), y_min, y_max))
## y a-t-il plusieurs blocs alignés ? Si oui, on prend le plus à gauche.
aligned_bl = [b for b in blocks if not b['treat'] and \
b['temp_left'] >= curr_col_min and \
b['temp_left'] < curr_col_max and \
b['y_min'] >= y_min - VERTICAL_ALIGMENT_THRESHOLD and \
b['y_min'] < y_max]
#abs(b['y_min'] - y_min) <= VERTICAL_ALIGMENT_THRESHOLD]
selected_bl = aligned_bl[0]
for b in aligned_bl[1:]:
if b['x_min'] < selected_bl['x_min']: selected_bl = b
......@@ -697,15 +713,19 @@ def sort_blocks(blocks, columns, default_font_size):
if len(higher_bl) > 0: # ← Case 1
#&& print("→ Case 1 : %s" % selected_bl['lines'][0]['text'])
# &&& À tester : col_max à 1 ?
curr_col_min = 0
curr_col_max = len(columns) - 1
elif selected_bl['col_max'] > curr_col_max : # ← Case 3
#&& print("→ Case 3 : %s" % selected_bl['lines'][0]['text'])
selected_bl['temp_left'] += 1
curr_col_max += 1
curr_col_min += 1
else: # ← Case 2
#&& print("→ Case 2 : %s" % selected_bl['lines'][0]['text'])
curr_col_max = selected_bl['col_max']
curr_col_min = selected_bl['col_min']
res.append(selected_bl)
......@@ -988,7 +1008,6 @@ def guess_structure(blocks, fontspec, def_size):
bl['font']['has_style'] = (len(bl['font']['family'].split(',')) > 1)
# Alignment
if len(bl['lines']) <= 1:
bl['alignment'] = ALIGN_UNDEF
......@@ -1036,9 +1055,10 @@ def guess_structure(blocks, fontspec, def_size):
# Reorder blocks
first_block_tagged = False
for p in pages.values():
expand_blocks(p['blocks'], default_font['size'])
col = get_columns(p['blocks'], default_font['size'])
col.append(max(b['x_max'] for b in p['blocks']))
#&& print("================================================== %s" % col)
#print("================================================== %s" % col)
p['blocks'] = sort_blocks(p['blocks'], col, default_font['size'])
#print(col)
if not first_block_tagged:
......@@ -1069,8 +1089,6 @@ def guess_structure(blocks, fontspec, def_size):
for bl in p['blocks']:
if bl['class'] == BL_UNDEF:
t = "".join([t['text'] for t in bl['lines']])
# print("====> %s" % t)
# print(" --> [%s]" % re.sub(r'\W','',t).strip())
if len(re.sub(r'\W','',t).strip()) == 0:
bl['class'] = BL_IGNORE
......@@ -1114,10 +1132,6 @@ def guess_structure(blocks, fontspec, def_size):
adjust_block_prototypes(bl)
for inc in [-1, 1]:
j = i+inc
#&& while j >= 0 and j < len(p['blocks']) and \
#&& p['blocks'][j]['class'] == BL_UNDEF and \
#&& p['blocks'][j]['font']['size'] == bl['font']['size'] and \
#&& p['blocks'][j]['max_line_size'] <= TABLE_LINE_SIZE:
while j >= 0 and j < len(p['blocks']) and \
p['blocks'][j]['class'] == BL_UNDEF and \
p['blocks'][j]['font_class'] <= FONT_DEFAULT and \
......@@ -1289,8 +1303,9 @@ def print_html(pages, fontspec):
print('<!DOCTYPE html>')
print('<html lang="fr">')
print('<head><meta charset="utf-8">')
print('<link rel="stylesheet" href="http://ontology.inrae.fr/bsv/html/bsv.css" />')
print('<link rel="stylesheet" href="bsv.css" />')
if PRINT_CSS:
print('<link rel="stylesheet" href="http://ontology.inrae.fr/bsv/html/bsv.css" />')
print('<link rel="stylesheet" href="bsv.css" />')
if DEBUG_PRINT:
print("<!-- Fonts :")
......@@ -1315,9 +1330,9 @@ def print_html(pages, fontspec):
print("</head>")
print("<body>")
BLOCK_TAGS = ['div', 'p', 'div', 'div', 'div', 'table',
BLOCK_TAGS = ['div', 'p', 'footer', 'header', 'figcaption', 'table',
'a', 'div', 'h1', 'h2', 'h3', 'h4', 'h5',
'div', 'div', 'div', 'div', '']
'div', 'div', 'div', 'p', '']
BLOCK_ENDLINES = ['<br />', '', '<br />', '<br />', '<br />', '',
'', '', '', '', '', '', '', '<br />', '<br />', '<br />', '<br />',
'<br />']
......@@ -1346,15 +1361,24 @@ def print_html(pages, fontspec):
print('</table>')
if i < len(blocks):
i -= 1
elif cl != BL_IGNORE:
pre = post = ''
if PRINT_CSS:
id_cl = ' class="%s"' % BLOCKS_CLASSES[cl]
else:
id_cl = ''
if cl == BL_CAPTION:
pre = '<figure>'
post = '</figure>'
if not DEBUG_PRINT:
print('<%s class="%s">' % (BLOCK_TAGS[cl], BLOCKS_CLASSES[cl]))
print('%s<%s%s>' % (pre, BLOCK_TAGS[cl], id_cl))
elif blocks[i]['score'] is None:
print('<%s class="%s" font="%d">' % (
BLOCK_TAGS[cl], BLOCKS_CLASSES[cl], blocks[i]['font']['id']))
print('%s<%s%s font="%d">' % (
pre, BLOCK_TAGS[cl], id_cl, blocks[i]['font']['id']))
else:
print('<%s class="%s" font="%d" score="%f">' % (
BLOCK_TAGS[cl], BLOCKS_CLASSES[cl],
print('%s<%s%s font="%d" score="%f">' % (pre,
BLOCK_TAGS[cl], id_cl,
blocks[i]['font']['id'], blocks[i]['score']))
for l in blocks[i]['lines']:
......@@ -1364,16 +1388,12 @@ def print_html(pages, fontspec):
print(" <br />%s" % l['text'])
else:
print(" %s%s" % (l['text'], BLOCK_ENDLINES[cl]))
print("</%s>" % BLOCK_TAGS[cl])
print("</%s>%s" % (BLOCK_TAGS[cl], post))
else: # cl == BL_IGNORE:
print("<!--")
[print(" %s" % l['text']) for l in blocks[i]['lines']]
print("-->")
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
i += 1
print("</body>")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment