Commit 5e2d0575 authored by Stéphan Bernard's avatar Stéphan Bernard
Browse files

Version 1, actuellement utilisée (10/2021), avant création d'une nouvelle...

Version 1, actuellement utilisée (10/2021), avant création d'une nouvelle branche pour une version 2.
parent efb6f235
......@@ -143,6 +143,9 @@ TITLE_MIN_CHAR = 2 # To avoid “styled” bullet : we consider that a font neve
# used for more than TITLE_MIN_CHAR characters per line
# is a kind of text styling and will take the next line's font
TITLE_MAX_LINES = 2 # a paragraph having more than TITLE_MAX_LINES lines
# is shurely not a title.
# SIMILARITY THRESHOLD : For line similarity (to assign font).
# 1.0 : no threshold, 0.0 : no similarity (score 0.0 is perfect match)
SIMILARITY_THRESHOLD = 1.0
......
......@@ -13,7 +13,6 @@ from p2b_config import *
# +--------------------------------------------------------------+
# | is_ind_exp |
# +--------------------------------------------------------------+
......@@ -630,42 +629,227 @@ def get_columns(blocks, default_font_size):
def compute_lrud(page_blocks):
for b in page_blocks:
b['left'] = b['right'] = b['top'] = b['down'] = -1
for b in page_blocks:
#print("** %s (%d) [%s, %s ; %s, %s] **" % (b['lines'][0]['text'][:20],
# len(b['lines']), b['x_min'], b['x_max'], b['y_min'], b['y_max']))
for ob in page_blocks: # ob : other block
if ob is not b:
if ob['y_min'] <= b['y_max'] and ob['y_max'] >= b['y_min']: # Vertically aligned
# &&&&&&&&&&&&& Ligne suivante pour la trace :
#str = '%s [%s, %s ; %s, %s]'%(ob['lines'][0]['text'][:10], ob['x_min'], ob['x_max'], ob['y_min'], ob['y_max']) # &&&&&&&&&&&&&&&&&&&&&&&&&
if ob['y_min'] < b['y_max'] and ob['y_max'] > b['y_min']: # Vertically aligned
#str = 'V: %s' % str # &&&&&&&&&&&&&&&&&&&&&&&
if ob['x_min'] >= b['x_max']: # ob on right
dist = ob['x_min'] - b['x_max']
if b['right'] < 0:
b['right'] = dist
b['right_block'] = ob
#print(" -> %s (%s)" % (str, dist)) # &&&&&&&&&&&&&&&&&&&&&&
elif b['right'] > dist:
b['right'] = dist
b['right_block'] = ob
#print(" -> %s (%s)" % (str, dist)) # &&&&&&&&&&&&&&&&&&&&&&
if ob['x_max'] < b['x_min']: # ob on left
dist = b['x_min'] - ob['x_max']
if b['left'] < 0:
b['left'] = dist
b['left_block'] = ob
#print(" <- %s (%s)" % (str, dist)) # &&&&&&&&&&&&&&&&&&&&&&
elif b['left'] > dist:
b['left'] = dist
b['left_block'] = ob
#print(" <- %s (%s)" % (str, dist)) # &&&&&&&&&&&&&&&&&&&&&&
if ob['x_min'] <= b['x_max'] and ob['x_max'] >= b['x_min']: # Horizontally aligned
#str = 'H: %s' % str # &&&&&&&&&&&&&&&&&&&&&&&
if ob['y_min'] >= b['y_max']: # ob under b
dist = ob['y_min'] - b['y_max']
if b['down'] < 0:
b['down'] = dist
b['down_block'] = ob
#print(" v %s (%s)" % (str, dist)) # &&&&&&&&&&&&&&&&&&&&&&
elif b['down'] > dist:
b['down'] = dist
b['down_block'] = ob
#print(" v %s (%s)" % (str, dist)) # &&&&&&&&&&&&&&&&&&&&&&
if ob['y_max'] < b['y_min']: # ob on top of b
dist = ob['y_max'] - b['y_min']
dist = b['y_min'] - ob['y_max']
if b['top'] < 0:
b['top'] = dist
b['top_block'] = ob
#print(" ^ %s (%s)" % (str, dist)) # &&&&&&&&&&&&&&&&&&&&&&
elif b['top'] > dist:
b['top'] = dist
b['top_block'] = ob
#print(" ^ %s (%s)" % (str, dist)) # &&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&& remove to the end of the function: printing for debug
if True: return # An easy way to have following code not executed
seen = []
for b in page_blocks:
if b['right'] >= 0 and b['left'] < 0 and b not in seen:
bl = [b]
while bl[-1]['right'] >= 0:
bl.append(bl[-1]['right_block'])
seen.append(bl[-1])
for i in range(max([len(bb['lines']) for bb in bl])):
txt = '|'
for bb in bl:
if i >= len(bb['lines']):
txt += ' |'
else:
txt += (bb['lines'][i]['text'] + " ")[:15]+'|'
print(txt)
print()
# +--------------------------------------------------------------+
# | table_detection |
# +--------------------------------------------------------------+
# A good test :
# python -u pdf2blocks.py ../../../../CorpusTESTs/corpus-alea/bsv/BSV_GRANDES_CULTURES_2018_No38_cle0b961f.pdf
# table_detection tries to find tables within a page.
# Uses comute_lrud()'s left/right, ...
# Will try to detect succession of blocks in the same line having
# different blocks on 'top' or 'down', making it different to shitty-justified
# text (having same top or down). But cannot detect multi-column cells.
def table_detection(page_blocks):
for b in page_blocks:
if b['class'] == BL_UNDEF and b['right'] > 0 and b['down'] > 0:
t = [[b],]
line = 0
pb = b
## Let's fill the 1st line
while pb['right'] > 0 :
pb = pb['right_block']
if pb['class'] == BL_UNDEF:
t[line].append(pb)
else:
break
keep_going = len(t[0]) > 1
#if keep_going: # &&&&&&&&&&&&&&&&&&&&& Effacer jusqu'à "là"
# print('-----------------------------')
# l = '|'
# for i in t[0]:
# l += ("%s " % i['lines'][0]['text'])[:10]+'|'
# print(l) # &&&&&&&&&&&&&&&&& "là"
while keep_going:
prev_line = line
bd = t[prev_line][0]['down_block']
n = 1
line += 1
if bd['class'] == BL_UNDEF:
t.append([bd])
else :
keep_going = False
while keep_going and bd['right'] > 0 and n < len(t[prev_line]):
bd = bd['right_block']
pb = t[prev_line][n]
n += 1
keep_going = (pb['down'] > 0 and pb['down_block'] == bd \
and bd['class'] == BL_UNDEF)
if keep_going:
t[line].append(bd)
#if line < len(t): # &&&&&&&&&&&&&&&&& Effacer jusqu'à "là"
# l = '|'
# for i in t[line]:
# l += ("%s " % i['lines'][0]['text'])[:10]+'|'
# if not keep_going:
# l+= 'X'
# print(l) # &&&&&&&&&&&&&&&&& "là"
if n < len(t[prev_line]):
keep_going = False
if not keep_going:
if line < len(t):
del t[line]
#elif n < len(t[prev_line]): # &&& Pas judicieux : à supprimer...
# print("Couic : %d" % n) # &&&&&&&&&&&&&&&&&&&&
# for l in range(line):
# del t[l][n:] # ... jusqu'ici.
if keep_going:
# We have to check that every block of the last line has
# a down_block. Otherwise, we don't go further.
for bb in t[line]:
if keep_going:
keep_going = bb['down'] > 0
#if not keep_going: # &&&&&&&&&&&&&&&&&&&&&&
# print()
html = None
if len(t) > 1 and len(t[0]) > 1:
## We've got a table candidate here !!!!
txt = '-----------------------------\n' # &&& txt is for debug printing. Can be removed.
html = '' # obsolete: '<table>\n'
for i in t:
html += ' <tr>\n'
l = '|'
for j in i:
html += ' <td>%s</td>\n' % j['lines'][0]['text']
l += ("%s " % j['lines'][0]['text'])[:15]+'|'
txt += l + '\n'
html += ' </tr>\n'
#html += '</table>\n' : obsolete
print(txt) # &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
print()
if len(t) == 1 and len(t[0]) > 1:
ll = [len(tt['lines']) for tt in t[0]]
if min(ll) == max(ll) and min(ll) > 1: # <- THIS {C/SH}OULD BE A PARAMETER
## Another Candidate here !!!!
html = '' # obsolete : '<table>\n'
txt = '===================================\n'
for i in range(len(t[0][0]['lines'])):
html += ' <tr>\n'
l = '|'
for j in t[0]:
html += ' <td>%s</td>\n' % j['lines'][i]['text']
l += ("%s " % j['lines'][i]['text'])[:15]+'|'
html += ' </tr>\n'
txt += l + '\n'
print(txt) # &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
print()
#html += '</table>\n' : obsolete
del ll
# &&&&&&&&&&& TODO :
# Si html est not None, alors on a un tableau.
# Ce tableau est décrit en html dans html, on va créer une clé dans le bloc.
# On pose t[0][0] qui sera le bloc qui contient TOUT le tableau.
# DONE: Il faut faire tous les blocs de la page pour changer tous les pointeurs
# vers un des blocs du tableau vers b.
# DONE: Il faut ajuster les x_min, x_max, y_min,...
# DONE: Puis mettre en BL_IGNORE tous les blocs qui composent
# le tableau, SAUF b.
# DONE: - Et aussi : MàJ du prototype !!!
if html is not None:
all_blocks = []
for i in t:
for j in i:
all_blocks.append(j)
j['class'] = BL_TABLE
adjust_block_prototypes(j)
j['class'] = BL_IGNORE
b['x_min'] = min([bb['x_min'] for bb in all_blocks])
b['y_min'] = min([bb['y_min'] for bb in all_blocks])
b['x_max'] = max([bb['x_max'] for bb in all_blocks])
b['y_max'] = max([bb['y_max'] for bb in all_blocks])
b['class'] = BL_TABLE
b['html'] = html
#print(html)
#print('---------------------')
for bb in page_blocks:
if bb not in all_blocks:
for k in ['left', 'right', 'top', 'down']:
if bb[k] > 0 and bb[('%s_block'%k)] in all_blocks:
bb[('%s_block'%k)] = all_blocks[0]
# +--------------------------------------------------------------+
......@@ -710,7 +894,8 @@ def sort_blocks(blocks, columns, default_font_size):
for b in blocks:
b['treat'] = (b['class'] == BL_IGNORE \
or b['class'] == BL_BOTTOM_PAGE \
or b['class'] == BL_TOP_PAGE)
or b['class'] == BL_TOP_PAGE \
or b['class'] == BL_TABLE)
if not b['treat']:
nb_blocks += 1
......@@ -1170,6 +1355,10 @@ def guess_structure(blocks, fontspec, def_size):
# Prototypes are not adjusted, because vertical text
# is too different and should need different treatments.
# Tables should be easy to detect
for p in pages.values():
table_detection(p['blocks'])
# Ignore blocks containing no character or number
for p in pages.values():
for bl in p['blocks']:
......@@ -1262,6 +1451,7 @@ def guess_structure(blocks, fontspec, def_size):
title_block_fonts = []
title_nb = []
title_blocks = []
title_nb_other = [0]
last_block_font = -2
for p in pages.values():
for b in p['blocks']:
......@@ -1269,16 +1459,31 @@ def guess_structure(blocks, fontspec, def_size):
re.search(PUNCTUATION_REGEX, b['last_character']) is None \
and (b['font_class'] > FONT_DEFAULT \
or (b['font_class'] == FONT_DEFAULT \
and b['font']['has_style']))\
and b['font']['has_style'])) \
and not block_contains_keywords(b, LINK_REGEX):
if b['font']['id'] == last_block_font:
title_nb[-1] += 1
title_nb[-1] += 1 #len(b['lines'])
title_blocks[-1].append(b)
else:
title_block_fonts.append(b['font'])
title_nb.append(1)
title_nb_other.append(0)
title_blocks.append([b])
last_block_font = b['font']['id']
else:
title_nb_other[-1] += 1
## We remove elements having too many successive lines
del title_nb_other[0] # The 1st element has no mean
i = 0
while i < len(title_nb):
if title_nb[i] > TITLE_MAX_LINES and title_nb_other[i] == 0:
del title_block_fonts[i]
del title_blocks[i]
del title_nb[i]
del title_nb_other[i]
else:
i += 1
## document title
if len(title_block_fonts) > 0 and \
......@@ -1351,33 +1556,61 @@ def guess_structure(blocks, fontspec, def_size):
# First, too long titles are marked as paragraphs, but we want this
# to be used for prototype definition. So we do an extra pass just for this
# case. But we can also mark others : that'll make 2 pass shorter.
title_stack = []
score_stack = []
prev_class = BL_UNDEF
for p in pages.values():
for b in p['blocks']:
if b['class'] == BL_UNDEF:
ndx = get_closest_block_class(b)
if ndx["class"] in TITLE_CLASSES:
if b['block_size'] > TITLE_SIZE_LIMIT:
if b['block_size'] > TITLE_SIZE_LIMIT or \
len(b['lines']) >= TITLE_MAX_LINES:
b['class'] = BL_PARAGRAPH
b['score'] = ndx["score"]
adjust_block_prototypes(b)
# else nothing : we'll test again other titles.
if prev_class == ndx["class"]:
title_stack.append(b)
score_stack.append(ndx["score"])
else:
if len(title_stack) > TITLE_MAX_LINES:
for bk,s in zip(title_stack,score_stack):
bk['class'] = BL_PARAGRAPH
bk['score'] = s
title_stack = []
score_stack = []
prev_class = ndx["class"]
else : # not titles
b['class'] = ndx["class"]
b['score'] = ndx["score"]
# Titles haven't been marked yet, but some may be paragraphs.
if len(title_stack) > TITLE_MAX_LINES:
for bk,s in zip(title_stack,score_stack):
bk['class'] = BL_PARAGRAPH
bk['score'] = s
title_stack = []
score_stack = []
# Titles haven't been marked yet.
for p in pages.values():
for b in p['blocks']:
if b['class'] == BL_UNDEF:
ndx = get_closest_block_class(b)
b['class'] = ndx["class"]
b['score'] = ndx["score"]
if ndx["class"] in TITLE_CLASSES and \
len(b['lines']) >= TITLE_MAX_LINES:
b['class'] = BL_PARAGRAPH
b['score'] = ndx["score"]
else:
b['class'] = ndx["class"]
b['score'] = ndx["score"]
adjust_block_prototypes(b)
else:
b['score'] = None
# Some adjustments :
## 1. Table cannot be alone. So an alone table will be MISC
## 1. Table cannot be alone, except for table having 'html' (result of
## table_detection). So an alone table having no 'html' will be MISC.
## 2. Titles having size > TITLE_SIZE_LIMIT are marked paragraph
prev_table = 0
prev_block = None
......@@ -1387,7 +1620,8 @@ def guess_structure(blocks, fontspec, def_size):
prev_table += 1
else:
if prev_table == 1:
prev_block['class'] = BL_MISC
if not prev_block.get('html'):
prev_block['class'] = BL_MISC
prev_table = 0
prev_block = b
......@@ -1463,17 +1697,20 @@ def print_html(pages, fontspec, out=sys.stdout):
cl = blocks[i]['class']
if cl == BL_TABLE:
if not DEBUG_PRINT:
print('<table>', file=out)
print('<table border="1">', file=out)
elif blocks[i]['score'] is None:
print('<table font="%d">' % blocks[i]['font']['id'], file=out)
print('<table border="1" font="%d">' % blocks[i]['font']['id'], file=out)
else:
print('<table font="%d" score="%f">' % (
print('<table border="1" font="%d" score="%f">' % (
blocks[i]['font']['id'], blocks[i]['score']), file=out)
while i < len(blocks) and blocks[i]['class'] == BL_TABLE:
print(' <tr>', file=out)
for l in blocks[i]['lines']:
print(" <td>%s</td>" % text_tr(l['text']), file=out)
print(' </tr>', file=out)
if blocks[i].get('html'):
print(blocks[i]['html'], file=out)
else:
print(' <tr>', file=out)
for l in blocks[i]['lines']:
print(" <td>%s</td>" % text_tr(l['text']), file=out)
print(' </tr>', file=out)
i += 1
print('</table>', file=out)
if i < len(blocks):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment