From a9906b581d613b7b61555b753f30cc8e377205bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phan=20Bernard?= <stephan.bernard@inrae.fr> Date: Fri, 26 May 2023 17:19:25 +0200 Subject: [PATCH] =?UTF-8?q?Quelques=20corrections=20de=20bugs=20d=C3=A9tec?= =?UTF-8?q?t=C3=A9s=20lors=20de=20la=20conversion=20de=20l'ensemble=20des?= =?UTF-8?q?=20BSVs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/py/p2b_functions.py | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/py/p2b_functions.py b/src/py/p2b_functions.py index 813705f..4a0b276 100644 --- a/src/py/p2b_functions.py +++ b/src/py/p2b_functions.py @@ -378,7 +378,7 @@ pdftotext. But ghostscript is faster. try: #tl = camelot.read_pdf(pdf_file, backend="poppler", flavor='stream', pages="all") #tl = camelot.read_pdf(pdf_file, backend="poppler", pages="all", split_text=True) - tl = camelot.read_pdf(pdf_file, backend="poppler", pages="all") + tl = camelot.read_pdf(pdf_file, backend="poppler", pages="all", shift_text=[''], strip_text='.\n') except: # e.g : "PyPDF2.errors.PdfReadError: Could not read malformed PDF file" pass # tl is [] so should return [] @@ -473,7 +473,7 @@ def mark_page_btotp(pages, ndx, increment, MARK): if len(pages) > 0: end = (abs(ndx) > min([len(p['blocks']) for p in pages.values()])) else: - end = 0 + end = 1 def mark_page_bottom(pages): mark_page_btotp(pages, -1, -1, BL_BOTTOM_PAGE) @@ -822,12 +822,15 @@ def sort_blocks(blocks, columns, default_font_size): if not b['treat']: nb_blocks += 1 - c = 1 - while columns[c] <= b['x_min']: c += 1 - b['col_min'] = c - 1 - while columns[c] < b['x_max'] - HORIZONTAL_ALIGMENT_THRESHOLD: c += 1 - b['col_max'] = c - # WARNING : if block in column 1, col_min ↠0 and col_max ↠1. + b['col_min'] = 0 + b['col_max'] = 1 + if len(columns) > 0: + c = 1 + while columns[c] < b['x_min']: c += 1 + b['col_min'] = c - 1 + while columns[c] < b['x_max'] - HORIZONTAL_ALIGMENT_THRESHOLD: c += 1 + b['col_max'] = c + # WARNING : if block in column 1, col_min ↠0 and col_max ↠1. b['temp_left'] = b['col_min'] for b in blocks: @@ -1365,9 +1368,12 @@ def guess_structure(blocks, fontspec, tables): p['blocks'] = sort_blocks(p['blocks'], col, default_font['size']) if not first_block_tagged: - bl = next(b for b in p['blocks'] if b['class'] != BL_TOP_PAGE) - bl['flags'] |= FLAG_FIRST_BLOCK - first_block_tagged = True + try: + bl = next(b for b in p['blocks'] if b['class'] != BL_TOP_PAGE) + bl['flags'] |= FLAG_FIRST_BLOCK + first_block_tagged = True + except : + pass # - End loop on pages # ...but sometimes it works better after blocks sorting @@ -1746,7 +1752,7 @@ def print_html(pages, fontspec, out=sys.stdout): if cl == BL_CAPTION: pre = '<figure num="%d" order="%d">' % (NUM_FIGURE, ORDER) NUM_FIGURE += 1 - ORDER += 1 + #ORDER += 1 post = '</figure>' if cl == BL_MISC: post = '</p>' @@ -1754,22 +1760,22 @@ def print_html(pages, fontspec, out=sys.stdout): txt = '%s<p num="%d" order="%d"><%s%s num="%d" order="%d">' % ( txt, BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]], ORDER, BLOCK_TAGS[cl], id_cl, BLOCK_NUM[BLOCK_TAGS[cl]], - ORDER+1) + ORDER) elif blocks[i]['score'] is None: txt = '%s<p font="%d" num="%d" order="%d"><%s%s num="%d" order="%d">' % ( txt, blocks[i]['font']['id'], BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]], ORDER, BLOCK_TAGS[cl], id_cl, BLOCK_NUM[BLOCK_TAGS[cl]], - ORDER+1) + ORDER) else: txt = '%s<p font="%d" score="%f" num="%d" order="%d"><%s%s num="%d" order="%d">' % ( txt, blocks[i]['font']['id'], blocks[i]['score'], BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]], ORDER, BLOCK_TAGS[cl], id_cl, BLOCK_NUM[BLOCK_TAGS[cl]], - ORDER+1) + ORDER) BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]] += 1 BLOCK_NUM[BLOCK_TAGS[cl]] += 1 - ORDER += 2 + ORDER += 1 else: if not DEBUG_PRINT: txt = '%s%s<%s%s num="%d" order="%d">' % (txt, pre, -- GitLab