diff --git a/src/py/p2b_functions.py b/src/py/p2b_functions.py index 813705f5dadd7d110082a67a34936c2212c252fa..4a0b27672d2efa9e91080af45e9310d1842ceca9 100644 --- a/src/py/p2b_functions.py +++ b/src/py/p2b_functions.py @@ -378,7 +378,7 @@ pdftotext. But ghostscript is faster. try: #tl = camelot.read_pdf(pdf_file, backend="poppler", flavor='stream', pages="all") #tl = camelot.read_pdf(pdf_file, backend="poppler", pages="all", split_text=True) - tl = camelot.read_pdf(pdf_file, backend="poppler", pages="all") + tl = camelot.read_pdf(pdf_file, backend="poppler", pages="all", shift_text=[''], strip_text='.\n') except: # e.g : "PyPDF2.errors.PdfReadError: Could not read malformed PDF file" pass # tl is [] so should return [] @@ -473,7 +473,7 @@ def mark_page_btotp(pages, ndx, increment, MARK): if len(pages) > 0: end = (abs(ndx) > min([len(p['blocks']) for p in pages.values()])) else: - end = 0 + end = 1 def mark_page_bottom(pages): mark_page_btotp(pages, -1, -1, BL_BOTTOM_PAGE) @@ -822,12 +822,15 @@ def sort_blocks(blocks, columns, default_font_size): if not b['treat']: nb_blocks += 1 - c = 1 - while columns[c] <= b['x_min']: c += 1 - b['col_min'] = c - 1 - while columns[c] < b['x_max'] - HORIZONTAL_ALIGMENT_THRESHOLD: c += 1 - b['col_max'] = c - # WARNING : if block in column 1, col_min ↠0 and col_max ↠1. + b['col_min'] = 0 + b['col_max'] = 1 + if len(columns) > 0: + c = 1 + while columns[c] < b['x_min']: c += 1 + b['col_min'] = c - 1 + while columns[c] < b['x_max'] - HORIZONTAL_ALIGMENT_THRESHOLD: c += 1 + b['col_max'] = c + # WARNING : if block in column 1, col_min ↠0 and col_max ↠1. b['temp_left'] = b['col_min'] for b in blocks: @@ -1365,9 +1368,12 @@ def guess_structure(blocks, fontspec, tables): p['blocks'] = sort_blocks(p['blocks'], col, default_font['size']) if not first_block_tagged: - bl = next(b for b in p['blocks'] if b['class'] != BL_TOP_PAGE) - bl['flags'] |= FLAG_FIRST_BLOCK - first_block_tagged = True + try: + bl = next(b for b in p['blocks'] if b['class'] != BL_TOP_PAGE) + bl['flags'] |= FLAG_FIRST_BLOCK + first_block_tagged = True + except : + pass # - End loop on pages # ...but sometimes it works better after blocks sorting @@ -1746,7 +1752,7 @@ def print_html(pages, fontspec, out=sys.stdout): if cl == BL_CAPTION: pre = '<figure num="%d" order="%d">' % (NUM_FIGURE, ORDER) NUM_FIGURE += 1 - ORDER += 1 + #ORDER += 1 post = '</figure>' if cl == BL_MISC: post = '</p>' @@ -1754,22 +1760,22 @@ def print_html(pages, fontspec, out=sys.stdout): txt = '%s<p num="%d" order="%d"><%s%s num="%d" order="%d">' % ( txt, BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]], ORDER, BLOCK_TAGS[cl], id_cl, BLOCK_NUM[BLOCK_TAGS[cl]], - ORDER+1) + ORDER) elif blocks[i]['score'] is None: txt = '%s<p font="%d" num="%d" order="%d"><%s%s num="%d" order="%d">' % ( txt, blocks[i]['font']['id'], BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]], ORDER, BLOCK_TAGS[cl], id_cl, BLOCK_NUM[BLOCK_TAGS[cl]], - ORDER+1) + ORDER) else: txt = '%s<p font="%d" score="%f" num="%d" order="%d"><%s%s num="%d" order="%d">' % ( txt, blocks[i]['font']['id'], blocks[i]['score'], BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]], ORDER, BLOCK_TAGS[cl], id_cl, BLOCK_NUM[BLOCK_TAGS[cl]], - ORDER+1) + ORDER) BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]] += 1 BLOCK_NUM[BLOCK_TAGS[cl]] += 1 - ORDER += 2 + ORDER += 1 else: if not DEBUG_PRINT: txt = '%s%s<%s%s num="%d" order="%d">' % (txt, pre,