From a9906b581d613b7b61555b753f30cc8e377205bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phan=20Bernard?= <stephan.bernard@inrae.fr>
Date: Fri, 26 May 2023 17:19:25 +0200
Subject: [PATCH] =?UTF-8?q?Quelques=20corrections=20de=20bugs=20d=C3=A9tec?=
 =?UTF-8?q?t=C3=A9s=20lors=20de=20la=20conversion=20de=20l'ensemble=20des?=
 =?UTF-8?q?=20BSVs.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/py/p2b_functions.py | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/py/p2b_functions.py b/src/py/p2b_functions.py
index 813705f..4a0b276 100644
--- a/src/py/p2b_functions.py
+++ b/src/py/p2b_functions.py
@@ -378,7 +378,7 @@ pdftotext. But ghostscript is faster.
   try:
     #tl = camelot.read_pdf(pdf_file, backend="poppler", flavor='stream', pages="all")
     #tl = camelot.read_pdf(pdf_file, backend="poppler", pages="all", split_text=True)
-    tl = camelot.read_pdf(pdf_file, backend="poppler", pages="all")
+    tl = camelot.read_pdf(pdf_file, backend="poppler", pages="all", shift_text=[''], strip_text='.\n')
   except:
     # e.g : "PyPDF2.errors.PdfReadError: Could not read malformed PDF file"
     pass # tl is [] so should return []
@@ -473,7 +473,7 @@ def mark_page_btotp(pages, ndx, increment, MARK):
             if len(pages) > 0:
               end = (abs(ndx) > min([len(p['blocks']) for p in pages.values()]))
             else:
-              end = 0
+              end = 1
 
 def mark_page_bottom(pages):
     mark_page_btotp(pages, -1, -1, BL_BOTTOM_PAGE)
@@ -822,12 +822,15 @@ def sort_blocks(blocks, columns, default_font_size):
         if not b['treat']:
             nb_blocks += 1
 
-        c = 1
-        while columns[c] <= b['x_min']: c += 1
-        b['col_min'] = c - 1
-        while columns[c] < b['x_max'] - HORIZONTAL_ALIGMENT_THRESHOLD: c += 1
-        b['col_max'] = c
-        # WARNING : if block in column 1, col_min ← 0 and col_max ← 1.
+        b['col_min'] = 0
+        b['col_max'] = 1
+        if len(columns) > 0:
+          c = 1
+          while columns[c] < b['x_min']: c += 1
+          b['col_min'] = c - 1
+          while columns[c] < b['x_max'] - HORIZONTAL_ALIGMENT_THRESHOLD: c += 1
+          b['col_max'] = c
+          # WARNING : if block in column 1, col_min ← 0 and col_max ← 1.
         b['temp_left'] = b['col_min']
 
     for b in blocks:
@@ -1365,9 +1368,12 @@ def guess_structure(blocks, fontspec, tables):
         p['blocks'] = sort_blocks(p['blocks'], col, default_font['size'])
         
         if not first_block_tagged:
-          bl = next(b  for b in p['blocks'] if b['class'] != BL_TOP_PAGE)
-          bl['flags'] |= FLAG_FIRST_BLOCK
-          first_block_tagged = True
+          try:
+            bl = next(b for b in p['blocks'] if b['class'] != BL_TOP_PAGE)
+            bl['flags'] |= FLAG_FIRST_BLOCK
+            first_block_tagged = True
+          except :
+            pass
     # - End loop on pages
 
     # ...but sometimes it works better after blocks sorting
@@ -1746,7 +1752,7 @@ def print_html(pages, fontspec, out=sys.stdout):
             if cl == BL_CAPTION:
                 pre = '<figure num="%d" order="%d">' % (NUM_FIGURE, ORDER)
                 NUM_FIGURE += 1
-                ORDER += 1
+                #ORDER += 1
                 post = '</figure>'
             if cl == BL_MISC:
               post = '</p>'
@@ -1754,22 +1760,22 @@ def print_html(pages, fontspec, out=sys.stdout):
                 txt = '%s<p num="%d" order="%d"><%s%s num="%d" order="%d">' % (
                       txt, BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]], ORDER,
                       BLOCK_TAGS[cl], id_cl, BLOCK_NUM[BLOCK_TAGS[cl]],
-                      ORDER+1)
+                      ORDER)
               elif blocks[i]['score'] is None:
                 txt = '%s<p font="%d" num="%d" order="%d"><%s%s num="%d" order="%d">' % (
                       txt, blocks[i]['font']['id'],
                       BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]], ORDER,
                       BLOCK_TAGS[cl], id_cl, BLOCK_NUM[BLOCK_TAGS[cl]],
-                      ORDER+1)
+                      ORDER)
               else:
                 txt = '%s<p font="%d" score="%f" num="%d" order="%d"><%s%s num="%d" order="%d">' % (
                       txt, blocks[i]['font']['id'], blocks[i]['score'], 
                       BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]], ORDER,
                       BLOCK_TAGS[cl], id_cl, BLOCK_NUM[BLOCK_TAGS[cl]], 
-                      ORDER+1)
+                      ORDER)
               BLOCK_NUM[BLOCK_TAGS[BL_PARAGRAPH]] += 1
               BLOCK_NUM[BLOCK_TAGS[cl]] += 1
-              ORDER += 2
+              ORDER += 1
             else: 
               if not DEBUG_PRINT:
                 txt = '%s%s<%s%s num="%d" order="%d">' % (txt, pre,
-- 
GitLab