From 0ce942a6fe76a95e53b820cf1a6a7ffcbac207e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phan=20Bernard?= <stephan.bernard@inrae.fr>
Date: Wed, 1 Jun 2022 18:20:59 +0200
Subject: [PATCH] =?UTF-8?q?Inclusion=20des=20tableaux=20d'une=20colonne=20?=
 =?UTF-8?q?pour=20lesquels=20il=20y=20a=20d=C3=A9tection=20de=20caract?=
 =?UTF-8?q?=C3=A8res=20de=20s=C3=A9paration=20(retours=20chariot=20ou=20ta?=
 =?UTF-8?q?bulations).?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/py/p2b_config.py    |  2 ++
 src/py/p2b_functions.py | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/py/p2b_config.py b/src/py/p2b_config.py
index 87c9156..4a96417 100644
--- a/src/py/p2b_config.py
+++ b/src/py/p2b_config.py
@@ -176,6 +176,8 @@ CAPTION_REGEX = [ r'cr[ée]dit photo', r'photo ?:' ]
 
 LINK_REGEX = [r'https?://', r'clique[rz]', r'\.gouv\.fr' ]
 
+TABLE_SEPARATORS_REGEX = re.compile(r'[\n\t]')
+
 CREDITS_REGEX = [
   r'R[ée]daction ?:', r'R[ée]dacteurs? ?:', r'R[ée]dactrices? ?:',
   r'cr[ée]dits photos', # Au pluriel ici, singulier dans CAPTION
diff --git a/src/py/p2b_functions.py b/src/py/p2b_functions.py
index d84d576..1a0a68e 100644
--- a/src/py/p2b_functions.py
+++ b/src/py/p2b_functions.py
@@ -374,8 +374,12 @@ pdftotext. But ghostscript is faster.
     # e.g : "PyPDF2.errors.PdfReadError: Could not read malformed PDF file"
     pass # tl is [] so should return []
   for t in tl:
+    n_tabs = 0
+    if t.shape[1] == 1:
+      for lines in t.cells:
+        n_tabs = max(n_tabs, len(TABLE_SEPARATORS_REGEX.split(lines[0].text)))
     if (t.accuracy > ACCURACY_THR) and (t.whitespace < WHITESPACE_THR) \
-      and (t.shape[1] > 1):
+      and ((t.shape[1] > 1) or (n_tabs > 0)):
       c = []
       tt = { 'cells': t.cells, 'page': t.page, 'table': t # Maybe Not necessary
         }
@@ -1138,12 +1142,21 @@ def guess_structure(blocks, fontspec, tables, def_size):
                     'height': lines[0].y2-lines[0].y1, 'font':bl['font'],
                     'x_min': lines[0].x1, 'x_max': lines[-1].x2,
                     'y_min': lines[0].y1, 'y_max': lines[0].y2}
+                if len(lines) == 1:
+                  cel = lines[0]
+                  for ttxt in TABLE_SEPARATORS_REGEX.split(cel.text):
+                    bl['nb_words'] += len(word_counter.findall(ttxt))
+                    li['text'] += '<td blk="c-%d">%s</td>\n' % (blk_cel, ttxt)
+                    blk_cel += 1
+                    li['words'].append({'height': cel.y2-cel.y1, 'text': ttxt})
+                elif len(lines) > 0:
                   for cel in lines:
                     ttxt = cel.text.strip()
                     bl['nb_words'] += len(word_counter.findall(ttxt))
                     li['text'] += '<td blk="c-%d">%s</td>\n' % (blk_cel, ttxt)
                     blk_cel += 1
                     li['words'].append({'height': cel.y2-cel.y1, 'text': cel.text.strip()})
+                if len(lines) > 0:
                   li['text'] += '</tr>'
                   bl['lines'].append(li)
               pages.get(bl['page'])['blocks'].append(bl)
-- 
GitLab