diff --git a/src/py/p2b_config.py b/src/py/p2b_config.py index 87c91568bde5826e9f659bcf6968bbb387879db9..4a964171f4ac9a32c2557d8b15f8b7bc03263fa3 100644 --- a/src/py/p2b_config.py +++ b/src/py/p2b_config.py @@ -176,6 +176,8 @@ CAPTION_REGEX = [ r'cr[ée]dit photo', r'photo ?:' ] LINK_REGEX = [r'https?://', r'clique[rz]', r'\.gouv\.fr' ] +TABLE_SEPARATORS_REGEX = re.compile(r'[\n\t]') + CREDITS_REGEX = [ r'R[ée]daction ?:', r'R[ée]dacteurs? ?:', r'R[ée]dactrices? ?:', r'cr[ée]dits photos', # Au pluriel ici, singulier dans CAPTION diff --git a/src/py/p2b_functions.py b/src/py/p2b_functions.py index d84d576e77f6abf1776d1fd1a5fd5a9dd7e81d3c..1a0a68e62a7537e60a3b14b2c6b65630a7cc64e4 100644 --- a/src/py/p2b_functions.py +++ b/src/py/p2b_functions.py @@ -374,8 +374,12 @@ pdftotext. But ghostscript is faster. # e.g : "PyPDF2.errors.PdfReadError: Could not read malformed PDF file" pass # tl is [] so should return [] for t in tl: + n_tabs = 0 + if t.shape[1] == 1: + for lines in t.cells: + n_tabs = max(n_tabs, len(TABLE_SEPARATORS_REGEX.split(lines[0].text))) if (t.accuracy > ACCURACY_THR) and (t.whitespace < WHITESPACE_THR) \ - and (t.shape[1] > 1): + and ((t.shape[1] > 1) or (n_tabs > 0)): c = [] tt = { 'cells': t.cells, 'page': t.page, 'table': t # Maybe Not necessary } @@ -1138,12 +1142,21 @@ def guess_structure(blocks, fontspec, tables, def_size): 'height': lines[0].y2-lines[0].y1, 'font':bl['font'], 'x_min': lines[0].x1, 'x_max': lines[-1].x2, 'y_min': lines[0].y1, 'y_max': lines[0].y2} + if len(lines) == 1: + cel = lines[0] + for ttxt in TABLE_SEPARATORS_REGEX.split(cel.text): + bl['nb_words'] += len(word_counter.findall(ttxt)) + li['text'] += '<td blk="c-%d">%s</td>\n' % (blk_cel, ttxt) + blk_cel += 1 + li['words'].append({'height': cel.y2-cel.y1, 'text': ttxt}) + elif len(lines) > 0: for cel in lines: ttxt = cel.text.strip() bl['nb_words'] += len(word_counter.findall(ttxt)) li['text'] += '<td blk="c-%d">%s</td>\n' % (blk_cel, ttxt) blk_cel += 1 li['words'].append({'height': cel.y2-cel.y1, 'text': cel.text.strip()}) + if len(lines) > 0: li['text'] += '</tr>' bl['lines'].append(li) pages.get(bl['page'])['blocks'].append(bl)