diff --git a/src/py/p2b_config.py b/src/py/p2b_config.py index df5a276ea24edb915d4ac381e8d0e66ed33a0050..c93eb8f30883fcf74d33651a3549a7df25bb8f9d 100644 --- a/src/py/p2b_config.py +++ b/src/py/p2b_config.py @@ -14,6 +14,8 @@ OUTPUT_LICENCE="""<!-- Licence --> DEBUG_PRINT = False PRINT_CSS = True +#URL_CSS = 'http://ontology.inrae.fr/bsv/html/bsv.css' +URL_CSS = 'https://rdf.codex.cati.inrae.fr/bsv/files/html/bsv.css' # With version 22-06 (that is june, 2022), poppler pdftohtml # returns different fontsize values, which completely messes up @@ -247,7 +249,7 @@ DOUBLEQUOTE_REGEX = r'[“â€Â«Â»]' # TO_BE_REMOVED can be None. TO_BE_REMOVED = re.compile( "[" - "*#$~¤¥¦§¨©¬®\s_" + r"*#$~¤¥¦§¨©¬®\s_" "\U000002BE-\U0000FFFD" # Symboles unicode divers "\U0001F1E0-\U0001F1FF" # flags (iOS) "\U0001F300-\U0001F5FF" # symbols & pictographs diff --git a/src/py/p2b_functions.py b/src/py/p2b_functions.py index 4a0b27672d2efa9e91080af45e9310d1842ceca9..ca3cc5385a881e7274d36060a26acba947438683 100644 --- a/src/py/p2b_functions.py +++ b/src/py/p2b_functions.py @@ -166,7 +166,7 @@ def get_pdftotext(filename): and DICT.check(word.text): li['text'] = "%s %s" % (li['text'], word.text) elif check_dict(word_to_test) or \ - check_dict(re.sub('\W','',word_to_test)): + check_dict(re.sub(r'\W','',word_to_test)): # Quelques explications sur le test qui précède : # Il peut sembler redondant mais ne l'est # pas pour les apostrophes. @@ -259,13 +259,13 @@ def get_pdftotext(filename): nb_ok_new = 0 ls = re.sub(r' +', ' ', re.sub(r"[^'’\w]", ' ', ltxt)).strip().split(' ') for wo in ls: - if len(re.sub('\d','',wo)) > 0: # Don't check numbers + if len(re.sub(r'\d','',wo)) > 0: # Don't check numbers if check_dict(wo): nb_ok_new += 1 prop_ok_new = float(nb_ok_new) / float(len(ls)) nb_ok_old = 0 ls = re.sub(r' +', ' ', re.sub(r"[^'’\w]", ' ', li['text'])).strip().split(' ') for wo in ls: - if len(re.sub('\d','',wo)) > 0: # Don't check numbers + if len(re.sub(r'\d','',wo)) > 0: # Don't check numbers if check_dict(wo): nb_ok_old += 1 prop_ok_old = float(nb_ok_old) / float(len(ls)) if prop_ok_new > prop_ok_old : @@ -297,7 +297,10 @@ def get_pdftohtml(filename): return None, None # Parse xml code and create block table. - xml = o.decode('utf8') + try: + xml = o.decode('utf8') + except UnicodeDecodeError: + xml = o.decode('unicode_escape') # Remove <b> and <i> tags which give parse errors and are not used. xml = HTML_BOLD_ITALIC_REGEX.sub('', xml) @@ -822,11 +825,18 @@ def sort_blocks(blocks, columns, default_font_size): if not b['treat']: nb_blocks += 1 +## c = 1 +## while columns[c] <= b['x_min']: c += 1 +## b['col_min'] = c - 1 +## while columns[c] < b['x_max'] - HORIZONTAL_ALIGMENT_THRESHOLD: c += 1 +## b['col_max'] = c +## # WARNING : if block in column 1, col_min ↠0 and col_max ↠1. b['col_min'] = 0 b['col_max'] = 1 if len(columns) > 0: c = 1 - while columns[c] < b['x_min']: c += 1 + while (c < len(columns)) and (columns[c] <= b['x_min']): c += 1 + if (c == len(columns)): c -= 1 b['col_min'] = c - 1 while columns[c] < b['x_max'] - HORIZONTAL_ALIGMENT_THRESHOLD: c += 1 b['col_max'] = c @@ -1666,7 +1676,7 @@ def print_html(pages, fontspec, out=sys.stdout): print('<html lang="fr">', file=out) print('<head><meta charset="utf-8" />', file=out) if PRINT_CSS: - print('<link rel="stylesheet" href="http://ontology.inrae.fr/bsv/html/bsv.css" />', file=out) + print('<link rel="stylesheet" href="%s" />' % URL_CSS, file=out) print('<link rel="stylesheet" href="bsv.css" />', file=out) if DEBUG_PRINT: