Commit d4521af9 authored by Bernard Stephan's avatar Bernard Stephan
Browse files

Utilisation d'un dictionnaire du Français (Librairie python enchant) pour...

Utilisation d'un dictionnaire du Français (Librairie python enchant) pour gérer les césures et effets de style des titres
parent b29ee44a
......@@ -49,6 +49,12 @@ dans un fichier html.
sur la librairie poppler, dérivée de Xpdf. Ces deux outils prennent en entrée
un fichier pdf.
*pdf2blocs* utilise aussi la librairie
[PyEnchant](https://pyenchant.github.io/pyenchant/index.html) qui lui fournit
un dictionnaire et des outils de recherche. PyEnchant est distribué sous
la licence [LGPL](http://www.gnu.org/copyleft/lesser.html).
#### 1.4.1 pdftotext
*pdftotext* est destiné à produire une sortie en texte brut,
lisible dans une console texte par exemple.
......
import re # To define some re.compile() regular expressions.
import enchant # To define DICT
CMD_PDFTOTEXT = '/usr/sbin/pdftotext'
CMD_PDFTOHTML = '/usr/sbin/pdftohtml'
DEBUG_PRINT = False
PRINT_CSS = False
## Dictionaire :
#DICT = enchant.Dict("fr_FR")
DICT = enchant.DictWithPWL("fr_FR", "liste.de.mots.a.ajouter.au.dictionnaire.txt")
# Retire les lettres non accentuées, qui figurent dans ce dictionnaire.
for c in "bcdefghijklmnopqrstuvwxyzBCDEFGHIJKLMNOPQRESTUVWXYZ"[::]:
DICT.remove(c)
#DICT = enchant.request_pwl_dict("liste.de.mots.francais.frgut.txt")
LEFT_THRESHOLD = 25 # In p2b_text_utils.add_lines() : the max horizontal space
# to consider aligned items to be on the same line.
......@@ -99,7 +112,10 @@ TITLE_MIN_CHAR = 2 # To avoid “styled” bullet : we consider that a font neve
# used for more than TITLE_MIN_CHAR characters per line
# is a kind of text styling and will take the next line's font
# SIMILARITY THRESHOLD : For line similarity (to assign font).
# 1.0 : no threshold, 0.0 : no similarity (score 0.0 is perfect match)
SIMILARITY_THRESHOLD = 1.0
# SIMILARITY_THRESHOLD = 0.6
# Celle là est un peu compliquée : Pour détecter la structure, on compte
# le nombre de successions d'un changement de police de caractères vers
......@@ -112,10 +128,10 @@ NB_SUCCESSION_FOR_SAME = 0
#### Regex
INDICES_EXPOSANTS_USUELS = [
'er|ère|ere', # 1er, 1ère, …
'nde?', # 2nd
'i?[eè]me', # 3ème, 4ieme, …
'°',
re.compile(r'^(er|ère|ere)$'), # 1er, 1ère, …
re.compile(r'^nde?$'), # 2nd
re.compile(r'^(e|i?[eè]me)$'), # 3ème, 4ieme, …
re.compile(r'^°$'),
]
NUMBERING_REGEX = r'[0-9,\.\-\+\(\)%°±~ ]'
......@@ -137,3 +153,36 @@ CREDITS_REGEX = ['R[ée]daction ?:', 'R[ée]dacteurs? ?:', 'R[ée]dactrices? ?:'
COPYRIGHT_REGEX = ['Reproduction int[ée]grale', 'Reproduction partielle']
CONTACT_REGEX = [r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"]
## --- Final transformations :
# Will be replaced with ' which is recognized by tree tagger
# Can be None.
APOSTROPHE_REGEX = re.compile(r"[’]")
# Will be replaced with " which is recognized by tree tagger
# Can be None
DOUBLEQUOTE_REGEX = re.compile(r'[“”«»]')
# https://gist.github.com/Alex-Just/e86110836f3f93fe7932290526529cd1
# Those characters will be substituted with spaces, and multiple spaces will
# be substituted with only one. This is done when printing results.
# TO_BE_REMOVED can be None.
TO_BE_REMOVED = re.compile(
"["
"*#$~¤¥¦§¨©¬®\s_"
"\U000002BE-\U0000FFFD" # Symboles unicode divers
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F600-\U0001F64F" # emoticons
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F700-\U0001F77F" # alchemical symbols
"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
"\U0001FA00-\U0001FA6F" # Chess Symbols
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
"\U00002702-\U000027B0" # Dingbats
"\U000024C2-\U0001F251"
"]+"
)
import xml.etree.ElementTree as ET
import enchant
import os
import sys
import re
......@@ -11,6 +12,39 @@ from p2b_utils import levenshtein
from p2b_config import *
# +--------------------------------------------------------------+
# | is_ind_exp |
# +--------------------------------------------------------------+
# Is it an indice or exposant ?
def is_ind_exp(str):
for ie in INDICES_EXPOSANTS_USUELS:
if re.match(ie, str):
return True
return False
# +--------------------------------------------------------------+
# | unaccent_fr |
# +--------------------------------------------------------------+
# Remove accents of french language of a lowercase string
def unaccent_fr(ch):
ch = re.sub('[éèêë]','e',ch)
ch = re.sub('[àâä]','a',ch)
ch = re.sub('[ôö]','o',ch)
ch = re.sub('[îï]','i',ch)
ch = re.sub('[ùüû]','u',ch)
return re.sub('ç','c',ch)
# +--------------------------------------------------------------+
# | is_equal_wo_accents |
# +--------------------------------------------------------------+
# Return true if ch1 and ch2 are equal accentless and caseless.
def is_equal_wo_accents(ch1, ch2):
return unaccent_fr(ch1.strip().lower()) == unaccent_fr(ch2.strip().lower())
# +--------------------------------------------------------------+
# | get_pdftotext |
# +--------------------------------------------------------------+
......@@ -58,6 +92,7 @@ def get_pdftotext(filename):
'y_min': float(bloc.get('yMin')),
'y_max': float(bloc.get('yMax')),
}
first_line = True
for line in bloc:
if (line.tag.endswith('line')):
h = float(line.get('yMax')) - float(line.get('yMin'))
......@@ -69,28 +104,55 @@ def get_pdftotext(filename):
'y_min': float(line.get('yMin')),
'y_max': float(line.get('yMax')),
}
last_nbcar = 0
last_h = 0
last_word = ''
nb_one_letter = 0
no_space_pls = False
for word in line:
if (word.tag.endswith('word')):
hword = float(word.get('yMax')) - float(word.get('yMin'))
li['words'].append({'height': hword, 'text': word.text})
if ((hword != last_h) and (last_nbcar < 2)):
# This is to avoid separation of one big capital
# letter at the beginin of a title or paragraph.
last_h = hword
if len(re.sub(r'\W','', li['text'])) == 0:
li['text'] = "%s %s" % (li['text'], word.text)
if len(word.text) == 1:
nb_one_letter += 1
if no_space_pls:
word_to_test = ("%s%s" % (li['text'], word.text)).split(" ")[-1]
#print(">>> %s" % word_to_test)
if DICT.check(li['text'].split(" ")[-1]) \
and DICT.check(word.text):
li['text'] = "%s %s" % (li['text'], word.text)
elif DICT.check(word_to_test):
li['text'] = "%s%s" % (li['text'], word.text)
else:
li['text'] = "%s%s" % (li['text'], word.text)
# We give a try accentless because this
# case is often in titles, which may be
# accentless.
found = False
for sug in DICT.suggest(word_to_test):
found = is_equal_wo_accents(sug, word_to_test)
if found: break
if found:
li['text'] = "%s%s" % (li['text'], word.text)
else:
li['text'] = "%s %s" % (li['text'], word.text)
no_space_pls = False
elif last_word.isdecimal() and is_ind_exp(word.text):
li['text'] = "%s%s" % (li['text'], word.text)
elif first_line and len(li['text']) == 0 and \
re.match(r'^[A-ZÉÈÂÊÄËÏÎÔÖÙÜÛ]$', word.text.strip()):
no_space_pls = True
li['text'] = "%s %s" % (li['text'], word.text)
else:
li['text'] = "%s %s" % (li['text'], word.text)
li['text'] = li['text'].strip()
last_nbcar = len(word.text)
last_word = word.text
if len(li['text']) > 3 and w < h: # Probably vertical text
li['height'] = w
bl['flags'] |= FLAG_VERTICAL
bl['lines'].append(li)
first_line = False
blocks.append(bl)
return blocks
......@@ -230,16 +292,6 @@ def mark_page_top(pages):
mark_page_btotp(pages, 0, 1, BL_TOP_PAGE)
# +--------------------------------------------------------------+
# | is_ind_exp |
# +--------------------------------------------------------------+
# Is it an indice or exposant ?
def is_ind_exp(str):
for ie in INDICES_EXPOSANTS_USUELS:
if re.match(ie, str):
return True
return False
# +--------------------------------------------------------------+
# | get_lines |
# +--------------------------------------------------------------+
......@@ -282,6 +334,7 @@ def get_lines(segments, fontspec):
max_car = fnt[f]
fnt_no = f
lines.append({ 'text': li.strip(),
'exact_match': False, # Used by guess_fonts()
'most_used_font': fnt_no,
'nb_fonts': len(fnt),
'page': page_num})
......@@ -338,12 +391,13 @@ def guess_fonts(blocks, segments, fontspec):
font_sel = -1
line_no = -1
for i in range(ndx_lines[bl['page']-1], ndx_lines[bl['page']]):
if (len(lines[i]['text']) > 0):
if (not lines[i]['exact_match']) and (len(lines[i]['text']) > 0):
d = levenshtein(l['text'], lines[i]['text'])
if (d == 0):
min_dist = 0
min_score = 0.0
font_sel = lines[i]['most_used_font']
lines[i]['exact_match'] = True
line_no = i
break;
score = float(d) / float(max(len(l['text']), len(lines[i]['text'])))
......@@ -361,6 +415,8 @@ def guess_fonts(blocks, segments, fontspec):
l['score'] = min_score # For debuggin purpose
l['dist'] = min_dist # idem.
l['line_no'] = line_no # idem. Stores the "similar line" number
# >>> <score> >>> <block_line> | <text line>
# print(">>> %s >>> %s | %s" %(l['score'], l['text'], lines[line_no]['text']))
if line_no >= 0:
if (lines[line_no]['nb_fonts'] > 1):
l['flags'] |= MANY_FONTS
......@@ -452,6 +508,111 @@ def get_columns(blocks, default_font_size):
return columns
# +--------------------------------------------------------------+
# | compute_lrud |
# +--------------------------------------------------------------+
# Compute 'left', 'right', 'up' and 'down' on blocks,
# where those are the minimum distance to another block
# on left, on right, ...
# Has a negative value where no other block on the direction.
# Should be computed before expand_blocks(...)
def compute_lrud(page_blocks):
for b in page_blocks:
b['left'] = b['right'] = b['top'] = b['down'] = -1
for ob in page_blocks: # ob : other block
if ob is not b:
if ob['y_min'] <= b['y_max'] and ob['y_max'] >= b['y_min']: # Vertically aligned
if ob['x_min'] >= b['x_max']: # ob on right
dist = ob['x_min'] - b['x_max']
if b['right'] < 0:
b['right'] = dist
b['right_block'] = ob
elif b['right'] > dist:
b['right'] = dist
b['right_block'] = ob
if ob['x_max'] < b['x_min']: # ob on left
dist = b['x_min'] - ob['x_max']
if b['left'] < 0:
b['left'] = dist
b['left_block'] = ob
elif b['left'] > dist:
b['left'] = dist
b['left_block'] = ob
if ob['x_min'] <= b['x_max'] and ob['x_max'] >= b['x_min']: # Horizontally aligned
if ob['y_min'] >= b['y_max']: # ob under b
dist = ob['y_min'] - b['y_max']
if b['down'] < 0:
b['down'] = dist
b['down_block'] = ob
elif b['down'] > dist:
b['down'] = dist
b['down_block'] = ob
if ob['y_max'] < b['y_min']: # ob on top of b
dist = ob['y_max'] - b['y_min']
if b['top'] < 0:
b['top'] = dist
b['top_block'] = ob
elif b['top'] > dist:
b['top'] = dist
b['top_block'] = ob
# +--------------------------------------------------------------+
# | table_detection |
# +--------------------------------------------------------------+
# Tries to find tables of a page.
# Uses comute_lrud()'s left/right, ...
# Will try to detect succession of blocks in the same line having
# different blocks on 'top' or 'down', making it different to shitty-justified
# text (having same top or down). But cannot detect multi-column cells.
def table_detection(page_blocks):
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# T O D O
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
for b in page_blocks:
b['left'] = b['right'] = b['top'] = b['down'] = -1
for ob in page_blocks: # ob : other block
if ob is not b:
if ob['y_min'] <= b['y_max'] and ob['y_max'] >= b['y_min']: # Vertically aligned
if ob['x_min'] >= b['x_max']: # ob on right
dist = ob['x_min'] - b['x_max']
if b['right'] < 0:
b['right'] = dist
b['right_block'] = ob
elif b['right'] > dist:
b['right'] = dist
b['right_block'] = ob
if ob['x_max'] < b['x_min']: # ob on left
dist = b['x_min'] - ob['x_max']
if b['left'] < 0:
b['left'] = dist
b['left_block'] = ob
elif b['left'] > dist:
b['left'] = dist
b['left_block'] = ob
if ob['x_min'] <= b['x_max'] and ob['x_max'] >= b['x_min']: # Horizontally aligned
if ob['y_min'] >= b['y_max']: # ob under b
dist = ob['y_min'] - b['y_max']
if b['down'] < 0:
b['down'] = dist
b['down_block'] = ob
elif b['down'] > dist:
b['down'] = dist
b['down_block'] = ob
if ob['y_max'] < b['y_min']: # ob on top of b
dist = ob['y_max'] - b['y_min']
if b['top'] < 0:
b['top'] = dist
b['top_block'] = ob
elif b['top'] > dist:
b['top'] = dist
b['top_block'] = ob
# +--------------------------------------------------------------+
# | expand_blocks |
# +--------------------------------------------------------------+
......@@ -926,6 +1087,7 @@ def guess_structure(blocks, fontspec, def_size):
first_block_tagged = False
for p in pages.values():
if len(p['blocks']) > 0:
compute_lrud(p['blocks'])
expand_blocks(p['blocks'], default_font['size'])
col = get_columns(p['blocks'], default_font['size'])
col.append(max(b['x_max'] for b in p['blocks']))
......@@ -987,35 +1149,13 @@ def guess_structure(blocks, fontspec, def_size):
bl['class'] = BL_PARAGRAPH
adjust_block_prototypes(bl)
# Tables
## First, we look for full of numbers blocks having small columns and
## default or small fonts, then we'll look for small columns before
## and after these ones.
for p in pages.values():
for i,bl in enumerate(p['blocks']):
if bl['class'] == BL_UNDEF and \
bl['not_numbers'] <= NUMBERING_THRESHOLD and \
bl['max_line_size'] <= TABLE_LINE_SIZE and \
bl['font_class'] <= FONT_DEFAULT:
bl['class'] = BL_TABLE
adjust_block_prototypes(bl)
for inc in [-1, 1]:
j = i+inc
while j >= 0 and j < len(p['blocks']) and \
p['blocks'][j]['class'] == BL_UNDEF and \
p['blocks'][j]['font_class'] <= FONT_DEFAULT and \
p['blocks'][j]['block_size'] / p['blocks'][j]['nb_lines'] <= TABLE_LINE_SIZE:
p['blocks'][j]['class'] = BL_TABLE
adjust_block_prototypes(p['blocks'][j])
j += inc
# Captions
## We'll look for keywords like "Crédit photo".
## If we don't find any, we'll try small fonts
## centered or aligned left and quite short (less than PARAGRAH_LINE_SIZE)
nb_caption = 0
for bl in blocks:
if bl['class'] == BL_UNDEF and bl['font_class'] < FONT_DEFAULT and \
if bl['class'] == BL_UNDEF and bl['font_class'] <= FONT_DEFAULT and \
block_contains_keywords(bl, CAPTION_REGEX):
bl['class'] = BL_CAPTION
adjust_block_prototypes(bl)
......@@ -1032,6 +1172,28 @@ def guess_structure(blocks, fontspec, def_size):
adjust_block_prototypes(bl)
nb_caption += 1
# Tables
## First, we look for full of numbers blocks having small columns and
## default or small fonts, then we'll look for small columns before
## and after these ones.
for p in pages.values():
for i,bl in enumerate(p['blocks']):
if bl['class'] == BL_UNDEF and \
bl['not_numbers'] <= NUMBERING_THRESHOLD and \
bl['max_line_size'] <= TABLE_LINE_SIZE and \
bl['font_class'] <= FONT_DEFAULT:
bl['class'] = BL_TABLE
adjust_block_prototypes(bl)
for inc in [-1, 1]:
j = i+inc
while j >= 0 and j < len(p['blocks']) and \
p['blocks'][j]['class'] == BL_UNDEF and \
p['blocks'][j]['font_class'] <= FONT_DEFAULT and \
p['blocks'][j]['block_size'] / p['blocks'][j]['nb_lines'] <= TABLE_LINE_SIZE:
p['blocks'][j]['class'] = BL_TABLE
adjust_block_prototypes(p['blocks'][j])
j += inc
# Links (or "See also")
## Just look for http(s)://
## Won't use it as a style. We just avoid it being a title.
......@@ -1176,6 +1338,19 @@ def guess_structure(blocks, fontspec, def_size):
return pages
# +--------------------------------------------------------------+
# | text_transformations |
# +--------------------------------------------------------------+
def text_tr(text):
if APOSTROPHE_REGEX:
text = re.sub(APOSTROPHE_REGEX, "'", text)
if DOUBLEQUOTE_REGEX:
text = re.sub(DOUBLEQUOTE_REGEX, '"', text)
if TO_BE_REMOVED:
text = re.sub(TO_BE_REMOVED, ' ', text)
text = re.sub(r' +', ' ', text)
return text.strip()
# +--------------------------------------------------------------+
# | print_html |
# +--------------------------------------------------------------+
......@@ -1213,7 +1388,7 @@ def print_html(pages, fontspec, out=sys.stdout):
txt = '%s%s ' % (txt, l['text'])
if txt[-1] == ' ':
txt = txt[:-1]
print("%s</title>" % txt, file=out)
print("%s</title>" % text_tr(txt), file=out)
print("</head>", file=out)
print("<body>", file=out)
......@@ -1242,7 +1417,7 @@ def print_html(pages, fontspec, out=sys.stdout):
while i < len(blocks) and blocks[i]['class'] == BL_TABLE:
print(' <tr>', file=out)
for l in blocks[i]['lines']:
print(" <td>%s</td>" % l['text'], file=out)
print(" <td>%s</td>" % text_tr(l['text']), file=out)
print(' </tr>', file=out)
i += 1
print('</table>', file=out)
......@@ -1276,22 +1451,54 @@ def print_html(pages, fontspec, out=sys.stdout):
if (l['flags'] & (HAS_BULLET | IS_DESCRIPTION)) != 0 and \
BLOCK_ENDLINES[cl] == '' and \
l != blocks[i]['lines'][0]:
txt = "%s<br />%s" % (txt, l['text'])
if txt.strip()[-1] == '-' and l['text'][0] != ' ': # Césure
word_to_test = ("%s%s" % (txt.strip()[:-1].split(" ")[-1],
l['text'])).split(" ")[0]
word_to_test = re.sub(r'[^a-zA-Z\'’àâäéèêëïîôöùûüÀÂÄÉÈÊËÏÎÔÖÙÛÜ]', '', word_to_test)
if len(word_to_test) > 0:
#print(">>> -> %s [%s]" % (word_to_test, DICT.check(word_to_test)))
if DICT.check(word_to_test):
txt = "%s%s " % (txt.strip()[:-1], l['text'])
else:
txt = "%s<br />%s " % (txt.strip(), l['text'])
else:
txt = "%s<br />%s " % (txt.strip(), l['text'])
else:
txt = "%s<br />%s " % (txt.strip(), l['text'])
else:
if l == blocks[i]['lines'][-1]:
txt = "%s%s" % (txt, l['text'])
elif BLOCK_ENDLINES[cl] == '':
txt = "%s%s " % (txt, l['text'])
if txt.strip()[-1] == '-' and l['text'][0] != ' ': # Césure
word_to_test = ("%s%s" % (txt.strip()[:-1].split(" ")[-1],
l['text'])).split(" ")[0]
word_to_test = re.sub(r'[^a-zA-Z\'’àâäéèêëïîôöùûüÀÂÄÉÈÊËÏÎÔÖÙÛÜ]', '', word_to_test)
if len(word_to_test) > 0:
#print(">>> %s [%s]" % (word_to_test, DICT.check(word_to_test)))
if DICT.check(word_to_test):
txt = "%s%s " % (txt.strip()[:-1], l['text'])
else:
txt = "%s%s " % (txt.strip(), l['text'])
else:
txt = "%s%s " % (txt, l['text'])
else:
txt = "%s%s%s" % (txt, l['text'], BLOCK_ENDLINES[cl])
txt = "%s%s " % (txt, l['text'])
if BLOCK_ENDLINES[cl] != '':
txt = "%s%s " % (txt.strip(),BLOCK_ENDLINES[cl])
if l == blocks[i]['lines'][-1]:
txt = txt.strip()
# &&&&&&&&&& Tester les césures
#if l == blocks[i]['lines'][-1]:
# txt = "%s%s" % (txt, l['text'])
#elif BLOCK_ENDLINES[cl] == '':
# txt = "%s%s " % (txt, l['text'])
#else:
# txt = "%s%s%s" % (txt, l['text'], BLOCK_ENDLINES[cl])
if txt[-1] == ' ':
txt = txt[:-1]
print("%s</%s>%s" % (txt, BLOCK_TAGS[cl], post), file=out)
print("%s</%s>%s" % (text_tr(txt), BLOCK_TAGS[cl], post), file=out)
else: # cl == BL_IGNORE:
txt = "<!-- "
for l in blocks[i]['lines']:
txt = "%s%s " % (txt, l['text'])
print("%s-->" % txt, file=out)
print("%s-->" % text_tr(txt), file=out)
i += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment