Commit d4521af9 authored by Bernard Stephan's avatar Bernard Stephan
Browse files

Utilisation d'un dictionnaire du Français (Librairie python enchant) pour...

Utilisation d'un dictionnaire du Français (Librairie python enchant) pour gérer les césures et effets de style des titres
parent b29ee44a
......@@ -49,6 +49,12 @@ dans un fichier html.
sur la librairie poppler, dérivée de Xpdf. Ces deux outils prennent en entrée
un fichier pdf.
*pdf2blocs* utilise aussi la librairie
[PyEnchant](https://pyenchant.github.io/pyenchant/index.html) qui lui fournit
un dictionnaire et des outils de recherche. PyEnchant est distribué sous
la licence [LGPL](http://www.gnu.org/copyleft/lesser.html).
#### 1.4.1 pdftotext
*pdftotext* est destiné à produire une sortie en texte brut,
lisible dans une console texte par exemple.
......
import re # To define some re.compile() regular expressions.
import enchant # To define DICT
CMD_PDFTOTEXT = '/usr/sbin/pdftotext'
CMD_PDFTOHTML = '/usr/sbin/pdftohtml'
DEBUG_PRINT = False
PRINT_CSS = False
## Dictionaire :
#DICT = enchant.Dict("fr_FR")
DICT = enchant.DictWithPWL("fr_FR", "liste.de.mots.a.ajouter.au.dictionnaire.txt")
# Retire les lettres non accentuées, qui figurent dans ce dictionnaire.
for c in "bcdefghijklmnopqrstuvwxyzBCDEFGHIJKLMNOPQRESTUVWXYZ"[::]:
DICT.remove(c)
#DICT = enchant.request_pwl_dict("liste.de.mots.francais.frgut.txt")
LEFT_THRESHOLD = 25 # In p2b_text_utils.add_lines() : the max horizontal space
# to consider aligned items to be on the same line.
......@@ -99,7 +112,10 @@ TITLE_MIN_CHAR = 2 # To avoid “styled” bullet : we consider that a font neve
# used for more than TITLE_MIN_CHAR characters per line
# is a kind of text styling and will take the next line's font
# SIMILARITY THRESHOLD : For line similarity (to assign font).
# 1.0 : no threshold, 0.0 : no similarity (score 0.0 is perfect match)
SIMILARITY_THRESHOLD = 1.0
# SIMILARITY_THRESHOLD = 0.6
# Celle là est un peu compliquée : Pour détecter la structure, on compte
# le nombre de successions d'un changement de police de caractères vers
......@@ -112,10 +128,10 @@ NB_SUCCESSION_FOR_SAME = 0
#### Regex
INDICES_EXPOSANTS_USUELS = [
'er|ère|ere', # 1er, 1ère, …
'nde?', # 2nd
'i?[eè]me', # 3ème, 4ieme, …
'°',
re.compile(r'^(er|ère|ere)$'), # 1er, 1ère, …
re.compile(r'^nde?$'), # 2nd
re.compile(r'^(e|i?[eè]me)$'), # 3ème, 4ieme, …
re.compile(r'^°$'),
]
NUMBERING_REGEX = r'[0-9,\.\-\+\(\)%°±~ ]'
......@@ -137,3 +153,36 @@ CREDITS_REGEX = ['R[ée]daction ?:', 'R[ée]dacteurs? ?:', 'R[ée]dactrices? ?:'
COPYRIGHT_REGEX = ['Reproduction int[ée]grale', 'Reproduction partielle']
CONTACT_REGEX = [r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"]
## --- Final transformations :
# Will be replaced with ' which is recognized by tree tagger
# Can be None.
APOSTROPHE_REGEX = re.compile(r"[’]")
# Will be replaced with " which is recognized by tree tagger
# Can be None
DOUBLEQUOTE_REGEX = re.compile(r'[“”«»]')
# https://gist.github.com/Alex-Just/e86110836f3f93fe7932290526529cd1
# Those characters will be substituted with spaces, and multiple spaces will
# be substituted with only one. This is done when printing results.
# TO_BE_REMOVED can be None.
TO_BE_REMOVED = re.compile(
"["
"*#$~¤¥¦§¨©¬®\s_"
"\U000002BE-\U0000FFFD" # Symboles unicode divers
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F600-\U0001F64F" # emoticons
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F700-\U0001F77F" # alchemical symbols
"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
"\U0001FA00-\U0001FA6F" # Chess Symbols
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
"\U00002702-\U000027B0" # Dingbats
"\U000024C2-\U0001F251"
"]+"
)
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment