Commit b29ee44a authored by Bernard Stephan's avatar Bernard Stephan
Browse files

Corrections relatives à la lecture de caractères spéciaux dans les pdf.

parent 4fd22c7c
......@@ -21,13 +21,20 @@ def get_pdftotext(filename):
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
o, e = proc.communicate()
if (proc.returncode != 0):
print('-S-> Command pdftotext returned an error :')
print(' ' + e.decode('utf8'))
#print('-S-> Command pdftotext returned an error :')
#print(' ' + e.decode('utf8'))
return []
# Parse xml code and create block table.
xml = o.decode('utf8')
root = ET.fromstring(xml)
## Quelques cas particuliers déjà rencontrés :-(
xml = re.sub(r">[]<",'>*<', xml)
root = None
try:
root = ET.fromstring(xml)
except Exception as e:
return []
#root = ET.fromstring(xml)
page_num = 0
flow_num = 0
......@@ -97,13 +104,17 @@ def get_pdftohtml(filename):
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
o, e = proc.communicate()
if (proc.returncode != 0):
print('-S-> Command pdftohtml returned an error :')
print(' ' + e.decode('utf8'))
#print('-S-> Command pdftohtml returned an error :')
#print(' ' + e.decode('utf8'))
return None
# Parse xml code and create block table.
xml = o.decode('utf8')
root = ET.fromstring(xml)
root = None
try:
root = ET.fromstring(xml)
except Exception as e:
return { 'fonts': [], 'segments': [] }
fontspec = []
segments = []
......@@ -187,21 +198,28 @@ def mark_page_btotp(pages, ndx, increment, MARK):
while not end:
li = None
for p in k:
try:
if li == None:
li = "".join([re.sub(r'[^a-zA-Z]', '', l['text']) \
for l in pages[p]['blocks'][ndx]['lines']])
else:
end |= (li != "".join([re.sub(r'[^a-zA-Z]', '', l['text']) \
for l in pages[p]['blocks'][ndx]['lines']]))
except Exception as e:
end = True
if not end:
for p in k:
# if p['blocks'][ndx]['class'] == BL_UNDEF:
pages[p]['blocks'][ndx]['class'] = MARK
if len(pages) > 2:
if li == "".join([re.sub(r'[^a-zA-Z]', '', l['text']) \
for l in pages[1]['blocks'][ndx]['lines']]):
pages[1]['blocks'][ndx]['class'] = MARK
try:
if li == "".join([re.sub(r'[^a-zA-Z]', '', l['text']) \
for l in pages[1]['blocks'][ndx]['lines']]):
pages[1]['blocks'][ndx]['class'] = MARK
except Exception as e:
pass # <--- Booooo : VERY BAD !!!
ndx += increment
end = (abs(ndx) > min([len(p['blocks']) for p in pages.values()]))
......@@ -233,6 +251,8 @@ def get_lines(segments, fontspec):
last_top = -1
line_no = -1
last_right = 0
if len(segments) == 0:
return []
for txt in segments:
if (txt['top'] == last_top) and ((txt['left'] - last_right) <= LEFT_THRESHOLD):
txt['line'] = line_no
......@@ -341,8 +361,9 @@ def guess_fonts(blocks, segments, fontspec):
l['score'] = min_score # For debuggin purpose
l['dist'] = min_dist # idem.
l['line_no'] = line_no # idem. Stores the "similar line" number
if (lines[line_no]['nb_fonts'] > 1):
l['flags'] |= MANY_FONTS
if line_no >= 0:
if (lines[line_no]['nb_fonts'] > 1):
l['flags'] |= MANY_FONTS
if block_fonts.get(font_sel) is None:
block_fonts[font_sel] = len(l['text'])
else:
......@@ -439,6 +460,7 @@ def get_columns(blocks, default_font_size):
# RQ : this is to be applied on a list of blocs in the SAME PAGE
def expand_blocks(page_blocks, default_font_size):
pb = page_blocks # for shorter writing
if len(pb) == 0: return
max_x = max([b['x_max'] for b in pb])
min_x = min([b['x_min'] for b in pb])
for b in pb:
......@@ -662,7 +684,8 @@ def get_closest_block_class(block):
closest_index = -1
for i in range(len(BLOCK_PROTOTYPES)):
if BLOCK_PROTOTYPES[i] is not None:
if BLOCK_PROTOTYPES[i] is not None and \
BLOCK_PROTOTYPES[BL_PARAGRAPH] is not None:
# Exclude comparison with titles if font_size smaller than default's
exclude = block['font']['size'] < BLOCK_PROTOTYPES[BL_PARAGRAPH]['font_size'] \
and i >= BL_DOCUMENT_TITLE and i <= BL_TITLE_5
......@@ -784,7 +807,10 @@ def guess_structure(blocks, fontspec, def_size):
c = (l['x_min'] + l['x_max']) / 2
if c < min_center: min_center = c
if c > max_center: max_center = c
last_character = l['text'].strip()[-1]
if len(l['text']) > 0:
last_character = l['text'].strip()[-1]
else:
last_character = '.'
if len(l['text'].split(':')) > 1 and \
len(re.sub(r'[A-ZÀÇÉÈÊÂÔÛ0-9]','', l['text'][0])) == 0:
l['flags'] |= IS_DESCRIPTION
......@@ -885,8 +911,12 @@ def guess_structure(blocks, fontspec, def_size):
if fonts[bl['font']['id']] > default_font_nb_car:
default_font_nb_car = fonts[bl['font']['id']]
default_font = bl['font']
if default_font is None:
return pages
default_font['is_default'] = True
# Page top and bottom detection is better before block sorting because of
# column detection consequences...
mark_page_top(pages)
......@@ -895,14 +925,15 @@ def guess_structure(blocks, fontspec, def_size):
# Reorder blocks
first_block_tagged = False
for p in pages.values():
expand_blocks(p['blocks'], default_font['size'])
col = get_columns(p['blocks'], default_font['size'])
col.append(max(b['x_max'] for b in p['blocks']))
p['blocks'] = sort_blocks(p['blocks'], col, default_font['size'])
if not first_block_tagged:
bl = next(b for b in p['blocks'] if b['class'] != BL_TOP_PAGE)
bl['flags'] |= FLAG_FIRST_BLOCK
first_block_tagged = True
if len(p['blocks']) > 0:
expand_blocks(p['blocks'], default_font['size'])
col = get_columns(p['blocks'], default_font['size'])
col.append(max(b['x_max'] for b in p['blocks']))
p['blocks'] = sort_blocks(p['blocks'], col, default_font['size'])
if not first_block_tagged:
bl = next(b for b in p['blocks'] if b['class'] != BL_TOP_PAGE)
bl['flags'] |= FLAG_FIRST_BLOCK
first_block_tagged = True
# - End loop on pages
# ...but sometimes it works better after blocks sorting
......@@ -1273,6 +1304,8 @@ def print_html(pages, fontspec, out=sys.stdout):
def get_pdf2html(filename):
blocks = get_pdftotext(filename)
p2h = get_pdftohtml(filename)
if p2h is None:
return ' '
fontspec = p2h['fonts']
segments = p2h['segments']
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment