diff --git a/src/py/analyse.py b/src/py/analyse.py deleted file mode 100644 index 5ce096577bf18201f165eea7a6ac2cd2195d520f..0000000000000000000000000000000000000000 --- a/src/py/analyse.py +++ /dev/null @@ -1,330 +0,0 @@ -# https://docs.python.org/fr/3/library/xml.etree.elementtree.html#module-xml.etree.ElementTree -# -# Lit les fichiers xml générés par pdftotext : -# pdftotext -bbox-layout ../tmp/viti/20180801_LOR_BSV_Viticulture_cle857461.pdf -# et fait une sortie destinées à être lue rapidement sur un terminal, -# avec une délimitation des blocs. -# -import xml.etree.ElementTree as ET -import os -import sys -import re - -# https://unix.stackexchange.com/questions/238180/execute-shell-commands-in-python -import subprocess - -### Parameters -CMD_PDFTOTEXT = '/usr/sbin/pdftotext' -CMD_PDFTOHTML = '/usr/sbin/pdftohtml' - - -### Entering MAIN process - -#### Getting pdf filename as a parameter. -if (len(sys.argv) < 1): - print("-U-> Usage : python analyse.py <fichier_pdf>") - sys.exit(-1) -#print('Parsing %s' % sys.argv[1]) - -basename = os.path.splitext(sys.argv[1])[0] - -#### Calling pdftotext command and getting its standard output -cmd = [CMD_PDFTOTEXT, '-bbox-layout', '-eol', 'unix', '%s.pdf' % basename, '-'] -proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) -o, e = proc.communicate() -if (proc.returncode != 0): - print('-S-> Command pdftotext returned an error :') - print(' ' + e.decode('utf8')) - sys.exit(-2) - -xml = o.decode('utf8') -root = ET.fromstring(xml) - - - -#### Extract xml to lists and dictionaries for faster and easier access. -# -# Data format : -# flow=[{page, blocks = [{page, lines:[{height, text}]}] -# Rq : page is redundant but for now we don't know which is the best -# -page_num = 0 -flow = [] -for body in root: - if (body.tag.endswith('body')): - for doc in body: - if (doc.tag.endswith('doc')): - for page in doc: - if (page.tag.endswith('page')): - page_num += 1 - for fl in page: - if (fl.tag.endswith('flow')): - blocks = [] - for bloc in fl: - if (bloc.tag.endswith('block')): - bl = {'page':page_num, 'lines':[]} - bwords = 0 - bcars = 0 - for line in bloc: - if (line.tag.endswith('line')): - h = float(line.get('yMax')) - float(line.get('yMin')) - li = '' - lwords = 0 - last_nbcar = 0 - last_h = 0 - for word in line: - if (word.tag.endswith('word')): - hword = float(word.get('yMax')) - float(word.get('yMin')) - if ((hword != last_h) - and (last_nbcar < 2)): - last_h = hword - li = "%s%s" % (li, word.text) - else: - li = "%s %s" % (li, word.text) - last_nbcar = len(word.text) - lwords += 1 - bl['lines'].append({ - 'height':h, - 'text':li.strip(), - 'nb_cars': len(li.strip()), - 'nb_words':lwords}) - bwords += lwords - bcars += len(li.strip()) - bl['nb_words'] = bwords - bl['nb_cars'] = bcars - blocks.append(bl) - flow.append({'page':page_num, 'blocks':blocks}) - - - - -#### Now, calls pdftohtml to improve font attributes -cmd = [CMD_PDFTOHTML, '-xml', '-i', '-stdout', '%s.pdf' % basename] -proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) -o, e = proc.communicate() -if (proc.returncode != 0): - print('-S-> Command pdftohtml returned an error :') - print(' ' + e.decode('utf8')) - sys.exit(-2) - -xml = o.decode('utf8') -root = ET.fromstring(xml) - -#### Extracts font information (id, size, family, color) -#### and the link between lines of text and their font. -fontspec = [] -p2x_text = [] -for page in root: - if (page.tag.endswith('page')): - pg = int(page.get('number')) - for tg in page: - if (tg.tag.endswith('fontspec')): - fontspec.append({ - 'id': int(tg.get('id')), - 'size': int(tg.get('size')), - 'family': tg.get('family'), - 'color': tg.get('color') - }) - elif (tg.tag.endswith('text')): - fnt = int(tg.get('font')) - while (tg.text is None) and (len(tg) > 0): - tg = tg[0] # remove html style tags (like <b>, …) - if (tg.text is not None): - li = "%s" % (tg.text) - if (len(li.strip()) > 0): - p2x_text.append({ - 'page': pg, - 'font': fnt, - 'text': li.strip() - }) - - - -#### Try to find fontspec of flow's lines -###### 1. By line recognition -for fl in flow: - for bl in fl['blocks']: - for li in bl['lines']: - nocc = 0 - fo = 0 - for ligne in p2x_text: - if (ligne['text'] == li['text']) and (ligne['page'] == fl['page']): - nocc += 1 - if (nocc == 2) and (ligne['font'] == fo): - nocc = 1 - fo = ligne['font'] - if (nocc == 1): - li['font'] = fo - else: - li['font'] = None -###### 2. Block uniformization -for fl in flow: - for bl in fl['blocks']: - for li in bl['lines']: - if (li['font'] is None): - h = round(li['height']) - fnt = None - for li2 in bl['lines']: - if (fnt is None) \ - and (round(li2['height']) == h) \ - and (li2['font'] is not None): - fnt = li2['font'] - if (fnt is not None): - li['font'] = fnt - - -#### Page bottom detection -pb = 'dummy' -if (flow[-1]['page'] == 1): pb = None -while (pb is not None): - pb = None - last_lines = [] - last_read_page = flow[0]['page'] - last_read_line = 'Foo' - for fl in flow: - if (fl['page'] != last_read_page): - last_lines.append(re.sub(r'[^a-zA-Z]', '', last_read_line)) - last_read_line = fl['blocks'][-1]['lines'][-1]['text'] - last_read_page = fl['page'] - last_lines.append(re.sub(r'[^a-zA-Z]', '', last_read_line)) - - ### Is last_lines filled with the same string ? - pb = last_lines[0] - for li in last_lines[1:]: - if (pb is not None) and (pb != li): pb = None - - ### Yes, so mark these lines to be removed - if (pb is not None): - print("#####> %s" % pb) - last_read_page = flow[0]['page'] - last_read_flow = flow[0] - for fl in flow[1:]: - if (fl['page'] != last_read_page): - print(' xxx> %s' % last_read_flow['blocks'][-1]['lines'][-1]['text']) - del last_read_flow['blocks'][-1]['lines'][-1] - if not last_read_flow['blocks'][-1]['lines']: # ie it's empty - del last_read_flow['blocks'][-1] - #print('****> %d' % len(last_read_flow['blocks'])) - #if last_read_page == 1: print(' **> %s' % flow) - if not last_read_flow['blocks']: - flow.remove(last_read_flow) - #if last_read_page == 1: print(' ··> %s' % flow) - last_read_flow = fl - last_read_page = fl['page'] - print(' xxx> %s' % last_read_flow['blocks'][-1]['lines'][-1]['text']) - del last_read_flow['blocks'][-1]['lines'][-1] - if not last_read_flow['blocks'][-1]['lines']: # ie it's empty - del last_read_flow['blocks'][-1] - #print('****> %d' % len(last_read_flow['blocks'])) - if not last_read_flow['blocks']: - flow.remove(last_read_flow) - - - - -#### Calcultate some stats -font_sizes = {} -pipe = [] -bl_num = 0 -fl_num = 0 -for fl in flow: - for bl in fl['blocks']: - for li in bl['lines']: - h = round(li['height']) - if (pipe == []): - pipe.append(h) - else: - #if (h != pipe[-1]): - pipe.append(h) - if (font_sizes.get(h) is None): - font_sizes[h] = {'nb_lines':1, 'nb_cars':li['nb_cars'], - 'nb_words':li['nb_words'], 'blocks':[bl_num], - 'flows':[fl_num]} - else: - font_sizes[h]['nb_lines'] += 1 - font_sizes[h]['nb_cars'] += li['nb_cars'] - font_sizes[h]['nb_words'] += li['nb_words'] - if (font_sizes[h]['blocks'][-1] != bl_num): - font_sizes[h]['blocks'].append(bl_num) - if (font_sizes[h]['flows'][-1] != fl_num): - font_sizes[h]['flows'].append(fl_num) - bl_num += 1 - fl_num += 1 - - -#### Choose "normal" fontsize. -print('') -normal_font = 0 -nf_nword = 0 -for ft in sorted(font_sizes.keys()): - f = font_sizes.get(ft) - if (f['nb_words'] > nf_nword): - normal_font = ft - nf_nword = f['nb_words'] - - - - - -#### Prints p2x_text content -for fnt in fontspec: - print(fnt) -#print('=============================================>') -#for li in p2x_text: -# print('[p. %d][%d] %s' % (li['page'], li['font'], li['text'])) -print('<=============================================') -print('') - - -#### Prints font stats -print('') -for ft in sorted(font_sizes.keys()): - f = font_sizes.get(ft) - print("[%d] ====> %d flows, %d blocks, %d lines, %d words, %d cars" %(ft, - len(f['flows']), len(f['blocks']), - f['nb_lines'], f['nb_words'], f['nb_cars'])) -print('') -print('---> %d' % normal_font) -print('') -print('=============================================') - - -#### Prints flow content, but just for normal and bigger than normal text. -#for fl in flow: -# print('') -# print('[p. %d]' % fl['page']) -# for bl in fl['blocks']: -# h = 0.0; -# for li in bl['lines']: h += li['height'] -# n = len(bl['lines']) -# print(' ---------------------------------------> [%d]' % round(h/n)) -# for li in bl['lines']: -# if (li['font'] is not None): -# print(' [%2d] (%d) %s' % (li['font'], round(li['height']), li['text'])) -# else: -# print(' (%d) %s' % (round(li['height']), li['text'])) -# print(' <---------------------------------------') - -BIG_ONLY = True -print('') -nb_blocks = 0 -for fl in flow: - if (nb_blocks > 0): - print('') - #print('[p. %d]' % fl['page']) - nb_blocks = 0 - for bl in fl['blocks']: - h = 0.0; - for li in bl['lines']: h += li['height'] - n = len(bl['lines']) - h = round(h/n) - if (h >= normal_font) or not BIG_ONLY: - nb_blocks += 1 - print(' <---------------------------------------- (p. %d)' % bl['page']) - for li in bl['lines']: - if (li['font'] is not None): - print(' [%2d] (%d) %s' % (li['font'], round(li['height']), li['text'])) - else: - print(' (%d) %s' % (round(li['height']), li['text'])) - if (nb_blocks > 0): - print(' ---------------------------------------->') diff --git a/src/py/p2b.py b/src/py/p2b.py deleted file mode 100644 index 0b80c61742793d08f3a3171fa0eb02dcf133587f..0000000000000000000000000000000000000000 --- a/src/py/p2b.py +++ /dev/null @@ -1,649 +0,0 @@ -import xml.etree.ElementTree as ET -import os -import sys -import re - -# https://unix.stackexchange.com/questions/238180/execute-shell-commands-in-python -import subprocess - -from p2b_utils import levenshtein - -### Script pour faire tout le corpus : -# D=~/Boulot/Ontology/BSV/tmp/Corpus/2019/Viticulture; for i in ${D}/*.pdf; do j=$( basename "$i" | sed -e 's/\.pdf//' ); echo $j; python p2b.py ${D}/$j | tee ${D}/${j}.md | markdown -o ${D}/${j}.html ; done - - -CMD_PDFTOTEXT = '/usr/sbin/pdftotext' -CMD_PDFTOHTML = '/usr/sbin/pdftohtml' - -LEFT_THRESHOLD = 25 # In p2b_text_utils.add_lines() : the max horizontal space - # to consider aligned items to be on the same line. - -FLAG_NONE = 0x0000 -SMALL_FONT = 0x0001 -# BIG_FONT = 0x0002 -> Unused -PAGE_BOTTOM = 0x0004 -MANY_FONTS = 0x0010 -IS_BULLET = 0x0020 -DEFAULT_FONT_SIZE = 0x0040 -TITLE_SMALLER_THAN_SUBTITLE = 0x0080 - - -TITLE_MAX_LINES = 2 - -TITLE_MIN_CHAR = 2 # To avoid “styled†bullet : we consider that a font never - # used for more than TITLE_MIN_CHAR characters per line - # is a kind of text styling and will take the next line's font - -SIMILARITY_THRESHOLD = 1.0 - -# Celle là est un peu compliquée : Pour détecter la structure, on compte -# le nombre de successions d'un changement de police de caractères vers -# un autre (ex : la fonte 3 succède *2* fois à la fonte 8). -# Si ce nombre est trop peu élevé (<= NB_SUCCESSION_FOR_SAME) alors -# on considère que 8 n'est pas un titre de 3, et qu'ils sont au même niveau. -# Sinon, on considère que 8 est un niveau au-dessus dans la hiérarchie des -# titres, sous-titres, … -NB_SUCCESSION_FOR_SAME = 0 - -# Regex -INDICES_EXPOSANTS_USUELS = [ - 'er|ère|ere', # 1er, 1ère, … - 'nde?', # 2nd - 'i?[eè]me', # 3ème, 4ieme, … - '°', -] - - -# +--------------------------------------------------------------+ -# | get_pdftotext | -# +--------------------------------------------------------------+ -def get_pdftotext(filename): - # Calls pdftotext and retreive standard output in a string (o) - basename = os.path.splitext(filename)[0] - cmd = [CMD_PDFTOTEXT, '-bbox-layout', '-eol', 'unix', '%s.pdf' % basename, '-'] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - o, e = proc.communicate() - if (proc.returncode != 0): - print('-S-> Command pdftotext returned an error :') - print(' ' + e.decode('utf8')) - return [] - - # Parse xml code and create block table. - xml = o.decode('utf8') - root = ET.fromstring(xml) - - page_num = 0 - flow_num = 0 - blocks = [] - for body in root: - if (body.tag.endswith('body')): - for doc in body: - if (doc.tag.endswith('doc')): - for page in doc: - if (page.tag.endswith('page')): - page_num += 1 - for fl in page: - if (fl.tag.endswith('flow')): - flow_num += 1 - for bloc in fl: - if (bloc.tag.endswith('block')): - bl = {'page': page_num, 'flow': flow_num, 'lines': [], - 'flags': FLAG_NONE, - 'x_min': float(bloc.get('xMin')), - 'x_max': float(bloc.get('xMax')), - 'y_min': float(bloc.get('yMin')), - 'y_max': float(bloc.get('yMax')), - } - for line in bloc: - if (line.tag.endswith('line')): - h = float(line.get('yMax')) - float(line.get('yMin')) - li = { 'text': '', 'height': h, 'words': [], - 'flags': FLAG_NONE, - 'x_min': float(bloc.get('xMin')), - 'x_max': float(bloc.get('xMax')), - 'y_min': float(bloc.get('yMin')), - 'y_max': float(bloc.get('yMax')), - } - last_nbcar = 0 - last_h = 0 - for word in line: - if (word.tag.endswith('word')): - hword = float(word.get('yMax')) - float(word.get('yMin')) - li['words'].append({'height': hword, 'text': word.text}) - if ((hword != last_h) and (last_nbcar < 2)): - # This is to avoid separation of one big capital - # letter at the beginin of a title or paragraph. - last_h = hword - if len(re.sub(r'\W','', li['text'])) == 0: - li['text'] = "%s %s" % (li['text'], word.text) - else: - li['text'] = "%s%s" % (li['text'], word.text) - else: - li['text'] = "%s %s" % (li['text'], word.text) - li['text'] = li['text'].strip() - last_nbcar = len(word.text) - bl['lines'].append(li) - blocks.append(bl) - return blocks - - -# +--------------------------------------------------------------+ -# | get_pdftohtml | -# +--------------------------------------------------------------+ -def get_pdftohtml(filename): - basename = os.path.splitext(filename)[0] - cmd = [CMD_PDFTOHTML, '-xml', '-i', '-stdout', '%s.pdf' % basename] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - o, e = proc.communicate() - if (proc.returncode != 0): - print('-S-> Command pdftohtml returned an error :') - print(' ' + e.decode('utf8')) - return None - - # Parse xml code and create block table. - xml = o.decode('utf8') - root = ET.fromstring(xml) - - fontspec = [] - segments = [] - for page in root: - if (page.tag.endswith('page')): - pg = int(page.get('number')) - for tg in page: - if (tg.tag.endswith('fontspec')): - fontspec.append({ - 'id': int(tg.get('id')), - 'size': int(tg.get('size')), - 'family': tg.get('family'), - 'color': tg.get('color'), - 'nb_cars': 0 - }) - elif (tg.tag.endswith('text')): - fnt = int(tg.get('font')) - top = int(tg.get('top')) - left = int(tg.get('left')) - width = int(tg.get('width')) - height = int(tg.get('height')) - while (tg.text is None) and (len(tg) > 0): - tg = tg[0] # remove html style tags (like <b>, …) - if (tg.text is not None): - li = "%s" % (tg.text) - if (len(li.strip()) > 0): - segments.append({'page': pg, 'font': fnt, - 'top': top, 'left': left, - 'width': width, 'height': height, - 'text': li.strip() - }) - # Find font in fontspec - for font in fontspec: - if font['id'] == fnt: break - font['nb_cars'] += len(li.strip()) - return { 'fonts': fontspec, 'segments': segments } - - -# +--------------------------------------------------------------+ -# | get_default_font_size | -# +--------------------------------------------------------------+ -def get_default_font_size(fontspec): - sizes = {} - max_cars = 0 - size_max_cars = 42 # Doesn't matter : it'll change - for f in fontspec: - if sizes.get(f['size']) is None: - sizes[f['size']] = f['nb_cars'] - else: - sizes[f['size']] += f['nb_cars'] - if sizes[f['size']] > max_cars: - max_cars = sizes[f['size']] - size_max_cars = f['size'] - return size_max_cars - - -# +--------------------------------------------------------------+ -# | mark_small_fonts | -# +--------------------------------------------------------------+ -# RQ : Also marks bullet lines -def mark_small_fonts(blocks, default_font_size): - for b in blocks: - for l in b['lines']: - if (round(l['height']) < default_font_size): - l['flags'] |= SMALL_FONT - if len(re.sub(r'\W','', l['text'])) == 0: - l['flags'] |= IS_BULLET - - -# +--------------------------------------------------------------+ -# | mark_page_bottom | -# +--------------------------------------------------------------+ -def mark_page_bottom(blocks): - if (blocks[-1]['page'] == 1): return - - # Find indexes of last blocks in pages - bndx = [] - for i in range(0, len(blocks) - 1): - if (blocks[i]['page'] != blocks[i+1]['page']): - bndx.append(i) - bndx.append(len(blocks)-1) - - # Get last line indexes - lndx = [] - for i in bndx: - lndx.append(len(blocks[i]['lines'])-1) - - # Loop while finding always same characters in last lines - end = False - while not end: - txt = None - # Test if last lines characters are the same - for i,j in zip(bndx, lndx): - li = re.sub(r'[^a-zA-Z]', '', blocks[i]['lines'][j]['text']) - if txt is None: txt = li - else: end = (txt != li) - # All last line are the same, so mark them - if not end: - for i in range(0, len(bndx)): - blocks[bndx[i]]['lines'][lndx[i]]['flags'] |= PAGE_BOTTOM - lndx[i] -= 1 - if (lndx[i] < 0): - #-# blocks[bndx[i]]['flags'] |= PAGE_BOTTOM - bndx[i] -= 1 - lndx[i] = len(blocks[bndx[i]]['lines']) - 1 - end = bndx[i] < 0 - -# +--------------------------------------------------------------+ -# | is_ind_exp | -# +--------------------------------------------------------------+ -# Is it an indice or exposant ? -def is_ind_exp(str): - for ie in INDICES_EXPOSANTS_USUELS: - if re.match(ie, str): - return True - return False - -# +--------------------------------------------------------------+ -# | get_lines | -# +--------------------------------------------------------------+ -# Extract lines from 'text' attribute returned by get_pdftohtml and associates -# a font id (and the page number), which is the font used by the higher number -# of characters of the line. -# Does a column splitting considering the value of LEFT_THRESHOLD -def get_lines(segments, fontspec): - last_top = -1 - line_no = -1 - last_right = 0 - for txt in segments: - if (txt['top'] == last_top) and ((txt['left'] - last_right) <= LEFT_THRESHOLD): - txt['line'] = line_no - elif is_ind_exp(txt['text'].strip()): - txt['line'] = line_no - else: - line_no += 1 - txt['line'] = line_no - last_top = txt['top'] - last_right = txt['left'] + txt['width'] - - for f in fontspec: - if 'same_line' not in f: - f['same_line'] = [] - - lines = [] - last_line = -2 - li = '' - fnt = {} - page_num = segments[0]['page'] - for txt in segments: - if (txt['line'] != last_line) or (txt == segments[-1]): - if (len(li.strip()) > 0): - fnt_no = -1; max_car = 0; - for f in fnt.keys(): - if (fnt[f] > max_car): - max_car = fnt[f] - fnt_no = f - lines.append({ 'text': li.strip(), - 'most_used_font': fnt_no, - 'nb_fonts': len(fnt), - 'page': page_num}) - li = txt['text'].strip() - last_line = txt['line'] - for fi1 in fnt.keys(): - for fi2 in fnt.keys(): - if fi1 != fi2: - f1 = next(it for it in fontspec if it['id'] == int(fi1)) - f2 = next(it for it in fontspec if it['id'] == int(fi2)) - if (f2['id'] not in f1['same_line']): - f1['same_line'].append(f2['id']) - f2['same_line'].append(f1['id']) - fnt = {} - fnt[txt['font']] = len(li.strip()) - else: - if (is_ind_exp(txt['text'])): - li = "%s%s" % (li, txt['text'].strip()) - else: - li = "%s %s" % (li, txt['text'].strip()) - if (fnt.get(txt['font']) is None): - fnt[txt['font']] = len(txt['text'].strip()) - else: - fnt[txt['font']] += len(txt['text'].strip()) - page_num = txt['page'] - return lines - -# +--------------------------------------------------------------+ -# | guess_fonts | -# +--------------------------------------------------------------+ -# Tries to guess fontspec of each line into blocks list. -# It calculates the levenshtein distance with every segment of the same page -# and assigns the best matching score's font. -def guess_fonts(blocks, segments, fontspec): - lines = get_lines(segments, fontspec) - ndx_lines = [0,] # Indexation des indices de line par numéro de page - for ndx in range(1, len(lines)): - if (lines[ndx-1]['page'] != lines[ndx]['page']): - ndx_lines.append(ndx) - ndx_lines.append(len(lines)) - - for f in fontspec: - f['nb_lines'] = 0 - f['dist_sum'] = 0 - #f['block_pos_sum'] = 0 - - for bl in blocks: - for l in bl['lines']: - if (len(l['text']) > 0): - min_dist = len(l['text']) - min_score = 1.0 - font_sel = -1 - line_no = -1 - for i in range(ndx_lines[bl['page']-1], ndx_lines[bl['page']]): - if (len(lines[i]['text']) > 0): - d = levenshtein(l['text'], lines[i]['text']) - if (d == 0): - min_dist = 0 - min_score = 0.0 - font_sel = lines[i]['most_used_font'] - line_no = i - break; - score = float(d) / float(max(len(l['text']), len(lines[i]['text']))) - if (score <= SIMILARITY_THRESHOLD): - if (d < min_dist): - min_dist = d - min_score = score - font_sel = lines[i]['most_used_font'] - line_no = i - l['font'] = font_sel - if (font_sel >= 0): - fnt = next(it for it in fontspec if it['id'] == font_sel) - fnt['nb_lines'] +=1 - fnt['dist_sum'] += min_dist - l['score'] = min_score # For debuggin purpose - l['dist'] = min_dist # idem. - l['line_no'] = line_no # idem. Stores the "similar line" number - # print("> %s" % l['text']) - # print(" %s" % lines[line_no]['text']) - # print(" [%d]" % font_sel) - # print("") - if (lines[line_no]['nb_fonts'] > 1): - l['flags'] |= MANY_FONTS - -# +--------------------------------------------------------------+ -# | replace_block_fonts | -# +--------------------------------------------------------------+ -# Adds a 'short_font' attribute to lines which gives another font value which -# doesn't care about style (bold, …). -# RK: def_size is default_font_size, used to mark SMALL_FONT flag. -def replace_block_fonts(blocks, fontspec, def_size): - for i in range(0, len(fontspec) - 1): - for j in range(i+1, len(fontspec)): - if (fontspec[j].get('replaceWith') is None): - if (fontspec[j]['id'] in fontspec[i]['same_line']): - if fontspec[i].get('replaceWith') is None: - fontspec[j]['replaceWith'] = fontspec[i]['id'] - else: - fontspec[j]['replaceWith'] = fontspec[i]['replaceWith'] - for bl in blocks: - for l in bl['lines']: - if (l['font'] < 0): - f = None - else: - f = next(it for it in fontspec if it['id'] == l['font']) - if (f is None) or (f.get('replaceWith') is None): - l['short_font'] = l['font'] - else: - l['short_font'] = f.get('replaceWith') - if (f is not None): - f = next(it for it in fontspec if it['id'] == l['short_font']) - if (f['size'] < def_size): - l['flags'] |= SMALL_FONT - if (f['size'] == def_size): - l['flags'] |= DEFAULT_FONT_SIZE - - -# +--------------------------------------------------------------+ -# | guess_structure | -# +--------------------------------------------------------------+ -def guess_structure(blocks, fontspec, - remove_flags = SMALL_FONT | PAGE_BOTTOM | IS_BULLET): - t = [] # A list used here and there - n = [] # Another one - - # Search for the most used font - # Here, t will be used to count the number of cars of each font. - # and n will be used to store the maximum line size for each font. - for i in range(len(fontspec)): - t.append(0) - n.append(0) - nb_max = -1 - ndx_most_used = -1 - for bl in blocks: - for l in bl['lines']: - if (l['short_font'] >= 0) and ((l['flags'] & remove_flags) == FLAG_NONE): - lon = len(l['text'].strip()) - t[l['short_font']] += lon - if lon > n[l['short_font']]: n[l['short_font']] = lon - if (t[l['short_font']] > nb_max): - nb_max = t[l['short_font']] - ndx_most_used = l['short_font'] - b = [nb <= TITLE_MIN_CHAR for nb in n] - - ### ndx_most_used is the most used font number. - ### b[font_number] is True if the font seems used for bullets. - - t = [] # We'll use it to list the fonts succession - n = [] # Used to count the number of lines - for bl in blocks: - for l in bl['lines']: - if (l['flags'] & remove_flags) == FLAG_NONE: - if t == []: - t.append(l['short_font']) - n.append(1) - else: - if (t[-1] != l['short_font']): - t.append(l['short_font']) - n.append(1) - else: - n[-1] += 1 - - f = {} # Will contain used font numbers and number of occurences in t - for i,j in zip(t,n): - if i not in f.keys(): - f[i] = {'nb': 1, 'nl':j, 'maxl': j, - 'is_bullet': b[i], 'flags': FLAG_NONE} - else: - f[i]['nb'] += 1 - f[i]['nl'] += j - if (j > f[i]['maxl']): - f[i]['maxl'] = j - - for i in f.keys(): - f[i]['isnt_title'] = (f[i]['maxl'] > TITLE_MAX_LINES) - - # Replace short_font for lines considered as bullets (or text styling). - last_bullet_lines = [] - for bl in blocks: - for l in bl['lines']: - if (l['flags'] & remove_flags) == FLAG_NONE: - if f[l['short_font']]['is_bullet']: - last_bullet_lines.append(l) - else: - if (len(last_bullet_lines) > 0): - for last in last_bullet_lines: - last['short_font'] = l['short_font'] - last_bullet_lines = [] - if (len(last_bullet_lines) > 0): - for last in last_bullet_lines: - last['short_font'] = ndx_most_used - - # n and b won't be used anymore I think. So they're free - - # Rebuild the font succession list (is not optimized but is the safest) - t = [] - for bl in blocks: - for l in bl['lines']: - if (l['flags'] & remove_flags) == FLAG_NONE: - if t == []: t.append(l['short_font']) - else: - if (t[-1] != l['short_font']): - t.append(l['short_font']) - - b = [] # We'll do a 2d table with b[i][j] = number of transitions - # from fonti to fontj (will be a tree of font transitions) - for i in range(len(fontspec)+1): # Consider len+1 to have font number -1 - b.append([0 for j in range(len(fontspec)+1)]) - for i in range(len(t)-1): - j = i+1 - if not f[t[i]]['isnt_title']: - b[t[i]][t[j]] += 1 - - # Should we do this : ? &&&&&&&&&&&&&&&&&&&& A tester - # Un moyen de s'assurer que tout sera parcouru... - # Signifie qu'on ne finit pas sur un titre. - f[t[-1]]['isnt_title'] = True - - # Create a deep attribute in f which contains distance from leaves - for k,v in f.items(): - if v['isnt_title']: - v['deep'] = 0 - v['nb_transitions'] = 999999999 - else: v['deep'] = None - - # Algo : dans le tableau b, on parcourt les colonnes (j) pour les fontes qui ont un deep. - # Un indice (i) de ligne pour lequel la valeur b[i][j] est non nulle signifie - # que la fonte i - # précède la fonte j b[i][j] fois. - # Rq : Une colonne vide pour un indice dont la ligne est non-vide est une racine - # Une ligne vide pour un indice dont la colonne est non-vide est une feuille - # On répète tant qu'on change des valeurs (c'est pas optimisé mais crotte, - # le tableau n'est pas si grand) - has_changed = True - deep_max = 0 - while has_changed: - has_changed = False - for k,v in f.items(): - if v['deep'] is not None: - for i in range(-1,len(b)-1): - if b[i][k] != 0: - if f[i]['deep'] is None: - if (b[i][k] <= NB_SUCCESSION_FOR_SAME): - f[i]['deep'] = v['deep'] - f[i]['nb_transitions'] = b[i][k] - else: - f[i]['deep'] = v['deep'] + 1 - f[i]['nb_transitions'] = b[i][k] - if f[i]['deep'] > deep_max: - deep_max = f[i]['deep'] - has_changed = True - if (fontspec[i]['size'] < fontspec[k]['size']): - f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE - elif f[i]['nb_transitions'] < b[i][k]: - f[i]['deep'] = v['deep'] + 1 - f[i]['nb_transitions'] = b[i][k] - has_changed = True - if (fontspec[i]['size'] < fontspec[k]['size']): - f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE - - # Reverse deepness value, to make it distance from root - for v in f.values(): - if (v['deep'] is not None): - v['deep'] = deep_max - v['deep'] - - # Add deep in blocks lines - for bl in blocks: - for l in bl['lines']: - if (l['flags'] & remove_flags) == FLAG_NONE: - l['deep'] = f[l['short_font']]['deep'] - if ((f[l['short_font']]['flags']) & TITLE_SMALLER_THAN_SUBTITLE != 0): - l['flags'] |= TITLE_SMALLER_THAN_SUBTITLE - else: - l['deep'] = deep_max - - - - - - - -# +--------------------------------------------------------------+ -# | print_block_list | -# +--------------------------------------------------------------+ -def print_block_list(t, remove_flags = FLAG_NONE): - last_page = -1 - deep_max = -1 - for bl in t: - for l in bl['lines']: - if (l.get('deep') is not None): - if deep_max < l['deep']: deep_max = l['deep'] - if deep_max > 10: deep_max = 10 - ttl = "#############" - last_deep = -1 - - for block in t: - if (block['page'] != last_page): - if (last_page > 0): - print("") - last_page = block['page'] - print("________________________________") - print("*page %d*" % last_page) - - print("") - - for l in block['lines']: - if (l['flags'] & remove_flags) == FLAG_NONE: - pre = '' - post = ' ' - if (l.get('deep') is None): - pre = '!! ' - last_deep = -1 - else: - if (l['flags'] & SMALL_FONT) != 0: - pre = "> %s" % pre - #if (len(l['text']) > 20) and \ - # len(re.sub(r'\w','', l['text']).strip()) > 5: - # post = "%s " % post - if (l['flags'] & TITLE_SMALLER_THAN_SUBTITLE) != 0 and \ - (l['flags'] & (DEFAULT_FONT_SIZE | SMALL_FONT)) != 0: - pre = "%s**" % (pre) - post = "**%s" % post - elif l['deep'] < deep_max: - pre = "%s%s " % (pre, ttl[0:(l['deep']+1)]) - last_deep = l['deep'] - print("%s%s%s" % (pre, l['text'], post)) - - -# +--------------------------------------------------------------+ -# | main | -# +--------------------------------------------------------------+ -if (len(sys.argv) < 1): - print("-U-> Usage : python pdf2blocks.py <fichier_pdf>") - sys.exit(-1) - -blocks = get_pdftotext(sys.argv[1]) -p2h = get_pdftohtml(sys.argv[1]) -fontspec = p2h['fonts'] -segments = p2h['segments'] - -default_font_size = get_default_font_size(fontspec) -# mark_small_fonts(blocks, default_font_size) -mark_page_bottom(blocks) -guess_fonts(blocks, segments, fontspec) -replace_block_fonts(blocks, fontspec, default_font_size) -guess_structure(blocks, fontspec) -print_block_list(blocks, PAGE_BOTTOM | IS_BULLET) diff --git a/src/py/p2b_analyse.py b/src/py/p2b_analyse.py deleted file mode 100644 index 4339529817dab29ed543a7ff2e6f9a07db5c779e..0000000000000000000000000000000000000000 --- a/src/py/p2b_analyse.py +++ /dev/null @@ -1,36 +0,0 @@ -# Quelques fonctions pour analyser les retours des fonctions -# get_pdftotext et get_pdftohtml - - -def get_pdftotext_fontstats(blocks): - # TODO: Changer en comptage mot par mot. - font_sizes = {} - pipe = [] - bl_num = 0 - fl_num = 0 - for bl in blocks: - for li in bl['lines']: - h = round(li['height']) ##### IMPORTANT : on arrondit. - if (font_sizes.get(h) is None): - font_sizes[h] = {'nb_lines':len(bl['lines']), - 'nb_cars':bl['nb_cars'], 'nb_words':bl['nb_words'], - 'nb_blocks': 1} - else: - font_sizes[h]['nb_lines'] += len(bl['lines']) - font_sizes[h]['nb_cars'] += bl['nb_cars'] - #font_sizes[h]['nb_words'] += li['nb_words'] - font_sizes[h]['nb_blocks'] += 1 - return font_sizes - -def get_pdftotext_normal_fontsize(font_sizes): - normal_font = 0 - nf_nword = 0 - for ft in sorted(font_sizes.keys()): - f = font_sizes.get(ft) -# if (f['nb_words'] > nf_nword): -# normal_font = ft -# nf_nword = f['nb_words'] - if (f['nb_cars'] > nf_nword): - normal_font = ft - nf_nword = f['nb_cars'] - return normal_font diff --git a/src/py/p2b_blocks_utils.py b/src/py/p2b_blocks_utils.py deleted file mode 100644 index 0c60f624cbcf15d01a62df94ebc39602b61201be..0000000000000000000000000000000000000000 --- a/src/py/p2b_blocks_utils.py +++ /dev/null @@ -1,371 +0,0 @@ -import re -from p2b_config import * -from p2b_utils import levenshtein -from p2b_text_utils import get_lines - - - -def print_block_list(t, remove_flags = FLAG_NONE): - last_page = -1 - last_flow = -1 - write_flow = False - - deep_max = -1 - for bl in t: - for l in bl['lines']: - if (l.get('deep') is not None): - if deep_max < l['deep']: deep_max = l['deep'] - if deep_max > 10: deep_max = 10 - ttl = "###########" - last_deep = -1 - - for block in t: - if (block['page'] != last_page): - if (last_page > 0): - print("") - last_page = block['page'] - print("________________________________") - print("*page %d*" % last_page) - - if (block['flow'] != last_flow): - write_flow = True - last_flow = block['flow'] - - print("") - - for l in block['lines']: - if (l['flags'] & remove_flags) == FLAG_NONE: - if write_flow: - #print("+----------------------------------------------------------------------------+") - write_flow = False - pre = '' - if (l.get('deep') is None): - pre = '!! ' - last_deep = -1 - else: - if l['deep'] < deep_max: - pre = "%s " % ttl[0:(l['deep']+1)] - #if (l['deep'] <= 3) and (l['deep'] != last_deep): - # print("") - last_deep = l['deep'] - print("%s%s" % (pre, l['text'])) - - -def print_block_list_old(t, remove_flags = FLAG_NONE): - last_page = -1 - last_flow = -1 - for block in t: - if (block['page'] != last_page): - if (last_page > 0): - print("") - last_page = block['page'] - print("== [page %d] ==================================================================" % last_page) - if (block['flow'] != last_flow): - print("+----------------------------------------------------------------------------+") - last_flow = block['flow'] - - flags = '' - if (block['flags'] == FLAG_NONE): flags = ' ' - else: flags = "<%x>" % block['flags'] - - #print("| → %s w:%d c:%d, [(%.2f, %.2f) ; (%.2f, %.2f)]" % (flags, - # block['nb_words'], block['nb_cars'], - # block['x_min'], block['y_min'], block['x_max'], block['y_max'])) - for l in block['lines']: - if (l['flags'] & remove_flags != 0): - break - lflags = ' ' - if (l['flags'] == FLAG_NONE): - lflags = '---→' - else: - lflags = "(%2x)" % l['flags'] - mf_flag = '-' if ((l.get('flags') & FLAG_MANY_FONTS) == 0) else '*' - - if l.get('font') is None: - font = "[ - %2d - ]" % round(l['height']) - else: - if l.get('short_font') is None: - font = "[%2d %s %.2f]" %(l.get('font'), mf_flag, l['score']) - else: - font = "%s%2d%s" % (mf_flag, l.get('short_font'), mf_flag) - - print(" %s %s %s" % (lflags, font, l['text'])) - - #print("") - - -def mark_blocks_fontsize(blocks, normal_size): - for b in blocks: - if (round(b['h_max']) < normal_size): - b['flags'] |= FLAG_SMALL_FONT - if (round(b['h_min']) > normal_size): - b['flags'] |= FLAG_BIG_FONT - for l in b['lines']: - if (round(l['height']) < normal_size): - l['flags'] |= FLAG_SMALL_FONT - elif (round(l['height']) > normal_size): - l['flags'] |= FLAG_BIG_FONT - - -def mark_bullet_lines(blocks): - for b in blocks: - for l in b['lines']: - if len(re.sub(r'\W','', l['text'])) == 0: - l['flags'] |= FLAG_BULLET - - -def mark_page_bottom(blocks): - if (blocks[-1]['page'] == 1): return - - # Find indexes of last blocks in pages - bndx = [] - for i in range(0, len(blocks) - 1): - if (blocks[i]['page'] != blocks[i+1]['page']): - bndx.append(i) - bndx.append(len(blocks)-1) - - # Get last line indexes - lndx = [] - for i in bndx: - lndx.append(len(blocks[i]['lines'])-1) - - # Loop while finding always same characters in those lines - end = False - while not end: - txt = None - # Test if last lines characters are the same - for i,j in zip(bndx, lndx): - li = re.sub(r'[^a-zA-Z]', '', blocks[i]['lines'][j]['text']) - if txt is None: txt = li - else: end = (txt != li) - # All last line are the same, so mark them - if not end: - for i in range(0, len(bndx)): - blocks[bndx[i]]['lines'][lndx[i]]['flags'] |= FLAG_PAGE_BOTTOM - lndx[i] -= 1 - if (lndx[i] < 0): - blocks[bndx[i]]['flags'] |= FLAG_PAGE_BOTTOM - bndx[i] -= 1 - lndx[i] = len(blocks[bndx[i]]['lines']) - 1 - end = bndx[i] < 0 - - -# Tries to guess fontspec of each line of text. -# It calculates the levenshtein distance with every line of the same page. -# If the result divided by the longuest string length is lower than -# SIMILARITY_THRESHOLD (defined in p2b_config), then it's used to calculate -# the best matching string. Of course, if a distance of 0 is found, -# it's considered to be the line we're looking for. -def guess_fonts(blocks, text, fontspec): - lines = get_lines(text, fontspec) - ndx_lines = [0,] # Indexation des indices de line par numéro de page - for ndx in range(1, len(lines)): - if (lines[ndx-1]['page'] != lines[ndx]['page']): - ndx_lines.append(ndx) - ndx_lines.append(len(lines)) - - for f in fontspec: - f['nb_lines'] = 0 - f['dist_sum'] = 0 - #f['block_pos_sum'] = 0 - - for bl in blocks: - for l in bl['lines']: - if (len(l['text']) > 0): - min_dist = len(l['text']) - min_score = 1.0 - font_sel = -1 - line_no = -1 - for i in range(ndx_lines[bl['page']-1], ndx_lines[bl['page']]): - if (len(lines[i]['text']) > 0): - d = levenshtein(l['text'], lines[i]['text']) - if (d == 0): - min_dist = 0 - min_score = 0.0 - font_sel = lines[i]['most_used_font'] - line_no = i - break; - score = float(d) / float(max(len(l['text']), len(lines[i]['text']))) - if (score <= SIMILARITY_THRESHOLD): - if (d < min_dist): - min_dist = d - min_score = score - font_sel = lines[i]['most_used_font'] - line_no = i - l['font'] = font_sel - if (font_sel >= 0): - fnt = next(it for it in fontspec if it['id'] == font_sel) - fnt['nb_lines'] +=1 - fnt['dist_sum'] += min_dist - l['score'] = min_score # For debuggin purpose - l['dist'] = min_dist # idem. - l['line_no'] = line_no # idem. Stores the "similar line" number - if (lines[line_no]['nb_fonts'] > 1): - l['flags'] |= FLAG_MANY_FONTS - - -# Adds a 'short_font' attribute to lines which gives another font value which -# doesn't care about style (bold, …). -def replace_block_fonts(blocks, fonts): - for bl in blocks: - for l in bl['lines']: - if (l['font'] < 0) or (fonts[l['font']].get('replaceWith') is None): - l['short_font'] = l['font'] - else: - l['short_font'] = fonts[l['font']].get('replaceWith') - - - -def guess_structure(blocks, fonts, - remove_flags = FLAG_SMALL_FONT | FLAG_PAGE_BOTTOM | FLAG_BULLET): - t = [] # A list used here and there - n = [] # Another one - - # Search for the most used font - # Here, t will be used to count the number of cars of each font. - # and n will be used to store the maximum line size for each font. - for i in range(len(fonts)): - t.append(0) - n.append(0) - nb_max = -1 - ndx_most_used = -1 - for bl in blocks: - for l in bl['lines']: - if (l['short_font'] >= 0) and ((l['flags'] & remove_flags) == FLAG_NONE): - lon = len(l['text'].strip()) - t[l['short_font']] += lon - if lon > n[l['short_font']]: n[l['short_font']] = lon - if (t[l['short_font']] > nb_max): - nb_max = t[l['short_font']] - ndx_most_used = l['short_font'] - b = [nb <= TITLE_MIN_CHAR for nb in n] - - ### ndx_most_used is the most used font number. - ### b[font_number] is True if the font seems used for bullets. - - t = [] # We'll use it to list the fonts succession - n = [] # Used to count the number of lines - for bl in blocks: - for l in bl['lines']: - if (l['flags'] & remove_flags) == FLAG_NONE: - if t == []: - t.append(l['short_font']) - n.append(1) - else: - if (t[-1] != l['short_font']): - t.append(l['short_font']) - n.append(1) - else: - n[-1] += 1 - - f = {} # Will contain used font numbers and number of occurences in t - for i,j in zip(t,n): - if i not in f.keys(): - f[i] = {'nb': 1, 'nl':j, 'maxl': j, 'is_bullet': b[i]} - else: - f[i]['nb'] += 1 - f[i]['nl'] += j - if (j > f[i]['maxl']): - f[i]['maxl'] = j - - for i in f.keys(): - # a = float(f[i]['nl']) / f[i]['nb'] - # f[i]['isnt_title'] = (a > TITLE_MAX_MEAN_LINES) - f[i]['isnt_title'] = (f[i]['maxl'] > TITLE_MAX_LINES) - - # Replace short_font for lines considered as bullets (or text styling). - last_bullet_lines = [] - for bl in blocks: - for l in bl['lines']: - if (l['flags'] & remove_flags) == FLAG_NONE: - if f[l['short_font']]['is_bullet']: - last_bullet_lines.append(l) - else: - if (len(last_bullet_lines) > 0): - for last in last_bullet_lines: - last['short_font'] = l['short_font'] - last_bullet_lines = [] - if (len(last_bullet_lines) > 0): - for last in last_bullet_lines: - last['short_font'] = ndx_most_used - - # n and b won't be used anymore I think. So they're free - - # Rebuild the font succession list (is not optimized but is the safest) - t = [] - for bl in blocks: - for l in bl['lines']: - if (l['flags'] & remove_flags) == FLAG_NONE: - if t == []: t.append(l['short_font']) - else: - if (t[-1] != l['short_font']): - t.append(l['short_font']) - - b = [] # We'll do a 2d table with b[i][j] = number of transitions - # from fonti to fontj (will be a tree of font transitions) - for i in range(len(fonts)+1): # Consider len+1 to have font number -1 - b.append([0 for j in range(len(fonts)+1)]) - for i in range(len(t)-1): - j = i+1 - if not f[t[i]]['isnt_title']: - b[t[i]][t[j]] += 1 - - # Should we do this : ? &&&&&&&&&&&&&&&&&&&& A tester - # Un moyen de s'assurer que tout sera parcouru... - # Signifie qu'on ne finit pas sur un titre. - f[t[-1]]['isnt_title'] = True - - # Create a deep attribute in f which contains distance from leaves - for k,v in f.items(): - if v['isnt_title']: - v['deep'] = 0 - v['nb_transitions'] = 999999999 - else: v['deep'] = None - - # Algo : dans le tableau b, on parcourt les colonnes (j) pour les fontes qui ont un deep. - # Un indice (i) de ligne pour lequel la valeur b[i][j] est non nulle signifie - # que la fonte i - # précède la fonte j b[i][j] fois. - # Rq : Une colonne vide pour un indice dont la ligne est non-vide est une racine - # Une ligne vide pour un indice dont la colonne est non-vide est une feuille - # On répète tant qu'on change des valeurs (c'est pas optimisé mais crotte, - # le tableau n'est pas si grand) - has_changed = True - deep_max = 0 - while has_changed: - has_changed = False - for k,v in f.items(): - if v['deep'] is not None: - for i in range(-1,len(b)-1): - if b[i][k] != 0: - if f[i]['deep'] is None: - if (b[i][k] <= NB_SUCCESSION_FOR_SAME): - f[i]['deep'] = v['deep'] - f[i]['nb_transitions'] = b[i][k] - else: - f[i]['deep'] = v['deep'] + 1 - f[i]['nb_transitions'] = b[i][k] - if f[i]['deep'] > deep_max: - deep_max = f[i]['deep'] - has_changed = True - elif f[i]['nb_transitions'] < b[i][k]: - f[i]['deep'] = v['deep'] + 1 - f[i]['nb_transitions'] = b[i][k] - has_changed = True - - # Reverse deepness value, to make it distance from root - for v in f.values(): - if (v['deep'] is not None): - v['deep'] = deep_max - v['deep'] - - # Add deep in blocks lines - for bl in blocks: - for l in bl['lines']: - if (l['flags'] & remove_flags) == FLAG_NONE: - l['deep'] = f[l['short_font']]['deep'] - else: - l['deep'] = deep_max - - #for k,v in f.items(): - # if (v['deep'] is not None): - # print("[%2d] → %d" % (k, v['deep'])) diff --git a/src/py/p2b_config.py b/src/py/p2b_config.py deleted file mode 100644 index b5f6172572ef1c4765c080d9e997ceb45a244e47..0000000000000000000000000000000000000000 --- a/src/py/p2b_config.py +++ /dev/null @@ -1,45 +0,0 @@ - - -CMD_PDFTOTEXT = '/usr/sbin/pdftotext' -CMD_PDFTOHTML = '/usr/sbin/pdftohtml' - -FLAG_NONE = 0x0000 -FLAG_SMALL_FONT = 0x0001 # Block or line has too small fontsize -FLAG_BIG_FONT = 0x0002 # Block or line has bigger then normal fontsize -FLAG_PAGE_BOTTOM = 0x0004 -FLAG_PAGE_TOP = 0x0008 -FLAG_MANY_FONTS = 0x0010 # pdftohtml return different fonts for the associated line -FLAG_BULLET = 0x0020 - -# For font guessing (see guess_fonts in p2b_blocks_utils) -SIMILARITY_THRESHOLD = 1.0 -LEFT_THRESHOLD = 20 # In p2b_text_utils.add_lines() : the max horizontal space - # to consider aligned items to be on the same line. - -#BOLD_FONT_THRESHOLD = 0.5 -#TITLE_MAX_MEAN_LINES = 2.0 # For each “short fontâ€, the mean number of succesive - # lines is computed. If this mean is higher than - # TITLE_MAX_MEAN_LINES, we consider this font is not - # used for titles. -TITLE_MAX_LINES = 2 # Replaces TITLE_MAX_MEAN_LINES - -TITLE_MIN_CHAR = 2 # To avoid “styled†bullet : we consider that a font never - # used for more than TITLE_MIN_CHAR characters per line - # is a kind of text styling and will take the next line's font - -# Celle là est un peu compliquée : Pour détecter la structure, on compte -# le nombre de successions d'un changement de police de caractères vers -# un autre (ex : la fonte 3 succède *2* fois à la fonte 8). -# Si ce nombre est trop peu élevé (<= NB_SUCCESSION_FOR_SAME) alors -# on considère que 8 n'est pas un titre de 3, et qu'ils sont au même niveau. -# Sinon, on considère que 8 est un niveau au-dessus dans la hiérarchie des -# titres, sous-titres, … -NB_SUCCESSION_FOR_SAME = 0 - -# Regex -INDICES_EXPOSANTS_USUELS = [ - 'er|ère|ere', # 1er, 1ère, … - 'nde?', # 2nd - 'i?[eè]me', # 3ème, 4ieme, … - '°', -] diff --git a/src/py/p2b_file.py b/src/py/p2b_file.py deleted file mode 100644 index 60d9111eb99cf099e68a75914a5a9ecc59a83b8b..0000000000000000000000000000000000000000 --- a/src/py/p2b_file.py +++ /dev/null @@ -1,128 +0,0 @@ -import xml.etree.ElementTree as ET -import os -#import sys -import re - -# https://unix.stackexchange.com/questions/238180/execute-shell-commands-in-python -import subprocess - -from p2b_config import CMD_PDFTOTEXT, CMD_PDFTOHTML, FLAG_NONE - -def get_pdftotext(filename): - # Calls pdftotext and retreive standard output in a string (o) - basename = os.path.splitext(filename)[0] - cmd = [CMD_PDFTOTEXT, '-bbox-layout', '-eol', 'unix', '%s.pdf' % basename, '-'] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - o, e = proc.communicate() - if (proc.returncode != 0): - print('-S-> Command pdftotext returned an error :') - print(' ' + e.decode('utf8')) - return [] - - # Parse xml code and create block table. - xml = o.decode('utf8') - root = ET.fromstring(xml) - - page_num = 0 - flow_num = 0 - blocks = [] - for body in root: - if (body.tag.endswith('body')): - for doc in body: - if (doc.tag.endswith('doc')): - for page in doc: - if (page.tag.endswith('page')): - page_num += 1 - for fl in page: - if (fl.tag.endswith('flow')): - flow_num += 1 - for bloc in fl: - if (bloc.tag.endswith('block')): - bl = {'page': page_num, 'flow': flow_num, 'lines': [], - 'h_min': 150000, 'h_max': 0, - 'x_min': float(bloc.get('xMin')), - 'x_max': float(bloc.get('xMax')), - 'y_min': float(bloc.get('yMin')), - 'y_max': float(bloc.get('yMax')), - 'nb_cars': 0, 'nb_words': 0, 'flags': FLAG_NONE } - bwords = 0 - bcars = 0 - for line in bloc: - if (line.tag.endswith('line')): - h = float(line.get('yMax')) - float(line.get('yMin')) - if (h < bl['h_min']): bl['h_min'] = h - if (h > bl['h_max']): bl['h_max'] = h - li = { 'text': '', 'height': h, 'words': [], - 'nb_words': 0, 'nb_cars': 0, 'flags': FLAG_NONE } - last_nbcar = 0 - last_h = 0 - for word in line: - if (word.tag.endswith('word')): - hword = float(word.get('yMax')) - float(word.get('yMin')) - li['words'].append({'height': hword, 'text': word.text}) - if ((hword != last_h) and (last_nbcar < 2)): - # This is to avoid separation of one big capital - # letter at the beginin of a title or paragraph. - last_h = hword - if len(re.sub(r'\W','', li['text'])) == 0: - li['text'] = "%s %s" % (li['text'], word.text) - else: - li['text'] = "%s%s" % (li['text'], word.text) - else: - li['text'] = "%s %s" % (li['text'], word.text) - li['text'] = li['text'].strip() - last_nbcar = len(word.text) - li['nb_words'] += 1 - bl['lines'].append(li) - bl['nb_cars'] += len(li['text'].strip()) - bl['nb_words'] += li['nb_words'] - blocks.append(bl) - return blocks - - -def get_pdftohtml(filename): - basename = os.path.splitext(filename)[0] - cmd = [CMD_PDFTOHTML, '-xml', '-i', '-stdout', '%s.pdf' % basename] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - o, e = proc.communicate() - if (proc.returncode != 0): - print('-S-> Command pdftohtml returned an error :') - print(' ' + e.decode('utf8')) - return None - - # Parse xml code and create block table. - xml = o.decode('utf8') - root = ET.fromstring(xml) - - fontspec = [] - p2x_text = [] - for page in root: - if (page.tag.endswith('page')): - pg = int(page.get('number')) - for tg in page: - if (tg.tag.endswith('fontspec')): - fontspec.append({ - 'id': int(tg.get('id')), - 'size': int(tg.get('size')), - 'family': tg.get('family'), - 'color': tg.get('color') - }) - elif (tg.tag.endswith('text')): - fnt = int(tg.get('font')) - top = int(tg.get('top')) - left = int(tg.get('left')) - width = int(tg.get('width')) - height = int(tg.get('height')) - while (tg.text is None) and (len(tg) > 0): - tg = tg[0] # remove html style tags (like <b>, …) - if (tg.text is not None): - li = "%s" % (tg.text) - if (len(li.strip()) > 0): - p2x_text.append({ - 'page': pg, - 'font': fnt, - 'top': top, 'left': left, - 'width': width, 'height': height, - 'text': li.strip() - }) - return { 'fonts': fontspec, 'text': p2x_text } diff --git a/src/py/p2b_fonts_utils.py b/src/py/p2b_fonts_utils.py deleted file mode 100644 index 0cb507765b01a23a867f7f737097e69abb997da5..0000000000000000000000000000000000000000 --- a/src/py/p2b_fonts_utils.py +++ /dev/null @@ -1,58 +0,0 @@ -import re -from p2b_config import * - -# Defines a 'sameAs' attribute, containing a list of similar fonts, -# considering style attributes (bold, slanted, …) not important. -def group_fonts(fonts): - for i in range(0, len(fonts) - 1): - for j in range(i+1, len(fonts)): - if (fonts[i]['size'] == fonts[j]['size']) and \ - (fonts[i]['color'] == fonts[j]['color']) and \ - (fonts[i]['family'].split(',')[0] == fonts[j]['family'].split(',')[0]): - if (fonts[i].get('sameAs') is None): - fonts[i]['sameAs'] = [j] - else: - fonts[i]['sameAs'].append(j) - if (fonts[j].get('sameAs') is None): - fonts[j]['sameAs'] = [i] - else: - fonts[j]['sameAs'].append(i) - - -# Same as group_fonts, except that every sameAs (which is named replaceWith) -# points out only one font (the first one identified 'sameAs'). For example, -# if the fonts n° 5, 8 and 12 are identified "same", it will return : -# - replaceWith is None for font n° 5 -# - replaceWith is 5 for fonts 8 and 12. -# Thus, replacing whole block fonts with replaceWith value will reduce -# the number of fonts used for the whole document. -def replace_fonts(fonts): - for i in range(0, len(fonts) - 1): - for j in range(i+1, len(fonts)): - if (fonts[j].get('replaceWith') is None): - if (fonts[j]['id'] in fonts[i]['same_line']): - if fonts[i].get('replaceWith') is None: - fonts[j]['replaceWith'] = i - else: - fonts[j]['replaceWith'] = fonts[i]['replaceWith'] -#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& -#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& -#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& -#### BUG !!!!!!!!! -#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& -#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& -#### &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& -# L'algo ne convient pas à la nouvelle forme d'unification ; -# Si on a 2 sur la même ligne que 4, puis 4 sur la même ligne que 7, alors on a : -# 2:same_line = [4], et 4:same_line = [2,7] -# et la conséquence est que l'on obtient : -# 2:replaceWith <- None, 4:replaceWith <- 2, 7:replaceWith <- 4. -# -# Rustine posée, (if fonts[i]['replaceWith'] is None) à tester. - - - ## Regroupement par famille/couleur/taille : - #if (fonts[i]['size'] == fonts[j]['size']) and \ - # (fonts[i]['color'] == fonts[j]['color']) and \ - # (fonts[i]['family'].split(',')[0] == fonts[j]['family'].split(',')[0]): - # fonts[j]['replaceWith'] = i diff --git a/src/py/p2b_text_utils.py b/src/py/p2b_text_utils.py deleted file mode 100644 index 11d5134d4fe483e4364c11f0fb0f1f2b209ade8d..0000000000000000000000000000000000000000 --- a/src/py/p2b_text_utils.py +++ /dev/null @@ -1,82 +0,0 @@ -import re -from p2b_config import * - -# Is it an indice or expoesant ? -def is_ind_exp(str): - for ie in INDICES_EXPOSANTS_USUELS: - if re.match(ie, str): - return True - return False - -def add_lines(text): - last_top = -1 - line_no = -1 - last_right = 0 - for txt in text: - if (txt['top'] == last_top) and ((txt['left'] - last_right) < LEFT_THRESHOLD): - txt['line'] = line_no - elif is_ind_exp(txt['text'].strip()): - txt['line'] = line_no - else: - line_no += 1 - txt['line'] = line_no - last_top = txt['top'] - last_right = txt['left'] + txt['width'] -# for txt in text: -# print("[%d] (%d | %d â†â†’ %d) %s" %(txt['line'], txt['top'], txt['left'], (txt['left'] + txt['width']), txt['text'])) - - -# Extract lines from 'text' attribute returned by get_pdftohtml and associates -# a font id (and the page number), which is the font used by the higher number -# of characters of the line. -def get_lines(text, fontspec): - add_lines(text) - - for f in fontspec: - if 'same_line' not in f: - f['same_line'] = [] - - lines = [] - last_line = -2 - li = '' - fnt = {} - page_num = text[0]['page'] - for txt in text: - if (txt['line'] != last_line) or (txt == text[-1]): - if (len(li.strip()) > 0): - fnt_no = -1; max_car = 0; - for f in fnt.keys(): - if (fnt[f] > max_car): - max_car = fnt[f] - fnt_no = f - lines.append({ 'text': li.strip(), - 'most_used_font': fnt_no, - 'nb_fonts': len(fnt), - 'page': page_num}) - li = txt['text'].strip() - last_line = txt['line'] - for fi1 in fnt.keys(): - for fi2 in fnt.keys(): - if fi1 != fi2: - f1 = next(it for it in fontspec if it['id'] == int(fi1)) - f2 = next(it for it in fontspec if it['id'] == int(fi2)) - if (f2['id'] not in f1['same_line']): - f1['same_line'].append(f2['id']) - f2['same_line'].append(f1['id']) - fnt = {} - fnt[txt['font']] = len(li.strip()) - else: - if (is_ind_exp(txt['text'])): - li = "%s%s" % (li, txt['text'].strip()) - else: - li = "%s %s" % (li, txt['text'].strip()) - if (fnt.get(txt['font']) is None): - fnt[txt['font']] = len(txt['text'].strip()) - else: - fnt[txt['font']] += len(txt['text'].strip()) - page_num = txt['page'] - return lines - -def print_lines(lines): - for l in lines: - print("(%d) [%d] %s" % (l['page'], l['most_used_font'], l['text'])) diff --git a/src/py/pdf2blocks.py b/src/py/pdf2blocks.py index 186ff3991d2a67541be09695a58aa88bb23e2c86..0b80c61742793d08f3a3171fa0eb02dcf133587f 100644 --- a/src/py/pdf2blocks.py +++ b/src/py/pdf2blocks.py @@ -1,63 +1,649 @@ +import xml.etree.ElementTree as ET +import os import sys +import re -from p2b_file import get_pdftotext, get_pdftohtml -from p2b_blocks_utils import * -from p2b_text_utils import * -from p2b_fonts_utils import * -from p2b_analyse import get_pdftotext_fontstats, get_pdftotext_normal_fontsize +# https://unix.stackexchange.com/questions/238180/execute-shell-commands-in-python +import subprocess +from p2b_utils import levenshtein + +### Script pour faire tout le corpus : +# D=~/Boulot/Ontology/BSV/tmp/Corpus/2019/Viticulture; for i in ${D}/*.pdf; do j=$( basename "$i" | sed -e 's/\.pdf//' ); echo $j; python p2b.py ${D}/$j | tee ${D}/${j}.md | markdown -o ${D}/${j}.html ; done + + +CMD_PDFTOTEXT = '/usr/sbin/pdftotext' +CMD_PDFTOHTML = '/usr/sbin/pdftohtml' + +LEFT_THRESHOLD = 25 # In p2b_text_utils.add_lines() : the max horizontal space + # to consider aligned items to be on the same line. + +FLAG_NONE = 0x0000 +SMALL_FONT = 0x0001 +# BIG_FONT = 0x0002 -> Unused +PAGE_BOTTOM = 0x0004 +MANY_FONTS = 0x0010 +IS_BULLET = 0x0020 +DEFAULT_FONT_SIZE = 0x0040 +TITLE_SMALLER_THAN_SUBTITLE = 0x0080 + + +TITLE_MAX_LINES = 2 + +TITLE_MIN_CHAR = 2 # To avoid “styled†bullet : we consider that a font never + # used for more than TITLE_MIN_CHAR characters per line + # is a kind of text styling and will take the next line's font + +SIMILARITY_THRESHOLD = 1.0 + +# Celle là est un peu compliquée : Pour détecter la structure, on compte +# le nombre de successions d'un changement de police de caractères vers +# un autre (ex : la fonte 3 succède *2* fois à la fonte 8). +# Si ce nombre est trop peu élevé (<= NB_SUCCESSION_FOR_SAME) alors +# on considère que 8 n'est pas un titre de 3, et qu'ils sont au même niveau. +# Sinon, on considère que 8 est un niveau au-dessus dans la hiérarchie des +# titres, sous-titres, … +NB_SUCCESSION_FOR_SAME = 0 + +# Regex +INDICES_EXPOSANTS_USUELS = [ + 'er|ère|ere', # 1er, 1ère, … + 'nde?', # 2nd + 'i?[eè]me', # 3ème, 4ieme, … + '°', +] + + +# +--------------------------------------------------------------+ +# | get_pdftotext | +# +--------------------------------------------------------------+ +def get_pdftotext(filename): + # Calls pdftotext and retreive standard output in a string (o) + basename = os.path.splitext(filename)[0] + cmd = [CMD_PDFTOTEXT, '-bbox-layout', '-eol', 'unix', '%s.pdf' % basename, '-'] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + o, e = proc.communicate() + if (proc.returncode != 0): + print('-S-> Command pdftotext returned an error :') + print(' ' + e.decode('utf8')) + return [] + + # Parse xml code and create block table. + xml = o.decode('utf8') + root = ET.fromstring(xml) + + page_num = 0 + flow_num = 0 + blocks = [] + for body in root: + if (body.tag.endswith('body')): + for doc in body: + if (doc.tag.endswith('doc')): + for page in doc: + if (page.tag.endswith('page')): + page_num += 1 + for fl in page: + if (fl.tag.endswith('flow')): + flow_num += 1 + for bloc in fl: + if (bloc.tag.endswith('block')): + bl = {'page': page_num, 'flow': flow_num, 'lines': [], + 'flags': FLAG_NONE, + 'x_min': float(bloc.get('xMin')), + 'x_max': float(bloc.get('xMax')), + 'y_min': float(bloc.get('yMin')), + 'y_max': float(bloc.get('yMax')), + } + for line in bloc: + if (line.tag.endswith('line')): + h = float(line.get('yMax')) - float(line.get('yMin')) + li = { 'text': '', 'height': h, 'words': [], + 'flags': FLAG_NONE, + 'x_min': float(bloc.get('xMin')), + 'x_max': float(bloc.get('xMax')), + 'y_min': float(bloc.get('yMin')), + 'y_max': float(bloc.get('yMax')), + } + last_nbcar = 0 + last_h = 0 + for word in line: + if (word.tag.endswith('word')): + hword = float(word.get('yMax')) - float(word.get('yMin')) + li['words'].append({'height': hword, 'text': word.text}) + if ((hword != last_h) and (last_nbcar < 2)): + # This is to avoid separation of one big capital + # letter at the beginin of a title or paragraph. + last_h = hword + if len(re.sub(r'\W','', li['text'])) == 0: + li['text'] = "%s %s" % (li['text'], word.text) + else: + li['text'] = "%s%s" % (li['text'], word.text) + else: + li['text'] = "%s %s" % (li['text'], word.text) + li['text'] = li['text'].strip() + last_nbcar = len(word.text) + bl['lines'].append(li) + blocks.append(bl) + return blocks + + +# +--------------------------------------------------------------+ +# | get_pdftohtml | +# +--------------------------------------------------------------+ +def get_pdftohtml(filename): + basename = os.path.splitext(filename)[0] + cmd = [CMD_PDFTOHTML, '-xml', '-i', '-stdout', '%s.pdf' % basename] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + o, e = proc.communicate() + if (proc.returncode != 0): + print('-S-> Command pdftohtml returned an error :') + print(' ' + e.decode('utf8')) + return None + + # Parse xml code and create block table. + xml = o.decode('utf8') + root = ET.fromstring(xml) + + fontspec = [] + segments = [] + for page in root: + if (page.tag.endswith('page')): + pg = int(page.get('number')) + for tg in page: + if (tg.tag.endswith('fontspec')): + fontspec.append({ + 'id': int(tg.get('id')), + 'size': int(tg.get('size')), + 'family': tg.get('family'), + 'color': tg.get('color'), + 'nb_cars': 0 + }) + elif (tg.tag.endswith('text')): + fnt = int(tg.get('font')) + top = int(tg.get('top')) + left = int(tg.get('left')) + width = int(tg.get('width')) + height = int(tg.get('height')) + while (tg.text is None) and (len(tg) > 0): + tg = tg[0] # remove html style tags (like <b>, …) + if (tg.text is not None): + li = "%s" % (tg.text) + if (len(li.strip()) > 0): + segments.append({'page': pg, 'font': fnt, + 'top': top, 'left': left, + 'width': width, 'height': height, + 'text': li.strip() + }) + # Find font in fontspec + for font in fontspec: + if font['id'] == fnt: break + font['nb_cars'] += len(li.strip()) + return { 'fonts': fontspec, 'segments': segments } + + +# +--------------------------------------------------------------+ +# | get_default_font_size | +# +--------------------------------------------------------------+ +def get_default_font_size(fontspec): + sizes = {} + max_cars = 0 + size_max_cars = 42 # Doesn't matter : it'll change + for f in fontspec: + if sizes.get(f['size']) is None: + sizes[f['size']] = f['nb_cars'] + else: + sizes[f['size']] += f['nb_cars'] + if sizes[f['size']] > max_cars: + max_cars = sizes[f['size']] + size_max_cars = f['size'] + return size_max_cars + + +# +--------------------------------------------------------------+ +# | mark_small_fonts | +# +--------------------------------------------------------------+ +# RQ : Also marks bullet lines +def mark_small_fonts(blocks, default_font_size): + for b in blocks: + for l in b['lines']: + if (round(l['height']) < default_font_size): + l['flags'] |= SMALL_FONT + if len(re.sub(r'\W','', l['text'])) == 0: + l['flags'] |= IS_BULLET + + +# +--------------------------------------------------------------+ +# | mark_page_bottom | +# +--------------------------------------------------------------+ +def mark_page_bottom(blocks): + if (blocks[-1]['page'] == 1): return + + # Find indexes of last blocks in pages + bndx = [] + for i in range(0, len(blocks) - 1): + if (blocks[i]['page'] != blocks[i+1]['page']): + bndx.append(i) + bndx.append(len(blocks)-1) + + # Get last line indexes + lndx = [] + for i in bndx: + lndx.append(len(blocks[i]['lines'])-1) + + # Loop while finding always same characters in last lines + end = False + while not end: + txt = None + # Test if last lines characters are the same + for i,j in zip(bndx, lndx): + li = re.sub(r'[^a-zA-Z]', '', blocks[i]['lines'][j]['text']) + if txt is None: txt = li + else: end = (txt != li) + # All last line are the same, so mark them + if not end: + for i in range(0, len(bndx)): + blocks[bndx[i]]['lines'][lndx[i]]['flags'] |= PAGE_BOTTOM + lndx[i] -= 1 + if (lndx[i] < 0): + #-# blocks[bndx[i]]['flags'] |= PAGE_BOTTOM + bndx[i] -= 1 + lndx[i] = len(blocks[bndx[i]]['lines']) - 1 + end = bndx[i] < 0 + +# +--------------------------------------------------------------+ +# | is_ind_exp | +# +--------------------------------------------------------------+ +# Is it an indice or exposant ? +def is_ind_exp(str): + for ie in INDICES_EXPOSANTS_USUELS: + if re.match(ie, str): + return True + return False + +# +--------------------------------------------------------------+ +# | get_lines | +# +--------------------------------------------------------------+ +# Extract lines from 'text' attribute returned by get_pdftohtml and associates +# a font id (and the page number), which is the font used by the higher number +# of characters of the line. +# Does a column splitting considering the value of LEFT_THRESHOLD +def get_lines(segments, fontspec): + last_top = -1 + line_no = -1 + last_right = 0 + for txt in segments: + if (txt['top'] == last_top) and ((txt['left'] - last_right) <= LEFT_THRESHOLD): + txt['line'] = line_no + elif is_ind_exp(txt['text'].strip()): + txt['line'] = line_no + else: + line_no += 1 + txt['line'] = line_no + last_top = txt['top'] + last_right = txt['left'] + txt['width'] + + for f in fontspec: + if 'same_line' not in f: + f['same_line'] = [] + + lines = [] + last_line = -2 + li = '' + fnt = {} + page_num = segments[0]['page'] + for txt in segments: + if (txt['line'] != last_line) or (txt == segments[-1]): + if (len(li.strip()) > 0): + fnt_no = -1; max_car = 0; + for f in fnt.keys(): + if (fnt[f] > max_car): + max_car = fnt[f] + fnt_no = f + lines.append({ 'text': li.strip(), + 'most_used_font': fnt_no, + 'nb_fonts': len(fnt), + 'page': page_num}) + li = txt['text'].strip() + last_line = txt['line'] + for fi1 in fnt.keys(): + for fi2 in fnt.keys(): + if fi1 != fi2: + f1 = next(it for it in fontspec if it['id'] == int(fi1)) + f2 = next(it for it in fontspec if it['id'] == int(fi2)) + if (f2['id'] not in f1['same_line']): + f1['same_line'].append(f2['id']) + f2['same_line'].append(f1['id']) + fnt = {} + fnt[txt['font']] = len(li.strip()) + else: + if (is_ind_exp(txt['text'])): + li = "%s%s" % (li, txt['text'].strip()) + else: + li = "%s %s" % (li, txt['text'].strip()) + if (fnt.get(txt['font']) is None): + fnt[txt['font']] = len(txt['text'].strip()) + else: + fnt[txt['font']] += len(txt['text'].strip()) + page_num = txt['page'] + return lines + +# +--------------------------------------------------------------+ +# | guess_fonts | +# +--------------------------------------------------------------+ +# Tries to guess fontspec of each line into blocks list. +# It calculates the levenshtein distance with every segment of the same page +# and assigns the best matching score's font. +def guess_fonts(blocks, segments, fontspec): + lines = get_lines(segments, fontspec) + ndx_lines = [0,] # Indexation des indices de line par numéro de page + for ndx in range(1, len(lines)): + if (lines[ndx-1]['page'] != lines[ndx]['page']): + ndx_lines.append(ndx) + ndx_lines.append(len(lines)) + + for f in fontspec: + f['nb_lines'] = 0 + f['dist_sum'] = 0 + #f['block_pos_sum'] = 0 + + for bl in blocks: + for l in bl['lines']: + if (len(l['text']) > 0): + min_dist = len(l['text']) + min_score = 1.0 + font_sel = -1 + line_no = -1 + for i in range(ndx_lines[bl['page']-1], ndx_lines[bl['page']]): + if (len(lines[i]['text']) > 0): + d = levenshtein(l['text'], lines[i]['text']) + if (d == 0): + min_dist = 0 + min_score = 0.0 + font_sel = lines[i]['most_used_font'] + line_no = i + break; + score = float(d) / float(max(len(l['text']), len(lines[i]['text']))) + if (score <= SIMILARITY_THRESHOLD): + if (d < min_dist): + min_dist = d + min_score = score + font_sel = lines[i]['most_used_font'] + line_no = i + l['font'] = font_sel + if (font_sel >= 0): + fnt = next(it for it in fontspec if it['id'] == font_sel) + fnt['nb_lines'] +=1 + fnt['dist_sum'] += min_dist + l['score'] = min_score # For debuggin purpose + l['dist'] = min_dist # idem. + l['line_no'] = line_no # idem. Stores the "similar line" number + # print("> %s" % l['text']) + # print(" %s" % lines[line_no]['text']) + # print(" [%d]" % font_sel) + # print("") + if (lines[line_no]['nb_fonts'] > 1): + l['flags'] |= MANY_FONTS + +# +--------------------------------------------------------------+ +# | replace_block_fonts | +# +--------------------------------------------------------------+ +# Adds a 'short_font' attribute to lines which gives another font value which +# doesn't care about style (bold, …). +# RK: def_size is default_font_size, used to mark SMALL_FONT flag. +def replace_block_fonts(blocks, fontspec, def_size): + for i in range(0, len(fontspec) - 1): + for j in range(i+1, len(fontspec)): + if (fontspec[j].get('replaceWith') is None): + if (fontspec[j]['id'] in fontspec[i]['same_line']): + if fontspec[i].get('replaceWith') is None: + fontspec[j]['replaceWith'] = fontspec[i]['id'] + else: + fontspec[j]['replaceWith'] = fontspec[i]['replaceWith'] + for bl in blocks: + for l in bl['lines']: + if (l['font'] < 0): + f = None + else: + f = next(it for it in fontspec if it['id'] == l['font']) + if (f is None) or (f.get('replaceWith') is None): + l['short_font'] = l['font'] + else: + l['short_font'] = f.get('replaceWith') + if (f is not None): + f = next(it for it in fontspec if it['id'] == l['short_font']) + if (f['size'] < def_size): + l['flags'] |= SMALL_FONT + if (f['size'] == def_size): + l['flags'] |= DEFAULT_FONT_SIZE + + +# +--------------------------------------------------------------+ +# | guess_structure | +# +--------------------------------------------------------------+ +def guess_structure(blocks, fontspec, + remove_flags = SMALL_FONT | PAGE_BOTTOM | IS_BULLET): + t = [] # A list used here and there + n = [] # Another one + + # Search for the most used font + # Here, t will be used to count the number of cars of each font. + # and n will be used to store the maximum line size for each font. + for i in range(len(fontspec)): + t.append(0) + n.append(0) + nb_max = -1 + ndx_most_used = -1 + for bl in blocks: + for l in bl['lines']: + if (l['short_font'] >= 0) and ((l['flags'] & remove_flags) == FLAG_NONE): + lon = len(l['text'].strip()) + t[l['short_font']] += lon + if lon > n[l['short_font']]: n[l['short_font']] = lon + if (t[l['short_font']] > nb_max): + nb_max = t[l['short_font']] + ndx_most_used = l['short_font'] + b = [nb <= TITLE_MIN_CHAR for nb in n] + + ### ndx_most_used is the most used font number. + ### b[font_number] is True if the font seems used for bullets. + + t = [] # We'll use it to list the fonts succession + n = [] # Used to count the number of lines + for bl in blocks: + for l in bl['lines']: + if (l['flags'] & remove_flags) == FLAG_NONE: + if t == []: + t.append(l['short_font']) + n.append(1) + else: + if (t[-1] != l['short_font']): + t.append(l['short_font']) + n.append(1) + else: + n[-1] += 1 + + f = {} # Will contain used font numbers and number of occurences in t + for i,j in zip(t,n): + if i not in f.keys(): + f[i] = {'nb': 1, 'nl':j, 'maxl': j, + 'is_bullet': b[i], 'flags': FLAG_NONE} + else: + f[i]['nb'] += 1 + f[i]['nl'] += j + if (j > f[i]['maxl']): + f[i]['maxl'] = j + + for i in f.keys(): + f[i]['isnt_title'] = (f[i]['maxl'] > TITLE_MAX_LINES) + + # Replace short_font for lines considered as bullets (or text styling). + last_bullet_lines = [] + for bl in blocks: + for l in bl['lines']: + if (l['flags'] & remove_flags) == FLAG_NONE: + if f[l['short_font']]['is_bullet']: + last_bullet_lines.append(l) + else: + if (len(last_bullet_lines) > 0): + for last in last_bullet_lines: + last['short_font'] = l['short_font'] + last_bullet_lines = [] + if (len(last_bullet_lines) > 0): + for last in last_bullet_lines: + last['short_font'] = ndx_most_used + + # n and b won't be used anymore I think. So they're free + + # Rebuild the font succession list (is not optimized but is the safest) + t = [] + for bl in blocks: + for l in bl['lines']: + if (l['flags'] & remove_flags) == FLAG_NONE: + if t == []: t.append(l['short_font']) + else: + if (t[-1] != l['short_font']): + t.append(l['short_font']) + + b = [] # We'll do a 2d table with b[i][j] = number of transitions + # from fonti to fontj (will be a tree of font transitions) + for i in range(len(fontspec)+1): # Consider len+1 to have font number -1 + b.append([0 for j in range(len(fontspec)+1)]) + for i in range(len(t)-1): + j = i+1 + if not f[t[i]]['isnt_title']: + b[t[i]][t[j]] += 1 + + # Should we do this : ? &&&&&&&&&&&&&&&&&&&& A tester + # Un moyen de s'assurer que tout sera parcouru... + # Signifie qu'on ne finit pas sur un titre. + f[t[-1]]['isnt_title'] = True + + # Create a deep attribute in f which contains distance from leaves + for k,v in f.items(): + if v['isnt_title']: + v['deep'] = 0 + v['nb_transitions'] = 999999999 + else: v['deep'] = None + + # Algo : dans le tableau b, on parcourt les colonnes (j) pour les fontes qui ont un deep. + # Un indice (i) de ligne pour lequel la valeur b[i][j] est non nulle signifie + # que la fonte i + # précède la fonte j b[i][j] fois. + # Rq : Une colonne vide pour un indice dont la ligne est non-vide est une racine + # Une ligne vide pour un indice dont la colonne est non-vide est une feuille + # On répète tant qu'on change des valeurs (c'est pas optimisé mais crotte, + # le tableau n'est pas si grand) + has_changed = True + deep_max = 0 + while has_changed: + has_changed = False + for k,v in f.items(): + if v['deep'] is not None: + for i in range(-1,len(b)-1): + if b[i][k] != 0: + if f[i]['deep'] is None: + if (b[i][k] <= NB_SUCCESSION_FOR_SAME): + f[i]['deep'] = v['deep'] + f[i]['nb_transitions'] = b[i][k] + else: + f[i]['deep'] = v['deep'] + 1 + f[i]['nb_transitions'] = b[i][k] + if f[i]['deep'] > deep_max: + deep_max = f[i]['deep'] + has_changed = True + if (fontspec[i]['size'] < fontspec[k]['size']): + f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE + elif f[i]['nb_transitions'] < b[i][k]: + f[i]['deep'] = v['deep'] + 1 + f[i]['nb_transitions'] = b[i][k] + has_changed = True + if (fontspec[i]['size'] < fontspec[k]['size']): + f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE + + # Reverse deepness value, to make it distance from root + for v in f.values(): + if (v['deep'] is not None): + v['deep'] = deep_max - v['deep'] + + # Add deep in blocks lines + for bl in blocks: + for l in bl['lines']: + if (l['flags'] & remove_flags) == FLAG_NONE: + l['deep'] = f[l['short_font']]['deep'] + if ((f[l['short_font']]['flags']) & TITLE_SMALLER_THAN_SUBTITLE != 0): + l['flags'] |= TITLE_SMALLER_THAN_SUBTITLE + else: + l['deep'] = deep_max + + + + + + + +# +--------------------------------------------------------------+ +# | print_block_list | +# +--------------------------------------------------------------+ +def print_block_list(t, remove_flags = FLAG_NONE): + last_page = -1 + deep_max = -1 + for bl in t: + for l in bl['lines']: + if (l.get('deep') is not None): + if deep_max < l['deep']: deep_max = l['deep'] + if deep_max > 10: deep_max = 10 + ttl = "#############" + last_deep = -1 + + for block in t: + if (block['page'] != last_page): + if (last_page > 0): + print("") + last_page = block['page'] + print("________________________________") + print("*page %d*" % last_page) + + print("") + + for l in block['lines']: + if (l['flags'] & remove_flags) == FLAG_NONE: + pre = '' + post = ' ' + if (l.get('deep') is None): + pre = '!! ' + last_deep = -1 + else: + if (l['flags'] & SMALL_FONT) != 0: + pre = "> %s" % pre + #if (len(l['text']) > 20) and \ + # len(re.sub(r'\w','', l['text']).strip()) > 5: + # post = "%s " % post + if (l['flags'] & TITLE_SMALLER_THAN_SUBTITLE) != 0 and \ + (l['flags'] & (DEFAULT_FONT_SIZE | SMALL_FONT)) != 0: + pre = "%s**" % (pre) + post = "**%s" % post + elif l['deep'] < deep_max: + pre = "%s%s " % (pre, ttl[0:(l['deep']+1)]) + last_deep = l['deep'] + print("%s%s%s" % (pre, l['text'], post)) + + +# +--------------------------------------------------------------+ +# | main | +# +--------------------------------------------------------------+ if (len(sys.argv) < 1): print("-U-> Usage : python pdf2blocks.py <fichier_pdf>") sys.exit(-1) blocks = get_pdftotext(sys.argv[1]) -fnt_sizes = get_pdftotext_fontstats(blocks) -normal_font = get_pdftotext_normal_fontsize(fnt_sizes) -mark_blocks_fontsize(blocks, normal_font) -mark_bullet_lines(blocks) -mark_page_bottom(blocks) -#print_block_list(blocks) -##print("<<< %d >>>" % normal_font) - p2h = get_pdftohtml(sys.argv[1]) -#li = get_lines(p2h['text']) -#print_lines(li) - -####group_fonts(p2h['fonts']) -##########replace_fonts(p2h['fonts']) -# for f in p2h['fonts']: -# sa = '( )' if f.get('replaceWith') is None else ("(%2d)" % f.get('replaceWith')) -# print("[%2d] %s %2dpx '%s' <%s>" % (f['id'], sa, f['size'], f['family'], f['color'])) -# -# print('') -# print("#######################################################################") -# print('') - -guess_fonts(blocks, p2h['text'], p2h['fonts']) - - -#### Ce qui a été modifié : -## On n'unifie que les polices détectées présentes sur une même ligne. -## Amélioration de la détection de puces (lignes sans car alphanum '\W') -## et ajout d'un FLAG 'BULLET', pour sortir du traitement les lignes -## qui ne contiennent qu'un caractère alacon. -## Du coup on a zingué le short_font, il faudrait réécrire au propre. - - -#for f in p2h['fonts']: -# print("(%2d) [%s ; %s] %s" % (f['id'], f["family"], f["size"], f['same_line'])) -#print('') -#print('------------------------------------------------') -#print('') -for bl in blocks: - for l in bl['lines']: - l['short_font'] = l['font'] -# print("[%2d] %s"%(l['font'], l['text'])) -# -#exit(0) - -replace_fonts(p2h['fonts']) -replace_block_fonts(blocks, p2h['fonts']) - -guess_structure(blocks, p2h['fonts']) -print_block_list(blocks, FLAG_SMALL_FONT | FLAG_PAGE_BOTTOM | FLAG_BULLET) +fontspec = p2h['fonts'] +segments = p2h['segments'] + +default_font_size = get_default_font_size(fontspec) +# mark_small_fonts(blocks, default_font_size) +mark_page_bottom(blocks) +guess_fonts(blocks, segments, fontspec) +replace_block_fonts(blocks, fontspec, default_font_size) +guess_structure(blocks, fontspec) +print_block_list(blocks, PAGE_BOTTOM | IS_BULLET)