An error occurred while loading the file. Please try again.
-
Bernard Stephan authored2ba46941
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
import xml.etree.ElementTree as ET
import os
import sys
import re
# https://unix.stackexchange.com/questions/238180/execute-shell-commands-in-python
import subprocess
from p2b_utils import levenshtein
### Script pour faire tout le corpus :
# D=~/Boulot/Ontology/BSV/tmp/Corpus/2019/Viticulture; for i in ${D}/*.pdf; do j=$( basename "$i" | sed -e 's/\.pdf//' ); echo $j; python p2b.py ${D}/$j | tee ${D}/${j}.md | markdown -o ${D}/${j}.html ; done
CMD_PDFTOTEXT = '/usr/sbin/pdftotext'
CMD_PDFTOHTML = '/usr/sbin/pdftohtml'
LEFT_THRESHOLD = 25 # In p2b_text_utils.add_lines() : the max horizontal space
# to consider aligned items to be on the same line.
FLAG_NONE = 0x0000
SMALL_FONT = 0x0001
# BIG_FONT = 0x0002 -> Unused
PAGE_BOTTOM = 0x0004
MANY_FONTS = 0x0010
IS_BULLET = 0x0020
DEFAULT_FONT_SIZE = 0x0040
TITLE_SMALLER_THAN_SUBTITLE = 0x0080
TITLE_MAX_LINES = 2
TITLE_MIN_CHAR = 2 # To avoid “styled” bullet : we consider that a font never
# used for more than TITLE_MIN_CHAR characters per line
# is a kind of text styling and will take the next line's font
SIMILARITY_THRESHOLD = 1.0
# Celle là est un peu compliquée : Pour détecter la structure, on compte
# le nombre de successions d'un changement de police de caractères vers
# un autre (ex : la fonte 3 succède *2* fois à la fonte 8).
# Si ce nombre est trop peu élevé (<= NB_SUCCESSION_FOR_SAME) alors
# on considère que 8 n'est pas un titre de 3, et qu'ils sont au même niveau.
# Sinon, on considère que 8 est un niveau au-dessus dans la hiérarchie des
# titres, sous-titres, …
NB_SUCCESSION_FOR_SAME = 0
# Regex
INDICES_EXPOSANTS_USUELS = [
'er|ère|ere', # 1er, 1ère, …
'nde?', # 2nd
'i?[eè]me', # 3ème, 4ieme, …
'°',
]
# +--------------------------------------------------------------+
# | get_pdftotext |
# +--------------------------------------------------------------+
def get_pdftotext(filename):
# Calls pdftotext and retreive standard output in a string (o)
basename = os.path.splitext(filename)[0]
cmd = [CMD_PDFTOTEXT, '-bbox-layout', '-eol', 'unix', '%s.pdf' % basename, '-']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
o, e = proc.communicate()
if (proc.returncode != 0):
print('-S-> Command pdftotext returned an error :')
print(' ' + e.decode('utf8'))
return []
# Parse xml code and create block table.
xml = o.decode('utf8')
root = ET.fromstring(xml)
page_num = 0
flow_num = 0
blocks = []
for body in root:
if (body.tag.endswith('body')):
for doc in body:
if (doc.tag.endswith('doc')):
for page in doc:
if (page.tag.endswith('page')):
page_num += 1
for fl in page:
if (fl.tag.endswith('flow')):
flow_num += 1
for bloc in fl:
if (bloc.tag.endswith('block')):
bl = {'page': page_num, 'flow': flow_num, 'lines': [],
'flags': FLAG_NONE,
'x_min': float(bloc.get('xMin')),
'x_max': float(bloc.get('xMax')),
'y_min': float(bloc.get('yMin')),
'y_max': float(bloc.get('yMax')),
}
for line in bloc:
if (line.tag.endswith('line')):
h = float(line.get('yMax')) - float(line.get('yMin'))
li = { 'text': '', 'height': h, 'words': [],
'flags': FLAG_NONE,
'x_min': float(bloc.get('xMin')),
'x_max': float(bloc.get('xMax')),
'y_min': float(bloc.get('yMin')),
'y_max': float(bloc.get('yMax')),
}
last_nbcar = 0
last_h = 0
for word in line:
if (word.tag.endswith('word')):
hword = float(word.get('yMax')) - float(word.get('yMin'))
li['words'].append({'height': hword, 'text': word.text})
if ((hword != last_h) and (last_nbcar < 2)):
# This is to avoid separation of one big capital
# letter at the beginin of a title or paragraph.
last_h = hword
if len(re.sub(r'\W','', li['text'])) == 0:
li['text'] = "%s %s" % (li['text'], word.text)
else:
li['text'] = "%s%s" % (li['text'], word.text)
else:
li['text'] = "%s %s" % (li['text'], word.text)
li['text'] = li['text'].strip()
last_nbcar = len(word.text)
bl['lines'].append(li)
blocks.append(bl)
return blocks
# +--------------------------------------------------------------+
# | get_pdftohtml |
# +--------------------------------------------------------------+
def get_pdftohtml(filename):
basename = os.path.splitext(filename)[0]
cmd = [CMD_PDFTOHTML, '-xml', '-i', '-stdout', '%s.pdf' % basename]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
o, e = proc.communicate()
if (proc.returncode != 0):
print('-S-> Command pdftohtml returned an error :')
print(' ' + e.decode('utf8'))
return None
# Parse xml code and create block table.
xml = o.decode('utf8')
root = ET.fromstring(xml)
fontspec = []
segments = []
for page in root:
if (page.tag.endswith('page')):
pg = int(page.get('number'))
for tg in page:
if (tg.tag.endswith('fontspec')):
fontspec.append({
'id': int(tg.get('id')),
'size': int(tg.get('size')),
'family': tg.get('family'),
'color': tg.get('color'),
'nb_cars': 0
})
elif (tg.tag.endswith('text')):
fnt = int(tg.get('font'))
top = int(tg.get('top'))
left = int(tg.get('left'))
width = int(tg.get('width'))
height = int(tg.get('height'))
while (tg.text is None) and (len(tg) > 0):
tg = tg[0] # remove html style tags (like <b>, …)
if (tg.text is not None):
li = "%s" % (tg.text)
if (len(li.strip()) > 0):
segments.append({'page': pg, 'font': fnt,
'top': top, 'left': left,
'width': width, 'height': height,
'text': li.strip()
})
# Find font in fontspec
for font in fontspec:
if font['id'] == fnt: break
font['nb_cars'] += len(li.strip())
return { 'fonts': fontspec, 'segments': segments }
# +--------------------------------------------------------------+
# | get_default_font_size |
# +--------------------------------------------------------------+
def get_default_font_size(fontspec):
sizes = {}
max_cars = 0
size_max_cars = 42 # Doesn't matter : it'll change
for f in fontspec:
if sizes.get(f['size']) is None:
sizes[f['size']] = f['nb_cars']
else:
sizes[f['size']] += f['nb_cars']
if sizes[f['size']] > max_cars:
max_cars = sizes[f['size']]
size_max_cars = f['size']
return size_max_cars
# +--------------------------------------------------------------+
# | mark_small_fonts |
# +--------------------------------------------------------------+
# RQ : Also marks bullet lines
def mark_small_fonts(blocks, default_font_size):
for b in blocks:
for l in b['lines']:
if (round(l['height']) < default_font_size):
l['flags'] |= SMALL_FONT
if len(re.sub(r'\W','', l['text'])) == 0:
l['flags'] |= IS_BULLET
# +--------------------------------------------------------------+
# | mark_page_bottom |
# +--------------------------------------------------------------+
def mark_page_bottom(blocks):
if (blocks[-1]['page'] == 1): return
# Find indexes of last blocks in pages
bndx = []
for i in range(0, len(blocks) - 1):
if (blocks[i]['page'] != blocks[i+1]['page']):
bndx.append(i)
bndx.append(len(blocks)-1)
# Get last line indexes
lndx = []
for i in bndx:
lndx.append(len(blocks[i]['lines'])-1)
# Loop while finding always same characters in last lines
end = False
while not end:
txt = None
# Test if last lines characters are the same
for i,j in zip(bndx, lndx):
li = re.sub(r'[^a-zA-Z]', '', blocks[i]['lines'][j]['text'])
if txt is None: txt = li
else: end = (txt != li)
# All last line are the same, so mark them
if not end:
for i in range(0, len(bndx)):
blocks[bndx[i]]['lines'][lndx[i]]['flags'] |= PAGE_BOTTOM
lndx[i] -= 1
if (lndx[i] < 0):
#-# blocks[bndx[i]]['flags'] |= PAGE_BOTTOM
bndx[i] -= 1
lndx[i] = len(blocks[bndx[i]]['lines']) - 1
end = bndx[i] < 0
# +--------------------------------------------------------------+
# | is_ind_exp |
# +--------------------------------------------------------------+
# Is it an indice or exposant ?
def is_ind_exp(str):
for ie in INDICES_EXPOSANTS_USUELS:
if re.match(ie, str):
return True
return False
# +--------------------------------------------------------------+
# | get_lines |
# +--------------------------------------------------------------+
# Extract lines from 'text' attribute returned by get_pdftohtml and associates
# a font id (and the page number), which is the font used by the higher number
# of characters of the line.
# Does a column splitting considering the value of LEFT_THRESHOLD
def get_lines(segments, fontspec):
last_top = -1
line_no = -1
last_right = 0
for txt in segments:
if (txt['top'] == last_top) and ((txt['left'] - last_right) <= LEFT_THRESHOLD):
txt['line'] = line_no
elif is_ind_exp(txt['text'].strip()):
txt['line'] = line_no
else:
line_no += 1
txt['line'] = line_no
last_top = txt['top']
last_right = txt['left'] + txt['width']
for f in fontspec:
if 'same_line' not in f:
f['same_line'] = []
lines = []
last_line = -2
li = ''
fnt = {}
page_num = segments[0]['page']
for txt in segments:
if (txt['line'] != last_line) or (txt == segments[-1]):
if (len(li.strip()) > 0):
fnt_no = -1; max_car = 0;
for f in fnt.keys():
if (fnt[f] > max_car):
max_car = fnt[f]
fnt_no = f
lines.append({ 'text': li.strip(),
'most_used_font': fnt_no,
'nb_fonts': len(fnt),
'page': page_num})
li = txt['text'].strip()
last_line = txt['line']
for fi1 in fnt.keys():
for fi2 in fnt.keys():
if fi1 != fi2:
f1 = next(it for it in fontspec if it['id'] == int(fi1))
f2 = next(it for it in fontspec if it['id'] == int(fi2))
if (f2['id'] not in f1['same_line']):
f1['same_line'].append(f2['id'])
f2['same_line'].append(f1['id'])
fnt = {}
fnt[txt['font']] = len(li.strip())
else:
if (is_ind_exp(txt['text'])):
li = "%s%s" % (li, txt['text'].strip())
else:
li = "%s %s" % (li, txt['text'].strip())
if (fnt.get(txt['font']) is None):
fnt[txt['font']] = len(txt['text'].strip())
else:
fnt[txt['font']] += len(txt['text'].strip())
page_num = txt['page']
return lines
# +--------------------------------------------------------------+
# | guess_fonts |
# +--------------------------------------------------------------+
# Tries to guess fontspec of each line into blocks list.
# It calculates the levenshtein distance with every segment of the same page
# and assigns the best matching score's font.
def guess_fonts(blocks, segments, fontspec):
lines = get_lines(segments, fontspec)
ndx_lines = [0,] # Indexation des indices de line par numéro de page
for ndx in range(1, len(lines)):
if (lines[ndx-1]['page'] != lines[ndx]['page']):
ndx_lines.append(ndx)
ndx_lines.append(len(lines))
for f in fontspec:
f['nb_lines'] = 0
f['dist_sum'] = 0
#f['block_pos_sum'] = 0
for bl in blocks:
for l in bl['lines']:
if (len(l['text']) > 0):
min_dist = len(l['text'])
min_score = 1.0
font_sel = -1
line_no = -1
for i in range(ndx_lines[bl['page']-1], ndx_lines[bl['page']]):
if (len(lines[i]['text']) > 0):
d = levenshtein(l['text'], lines[i]['text'])
if (d == 0):
min_dist = 0
min_score = 0.0
font_sel = lines[i]['most_used_font']
line_no = i
break;
score = float(d) / float(max(len(l['text']), len(lines[i]['text'])))
if (score <= SIMILARITY_THRESHOLD):
if (d < min_dist):
min_dist = d
min_score = score
font_sel = lines[i]['most_used_font']
line_no = i
l['font'] = font_sel
if (font_sel >= 0):
fnt = next(it for it in fontspec if it['id'] == font_sel)
fnt['nb_lines'] +=1
fnt['dist_sum'] += min_dist
l['score'] = min_score # For debuggin purpose
l['dist'] = min_dist # idem.
l['line_no'] = line_no # idem. Stores the "similar line" number
# print("> %s" % l['text'])
# print(" %s" % lines[line_no]['text'])
# print(" [%d]" % font_sel)
# print("")
if (lines[line_no]['nb_fonts'] > 1):
l['flags'] |= MANY_FONTS
# +--------------------------------------------------------------+
# | replace_block_fonts |
# +--------------------------------------------------------------+
# Adds a 'short_font' attribute to lines which gives another font value which
# doesn't care about style (bold, …).
# RK: def_size is default_font_size, used to mark SMALL_FONT flag.
def replace_block_fonts(blocks, fontspec, def_size):
for i in range(0, len(fontspec) - 1):
for j in range(i+1, len(fontspec)):
if (fontspec[j].get('replaceWith') is None):
if (fontspec[j]['id'] in fontspec[i]['same_line']):
if fontspec[i].get('replaceWith') is None:
fontspec[j]['replaceWith'] = fontspec[i]['id']
else:
fontspec[j]['replaceWith'] = fontspec[i]['replaceWith']
for bl in blocks:
for l in bl['lines']:
if (l['font'] < 0):
f = None
else:
f = next(it for it in fontspec if it['id'] == l['font'])
if (f is None) or (f.get('replaceWith') is None):
l['short_font'] = l['font']
else:
l['short_font'] = f.get('replaceWith')
if (f is not None):
f = next(it for it in fontspec if it['id'] == l['short_font'])
if (f['size'] < def_size):
l['flags'] |= SMALL_FONT
if (f['size'] == def_size):
l['flags'] |= DEFAULT_FONT_SIZE
# +--------------------------------------------------------------+
# | guess_structure |
# +--------------------------------------------------------------+
def guess_structure(blocks, fontspec,
remove_flags = SMALL_FONT | PAGE_BOTTOM | IS_BULLET):
t = [] # A list used here and there
n = [] # Another one
# Search for the most used font
# Here, t will be used to count the number of cars of each font.
# and n will be used to store the maximum line size for each font.
for i in range(len(fontspec)):
t.append(0)
n.append(0)
nb_max = -1
ndx_most_used = -1
for bl in blocks:
for l in bl['lines']:
if (l['short_font'] >= 0) and ((l['flags'] & remove_flags) == FLAG_NONE):
lon = len(l['text'].strip())
t[l['short_font']] += lon
if lon > n[l['short_font']]: n[l['short_font']] = lon
if (t[l['short_font']] > nb_max):
nb_max = t[l['short_font']]
ndx_most_used = l['short_font']
b = [nb <= TITLE_MIN_CHAR for nb in n]
### ndx_most_used is the most used font number.
### b[font_number] is True if the font seems used for bullets.
t = [] # We'll use it to list the fonts succession
n = [] # Used to count the number of lines
for bl in blocks:
for l in bl['lines']:
if (l['flags'] & remove_flags) == FLAG_NONE:
if t == []:
t.append(l['short_font'])
n.append(1)
else:
if (t[-1] != l['short_font']):
t.append(l['short_font'])
n.append(1)
else:
n[-1] += 1
f = {} # Will contain used font numbers and number of occurences in t
for i,j in zip(t,n):
if i not in f.keys():
f[i] = {'nb': 1, 'nl':j, 'maxl': j,
'is_bullet': b[i], 'flags': FLAG_NONE}
else:
f[i]['nb'] += 1
f[i]['nl'] += j
if (j > f[i]['maxl']):
f[i]['maxl'] = j
for i in f.keys():
f[i]['isnt_title'] = (f[i]['maxl'] > TITLE_MAX_LINES)
# Replace short_font for lines considered as bullets (or text styling).
last_bullet_lines = []
for bl in blocks:
for l in bl['lines']:
if (l['flags'] & remove_flags) == FLAG_NONE:
if f[l['short_font']]['is_bullet']:
last_bullet_lines.append(l)
else:
if (len(last_bullet_lines) > 0):
for last in last_bullet_lines:
last['short_font'] = l['short_font']
last_bullet_lines = []
if (len(last_bullet_lines) > 0):
for last in last_bullet_lines:
last['short_font'] = ndx_most_used
# n and b won't be used anymore I think. So they're free
# Rebuild the font succession list (is not optimized but is the safest)
t = []
for bl in blocks:
for l in bl['lines']:
if (l['flags'] & remove_flags) == FLAG_NONE:
if t == []: t.append(l['short_font'])
else:
if (t[-1] != l['short_font']):
t.append(l['short_font'])
b = [] # We'll do a 2d table with b[i][j] = number of transitions
# from fonti to fontj (will be a tree of font transitions)
for i in range(len(fontspec)+1): # Consider len+1 to have font number -1
b.append([0 for j in range(len(fontspec)+1)])
for i in range(len(t)-1):
j = i+1
if not f[t[i]]['isnt_title']:
b[t[i]][t[j]] += 1
# Should we do this : ? &&&&&&&&&&&&&&&&&&&& A tester
# Un moyen de s'assurer que tout sera parcouru...
# Signifie qu'on ne finit pas sur un titre.
f[t[-1]]['isnt_title'] = True
# Create a deep attribute in f which contains distance from leaves
for k,v in f.items():
if v['isnt_title']:
v['deep'] = 0
v['nb_transitions'] = 999999999
else: v['deep'] = None
# Algo : dans le tableau b, on parcourt les colonnes (j) pour les fontes qui ont un deep.
# Un indice (i) de ligne pour lequel la valeur b[i][j] est non nulle signifie
# que la fonte i
# précède la fonte j b[i][j] fois.
# Rq : Une colonne vide pour un indice dont la ligne est non-vide est une racine
# Une ligne vide pour un indice dont la colonne est non-vide est une feuille
# On répète tant qu'on change des valeurs (c'est pas optimisé mais crotte,
# le tableau n'est pas si grand)
has_changed = True
deep_max = 0
while has_changed:
has_changed = False
for k,v in f.items():
if v['deep'] is not None:
for i in range(-1,len(b)-1):
if b[i][k] != 0:
if f[i]['deep'] is None:
if (b[i][k] <= NB_SUCCESSION_FOR_SAME):
f[i]['deep'] = v['deep']
f[i]['nb_transitions'] = b[i][k]
else:
f[i]['deep'] = v['deep'] + 1
f[i]['nb_transitions'] = b[i][k]
if f[i]['deep'] > deep_max:
deep_max = f[i]['deep']
has_changed = True
if (fontspec[i]['size'] < fontspec[k]['size']):
f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
elif f[i]['nb_transitions'] < b[i][k]:
f[i]['deep'] = v['deep'] + 1
f[i]['nb_transitions'] = b[i][k]
has_changed = True
if (fontspec[i]['size'] < fontspec[k]['size']):
f[i]['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
# Reverse deepness value, to make it distance from root
for v in f.values():
if (v['deep'] is not None):
v['deep'] = deep_max - v['deep']
# Add deep in blocks lines
for bl in blocks:
for l in bl['lines']:
if (l['flags'] & remove_flags) == FLAG_NONE:
l['deep'] = f[l['short_font']]['deep']
if ((f[l['short_font']]['flags']) & TITLE_SMALLER_THAN_SUBTITLE != 0):
l['flags'] |= TITLE_SMALLER_THAN_SUBTITLE
else:
l['deep'] = deep_max
# +--------------------------------------------------------------+
# | print_block_list |
# +--------------------------------------------------------------+
def print_block_list(t, remove_flags = FLAG_NONE):
last_page = -1
deep_max = -1
for bl in t:
for l in bl['lines']:
if (l.get('deep') is not None):
if deep_max < l['deep']: deep_max = l['deep']
if deep_max > 10: deep_max = 10
ttl = "#############"
last_deep = -1
for block in t:
if (block['page'] != last_page):
if (last_page > 0):
print("")
last_page = block['page']
print("________________________________")
print("*page %d*" % last_page)
print("")
for l in block['lines']:
if (l['flags'] & remove_flags) == FLAG_NONE:
pre = ''
post = ' '
if (l.get('deep') is None):
pre = '!! '
last_deep = -1
else:
if (l['flags'] & SMALL_FONT) != 0:
pre = "> %s" % pre
#if (len(l['text']) > 20) and \
# len(re.sub(r'\w','', l['text']).strip()) > 5:
# post = "%s " % post
if (l['flags'] & TITLE_SMALLER_THAN_SUBTITLE) != 0 and \
(l['flags'] & (DEFAULT_FONT_SIZE | SMALL_FONT)) != 0:
pre = "%s**" % (pre)
post = "**%s" % post
elif l['deep'] < deep_max:
pre = "%s%s " % (pre, ttl[0:(l['deep']+1)])
last_deep = l['deep']
print("%s%s%s" % (pre, l['text'], post))
# +--------------------------------------------------------------+
# | main |
# +--------------------------------------------------------------+
if (len(sys.argv) < 1):
print("-U-> Usage : python pdf2blocks.py <fichier_pdf>")
sys.exit(-1)
blocks = get_pdftotext(sys.argv[1])
p2h = get_pdftohtml(sys.argv[1])
fontspec = p2h['fonts']
segments = p2h['segments']
default_font_size = get_default_font_size(fontspec)
# mark_small_fonts(blocks, default_font_size)
mark_page_bottom(blocks)
guess_fonts(blocks, segments, fontspec)
replace_block_fonts(blocks, fontspec, default_font_size)
guess_structure(blocks, fontspec)
print_block_list(blocks, PAGE_BOTTOM | IS_BULLET)