Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
copain
PDF2Blocs
Commits
9a4a368d
Commit
9a4a368d
authored
Apr 02, 2020
by
Bernard Stephan
Browse files
Version légèrement améliorée, moins de <div> dans les sorties, et documentation de l'algo à jour.
parent
c250fac8
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
src/bsv.css
View file @
9a4a368d
...
...
@@ -14,7 +14,7 @@ table {
}
.
caption
{
fig
caption
{
font-size
:
small
;
font-style
:
italic
;
color
:
#555555
;
...
...
@@ -30,7 +30,7 @@ table {
padding-left
:
50px
;
}
.page_top
,
.bottom_page
{
header
,
footer
{
font-family
:
sans
;
font-size
:
small
;
text-align
:
right
;
...
...
src/py/README.md
View file @
9a4a368d
This diff is collapsed.
Click to expand it.
src/py/pdf2blocks.py
View file @
9a4a368d
...
...
@@ -15,13 +15,14 @@ from p2b_utils import levenshtein
CMD_PDFTOTEXT
=
'/usr/sbin/pdftotext'
CMD_PDFTOHTML
=
'/usr/sbin/pdftohtml'
DEBUG_PRINT
=
True
DEBUG_PRINT
=
False
PRINT_CSS
=
False
LEFT_THRESHOLD
=
25
# In p2b_text_utils.add_lines() : the max horizontal space
# to consider aligned items to be on the same line.
HORIZONTAL_ALIGMENT_THRESHOLD
=
10
VERTICAL_ALIGMENT_THRESHOLD
=
8
HORIZONTAL_ALIGMENT_THRESHOLD
=
8
VERTICAL_ALIGMENT_THRESHOLD
=
14
# Avant, c'était
8
# En dessous de ces valeurs, un bloc n'est pas considéré comme
# représentatif pour définir une colonne.
...
...
@@ -289,7 +290,7 @@ def get_pdftohtml(filename):
def
get_default_font_size
(
fontspec
):
sizes
=
{}
max_cars
=
0
size_max_cars
=
42
# Doesn't matter : it'
ll
change
size_max_cars
=
42
# Doesn't matter : it'
s going to
change
for
f
in
fontspec
:
if
sizes
.
get
(
f
[
'size'
])
is
None
:
sizes
[
f
[
'size'
]]
=
f
[
'nb_cars'
]
...
...
@@ -583,13 +584,41 @@ def get_columns(blocks, default_font_size):
return
columns
# +--------------------------------------------------------------+
# | expand_blocks |
# +--------------------------------------------------------------+
# Make the big-fonted blocs (titles?) take the most space they can, growing
# right without intersecting another block.
# RQ : this is to be applied on a list of blocs in the SAME PAGE
def
expand_blocks
(
page_blocks
,
default_font_size
):
pb
=
page_blocks
# for shorter writing
max_x
=
max
([
b
[
'x_max'
]
for
b
in
pb
])
min_x
=
max
([
b
[
'x_min'
]
for
b
in
pb
])
for
b
in
pb
:
if
b
[
'font'
][
'size'
]
>
default_font_size
:
b
[
'xx_max'
]
=
max_x
b
[
'xx_min'
]
=
min_x
for
block
in
pb
:
if
block
[
'font'
][
'size'
]
>
default_font_size
:
for
b
in
pb
:
if
b
[
'y_min'
]
<
block
[
'y_max'
]
and
b
[
'y_max'
]
>
block
[
'y_min'
]:
if
b
[
'x_min'
]
>=
block
[
'x_max'
]
and
block
[
'xx_max'
]
>
b
[
'x_min'
]:
block
[
'xx_max'
]
=
b
[
'x_min'
]
if
b
[
'x_max'
]
<=
block
[
'x_min'
]
and
block
[
'xx_min'
]
<
b
[
'x_max'
]:
block
[
'xx_min'
]
=
b
[
'x_max'
]
for
b
in
pb
:
if
b
[
'font'
][
'size'
]
>
default_font_size
:
b
[
'x_max'
]
=
b
[
'xx_max'
]
b
[
'x_min'
]
=
b
[
'xx_min'
]
del
b
[
'xx_max'
]
del
b
[
'xx_min'
]
# +--------------------------------------------------------------+
# | sort_blocks |
# +--------------------------------------------------------------+
def
sort_blocks
(
blocks
,
columns
,
default_font_size
):
HUGE_RESIZE_LEFT
=
False
# If FONT_HUGE, resize block as large as possible.
# But resize to left may not be a good idea
# Set this to True or False and test results.
res
=
[]
nb_blocks
=
0
for
b
in
blocks
:
...
...
@@ -602,29 +631,11 @@ def sort_blocks(blocks, columns, default_font_size):
c
=
1
while
columns
[
c
]
<=
b
[
'x_min'
]:
c
+=
1
b
[
'col_min'
]
=
c
-
1
while
columns
[
c
]
<
b
[
'x_max'
]
-
VERTIC
AL_ALIGMENT_THRESHOLD
:
c
+=
1
while
columns
[
c
]
<
b
[
'x_max'
]
-
HORIZONT
AL_ALIGMENT_THRESHOLD
:
c
+=
1
b
[
'col_max'
]
=
c
#
ATTENTION
: if block in column 1, col_min ← 0 and col_max ← 1.
#
WARNING
: if block in column 1, col_min ← 0 and col_max ← 1.
b
[
'temp_left'
]
=
b
[
'col_min'
]
# HUGE_FONT blocks are resized the largest possible.
for
b
in
blocks
:
if
not
b
[
'treat'
]:
if
b
[
'font_class'
]
==
FONT_HUGE
:
oth_bl
=
[
bl
for
bl
in
blocks
if
(
not
bl
[
'treat'
])
and
bl
!=
b
\
and
bl
[
'y_min'
]
<=
b
[
'y_max'
]
and
bl
[
'y_max'
]
>=
b
[
'y_min'
]]
if
len
(
oth_bl
)
==
0
:
if
HUGE_RESIZE_LEFT
:
b
[
'col_min'
]
=
b
[
'temp_left'
]
=
0
b
[
'col_max'
]
=
len
(
columns
)
-
1
else
:
if
HUGE_RESIZE_LEFT
:
while
len
([
bl
for
bl
in
oth_bl
if
bl
[
'col_max'
]
<
b
[
'col_min'
]])
!=
0
:
b
[
'col_min'
]
-=
1
b
[
'temp_left'
]
=
b
[
'col_min'
]
while
len
([
bl
for
bl
in
oth_bl
if
bl
[
'col_min'
]
>
b
[
'col_max'
]])
!=
0
:
b
[
'col_max'
]
+=
1
for
b
in
blocks
:
if
b
[
'class'
]
==
BL_TOP_PAGE
:
res
.
append
(
b
)
...
...
@@ -635,7 +646,7 @@ def sort_blocks(blocks, columns, default_font_size):
# "we're looking for blocks contained into columns 0 to 2" (there, min is 0
# and max is 2).
curr_col_min
=
0
curr_col_max
=
len
(
columns
)
-
1
curr_col_max
=
1
while
len
(
res
)
!=
nb_blocks
:
#&& print("===> %d / %d" % (len(res), len(blocks)))
...
...
@@ -647,7 +658,10 @@ def sort_blocks(blocks, columns, default_font_size):
# Tant qu'on n'en trouve pas, on élargit les colonnes.
# Si on n'en trouvait toujours pas c'est qu'il y a un gros bug :
# donc on choisit de faire planter
# donc on choisit de faire planter. PS : ça n'est jamais arrivé
# dans le corpus.
# WARNING : bl_top is badly named. It doesn't contain the top blocks
# but all blocks in current column(s)
bl_top
=
[
b
for
b
in
blocks
if
not
b
[
'treat'
]
and
\
b
[
'temp_left'
]
>=
curr_col_min
and
\
b
[
'temp_left'
]
<
curr_col_max
]
...
...
@@ -663,13 +677,15 @@ def sort_blocks(blocks, columns, default_font_size):
y_min
=
min
([
b
[
'y_min'
]
for
b
in
bl_top
])
y_max
=
max
([
b
[
'y_max'
]
for
b
in
bl_top
if
b
[
'y_min'
]
==
y_min
])
#&& print("----- %d %f %f" % (len(bl_top), y_min, y_max))
## y a-t-il plusieurs blocs alignés ? Si oui, on prend le plus à gauche.
aligned_bl
=
[
b
for
b
in
blocks
if
not
b
[
'treat'
]
and
\
b
[
'temp_left'
]
>=
curr_col_min
and
\
b
[
'temp_left'
]
<
curr_col_max
and
\
b
[
'y_min'
]
>=
y_min
-
VERTICAL_ALIGMENT_THRESHOLD
and
\
b
[
'y_min'
]
<
y_max
]
#abs(b['y_min'] - y_min) <= VERTICAL_ALIGMENT_THRESHOLD]
selected_bl
=
aligned_bl
[
0
]
for
b
in
aligned_bl
[
1
:]:
if
b
[
'x_min'
]
<
selected_bl
[
'x_min'
]:
selected_bl
=
b
...
...
@@ -697,15 +713,19 @@ def sort_blocks(blocks, columns, default_font_size):
if
len
(
higher_bl
)
>
0
:
# ← Case 1
#&& print("→ Case 1 : %s" % selected_bl['lines'][0]['text'])
# &&& À tester : col_max à 1 ?
curr_col_min
=
0
curr_col_max
=
len
(
columns
)
-
1
elif
selected_bl
[
'col_max'
]
>
curr_col_max
:
# ← Case 3
#&& print("→ Case 3 : %s" % selected_bl['lines'][0]['text'])
selected_bl
[
'temp_left'
]
+=
1
curr_col_max
+=
1
curr_col_min
+=
1
else
:
# ← Case 2
#&& print("→ Case 2 : %s" % selected_bl['lines'][0]['text'])
curr_col_max
=
selected_bl
[
'col_max'
]
curr_col_min
=
selected_bl
[
'col_min'
]
res
.
append
(
selected_bl
)
...
...
@@ -988,7 +1008,6 @@ def guess_structure(blocks, fontspec, def_size):
bl
[
'font'
][
'has_style'
]
=
(
len
(
bl
[
'font'
][
'family'
].
split
(
','
))
>
1
)
# Alignment
if
len
(
bl
[
'lines'
])
<=
1
:
bl
[
'alignment'
]
=
ALIGN_UNDEF
...
...
@@ -1036,9 +1055,10 @@ def guess_structure(blocks, fontspec, def_size):
# Reorder blocks
first_block_tagged
=
False
for
p
in
pages
.
values
():
expand_blocks
(
p
[
'blocks'
],
default_font
[
'size'
])
col
=
get_columns
(
p
[
'blocks'
],
default_font
[
'size'
])
col
.
append
(
max
(
b
[
'x_max'
]
for
b
in
p
[
'blocks'
]))
#
&&
print("================================================== %s" % col)
#print("================================================== %s" % col)
p
[
'blocks'
]
=
sort_blocks
(
p
[
'blocks'
],
col
,
default_font
[
'size'
])
#print(col)
if
not
first_block_tagged
:
...
...
@@ -1069,8 +1089,6 @@ def guess_structure(blocks, fontspec, def_size):
for
bl
in
p
[
'blocks'
]:
if
bl
[
'class'
]
==
BL_UNDEF
:
t
=
""
.
join
([
t
[
'text'
]
for
t
in
bl
[
'lines'
]])
# print("====> %s" % t)
# print(" --> [%s]" % re.sub(r'\W','',t).strip())
if
len
(
re
.
sub
(
r
'\W'
,
''
,
t
).
strip
())
==
0
:
bl
[
'class'
]
=
BL_IGNORE
...
...
@@ -1114,10 +1132,6 @@ def guess_structure(blocks, fontspec, def_size):
adjust_block_prototypes
(
bl
)
for
inc
in
[
-
1
,
1
]:
j
=
i
+
inc
#&& while j >= 0 and j < len(p['blocks']) and \
#&& p['blocks'][j]['class'] == BL_UNDEF and \
#&& p['blocks'][j]['font']['size'] == bl['font']['size'] and \
#&& p['blocks'][j]['max_line_size'] <= TABLE_LINE_SIZE:
while
j
>=
0
and
j
<
len
(
p
[
'blocks'
])
and
\
p
[
'blocks'
][
j
][
'class'
]
==
BL_UNDEF
and
\
p
[
'blocks'
][
j
][
'font_class'
]
<=
FONT_DEFAULT
and
\
...
...
@@ -1289,8 +1303,9 @@ def print_html(pages, fontspec):
print
(
'<!DOCTYPE html>'
)
print
(
'<html lang="fr">'
)
print
(
'<head><meta charset="utf-8">'
)
print
(
'<link rel="stylesheet" href="http://ontology.inrae.fr/bsv/html/bsv.css" />'
)
print
(
'<link rel="stylesheet" href="bsv.css" />'
)
if
PRINT_CSS
:
print
(
'<link rel="stylesheet" href="http://ontology.inrae.fr/bsv/html/bsv.css" />'
)
print
(
'<link rel="stylesheet" href="bsv.css" />'
)
if
DEBUG_PRINT
:
print
(
"<!-- Fonts :"
)
...
...
@@ -1315,9 +1330,9 @@ def print_html(pages, fontspec):
print
(
"</head>"
)
print
(
"<body>"
)
BLOCK_TAGS
=
[
'div'
,
'p'
,
'
div'
,
'div'
,
'div
'
,
'table'
,
BLOCK_TAGS
=
[
'div'
,
'p'
,
'
footer'
,
'header'
,
'figcaption
'
,
'table'
,
'a'
,
'div'
,
'h1'
,
'h2'
,
'h3'
,
'h4'
,
'h5'
,
'div'
,
'div'
,
'div'
,
'
div
'
,
''
]
'div'
,
'div'
,
'div'
,
'
p
'
,
''
]
BLOCK_ENDLINES
=
[
'<br />'
,
''
,
'<br />'
,
'<br />'
,
'<br />'
,
''
,
''
,
''
,
''
,
''
,
''
,
''
,
''
,
'<br />'
,
'<br />'
,
'<br />'
,
'<br />'
,
'<br />'
]
...
...
@@ -1346,15 +1361,24 @@ def print_html(pages, fontspec):
print
(
'</table>'
)
if
i
<
len
(
blocks
):
i
-=
1
elif
cl
!=
BL_IGNORE
:
pre
=
post
=
''
if
PRINT_CSS
:
id_cl
=
' class="%s"'
%
BLOCKS_CLASSES
[
cl
]
else
:
id_cl
=
''
if
cl
==
BL_CAPTION
:
pre
=
'<figure>'
post
=
'</figure>'
if
not
DEBUG_PRINT
:
print
(
'<%s
class="
%s
"
>'
%
(
BLOCK_TAGS
[
cl
],
BLOCKS_CLASSES
[
cl
]
))
print
(
'
%s
<%s%s>'
%
(
pre
,
BLOCK_TAGS
[
cl
],
id_
cl
))
elif
blocks
[
i
][
'score'
]
is
None
:
print
(
'<%s
class="
%s
"
font="%d">'
%
(
BLOCK_TAGS
[
cl
],
BLOCKS_CLASSES
[
cl
]
,
blocks
[
i
][
'font'
][
'id'
]))
print
(
'
%s
<%s%s font="%d">'
%
(
pre
,
BLOCK_TAGS
[
cl
],
id_
cl
,
blocks
[
i
][
'font'
][
'id'
]))
else
:
print
(
'<%s
class="
%s
"
font="%d" score="%f">'
%
(
BLOCK_TAGS
[
cl
],
BLOCKS_CLASSES
[
cl
]
,
print
(
'
%s
<%s%s font="%d" score="%f">'
%
(
pre
,
BLOCK_TAGS
[
cl
],
id_
cl
,
blocks
[
i
][
'font'
][
'id'
],
blocks
[
i
][
'score'
]))
for
l
in
blocks
[
i
][
'lines'
]:
...
...
@@ -1364,16 +1388,12 @@ def print_html(pages, fontspec):
print
(
" <br />%s"
%
l
[
'text'
])
else
:
print
(
" %s%s"
%
(
l
[
'text'
],
BLOCK_ENDLINES
[
cl
]))
print
(
"</%s>"
%
BLOCK_TAGS
[
cl
])
print
(
"</%s>
%s
"
%
(
BLOCK_TAGS
[
cl
]
,
post
)
)
else
:
# cl == BL_IGNORE:
print
(
"<!--"
)
[
print
(
" %s"
%
l
[
'text'
])
for
l
in
blocks
[
i
][
'lines'
]]
print
(
"-->"
)
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
i
+=
1
print
(
"</body>"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment