Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
copain
PDF2Blocs
Commits
b29ee44a
Commit
b29ee44a
authored
Jan 11, 2021
by
Bernard Stephan
Browse files
Corrections relatives à la lecture de caractères spéciaux dans les pdf.
parent
4fd22c7c
Changes
1
Hide whitespace changes
Inline
Side-by-side
src/py/p2b_functions.py
View file @
b29ee44a
...
...
@@ -21,13 +21,20 @@ def get_pdftotext(filename):
proc
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
o
,
e
=
proc
.
communicate
()
if
(
proc
.
returncode
!=
0
):
print
(
'-S-> Command pdftotext returned an error :'
)
print
(
' '
+
e
.
decode
(
'utf8'
))
#
print('-S-> Command pdftotext returned an error :')
#
print(' ' + e.decode('utf8'))
return
[]
# Parse xml code and create block table.
xml
=
o
.
decode
(
'utf8'
)
root
=
ET
.
fromstring
(
xml
)
## Quelques cas particuliers déjà rencontrés :-(
xml
=
re
.
sub
(
r
">[]<"
,
'>*<'
,
xml
)
root
=
None
try
:
root
=
ET
.
fromstring
(
xml
)
except
Exception
as
e
:
return
[]
#root = ET.fromstring(xml)
page_num
=
0
flow_num
=
0
...
...
@@ -97,13 +104,17 @@ def get_pdftohtml(filename):
proc
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
o
,
e
=
proc
.
communicate
()
if
(
proc
.
returncode
!=
0
):
print
(
'-S-> Command pdftohtml returned an error :'
)
print
(
' '
+
e
.
decode
(
'utf8'
))
#
print('-S-> Command pdftohtml returned an error :')
#
print(' ' + e.decode('utf8'))
return
None
# Parse xml code and create block table.
xml
=
o
.
decode
(
'utf8'
)
root
=
ET
.
fromstring
(
xml
)
root
=
None
try
:
root
=
ET
.
fromstring
(
xml
)
except
Exception
as
e
:
return
{
'fonts'
:
[],
'segments'
:
[]
}
fontspec
=
[]
segments
=
[]
...
...
@@ -187,21 +198,28 @@ def mark_page_btotp(pages, ndx, increment, MARK):
while
not
end
:
li
=
None
for
p
in
k
:
try
:
if
li
==
None
:
li
=
""
.
join
([
re
.
sub
(
r
'[^a-zA-Z]'
,
''
,
l
[
'text'
])
\
for
l
in
pages
[
p
][
'blocks'
][
ndx
][
'lines'
]])
else
:
end
|=
(
li
!=
""
.
join
([
re
.
sub
(
r
'[^a-zA-Z]'
,
''
,
l
[
'text'
])
\
for
l
in
pages
[
p
][
'blocks'
][
ndx
][
'lines'
]]))
except
Exception
as
e
:
end
=
True
if
not
end
:
for
p
in
k
:
# if p['blocks'][ndx]['class'] == BL_UNDEF:
pages
[
p
][
'blocks'
][
ndx
][
'class'
]
=
MARK
if
len
(
pages
)
>
2
:
if
li
==
""
.
join
([
re
.
sub
(
r
'[^a-zA-Z]'
,
''
,
l
[
'text'
])
\
for
l
in
pages
[
1
][
'blocks'
][
ndx
][
'lines'
]]):
pages
[
1
][
'blocks'
][
ndx
][
'class'
]
=
MARK
try
:
if
li
==
""
.
join
([
re
.
sub
(
r
'[^a-zA-Z]'
,
''
,
l
[
'text'
])
\
for
l
in
pages
[
1
][
'blocks'
][
ndx
][
'lines'
]]):
pages
[
1
][
'blocks'
][
ndx
][
'class'
]
=
MARK
except
Exception
as
e
:
pass
# <--- Booooo : VERY BAD !!!
ndx
+=
increment
end
=
(
abs
(
ndx
)
>
min
([
len
(
p
[
'blocks'
])
for
p
in
pages
.
values
()]))
...
...
@@ -233,6 +251,8 @@ def get_lines(segments, fontspec):
last_top
=
-
1
line_no
=
-
1
last_right
=
0
if
len
(
segments
)
==
0
:
return
[]
for
txt
in
segments
:
if
(
txt
[
'top'
]
==
last_top
)
and
((
txt
[
'left'
]
-
last_right
)
<=
LEFT_THRESHOLD
):
txt
[
'line'
]
=
line_no
...
...
@@ -341,8 +361,9 @@ def guess_fonts(blocks, segments, fontspec):
l
[
'score'
]
=
min_score
# For debuggin purpose
l
[
'dist'
]
=
min_dist
# idem.
l
[
'line_no'
]
=
line_no
# idem. Stores the "similar line" number
if
(
lines
[
line_no
][
'nb_fonts'
]
>
1
):
l
[
'flags'
]
|=
MANY_FONTS
if
line_no
>=
0
:
if
(
lines
[
line_no
][
'nb_fonts'
]
>
1
):
l
[
'flags'
]
|=
MANY_FONTS
if
block_fonts
.
get
(
font_sel
)
is
None
:
block_fonts
[
font_sel
]
=
len
(
l
[
'text'
])
else
:
...
...
@@ -439,6 +460,7 @@ def get_columns(blocks, default_font_size):
# RQ : this is to be applied on a list of blocs in the SAME PAGE
def
expand_blocks
(
page_blocks
,
default_font_size
):
pb
=
page_blocks
# for shorter writing
if
len
(
pb
)
==
0
:
return
max_x
=
max
([
b
[
'x_max'
]
for
b
in
pb
])
min_x
=
min
([
b
[
'x_min'
]
for
b
in
pb
])
for
b
in
pb
:
...
...
@@ -662,7 +684,8 @@ def get_closest_block_class(block):
closest_index
=
-
1
for
i
in
range
(
len
(
BLOCK_PROTOTYPES
)):
if
BLOCK_PROTOTYPES
[
i
]
is
not
None
:
if
BLOCK_PROTOTYPES
[
i
]
is
not
None
and
\
BLOCK_PROTOTYPES
[
BL_PARAGRAPH
]
is
not
None
:
# Exclude comparison with titles if font_size smaller than default's
exclude
=
block
[
'font'
][
'size'
]
<
BLOCK_PROTOTYPES
[
BL_PARAGRAPH
][
'font_size'
]
\
and
i
>=
BL_DOCUMENT_TITLE
and
i
<=
BL_TITLE_5
...
...
@@ -784,7 +807,10 @@ def guess_structure(blocks, fontspec, def_size):
c
=
(
l
[
'x_min'
]
+
l
[
'x_max'
])
/
2
if
c
<
min_center
:
min_center
=
c
if
c
>
max_center
:
max_center
=
c
last_character
=
l
[
'text'
].
strip
()[
-
1
]
if
len
(
l
[
'text'
])
>
0
:
last_character
=
l
[
'text'
].
strip
()[
-
1
]
else
:
last_character
=
'.'
if
len
(
l
[
'text'
].
split
(
':'
))
>
1
and
\
len
(
re
.
sub
(
r
'[A-ZÀÇÉÈÊÂÔÛ0-9]'
,
''
,
l
[
'text'
][
0
]))
==
0
:
l
[
'flags'
]
|=
IS_DESCRIPTION
...
...
@@ -885,8 +911,12 @@ def guess_structure(blocks, fontspec, def_size):
if
fonts
[
bl
[
'font'
][
'id'
]]
>
default_font_nb_car
:
default_font_nb_car
=
fonts
[
bl
[
'font'
][
'id'
]]
default_font
=
bl
[
'font'
]
if
default_font
is
None
:
return
pages
default_font
[
'is_default'
]
=
True
# Page top and bottom detection is better before block sorting because of
# column detection consequences...
mark_page_top
(
pages
)
...
...
@@ -895,14 +925,15 @@ def guess_structure(blocks, fontspec, def_size):
# Reorder blocks
first_block_tagged
=
False
for
p
in
pages
.
values
():
expand_blocks
(
p
[
'blocks'
],
default_font
[
'size'
])
col
=
get_columns
(
p
[
'blocks'
],
default_font
[
'size'
])
col
.
append
(
max
(
b
[
'x_max'
]
for
b
in
p
[
'blocks'
]))
p
[
'blocks'
]
=
sort_blocks
(
p
[
'blocks'
],
col
,
default_font
[
'size'
])
if
not
first_block_tagged
:
bl
=
next
(
b
for
b
in
p
[
'blocks'
]
if
b
[
'class'
]
!=
BL_TOP_PAGE
)
bl
[
'flags'
]
|=
FLAG_FIRST_BLOCK
first_block_tagged
=
True
if
len
(
p
[
'blocks'
])
>
0
:
expand_blocks
(
p
[
'blocks'
],
default_font
[
'size'
])
col
=
get_columns
(
p
[
'blocks'
],
default_font
[
'size'
])
col
.
append
(
max
(
b
[
'x_max'
]
for
b
in
p
[
'blocks'
]))
p
[
'blocks'
]
=
sort_blocks
(
p
[
'blocks'
],
col
,
default_font
[
'size'
])
if
not
first_block_tagged
:
bl
=
next
(
b
for
b
in
p
[
'blocks'
]
if
b
[
'class'
]
!=
BL_TOP_PAGE
)
bl
[
'flags'
]
|=
FLAG_FIRST_BLOCK
first_block_tagged
=
True
# - End loop on pages
# ...but sometimes it works better after blocks sorting
...
...
@@ -1273,6 +1304,8 @@ def print_html(pages, fontspec, out=sys.stdout):
def
get_pdf2html
(
filename
):
blocks
=
get_pdftotext
(
filename
)
p2h
=
get_pdftohtml
(
filename
)
if
p2h
is
None
:
return
' '
fontspec
=
p2h
[
'fonts'
]
segments
=
p2h
[
'segments'
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment