fix pdfinfo.get_pagecount bug. Add unit tests. Bump version.

pull/3/head
Eugen Ciur 2020-07-25 10:43:50 +02:00
parent 14370d35b2
commit 8a8835d243
8 changed files with 94 additions and 3 deletions

View File

@ -1,5 +1,15 @@
# Changelog
## [1.2.2] - 25 July 2020
### Changed
- bugfix - get_pagecount handles non utf-8 encoded documents
### Added
- unit tests for get_pagecount
## [1.2.1] - 16 July 2020
### Added

View File

@ -38,7 +38,7 @@ def get_tiff_pagecount(filepath):
raise Exception("Error occured while getting document page count.")
lines = compl.stdout.decode('utf-8').split('\n')
lines = _split(stdout=compl.stdout)
# look up for the line containing "Pages: 11"
for line in lines:
x = re.match(r"(\d+)", line.strip())
@ -100,11 +100,27 @@ def get_pagecount(filepath):
raise Exception("Error occured while getting document page count.")
lines = compl.stdout.decode('utf-8').split('\n')
lines = _split(stdout=compl.stdout)
# look up for the line containing "Pages: 11"
for line in lines:
x = re.match("Pages:\W+(\d+)$", line.strip())
x = re.match(r"Pages:\W+(\d+)$", line.strip())
if x:
return int(x.group(1))
return 0
def _split(stdout):
"""
stdout is result.stdout where result
is whatever is returned by subprocess.run
"""
decoded_text = stdout.decode(
'utf-8',
# in case there are decoding issues, just replace
# problematic characters. We don't need text verbatim.
'replace'
)
lines = decoded_text.split('\n')
return lines

2
test/data/berlin.jpeg Normal file
View File

@ -0,0 +1,2 @@
I am not even binary!
The idea is to test pdfinfo.get_pagecount

1
test/data/berlin.jpg Normal file
View File

@ -0,0 +1 @@
well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount

BIN
test/data/berlin.pdf Normal file

Binary file not shown.

1
test/data/berlin.png Normal file
View File

@ -0,0 +1 @@
well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount

BIN
test/data/text.tiff Normal file

Binary file not shown.

61
test/test_pdfinfo.py Normal file
View File

@ -0,0 +1,61 @@
import os
import unittest
from mglib.pdfinfo import get_pagecount
BASE_DIR = os.path.dirname(
os.path.abspath(__file__)
)
DATA_DIR = os.path.join(
BASE_DIR, "data"
)
def get_filepath(filename):
return os.path.join(DATA_DIR, filename)
class TestPDFinfo(unittest.TestCase):
def test_basic_pdf(self):
page_count = get_pagecount(get_filepath("berlin.pdf"))
self.assertEqual(
page_count,
2
)
def test_basic_jpeg(self):
page_count = get_pagecount(get_filepath("berlin.jpeg"))
self.assertEqual(
page_count,
1
)
def test_basic_jpg(self):
page_count = get_pagecount(get_filepath("berlin.jpg"))
self.assertEqual(
page_count,
1
)
def test_basic_png(self):
page_count = get_pagecount(get_filepath("berlin.png"))
self.assertEqual(
page_count,
1
)
def test_basic_tiff(self):
# in case input file has extention tiff extension
# it will internally call get_tiff_pagecount method
page_count = get_pagecount(get_filepath("text.tiff"))
self.assertEqual(
page_count,
2
)