mirror of https://github.com/papermerge/mglib
fix pdfinfo.get_pagecount bug. Add unit tests. Bump version.
parent
14370d35b2
commit
8a8835d243
10
changelog.md
10
changelog.md
|
@ -1,5 +1,15 @@
|
|||
# Changelog
|
||||
|
||||
## [1.2.2] - 25 July 2020
|
||||
|
||||
### Changed
|
||||
|
||||
- bugfix - get_pagecount handles non utf-8 encoded documents
|
||||
|
||||
### Added
|
||||
|
||||
- unit tests for get_pagecount
|
||||
|
||||
## [1.2.1] - 16 July 2020
|
||||
|
||||
### Added
|
||||
|
|
|
@ -38,7 +38,7 @@ def get_tiff_pagecount(filepath):
|
|||
|
||||
raise Exception("Error occured while getting document page count.")
|
||||
|
||||
lines = compl.stdout.decode('utf-8').split('\n')
|
||||
lines = _split(stdout=compl.stdout)
|
||||
# look up for the line containing "Pages: 11"
|
||||
for line in lines:
|
||||
x = re.match(r"(\d+)", line.strip())
|
||||
|
@ -100,11 +100,27 @@ def get_pagecount(filepath):
|
|||
|
||||
raise Exception("Error occured while getting document page count.")
|
||||
|
||||
lines = compl.stdout.decode('utf-8').split('\n')
|
||||
lines = _split(stdout=compl.stdout)
|
||||
# look up for the line containing "Pages: 11"
|
||||
for line in lines:
|
||||
x = re.match("Pages:\W+(\d+)$", line.strip())
|
||||
x = re.match(r"Pages:\W+(\d+)$", line.strip())
|
||||
if x:
|
||||
return int(x.group(1))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def _split(stdout):
|
||||
"""
|
||||
stdout is result.stdout where result
|
||||
is whatever is returned by subprocess.run
|
||||
"""
|
||||
decoded_text = stdout.decode(
|
||||
'utf-8',
|
||||
# in case there are decoding issues, just replace
|
||||
# problematic characters. We don't need text verbatim.
|
||||
'replace'
|
||||
)
|
||||
lines = decoded_text.split('\n')
|
||||
|
||||
return lines
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
I am not even binary!
|
||||
The idea is to test pdfinfo.get_pagecount
|
|
@ -0,0 +1 @@
|
|||
well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount
|
Binary file not shown.
|
@ -0,0 +1 @@
|
|||
well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount
|
Binary file not shown.
|
@ -0,0 +1,61 @@
|
|||
import os
|
||||
import unittest
|
||||
|
||||
from mglib.pdfinfo import get_pagecount
|
||||
|
||||
BASE_DIR = os.path.dirname(
|
||||
os.path.abspath(__file__)
|
||||
)
|
||||
|
||||
DATA_DIR = os.path.join(
|
||||
BASE_DIR, "data"
|
||||
)
|
||||
|
||||
|
||||
def get_filepath(filename):
|
||||
return os.path.join(DATA_DIR, filename)
|
||||
|
||||
|
||||
class TestPDFinfo(unittest.TestCase):
|
||||
|
||||
def test_basic_pdf(self):
|
||||
page_count = get_pagecount(get_filepath("berlin.pdf"))
|
||||
|
||||
self.assertEqual(
|
||||
page_count,
|
||||
2
|
||||
)
|
||||
|
||||
def test_basic_jpeg(self):
|
||||
page_count = get_pagecount(get_filepath("berlin.jpeg"))
|
||||
|
||||
self.assertEqual(
|
||||
page_count,
|
||||
1
|
||||
)
|
||||
|
||||
def test_basic_jpg(self):
|
||||
page_count = get_pagecount(get_filepath("berlin.jpg"))
|
||||
|
||||
self.assertEqual(
|
||||
page_count,
|
||||
1
|
||||
)
|
||||
|
||||
def test_basic_png(self):
|
||||
page_count = get_pagecount(get_filepath("berlin.png"))
|
||||
|
||||
self.assertEqual(
|
||||
page_count,
|
||||
1
|
||||
)
|
||||
|
||||
def test_basic_tiff(self):
|
||||
# in case input file has extention tiff extension
|
||||
# it will internally call get_tiff_pagecount method
|
||||
page_count = get_pagecount(get_filepath("text.tiff"))
|
||||
|
||||
self.assertEqual(
|
||||
page_count,
|
||||
2
|
||||
)
|
Loading…
Reference in New Issue