mirror of https://github.com/papermerge/mglib
fix pdfinfo.get_pagecount bug. Add unit tests. Bump version.
parent
14370d35b2
commit
8a8835d243
10
changelog.md
10
changelog.md
|
@ -1,5 +1,15 @@
|
||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [1.2.2] - 25 July 2020
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
|
||||||
|
- bugfix - get_pagecount handles non utf-8 encoded documents
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- unit tests for get_pagecount
|
||||||
|
|
||||||
## [1.2.1] - 16 July 2020
|
## [1.2.1] - 16 July 2020
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
@ -38,7 +38,7 @@ def get_tiff_pagecount(filepath):
|
||||||
|
|
||||||
raise Exception("Error occured while getting document page count.")
|
raise Exception("Error occured while getting document page count.")
|
||||||
|
|
||||||
lines = compl.stdout.decode('utf-8').split('\n')
|
lines = _split(stdout=compl.stdout)
|
||||||
# look up for the line containing "Pages: 11"
|
# look up for the line containing "Pages: 11"
|
||||||
for line in lines:
|
for line in lines:
|
||||||
x = re.match(r"(\d+)", line.strip())
|
x = re.match(r"(\d+)", line.strip())
|
||||||
|
@ -100,11 +100,27 @@ def get_pagecount(filepath):
|
||||||
|
|
||||||
raise Exception("Error occured while getting document page count.")
|
raise Exception("Error occured while getting document page count.")
|
||||||
|
|
||||||
lines = compl.stdout.decode('utf-8').split('\n')
|
lines = _split(stdout=compl.stdout)
|
||||||
# look up for the line containing "Pages: 11"
|
# look up for the line containing "Pages: 11"
|
||||||
for line in lines:
|
for line in lines:
|
||||||
x = re.match("Pages:\W+(\d+)$", line.strip())
|
x = re.match(r"Pages:\W+(\d+)$", line.strip())
|
||||||
if x:
|
if x:
|
||||||
return int(x.group(1))
|
return int(x.group(1))
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def _split(stdout):
|
||||||
|
"""
|
||||||
|
stdout is result.stdout where result
|
||||||
|
is whatever is returned by subprocess.run
|
||||||
|
"""
|
||||||
|
decoded_text = stdout.decode(
|
||||||
|
'utf-8',
|
||||||
|
# in case there are decoding issues, just replace
|
||||||
|
# problematic characters. We don't need text verbatim.
|
||||||
|
'replace'
|
||||||
|
)
|
||||||
|
lines = decoded_text.split('\n')
|
||||||
|
|
||||||
|
return lines
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
I am not even binary!
|
||||||
|
The idea is to test pdfinfo.get_pagecount
|
|
@ -0,0 +1 @@
|
||||||
|
well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount
|
Binary file not shown.
|
@ -0,0 +1 @@
|
||||||
|
well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount
|
Binary file not shown.
|
@ -0,0 +1,61 @@
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from mglib.pdfinfo import get_pagecount
|
||||||
|
|
||||||
|
BASE_DIR = os.path.dirname(
|
||||||
|
os.path.abspath(__file__)
|
||||||
|
)
|
||||||
|
|
||||||
|
DATA_DIR = os.path.join(
|
||||||
|
BASE_DIR, "data"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_filepath(filename):
|
||||||
|
return os.path.join(DATA_DIR, filename)
|
||||||
|
|
||||||
|
|
||||||
|
class TestPDFinfo(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_basic_pdf(self):
|
||||||
|
page_count = get_pagecount(get_filepath("berlin.pdf"))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
page_count,
|
||||||
|
2
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_basic_jpeg(self):
|
||||||
|
page_count = get_pagecount(get_filepath("berlin.jpeg"))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
page_count,
|
||||||
|
1
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_basic_jpg(self):
|
||||||
|
page_count = get_pagecount(get_filepath("berlin.jpg"))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
page_count,
|
||||||
|
1
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_basic_png(self):
|
||||||
|
page_count = get_pagecount(get_filepath("berlin.png"))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
page_count,
|
||||||
|
1
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_basic_tiff(self):
|
||||||
|
# in case input file has extention tiff extension
|
||||||
|
# it will internally call get_tiff_pagecount method
|
||||||
|
page_count = get_pagecount(get_filepath("text.tiff"))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
page_count,
|
||||||
|
2
|
||||||
|
)
|
Loading…
Reference in New Issue