diff --git a/changelog.md b/changelog.md index 439e5cb..2646f54 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +## [1.2.2] - 25 July 2020 + +### Changed + + - bugfix - get_pagecount handles non utf-8 encoded documents + +### Added + +- unit tests for get_pagecount + ## [1.2.1] - 16 July 2020 ### Added diff --git a/mglib/pdfinfo.py b/mglib/pdfinfo.py index e200342..997884a 100644 --- a/mglib/pdfinfo.py +++ b/mglib/pdfinfo.py @@ -38,7 +38,7 @@ def get_tiff_pagecount(filepath): raise Exception("Error occured while getting document page count.") - lines = compl.stdout.decode('utf-8').split('\n') + lines = _split(stdout=compl.stdout) # look up for the line containing "Pages: 11" for line in lines: x = re.match(r"(\d+)", line.strip()) @@ -100,11 +100,27 @@ def get_pagecount(filepath): raise Exception("Error occured while getting document page count.") - lines = compl.stdout.decode('utf-8').split('\n') + lines = _split(stdout=compl.stdout) # look up for the line containing "Pages: 11" for line in lines: - x = re.match("Pages:\W+(\d+)$", line.strip()) + x = re.match(r"Pages:\W+(\d+)$", line.strip()) if x: return int(x.group(1)) return 0 + + +def _split(stdout): + """ + stdout is result.stdout where result + is whatever is returned by subprocess.run + """ + decoded_text = stdout.decode( + 'utf-8', + # in case there are decoding issues, just replace + # problematic characters. We don't need text verbatim. + 'replace' + ) + lines = decoded_text.split('\n') + + return lines diff --git a/test/data/berlin.jpeg b/test/data/berlin.jpeg new file mode 100644 index 0000000..c305027 --- /dev/null +++ b/test/data/berlin.jpeg @@ -0,0 +1,2 @@ +I am not even binary! +The idea is to test pdfinfo.get_pagecount \ No newline at end of file diff --git a/test/data/berlin.jpg b/test/data/berlin.jpg new file mode 100644 index 0000000..6c06761 --- /dev/null +++ b/test/data/berlin.jpg @@ -0,0 +1 @@ +well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount \ No newline at end of file diff --git a/test/data/berlin.pdf b/test/data/berlin.pdf new file mode 100644 index 0000000..e263d19 Binary files /dev/null and b/test/data/berlin.pdf differ diff --git a/test/data/berlin.png b/test/data/berlin.png new file mode 100644 index 0000000..6c06761 --- /dev/null +++ b/test/data/berlin.png @@ -0,0 +1 @@ +well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount \ No newline at end of file diff --git a/test/data/text.tiff b/test/data/text.tiff new file mode 100644 index 0000000..a2127e7 Binary files /dev/null and b/test/data/text.tiff differ diff --git a/test/test_pdfinfo.py b/test/test_pdfinfo.py new file mode 100644 index 0000000..d3877c4 --- /dev/null +++ b/test/test_pdfinfo.py @@ -0,0 +1,61 @@ +import os +import unittest + +from mglib.pdfinfo import get_pagecount + +BASE_DIR = os.path.dirname( + os.path.abspath(__file__) +) + +DATA_DIR = os.path.join( + BASE_DIR, "data" +) + + +def get_filepath(filename): + return os.path.join(DATA_DIR, filename) + + +class TestPDFinfo(unittest.TestCase): + + def test_basic_pdf(self): + page_count = get_pagecount(get_filepath("berlin.pdf")) + + self.assertEqual( + page_count, + 2 + ) + + def test_basic_jpeg(self): + page_count = get_pagecount(get_filepath("berlin.jpeg")) + + self.assertEqual( + page_count, + 1 + ) + + def test_basic_jpg(self): + page_count = get_pagecount(get_filepath("berlin.jpg")) + + self.assertEqual( + page_count, + 1 + ) + + def test_basic_png(self): + page_count = get_pagecount(get_filepath("berlin.png")) + + self.assertEqual( + page_count, + 1 + ) + + def test_basic_tiff(self): + # in case input file has extention tiff extension + # it will internally call get_tiff_pagecount method + page_count = get_pagecount(get_filepath("text.tiff")) + + self.assertEqual( + page_count, + 2 + )