fix pdfinfo.get_pagecount bug. Add unit tests. Bump version.

2020-07-25 10:43:50 +02:00 · 2020-07-25 10:43:50 +02:00 · 8a8835d243
parent 14370d35b2
commit 8a8835d243
8 changed files with 94 additions and 3 deletions
--- a/changelog.md
+++ b/changelog.md
@ -1,5 +1,15 @@
 # Changelog
 ## [1.2.2] - 25 July 2020
 ### Changed
  - bugfix - get_pagecount handles non utf-8 encoded documents
 ### Added
 - unit tests for get_pagecount
 ## [1.2.1] - 16 July 2020
 ### Added
--- a/mglib/pdfinfo.py
+++ b/mglib/pdfinfo.py
@ -38,7 +38,7 @@ def get_tiff_pagecount(filepath):
        raise Exception("Error occured while getting document page count.")
-    lines = compl.stdout.decode('utf-8').split('\n')
+    lines = _split(stdout=compl.stdout)
    # look up for the line containing "Pages: 11"
    for line in lines:
        x = re.match(r"(\d+)", line.strip())
@ -100,11 +100,27 @@ def get_pagecount(filepath):
        raise Exception("Error occured while getting document page count.")
-    lines = compl.stdout.decode('utf-8').split('\n')
+    lines = _split(stdout=compl.stdout)
    # look up for the line containing "Pages: 11"
    for line in lines:
-        x = re.match("Pages:\W+(\d+)$", line.strip())
+        x = re.match(r"Pages:\W+(\d+)$", line.strip())
        if x:
            return int(x.group(1))
    return 0
 def _split(stdout):
    """
    stdout is result.stdout where result
    is whatever is returned by subprocess.run
    """
    decoded_text = stdout.decode(
        'utf-8',
        # in case there are decoding issues, just replace
        # problematic characters. We don't need text verbatim.
        'replace'
    )
    lines = decoded_text.split('\n')
    return lines
--- a/test/data/berlin.jpeg
+++ b/test/data/berlin.jpeg
@ -0,0 +1,2 @@
 I am not even binary!
 The idea is to test pdfinfo.get_pagecount
--- a/test/data/berlin.jpg
+++ b/test/data/berlin.jpg
@ -0,0 +1 @@
 well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount
--- a/test/data/berlin.pdf
+++ b/test/data/berlin.pdf
--- a/test/data/berlin.png
+++ b/test/data/berlin.png
@ -0,0 +1 @@
 well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount
--- a/test/data/text.tiff
+++ b/test/data/text.tiff
--- a/test/test_pdfinfo.py
+++ b/test/test_pdfinfo.py
@ -0,0 +1,61 @@
 import os
 import unittest
 from mglib.pdfinfo import get_pagecount
 BASE_DIR = os.path.dirname(
    os.path.abspath(__file__)
 )
 DATA_DIR = os.path.join(
    BASE_DIR, "data"
 )
 def get_filepath(filename):
    return os.path.join(DATA_DIR, filename)
 class TestPDFinfo(unittest.TestCase):
    def test_basic_pdf(self):
        page_count = get_pagecount(get_filepath("berlin.pdf"))
        self.assertEqual(
            page_count,
            2
        )
    def test_basic_jpeg(self):
        page_count = get_pagecount(get_filepath("berlin.jpeg"))
        self.assertEqual(
            page_count,
            1
        )
    def test_basic_jpg(self):
        page_count = get_pagecount(get_filepath("berlin.jpg"))
        self.assertEqual(
            page_count,
            1
        )
    def test_basic_png(self):
        page_count = get_pagecount(get_filepath("berlin.png"))
        self.assertEqual(
            page_count,
            1
        )
    def test_basic_tiff(self):
        # in case input file has extention tiff extension
        # it will internally call get_tiff_pagecount method
        page_count = get_pagecount(get_filepath("text.tiff"))
        self.assertEqual(
            page_count,
            2
        )
		`@ -0,0 +1,2 @@`
							`I am not even binary!`
							`The idea is to test pdfinfo.get_pagecount`
		`@ -0,0 +1 @@`
							`well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount`