mglib/mglib/pdfinfo.py

import os
import re
import subprocess
import logging
from magic import from_file

from .conf import settings
from .exceptions import FileTypeNotSupported

"""
Uses command line pdfinfo utility (from poppler pakage) for various
small operations (e.g. get pdf page count).
"""

logger = logging.getLogger(__name__)


def get_tiff_pagecount(filepath):
    cmd = [
        settings.BINARY_IDENTIFY,
        "-format",
        "%n\n",
        filepath
    ]
    compl = subprocess.run(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )

    if compl.returncode:

        logger.error(
            "get_tiff_pagecount: cmd=%s args=%s stdout=%s stderr=%s code=%s",
            cmd,
            compl.args,
            compl.stdout,
            compl.stderr,
            compl.returncode,
            stack_info=True
        )

        raise Exception("Error occured while getting document page count.")

    lines = _split(stdout=compl.stdout)
    # look up for the line containing "Pages: 11"
    for line in lines:
        x = re.match(r"(\d+)", line.strip())
        if x:
            return int(x.group(1))

    return 0


def get_pagecount(filepath):
    """
    Returns the number of pages in a PDF document as integer.

    filepath - is filesystem path to a PDF document
    """
    if not os.path.isfile(filepath):
        raise ValueError("Filepath %s is not a file" % filepath)

    if os.path.isdir(filepath):
        raise ValueError("Filepath %s is a directory!" % filepath)

    base, ext = os.path.splitext(filepath)
    mime_type = from_file(filepath, mime=True)
    # pure images (png, jpeg) have only one page :)

    if mime_type in ['image/png', 'image/jpeg', 'image/jpg']:
        # whatever png/jpg image is there - it is
        # considered by default one page document.
        return 1

    # In case of REST API upload (via PUT + form multipart)
    # django saves temporary file as application/octet-stream
    # Checking extentions is an extra method of finding out correct
    # mime type
    if ext and ext.lower() in ('.jpeg', '.png', '.jpg'):
        return 1

    if mime_type == 'image/tiff':
        return get_tiff_pagecount(filepath)

    # In case of REST API upload (via PUT + form multipart)
    # django saves temporary file as application/octet-stream
    # Checking extentions is an extra method of finding out correct
    # mime type
    if ext and ext.lower() in ('.tiff', ):
        return get_tiff_pagecount(filepath)

    if mime_type != 'application/pdf':
        # In case of REST API upload (via PUT + form multipart)
        # django saves temporary file as application/octet-stream
        # Checking extentions is an extra method of finding out correct
        # mime type
        if ext and ext.lower() != '.pdf':
            raise FileTypeNotSupported(
                "Only jpeg, png, pdf and tiff are handled by this"
                " method"
            )
    # pdfinfo "${PDFFILE}" | grep Pages
    cmd = [
        settings.BINARY_PDFINFO,
        filepath
    ]
    compl = subprocess.run(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )

    if compl.returncode:

        logger.error(
            "get_pagecount: cmd=%s args=%s stdout=%s stderr=%s code=%s",
            cmd,
            compl.args,
            compl.stdout,
            compl.stderr,
            compl.returncode,
            stack_info=True
        )

        raise Exception("Error occured while getting document page count.")

    lines = _split(stdout=compl.stdout)
    # look up for the line containing "Pages: 11"
    for line in lines:
        x = re.match(r"Pages:\W+(\d+)$", line.strip())
        if x:
            return int(x.group(1))

    return 0


def _split(stdout):
    """
    stdout is result.stdout where result
    is whatever is returned by subprocess.run
    """
    decoded_text = stdout.decode(
        'utf-8',
        # in case there are decoding issues, just replace
        # problematic characters. We don't need text verbatim.
        'replace'
    )
    lines = decoded_text.split('\n')

    return lines
move pdftk and pdfinfo into mglib 2020-05-16 16:42:57 +02:00			`import os`
			`import re`
			`import subprocess`
			`import logging`
Replace extension checking with python-magic 2020-11-27 19:22:43 +01:00			`from magic import from_file`
move pdftk and pdfinfo into mglib 2020-05-16 16:42:57 +02:00
make binary paths configurable 2020-08-11 19:47:35 +02:00			`from .conf import settings`
Throw a more specific excetion on unsupported file format 2020-08-24 09:14:01 +02:00			`from .exceptions import FileTypeNotSupported`
make binary paths configurable 2020-08-11 19:47:35 +02:00
move pdftk and pdfinfo into mglib 2020-05-16 16:42:57 +02:00			`"""`
			`Uses command line pdfinfo utility (from poppler pakage) for various`
			`small operations (e.g. get pdf page count).`
			`"""`

			`logger = logging.getLogger(__name__)`


version bump. Bring last modules from pmworker. Make mglib - tiff aware 2020-07-16 11:06:41 +02:00			`def get_tiff_pagecount(filepath):`
			`cmd = [`
make binary paths configurable 2020-08-11 19:47:35 +02:00			`settings.BINARY_IDENTIFY,`
version bump. Bring last modules from pmworker. Make mglib - tiff aware 2020-07-16 11:06:41 +02:00			`"-format",`
			`"%n\n",`
			`filepath`
			`]`
			`compl = subprocess.run(`
			`cmd,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE`
			`)`

			`if compl.returncode:`

			`logger.error(`
			`"get_tiff_pagecount: cmd=%s args=%s stdout=%s stderr=%s code=%s",`
			`cmd,`
			`compl.args,`
			`compl.stdout,`
			`compl.stderr,`
			`compl.returncode,`
			`stack_info=True`
			`)`

			`raise Exception("Error occured while getting document page count.")`

fix pdfinfo.get_pagecount bug. Add unit tests. Bump version. 2020-07-25 10:43:50 +02:00			`lines = _split(stdout=compl.stdout)`
version bump. Bring last modules from pmworker. Make mglib - tiff aware 2020-07-16 11:06:41 +02:00			`# look up for the line containing "Pages: 11"`
			`for line in lines:`
			`x = re.match(r"(\d+)", line.strip())`
			`if x:`
			`return int(x.group(1))`

			`return 0`


move pdftk and pdfinfo into mglib 2020-05-16 16:42:57 +02:00			`def get_pagecount(filepath):`
			`"""`
			`Returns the number of pages in a PDF document as integer.`

			`filepath - is filesystem path to a PDF document`
			`"""`
			`if not os.path.isfile(filepath):`
			`raise ValueError("Filepath %s is not a file" % filepath)`

			`if os.path.isdir(filepath):`
			`raise ValueError("Filepath %s is a directory!" % filepath)`

add extra checks for mime type, inc version, fix failing tests 2020-12-01 11:40:51 +01:00			`base, ext = os.path.splitext(filepath)`
Replace extension checking with python-magic 2020-11-27 19:22:43 +01:00			`mime_type = from_file(filepath, mime=True)`
move pdftk and pdfinfo into mglib 2020-05-16 16:42:57 +02:00			`# pure images (png, jpeg) have only one page :)`
add extra checks for mime type, inc version, fix failing tests 2020-12-01 11:40:51 +01:00
Replace extension checking with python-magic 2020-11-27 19:22:43 +01:00			`if mime_type in ['image/png', 'image/jpeg', 'image/jpg']:`
move pdftk and pdfinfo into mglib 2020-05-16 16:42:57 +02:00			`# whatever png/jpg image is there - it is`
			`# considered by default one page document.`
			`return 1`

add extra checks for mime type, inc version, fix failing tests 2020-12-01 11:40:51 +01:00			`# In case of REST API upload (via PUT + form multipart)`
			`# django saves temporary file as application/octet-stream`
			`# Checking extentions is an extra method of finding out correct`
			`# mime type`
			`if ext and ext.lower() in ('.jpeg', '.png', '.jpg'):`
			`return 1`

Replace extension checking with python-magic 2020-11-27 19:22:43 +01:00			`if mime_type == 'image/tiff':`
version bump. Bring last modules from pmworker. Make mglib - tiff aware 2020-07-16 11:06:41 +02:00			`return get_tiff_pagecount(filepath)`

add extra checks for mime type, inc version, fix failing tests 2020-12-01 11:40:51 +01:00			`# In case of REST API upload (via PUT + form multipart)`
			`# django saves temporary file as application/octet-stream`
			`# Checking extentions is an extra method of finding out correct`
			`# mime type`
			`if ext and ext.lower() in ('.tiff', ):`
			`return get_tiff_pagecount(filepath)`

Formatting 2020-11-27 19:26:57 +01:00			`if mime_type != 'application/pdf':`
add extra checks for mime type, inc version, fix failing tests 2020-12-01 11:40:51 +01:00			`# In case of REST API upload (via PUT + form multipart)`
			`# django saves temporary file as application/octet-stream`
			`# Checking extentions is an extra method of finding out correct`
			`# mime type`
			`if ext and ext.lower() != '.pdf':`
			`raise FileTypeNotSupported(`
			`"Only jpeg, png, pdf and tiff are handled by this"`
			`" method"`
			`)`
move pdftk and pdfinfo into mglib 2020-05-16 16:42:57 +02:00			`# pdfinfo "${PDFFILE}" \| grep Pages`
make binary paths configurable 2020-08-11 19:47:35 +02:00			`cmd = [`
			`settings.BINARY_PDFINFO,`
			`filepath`
			`]`
move pdftk and pdfinfo into mglib 2020-05-16 16:42:57 +02:00			`compl = subprocess.run(`
			`cmd,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE`
			`)`

			`if compl.returncode:`

			`logger.error(`
			`"get_pagecount: cmd=%s args=%s stdout=%s stderr=%s code=%s",`
			`cmd,`
			`compl.args,`
			`compl.stdout,`
			`compl.stderr,`
			`compl.returncode,`
			`stack_info=True`
			`)`

			`raise Exception("Error occured while getting document page count.")`

fix pdfinfo.get_pagecount bug. Add unit tests. Bump version. 2020-07-25 10:43:50 +02:00			`lines = _split(stdout=compl.stdout)`
move pdftk and pdfinfo into mglib 2020-05-16 16:42:57 +02:00			`# look up for the line containing "Pages: 11"`
			`for line in lines:`
fix pdfinfo.get_pagecount bug. Add unit tests. Bump version. 2020-07-25 10:43:50 +02:00			`x = re.match(r"Pages:\W+(\d+)$", line.strip())`
move pdftk and pdfinfo into mglib 2020-05-16 16:42:57 +02:00			`if x:`
			`return int(x.group(1))`

			`return 0`
fix pdfinfo.get_pagecount bug. Add unit tests. Bump version. 2020-07-25 10:43:50 +02:00

			`def _split(stdout):`
			`"""`
			`stdout is result.stdout where result`
			`is whatever is returned by subprocess.run`
			`"""`
			`decoded_text = stdout.decode(`
			`'utf-8',`
			`# in case there are decoding issues, just replace`
			`# problematic characters. We don't need text verbatim.`
			`'replace'`
			`)`
			`lines = decoded_text.split('\n')`

			`return lines`