mglib/mglib/pdfinfo.py

111 lines
2.7 KiB
Python

import os
import re
import subprocess
import logging
"""
Uses command line pdfinfo utility (from poppler pakage) for various
small operations (e.g. get pdf page count).
"""
logger = logging.getLogger(__name__)
def get_tiff_pagecount(filepath):
cmd = [
"/usr/bin/identify",
"-format",
"%n\n",
filepath
]
compl = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
if compl.returncode:
logger.error(
"get_tiff_pagecount: cmd=%s args=%s stdout=%s stderr=%s code=%s",
cmd,
compl.args,
compl.stdout,
compl.stderr,
compl.returncode,
stack_info=True
)
raise Exception("Error occured while getting document page count.")
lines = compl.stdout.decode('utf-8').split('\n')
# look up for the line containing "Pages: 11"
for line in lines:
x = re.match(r"(\d+)", line.strip())
if x:
return int(x.group(1))
return 0
def get_pagecount(filepath):
"""
Returns the number of pages in a PDF document as integer.
filepath - is filesystem path to a PDF document
"""
if not os.path.isfile(filepath):
raise ValueError("Filepath %s is not a file" % filepath)
if os.path.isdir(filepath):
raise ValueError("Filepath %s is a directory!" % filepath)
base, ext = os.path.splitext(filepath)
# pure images (png, jpeg) have only one page :)
if ext and ext.lower() in ('.jpeg', '.png', '.jpg'):
# whatever png/jpg image is there - it is
# considered by default one page document.
return 1
if ext and ext.lower() in ('.tiff', ):
return get_tiff_pagecount(filepath)
if ext and ext.lower() not in ('.pdf', '.tiff'):
raise ValueError(
"Only jpeg, png, pdf and tiff are handlerd by this"
" method"
)
# pdfinfo "${PDFFILE}" | grep Pages
cmd = ["/usr/bin/pdfinfo", filepath]
compl = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
if compl.returncode:
logger.error(
"get_pagecount: cmd=%s args=%s stdout=%s stderr=%s code=%s",
cmd,
compl.args,
compl.stdout,
compl.stderr,
compl.returncode,
stack_info=True
)
raise Exception("Error occured while getting document page count.")
lines = compl.stdout.decode('utf-8').split('\n')
# look up for the line containing "Pages: 11"
for line in lines:
x = re.match("Pages:\W+(\d+)$", line.strip())
if x:
return int(x.group(1))
return 0