2020-05-16 16:42:57 +02:00
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import subprocess
|
|
|
|
import logging
|
2020-11-27 19:22:43 +01:00
|
|
|
from magic import from_file
|
2020-05-16 16:42:57 +02:00
|
|
|
|
2020-08-11 19:47:35 +02:00
|
|
|
from .conf import settings
|
2020-08-24 09:14:01 +02:00
|
|
|
from .exceptions import FileTypeNotSupported
|
2020-08-11 19:47:35 +02:00
|
|
|
|
2020-05-16 16:42:57 +02:00
|
|
|
"""
|
|
|
|
Uses command line pdfinfo utility (from poppler pakage) for various
|
|
|
|
small operations (e.g. get pdf page count).
|
|
|
|
"""
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2020-07-16 11:06:41 +02:00
|
|
|
def get_tiff_pagecount(filepath):
|
|
|
|
cmd = [
|
2020-08-11 19:47:35 +02:00
|
|
|
settings.BINARY_IDENTIFY,
|
2020-07-16 11:06:41 +02:00
|
|
|
"-format",
|
|
|
|
"%n\n",
|
|
|
|
filepath
|
|
|
|
]
|
|
|
|
compl = subprocess.run(
|
|
|
|
cmd,
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE
|
|
|
|
)
|
|
|
|
|
|
|
|
if compl.returncode:
|
|
|
|
|
|
|
|
logger.error(
|
|
|
|
"get_tiff_pagecount: cmd=%s args=%s stdout=%s stderr=%s code=%s",
|
|
|
|
cmd,
|
|
|
|
compl.args,
|
|
|
|
compl.stdout,
|
|
|
|
compl.stderr,
|
|
|
|
compl.returncode,
|
|
|
|
stack_info=True
|
|
|
|
)
|
|
|
|
|
|
|
|
raise Exception("Error occured while getting document page count.")
|
|
|
|
|
2020-07-25 10:43:50 +02:00
|
|
|
lines = _split(stdout=compl.stdout)
|
2020-07-16 11:06:41 +02:00
|
|
|
# look up for the line containing "Pages: 11"
|
|
|
|
for line in lines:
|
|
|
|
x = re.match(r"(\d+)", line.strip())
|
|
|
|
if x:
|
|
|
|
return int(x.group(1))
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
2020-05-16 16:42:57 +02:00
|
|
|
def get_pagecount(filepath):
|
|
|
|
"""
|
|
|
|
Returns the number of pages in a PDF document as integer.
|
|
|
|
|
|
|
|
filepath - is filesystem path to a PDF document
|
|
|
|
"""
|
|
|
|
if not os.path.isfile(filepath):
|
|
|
|
raise ValueError("Filepath %s is not a file" % filepath)
|
|
|
|
|
|
|
|
if os.path.isdir(filepath):
|
|
|
|
raise ValueError("Filepath %s is a directory!" % filepath)
|
|
|
|
|
2020-12-01 11:40:51 +01:00
|
|
|
base, ext = os.path.splitext(filepath)
|
2020-11-27 19:22:43 +01:00
|
|
|
mime_type = from_file(filepath, mime=True)
|
2020-05-16 16:42:57 +02:00
|
|
|
# pure images (png, jpeg) have only one page :)
|
2020-12-01 11:40:51 +01:00
|
|
|
|
2020-11-27 19:22:43 +01:00
|
|
|
if mime_type in ['image/png', 'image/jpeg', 'image/jpg']:
|
2020-05-16 16:42:57 +02:00
|
|
|
# whatever png/jpg image is there - it is
|
|
|
|
# considered by default one page document.
|
|
|
|
return 1
|
|
|
|
|
2020-12-01 11:40:51 +01:00
|
|
|
# In case of REST API upload (via PUT + form multipart)
|
|
|
|
# django saves temporary file as application/octet-stream
|
|
|
|
# Checking extentions is an extra method of finding out correct
|
|
|
|
# mime type
|
|
|
|
if ext and ext.lower() in ('.jpeg', '.png', '.jpg'):
|
|
|
|
return 1
|
|
|
|
|
2020-11-27 19:22:43 +01:00
|
|
|
if mime_type == 'image/tiff':
|
2020-07-16 11:06:41 +02:00
|
|
|
return get_tiff_pagecount(filepath)
|
|
|
|
|
2020-12-01 11:40:51 +01:00
|
|
|
# In case of REST API upload (via PUT + form multipart)
|
|
|
|
# django saves temporary file as application/octet-stream
|
|
|
|
# Checking extentions is an extra method of finding out correct
|
|
|
|
# mime type
|
|
|
|
if ext and ext.lower() in ('.tiff', ):
|
|
|
|
return get_tiff_pagecount(filepath)
|
|
|
|
|
2020-11-27 19:26:57 +01:00
|
|
|
if mime_type != 'application/pdf':
|
2020-12-01 11:40:51 +01:00
|
|
|
# In case of REST API upload (via PUT + form multipart)
|
|
|
|
# django saves temporary file as application/octet-stream
|
|
|
|
# Checking extentions is an extra method of finding out correct
|
|
|
|
# mime type
|
|
|
|
if ext and ext.lower() != '.pdf':
|
|
|
|
raise FileTypeNotSupported(
|
|
|
|
"Only jpeg, png, pdf and tiff are handled by this"
|
|
|
|
" method"
|
|
|
|
)
|
2020-05-16 16:42:57 +02:00
|
|
|
# pdfinfo "${PDFFILE}" | grep Pages
|
2020-08-11 19:47:35 +02:00
|
|
|
cmd = [
|
|
|
|
settings.BINARY_PDFINFO,
|
|
|
|
filepath
|
|
|
|
]
|
2020-05-16 16:42:57 +02:00
|
|
|
compl = subprocess.run(
|
|
|
|
cmd,
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE
|
|
|
|
)
|
|
|
|
|
|
|
|
if compl.returncode:
|
|
|
|
|
|
|
|
logger.error(
|
|
|
|
"get_pagecount: cmd=%s args=%s stdout=%s stderr=%s code=%s",
|
|
|
|
cmd,
|
|
|
|
compl.args,
|
|
|
|
compl.stdout,
|
|
|
|
compl.stderr,
|
|
|
|
compl.returncode,
|
|
|
|
stack_info=True
|
|
|
|
)
|
|
|
|
|
|
|
|
raise Exception("Error occured while getting document page count.")
|
|
|
|
|
2020-07-25 10:43:50 +02:00
|
|
|
lines = _split(stdout=compl.stdout)
|
2020-05-16 16:42:57 +02:00
|
|
|
# look up for the line containing "Pages: 11"
|
|
|
|
for line in lines:
|
2020-07-25 10:43:50 +02:00
|
|
|
x = re.match(r"Pages:\W+(\d+)$", line.strip())
|
2020-05-16 16:42:57 +02:00
|
|
|
if x:
|
|
|
|
return int(x.group(1))
|
|
|
|
|
|
|
|
return 0
|
2020-07-25 10:43:50 +02:00
|
|
|
|
|
|
|
|
|
|
|
def _split(stdout):
|
|
|
|
"""
|
|
|
|
stdout is result.stdout where result
|
|
|
|
is whatever is returned by subprocess.run
|
|
|
|
"""
|
|
|
|
decoded_text = stdout.decode(
|
|
|
|
'utf-8',
|
|
|
|
# in case there are decoding issues, just replace
|
|
|
|
# problematic characters. We don't need text verbatim.
|
|
|
|
'replace'
|
|
|
|
)
|
|
|
|
lines = decoded_text.split('\n')
|
|
|
|
|
|
|
|
return lines
|