Compare commits

...

4 Commits

Author SHA1 Message Date
Eugen Ciur 06be42542a add extra checks for mime type, inc version, fix failing tests 2020-12-01 11:40:51 +01:00
Eugen Ciur fe20ddd72b typo in license file 2020-12-01 07:49:57 +01:00
Eugen Ciur b7ce57b055 version inc 2020-12-01 07:47:24 +01:00
Eugen Ciur fa90e6b0a6 removed pdftk dependency 2020-12-01 07:44:57 +01:00
9 changed files with 103 additions and 541 deletions

View File

@ -1,6 +1,6 @@
Copyright 2020 Eugen Ciur <eugen@papermerge.com>
MgMail is Licensed under Apache License version 2.0
MgLib is Licensed under Apache License version 2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this software except in compliance with the License.

View File

@ -1,6 +1,19 @@
# Changelog
## [1.3.2] - 1 December 2020
### Changed
- mglib.pdfinfo.get_pagecount use python magic + file extention to determine correct mime type (and thus page count)
## [1.3.1] - 1 December 2020
### Changed
- pdftk module was replaced with stapler
## [1.2.8] - 24 August 2020
### Added

View File

@ -23,10 +23,6 @@ BINARY_IDENTIFY = "/usr/bin/identify"
# Used to extract text from images/PDF files.
BINARY_OCR = "/usr/bin/tesseract"
# Provided by pdftk package
# Used to reorder, cut/paste, delete pages withing PDF document
BINARY_PDFTK = "/usr/bin/pdftk"
# Provided by stapler
# Used to edit PDF documents
BINARY_STAPLER = "~/.local/bin/stapler"

View File

@ -64,22 +64,42 @@ def get_pagecount(filepath):
if os.path.isdir(filepath):
raise ValueError("Filepath %s is a directory!" % filepath)
base, ext = os.path.splitext(filepath)
mime_type = from_file(filepath, mime=True)
# pure images (png, jpeg) have only one page :)
if mime_type in ['image/png', 'image/jpeg', 'image/jpg']:
# whatever png/jpg image is there - it is
# considered by default one page document.
return 1
# In case of REST API upload (via PUT + form multipart)
# django saves temporary file as application/octet-stream
# Checking extentions is an extra method of finding out correct
# mime type
if ext and ext.lower() in ('.jpeg', '.png', '.jpg'):
return 1
if mime_type == 'image/tiff':
return get_tiff_pagecount(filepath)
# In case of REST API upload (via PUT + form multipart)
# django saves temporary file as application/octet-stream
# Checking extentions is an extra method of finding out correct
# mime type
if ext and ext.lower() in ('.tiff', ):
return get_tiff_pagecount(filepath)
if mime_type != 'application/pdf':
raise FileTypeNotSupported(
"Only jpeg, png, pdf and tiff are handled by this"
" method"
)
# In case of REST API upload (via PUT + form multipart)
# django saves temporary file as application/octet-stream
# Checking extentions is an extra method of finding out correct
# mime type
if ext and ext.lower() != '.pdf':
raise FileTypeNotSupported(
"Only jpeg, png, pdf and tiff are handled by this"
" method"
)
# pdfinfo "${PDFFILE}" | grep Pages
cmd = [
settings.BINARY_PDFINFO,

View File

@ -1,357 +0,0 @@
import logging
from mglib.runcmd import run
from mglib.pdfinfo import get_pagecount
from .conf import settings
logger = logging.getLogger(__name__)
#
# Utilities around pdftk command line tool
#
# https://www.pdflabs.com/docs/pdftk-man-page/
#
def cat_ranges_for_reorder(page_count, new_order):
"""
Returns a list of integers. Each number in the list
is correctly positioned (newly ordered) page.
Examples:
If in document with 4 pages first and second pages were
swapped, then returned list will be:
[2, 1, 3, 4]
If first page was swapped with last one (also 4 paegs document)
result list will look like:
[4, 2, 3, 1]
"""
if len(new_order) != page_count:
raise ValueError("Not enough pages specified")
results = []
# key = page_num
# value = page_order
page_map = {}
for item in new_order:
k = int(item['page_order'])
v = int(item['page_num'])
page_map[k] = v
for number in range(1, page_count + 1):
results.append(
page_map[number]
)
return results
def cat_ranges_for_delete(page_count, page_numbers):
"""
Returns a list of integers. Each number in the list
is the number of page which will 'stay' in document.
In other words, it returns a list with not deleted pages.
Examples:
If document has 22 pages (page_count=22) and page number 21 is to be
deleted (i.e page_numbers = [21]) will return
[1, 2, 3, 4, ..., 19, 20, 22]
If page number 1 is to be deleted:
[2, 3, 4, ..., 22] list will be returned.
If page number is 22 is to be deleted:
[1, 2, 3,..., 21] will be returned.
With page_numbers=[1, 7, 10] and page_count=22 result
will be:
(2, 3, 4, 5, 6, 8, 9, 11, 12 , 13, ..., 22)
page_numbers is a list of page numbers (starting with 1).
"""
results = []
for check in page_numbers:
if not isinstance(check, int):
err_msg = "page_numbers must be a list of ints"
raise ValueError(err_msg)
for number in range(1, page_count + 1):
if number not in page_numbers:
results.append(number)
return results
def split_ranges(total, after=False, before=False):
"""
Given a range 1, 2, ..., total (page numbers of a doc).
Split it in two lists.
Example:
Input: total = 9, after=1, before=False
Output: list1 = [1]; list2 = [2, 3, 4, ..., 9].
Input: total = 9; after=False, before=1
Output: list1 = [], list2 = [1, 2, 3, 4, ..., 9]
Input: total = 5; after=4; before=False
Output: list1 = [1, 2, 3, 4] list2 = [5]
Input: total = 5; after=False; before=False;
Output: list1 = [1, 2, 3, 4, 5], list2 = []
(it means, by default, all pages are inserted at the end of the doc)
"""
if after and not before:
if not type(after) == int:
raise ValueError(
"argument 'after' is supposed to be an int"
)
list1 = list(range(1, after + 1))
list2 = list(range(after + 1, total + 1))
return list1, list2
if not after and before:
if not type(before) == int:
raise ValueError(
"argument 'before' is supposed to be an int"
)
list1 = list(range(1, before))
list2 = list(range(before, total + 1))
return list1, list2
list1 = list(range(1, total + 1))
list2 = []
return list1, list2
def paste_pages_into_existing_doc(
src,
dst,
data_list,
after_page_number=False,
before_page_number=False
):
page_count = get_pagecount(src)
list1, list2 = split_ranges(
total=page_count,
after=after_page_number,
before=before_page_number
)
# notice missing A
# Letter A is assignent to current folder and
# pages from list1 and list2
letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ"
letters_2_doc_map = []
letters_pages = []
letters_pages_before = []
letters_pages_after = []
letters_2_doc_map.append(
f"A={src}"
)
for idx in range(0, len(data_list)):
letter = letters[idx]
src = data_list[idx]['src']
pages = data_list[idx]['page_nums']
letters_2_doc_map.append(
f"{letter}={src}"
)
for p in pages:
letters_pages.append(
f"{letter}{p}"
)
for p in list1:
letters_pages_before.append(
f"A{p}"
)
for p in list2:
letters_pages_after.append(
f"A{p}"
)
cmd = [
settings.BINARY_PDFTK,
]
# add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map)
cmd.append("cat")
# existing doc pages (may be empty)
cmd.extend(letters_pages_before)
# newly inserted pages
cmd.extend(letters_pages)
# existing doc pages (may be empty)
cmd.extend(letters_pages_after)
cmd.append("output")
cmd.append(dst)
run(cmd)
def paste_pages(
src,
dst,
data_list,
dst_doc_is_new=True,
after_page_number=False,
before_page_number=False
):
"""
dest_doc_ep = endpoint of the doc where newly created
file will be placed.
src_doc_ep_list is a list of following format:
[
{
'doc_ep': doc_ep,
'page_nums': [page_num_1, page_num_2, page_num_3]
},
{
'doc_ep': doc_ep,
'page_nums': [page_num_1, page_num_2, page_num_3]
},
...
]
src_doc_ep_list is a list of documents where pages
(with numbers page_num_1...) will be paste from.
dst_doc_is_new = True well.. destination document was just created,
we are pasting here cutted pages into some folder as new document.
In this case 'after' and 'before' arguments are ignored
dst_doc_is_new = False, pasting pages into exiting document.
If before_page_number > 0 - paste pages before page number
'before_page_number'
If after_page_number > 0 - paste pages after page number
'after_page_number'
before_page_number argument has priority over after_page_number.
If both before_page_number and after_page_number are < 0 - just paste
pages at the end of the document.
"""
if not dst_doc_is_new:
return paste_pages_into_existing_doc(
src=src,
dst=dst,
data_list=data_list,
after_page_number=after_page_number,
before_page_number=before_page_number
)
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
letters_2_doc_map = []
letters_pages = []
for idx in range(0, len(data_list)):
letter = letters[idx]
src = data_list[idx]['src']
pages = data_list[idx]['page_nums']
letters_2_doc_map.append(
f"{letter}={src}"
)
for p in pages:
letters_pages.append(
f"{letter}{p}"
)
cmd = [
settings.BINARY_PDFTK,
]
# add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map)
cmd.append("cat")
cmd.extend(letters_pages)
cmd.append("output")
cmd.append(dst)
run(cmd)
def reorder_pages(
src, dst, new_order
):
"""
new_order is a list of following format:
[
{'page_num': 2, page_order: 1},
{'page_num': 1, page_order: 2},
{'page_num': 3, page_order: 3},
{'page_num': 4, page_order: 4},
]
Example above means that in current document of 4 pages,
first page was swapped with second one.
page_num = older page order
page_order = current page order
So in human language, each hash is read:
<page_num> now should be <page_order>
"""
page_count = get_pagecount(src)
cat_ranges = cat_ranges_for_reorder(
page_count=page_count,
new_order=new_order
)
cmd = [
settings.BINARY_PDFTK,
src,
"cat"
]
for page in cat_ranges:
cmd.append(
str(page)
)
cmd.append("output")
cmd.append(dst)
run(cmd)
def delete_pages(src, dst, page_numbers):
page_count = get_pagecount(src)
cat_ranges = cat_ranges_for_delete(
page_count,
page_numbers
)
cmd = [
settings.BINARY_PDFTK,
src,
"cat"
]
for page in cat_ranges:
cmd.append(
str(page)
)
cmd.append("output")
cmd.append(dst)
run(cmd)

View File

@ -4,7 +4,7 @@ import shutil
from os import listdir
from os.path import isdir, join
from mglib import pdftk
from mglib import stapler
from mglib.path import DocumentPath, PagePath
from mglib.step import Steps
from mglib.utils import get_assigns_after_delete, safe_to_delete
@ -209,7 +209,7 @@ class Storage:
self.abspath(dst_doc_path)
)
pdftk.reorder_pages(
stapler.reorder_pages(
src=self.abspath(src_doc_path),
dst=self.abspath(dst_doc_path),
new_order=new_order
@ -269,7 +269,7 @@ class Storage:
self.make_sure_path_exists(
self.abspath(dst_doc_path)
)
pdftk.delete_pages(
stapler.delete_pages(
self.abspath(src_doc_path),
self.abspath(dst_doc_path),
page_numbers
@ -332,7 +332,7 @@ class Storage:
self.abspath(next_ver_dp)
)
pdftk.paste_pages(
stapler.paste_pages(
src=self.abspath(dest_doc_path),
dst=self.abspath(next_ver_dp),
data_list=data_list,

View File

@ -6,7 +6,7 @@ with open("README.md", "r") as fh:
setup(
name="mglib",
version="1.3.0",
version="1.3.2",
author="Eugen Ciur",
author_email="eugen@papermerge.com",
url="https://github.com/papermerge/mglib",

View File

@ -1,145 +0,0 @@
import os
import unittest
from unittest import mock
from mglib import pdftk
from mglib.conf import settings
from mglib.runcmd import run
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")
class TestPdfLib(unittest.TestCase):
def test_ranges_for_reorder(self):
actual = pdftk.cat_ranges_for_reorder(4, [
{"page_order": 1, "page_num": 4},
{"page_order": 2, "page_num": 3},
{"page_order": 3, "page_num": 2},
{"page_order": 4, "page_num": 1}
])
expected = [4,3,2,1]
assert expected == actual
self.assertRaises(ValueError, pdftk.cat_ranges_for_reorder, 2, [])
self.assertRaises(KeyError, pdftk.cat_ranges_for_reorder, 2, [
{"page_order": 3, "page_num": 4},
{"page_order": 5, "page_num": 6}
])
def test_delete_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.delete_pages(input_file, output_file, [1])
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, input_file, "cat", "2", "output", output_file]
)
def test_cat_ranges_for_delete(self):
page_count = 22
page_numbers = range(1, 23)
actual = pdftk.cat_ranges_for_delete(page_count, [21])
expected = list(page_numbers)
expected.remove(21)
assert actual == expected
actual = pdftk.cat_ranges_for_delete(page_count, [1])
expected = list(page_numbers)
expected.remove(1)
assert actual == expected
actual = pdftk.cat_ranges_for_delete(page_count, [1, 7, 10])
expected = list(page_numbers)
expected.remove(1)
expected.remove(7)
expected.remove(10)
assert actual == expected
self.assertRaises(ValueError, pdftk.cat_ranges_for_delete, page_count, ["1"])
def test_split_ranges(self):
page_count = 9
page_numbers = list(range(1, 10))
self.assertRaises(ValueError, pdftk.split_ranges, 9, after="a", before=False)
self.assertRaises(ValueError, pdftk.split_ranges, 9, after=False, before=True)
actual1, actual2 = pdftk.split_ranges(page_count, 1, False)
expected1 = [1]
expected2 = [2, 3, 4, 5, 6, 7, 8, 9]
assert actual1 == expected1
assert actual2 == expected2
actual1, actual2 = pdftk.split_ranges(page_count, False, 2)
expected1 = [1]
expected2 = [2, 3, 4, 5, 6, 7, 8, 9]
assert actual1 == expected1
assert actual2 == expected2
actual1, actual2 = pdftk.split_ranges(page_count)
expected1 = list(range(1, page_count + 1))
expected2 = []
assert actual1 == expected1
assert actual2 == expected2
def test_reorder_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
new_order = [
{'page_num': 2, 'page_order': 1},
{'page_num': 1, 'page_order': 2},
]
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.reorder_pages(input_file, output_file, new_order)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, input_file, "cat", "2", "1", "output", output_file]
)
def test_paste_pages_into_existing_doc(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages_into_existing_doc(input_file, output_file, datalist)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "cat", "A1", "A2", "output", output_file]
)
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages_into_existing_doc(input_file, output_file, datalist, 1)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "B=" + input_file, "cat", "A1", "B3",
"B4", "A2", "output", output_file]
)
def test_paste_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages(input_file, output_file, datalist, False)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "cat", "A1", "A2", "output", output_file]
)
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages(input_file, output_file, datalist)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "cat", "A3", "A4",
"output", output_file]
)

View File

@ -3,7 +3,6 @@ import unittest
from unittest import mock
from mglib import stapler
from mglib.conf import settings
from mglib.runcmd import run
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@ -17,15 +16,15 @@ class TestPdfLib(unittest.TestCase):
{"page_order": 2, "page_num": 3},
{"page_order": 3, "page_num": 2},
{"page_order": 4, "page_num": 1}
])
expected = [4,3,2,1]
])
expected = [4, 3, 2, 1]
assert expected == actual
self.assertRaises(ValueError, stapler.cat_ranges_for_reorder, 2, [])
self.assertRaises(KeyError, stapler.cat_ranges_for_reorder, 2, [
{"page_order": 3, "page_num": 4},
{"page_order": 5, "page_num": 6}
])
])
def test_delete_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
@ -38,13 +37,22 @@ class TestPdfLib(unittest.TestCase):
[settings.BINARY_STAPLER, "del", input_file, "1", output_file]
)
def test_split_ranges(self):
page_count = 9
page_numbers = list(range(1, 10))
self.assertRaises(ValueError, stapler.split_ranges, 9, after="a", before=False)
self.assertRaises(ValueError, stapler.split_ranges, 9, after=False, before=True)
self.assertRaises(
ValueError,
stapler.split_ranges,
9,
after="a",
before=False
)
self.assertRaises(
ValueError,
stapler.split_ranges,
9, after=False,
before=True
)
actual1, actual2 = stapler.split_ranges(page_count, 1, False)
expected1 = [1]
@ -67,54 +75,81 @@ class TestPdfLib(unittest.TestCase):
def test_reorder_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
new_order = [
{'page_num': 2, 'page_order': 1},
{'page_num': 1, 'page_order': 2},
]
new_order = [
{'page_num': 2, 'page_order': 1},
{'page_num': 1, 'page_order': 2},
]
with mock.patch("mglib.stapler.run") as run_func:
stapler.reorder_pages(input_file, output_file, new_order)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", input_file, "2", "1", output_file]
[
settings.BINARY_STAPLER,
"sel",
input_file,
"2",
"1",
output_file
]
)
def test_paste_pages_into_existing_doc(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
datalist = []
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages_into_existing_doc(input_file, output_file, datalist)
stapler.paste_pages_into_existing_doc(
input_file, output_file, datalist
)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "A1", "A2", output_file]
[
settings.BINARY_STAPLER,
"sel", "A=" + input_file, "A1", "A2", output_file
]
)
datalist = [{"src": input_file, "page_nums": "34"}]
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages_into_existing_doc(input_file, output_file, datalist, 1)
stapler.paste_pages_into_existing_doc(
input_file,
output_file,
datalist,
1
)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "B=" + input_file, "A1", "B3",
"B4", "A2", output_file]
[
settings.BINARY_STAPLER,
"sel", "A=" + input_file,
"B=" + input_file, "A1", "B3",
"B4", "A2", output_file
]
)
def test_paste_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
datalist = []
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages(input_file, output_file, datalist, False)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "A1", "A2", output_file]
[
settings.BINARY_STAPLER,
"sel",
"A=" + input_file,
"A1",
"A2",
output_file
]
)
datalist = [{"src": input_file, "page_nums": "34"}]
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages(input_file, output_file, datalist)