Compare commits

...

5 Commits

Author SHA1 Message Date
Eugen Ciur 9fbaaf7dfd
Merge pull request #2 from georgkrause/test_pdflib
Unit Tests for PDF lib
2020-11-27 15:41:04 +01:00
Eugen Ciur 4e28785b65
Merge pull request #5 from georgkrause/stapler
Add lib interacting with stapler
2020-11-27 15:40:45 +01:00
Georg Krause 6ddff80818
Remove blank line at end of file 2020-11-27 15:04:01 +01:00
Georg Krause 49d569ead8
Add lib interacting with stapler 2020-11-27 14:57:47 +01:00
Georg Krause 1ad7239172
Add Unit Tests for pdftk module 2020-11-24 20:28:39 +01:00
5 changed files with 583 additions and 4 deletions

View File

@ -26,3 +26,7 @@ BINARY_OCR = "/usr/bin/tesseract"
# Provided by pdftk package # Provided by pdftk package
# Used to reorder, cut/paste, delete pages withing PDF document # Used to reorder, cut/paste, delete pages withing PDF document
BINARY_PDFTK = "/usr/bin/pdftk" BINARY_PDFTK = "/usr/bin/pdftk"
# Provided by stapler
# Used to edit PDF documents
BINARY_STAPLER = "~/.local/bin/stapler"

View File

@ -31,6 +31,8 @@ def cat_ranges_for_reorder(page_count, new_order):
[4, 2, 3, 1] [4, 2, 3, 1]
""" """
if len(new_order) != page_count:
raise ValueError("Not enough pages specified")
results = [] results = []
# key = page_num # key = page_num
# value = page_order # value = page_order
@ -53,7 +55,7 @@ def cat_ranges_for_delete(page_count, page_numbers):
""" """
Returns a list of integers. Each number in the list Returns a list of integers. Each number in the list
is the number of page which will 'stay' in document. is the number of page which will 'stay' in document.
In other words, it returns a list with deleted pages. In other words, it returns a list with not deleted pages.
Examples: Examples:
@ -83,7 +85,7 @@ def cat_ranges_for_delete(page_count, page_numbers):
for check in page_numbers: for check in page_numbers:
if not isinstance(check, int): if not isinstance(check, int):
err_msg = "page_numbers must be a list of strings" err_msg = "page_numbers must be a list of ints"
raise ValueError(err_msg) raise ValueError(err_msg)
for number in range(1, page_count + 1): for number in range(1, page_count + 1):
@ -112,7 +114,7 @@ def split_ranges(total, after=False, before=False):
(it means, by default, all pages are inserted at the end of the doc) (it means, by default, all pages are inserted at the end of the doc)
""" """
if after and not before: if after and not before:
if not isinstance(after, int): if not type(after) == int:
raise ValueError( raise ValueError(
"argument 'after' is supposed to be an int" "argument 'after' is supposed to be an int"
) )
@ -121,7 +123,7 @@ def split_ranges(total, after=False, before=False):
return list1, list2 return list1, list2
if not after and before: if not after and before:
if not isinstance(before, int): if not type(before) == int:
raise ValueError( raise ValueError(
"argument 'before' is supposed to be an int" "argument 'before' is supposed to be an int"
) )

302
mglib/stapler.py Normal file
View File

@ -0,0 +1,302 @@
import logging
from mglib.runcmd import run
from mglib.pdfinfo import get_pagecount
from .conf import settings
logger = logging.getLogger(__name__)
#
# Utilities around stapler command line tool
#
# https://github.com/hellerbarde/stapler
#
def cat_ranges_for_reorder(page_count, new_order):
"""
Returns a list of integers. Each number in the list
is correctly positioned (newly ordered) page.
Examples:
If in document with 4 pages first and second pages were
swapped, then returned list will be:
[2, 1, 3, 4]
If first page was swapped with last one (also 4 paegs document)
result list will look like:
[4, 2, 3, 1]
"""
if len(new_order) != page_count:
raise ValueError("Not enough pages specified")
results = []
# key = page_num
# value = page_order
page_map = {}
for item in new_order:
k = int(item['page_order'])
v = int(item['page_num'])
page_map[k] = v
for number in range(1, page_count + 1):
results.append(
page_map[number]
)
return results
def split_ranges(total, after=False, before=False):
"""
Given a range 1, 2, ..., total (page numbers of a doc).
Split it in two lists.
Example:
Input: total = 9, after=1, before=False
Output: list1 = [1]; list2 = [2, 3, 4, ..., 9].
Input: total = 9; after=False, before=1
Output: list1 = [], list2 = [1, 2, 3, 4, ..., 9]
Input: total = 5; after=4; before=False
Output: list1 = [1, 2, 3, 4] list2 = [5]
Input: total = 5; after=False; before=False;
Output: list1 = [1, 2, 3, 4, 5], list2 = []
(it means, by default, all pages are inserted at the end of the doc)
"""
if after and not before:
if not type(after) == int:
raise ValueError(
"argument 'after' is supposed to be an int"
)
list1 = list(range(1, after + 1))
list2 = list(range(after + 1, total + 1))
return list1, list2
if not after and before:
if not type(before) == int:
raise ValueError(
"argument 'before' is supposed to be an int"
)
list1 = list(range(1, before))
list2 = list(range(before, total + 1))
return list1, list2
list1 = list(range(1, total + 1))
list2 = []
return list1, list2
def paste_pages_into_existing_doc(
src,
dst,
data_list,
after_page_number=False,
before_page_number=False
):
page_count = get_pagecount(src)
list1, list2 = split_ranges(
total=page_count,
after=after_page_number,
before=before_page_number
)
# notice missing A
# Letter A is assignent to current folder and
# pages from list1 and list2
letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ"
letters_2_doc_map = []
letters_pages = []
letters_pages_before = []
letters_pages_after = []
letters_2_doc_map.append(
f"A={src}"
)
for idx in range(0, len(data_list)):
letter = letters[idx]
src = data_list[idx]['src']
pages = data_list[idx]['page_nums']
letters_2_doc_map.append(
f"{letter}={src}"
)
for p in pages:
letters_pages.append(
f"{letter}{p}"
)
for p in list1:
letters_pages_before.append(
f"A{p}"
)
for p in list2:
letters_pages_after.append(
f"A{p}"
)
cmd = [
settings.BINARY_STAPLER,
]
cmd.append("sel")
# add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map)
# existing doc pages (may be empty)
cmd.extend(letters_pages_before)
# newly inserted pages
cmd.extend(letters_pages)
# existing doc pages (may be empty)
cmd.extend(letters_pages_after)
cmd.append(dst)
run(cmd)
def paste_pages(
src,
dst,
data_list,
dst_doc_is_new=True,
after_page_number=False,
before_page_number=False
):
"""
dest_doc_ep = endpoint of the doc where newly created
file will be placed.
src_doc_ep_list is a list of following format:
[
{
'doc_ep': doc_ep,
'page_nums': [page_num_1, page_num_2, page_num_3]
},
{
'doc_ep': doc_ep,
'page_nums': [page_num_1, page_num_2, page_num_3]
},
...
]
src_doc_ep_list is a list of documents where pages
(with numbers page_num_1...) will be paste from.
dst_doc_is_new = True well.. destination document was just created,
we are pasting here cutted pages into some folder as new document.
In this case 'after' and 'before' arguments are ignored
dst_doc_is_new = False, pasting pages into exiting document.
If before_page_number > 0 - paste pages before page number
'before_page_number'
If after_page_number > 0 - paste pages after page number
'after_page_number'
before_page_number argument has priority over after_page_number.
If both before_page_number and after_page_number are < 0 - just paste
pages at the end of the document.
"""
if not dst_doc_is_new:
return paste_pages_into_existing_doc(
src=src,
dst=dst,
data_list=data_list,
after_page_number=after_page_number,
before_page_number=before_page_number
)
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
letters_2_doc_map = []
letters_pages = []
for idx in range(0, len(data_list)):
letter = letters[idx]
src = data_list[idx]['src']
pages = data_list[idx]['page_nums']
letters_2_doc_map.append(
f"{letter}={src}"
)
for p in pages:
letters_pages.append(
f"{letter}{p}"
)
cmd = [
settings.BINARY_STAPLER,
]
cmd.append("sel")
# add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map)
cmd.extend(letters_pages)
cmd.append(dst)
run(cmd)
def reorder_pages(
src, dst, new_order
):
"""
new_order is a list of following format:
[
{'page_num': 2, page_order: 1},
{'page_num': 1, page_order: 2},
{'page_num': 3, page_order: 3},
{'page_num': 4, page_order: 4},
]
Example above means that in current document of 4 pages,
first page was swapped with second one.
page_num = older page order
page_order = current page order
So in human language, each hash is read:
<page_num> now should be <page_order>
"""
page_count = get_pagecount(src)
cat_ranges = cat_ranges_for_reorder(
page_count=page_count,
new_order=new_order
)
cmd = [
settings.BINARY_STAPLER,
"sel",
src
]
for page in cat_ranges:
cmd.append(
str(page)
)
cmd.append(dst)
run(cmd)
def delete_pages(src, dst, page_numbers):
page_count = get_pagecount(src)
cmd = [
settings.BINARY_STAPLER,
"del",
src
]
for page in page_numbers:
cmd.append(
str(page)
)
cmd.append(dst)
run(cmd)

145
test/test_pdftk.py Normal file
View File

@ -0,0 +1,145 @@
import os
import unittest
from unittest import mock
from mglib import pdftk
from mglib.conf import settings
from mglib.runcmd import run
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")
class TestPdfLib(unittest.TestCase):
def test_ranges_for_reorder(self):
actual = pdftk.cat_ranges_for_reorder(4, [
{"page_order": 1, "page_num": 4},
{"page_order": 2, "page_num": 3},
{"page_order": 3, "page_num": 2},
{"page_order": 4, "page_num": 1}
])
expected = [4,3,2,1]
assert expected == actual
self.assertRaises(ValueError, pdftk.cat_ranges_for_reorder, 2, [])
self.assertRaises(KeyError, pdftk.cat_ranges_for_reorder, 2, [
{"page_order": 3, "page_num": 4},
{"page_order": 5, "page_num": 6}
])
def test_delete_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.delete_pages(input_file, output_file, [1])
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, input_file, "cat", "2", "output", output_file]
)
def test_cat_ranges_for_delete(self):
page_count = 22
page_numbers = range(1, 23)
actual = pdftk.cat_ranges_for_delete(page_count, [21])
expected = list(page_numbers)
expected.remove(21)
assert actual == expected
actual = pdftk.cat_ranges_for_delete(page_count, [1])
expected = list(page_numbers)
expected.remove(1)
assert actual == expected
actual = pdftk.cat_ranges_for_delete(page_count, [1, 7, 10])
expected = list(page_numbers)
expected.remove(1)
expected.remove(7)
expected.remove(10)
assert actual == expected
self.assertRaises(ValueError, pdftk.cat_ranges_for_delete, page_count, ["1"])
def test_split_ranges(self):
page_count = 9
page_numbers = list(range(1, 10))
self.assertRaises(ValueError, pdftk.split_ranges, 9, after="a", before=False)
self.assertRaises(ValueError, pdftk.split_ranges, 9, after=False, before=True)
actual1, actual2 = pdftk.split_ranges(page_count, 1, False)
expected1 = [1]
expected2 = [2, 3, 4, 5, 6, 7, 8, 9]
assert actual1 == expected1
assert actual2 == expected2
actual1, actual2 = pdftk.split_ranges(page_count, False, 2)
expected1 = [1]
expected2 = [2, 3, 4, 5, 6, 7, 8, 9]
assert actual1 == expected1
assert actual2 == expected2
actual1, actual2 = pdftk.split_ranges(page_count)
expected1 = list(range(1, page_count + 1))
expected2 = []
assert actual1 == expected1
assert actual2 == expected2
def test_reorder_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
new_order = [
{'page_num': 2, 'page_order': 1},
{'page_num': 1, 'page_order': 2},
]
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.reorder_pages(input_file, output_file, new_order)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, input_file, "cat", "2", "1", "output", output_file]
)
def test_paste_pages_into_existing_doc(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages_into_existing_doc(input_file, output_file, datalist)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "cat", "A1", "A2", "output", output_file]
)
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages_into_existing_doc(input_file, output_file, datalist, 1)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "B=" + input_file, "cat", "A1", "B3",
"B4", "A2", "output", output_file]
)
def test_paste_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages(input_file, output_file, datalist, False)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "cat", "A1", "A2", "output", output_file]
)
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages(input_file, output_file, datalist)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "cat", "A3", "A4",
"output", output_file]
)

126
test/test_stapler.py Normal file
View File

@ -0,0 +1,126 @@
import os
import unittest
from unittest import mock
from mglib import stapler
from mglib.conf import settings
from mglib.runcmd import run
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")
class TestPdfLib(unittest.TestCase):
def test_ranges_for_reorder(self):
actual = stapler.cat_ranges_for_reorder(4, [
{"page_order": 1, "page_num": 4},
{"page_order": 2, "page_num": 3},
{"page_order": 3, "page_num": 2},
{"page_order": 4, "page_num": 1}
])
expected = [4,3,2,1]
assert expected == actual
self.assertRaises(ValueError, stapler.cat_ranges_for_reorder, 2, [])
self.assertRaises(KeyError, stapler.cat_ranges_for_reorder, 2, [
{"page_order": 3, "page_num": 4},
{"page_order": 5, "page_num": 6}
])
def test_delete_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
with mock.patch("mglib.stapler.run") as run_func:
stapler.delete_pages(input_file, output_file, [1])
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "del", input_file, "1", output_file]
)
def test_split_ranges(self):
page_count = 9
page_numbers = list(range(1, 10))
self.assertRaises(ValueError, stapler.split_ranges, 9, after="a", before=False)
self.assertRaises(ValueError, stapler.split_ranges, 9, after=False, before=True)
actual1, actual2 = stapler.split_ranges(page_count, 1, False)
expected1 = [1]
expected2 = [2, 3, 4, 5, 6, 7, 8, 9]
assert actual1 == expected1
assert actual2 == expected2
actual1, actual2 = stapler.split_ranges(page_count, False, 2)
expected1 = [1]
expected2 = [2, 3, 4, 5, 6, 7, 8, 9]
assert actual1 == expected1
assert actual2 == expected2
actual1, actual2 = stapler.split_ranges(page_count)
expected1 = list(range(1, page_count + 1))
expected2 = []
assert actual1 == expected1
assert actual2 == expected2
def test_reorder_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
new_order = [
{'page_num': 2, 'page_order': 1},
{'page_num': 1, 'page_order': 2},
]
with mock.patch("mglib.stapler.run") as run_func:
stapler.reorder_pages(input_file, output_file, new_order)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", input_file, "2", "1", output_file]
)
def test_paste_pages_into_existing_doc(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages_into_existing_doc(input_file, output_file, datalist)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "A1", "A2", output_file]
)
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages_into_existing_doc(input_file, output_file, datalist, 1)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "B=" + input_file, "A1", "B3",
"B4", "A2", output_file]
)
def test_paste_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages(input_file, output_file, datalist, False)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "A1", "A2", output_file]
)
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages(input_file, output_file, datalist)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "A3", "A4",
output_file]
)