diff --git a/mglib/conf/default_settings.py b/mglib/conf/default_settings.py index 130b981..fd0e970 100644 --- a/mglib/conf/default_settings.py +++ b/mglib/conf/default_settings.py @@ -26,3 +26,7 @@ BINARY_OCR = "/usr/bin/tesseract" # Provided by pdftk package # Used to reorder, cut/paste, delete pages withing PDF document BINARY_PDFTK = "/usr/bin/pdftk" + +# Provided by stapler +# Used to edit PDF documents +BINARY_STAPLER = "~/.local/bin/stapler" diff --git a/mglib/stapler.py b/mglib/stapler.py new file mode 100644 index 0000000..76b3363 --- /dev/null +++ b/mglib/stapler.py @@ -0,0 +1,303 @@ +import logging + +from mglib.runcmd import run +from mglib.pdfinfo import get_pagecount + +from .conf import settings + +logger = logging.getLogger(__name__) + +# +# Utilities around stapler command line tool +# +# https://github.com/hellerbarde/stapler +# + + +def cat_ranges_for_reorder(page_count, new_order): + """ + Returns a list of integers. Each number in the list + is correctly positioned (newly ordered) page. + + Examples: + + If in document with 4 pages first and second pages were + swapped, then returned list will be: + + [2, 1, 3, 4] + + If first page was swapped with last one (also 4 paegs document) + result list will look like: + + [4, 2, 3, 1] + """ + if len(new_order) != page_count: + raise ValueError("Not enough pages specified") + results = [] + # key = page_num + # value = page_order + page_map = {} + + for item in new_order: + k = int(item['page_order']) + v = int(item['page_num']) + page_map[k] = v + + for number in range(1, page_count + 1): + results.append( + page_map[number] + ) + + return results + + +def split_ranges(total, after=False, before=False): + """ + Given a range 1, 2, ..., total (page numbers of a doc). + Split it in two lists. + Example: + Input: total = 9, after=1, before=False + Output: list1 = [1]; list2 = [2, 3, 4, ..., 9]. + + Input: total = 9; after=False, before=1 + Output: list1 = [], list2 = [1, 2, 3, 4, ..., 9] + + Input: total = 5; after=4; before=False + Output: list1 = [1, 2, 3, 4] list2 = [5] + + Input: total = 5; after=False; before=False; + Output: list1 = [1, 2, 3, 4, 5], list2 = [] + (it means, by default, all pages are inserted at the end of the doc) + """ + if after and not before: + if not type(after) == int: + raise ValueError( + "argument 'after' is supposed to be an int" + ) + list1 = list(range(1, after + 1)) + list2 = list(range(after + 1, total + 1)) + return list1, list2 + + if not after and before: + if not type(before) == int: + raise ValueError( + "argument 'before' is supposed to be an int" + ) + list1 = list(range(1, before)) + list2 = list(range(before, total + 1)) + return list1, list2 + + list1 = list(range(1, total + 1)) + list2 = [] + + return list1, list2 + + +def paste_pages_into_existing_doc( + src, + dst, + data_list, + after_page_number=False, + before_page_number=False +): + page_count = get_pagecount(src) + list1, list2 = split_ranges( + total=page_count, + after=after_page_number, + before=before_page_number + ) + # notice missing A + # Letter A is assignent to current folder and + # pages from list1 and list2 + letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ" + letters_2_doc_map = [] + letters_pages = [] + letters_pages_before = [] + letters_pages_after = [] + + letters_2_doc_map.append( + f"A={src}" + ) + + for idx in range(0, len(data_list)): + letter = letters[idx] + src = data_list[idx]['src'] + pages = data_list[idx]['page_nums'] + + letters_2_doc_map.append( + f"{letter}={src}" + ) + for p in pages: + letters_pages.append( + f"{letter}{p}" + ) + + for p in list1: + letters_pages_before.append( + f"A{p}" + ) + + for p in list2: + letters_pages_after.append( + f"A{p}" + ) + + cmd = [ + settings.BINARY_STAPLER, + ] + cmd.append("sel") + + # add A=doc1_path, B=doc2_path + cmd.extend(letters_2_doc_map) + + # existing doc pages (may be empty) + cmd.extend(letters_pages_before) + # newly inserted pages + cmd.extend(letters_pages) + # existing doc pages (may be empty) + cmd.extend(letters_pages_after) + + cmd.append(dst) + + run(cmd) + + +def paste_pages( + src, + dst, + data_list, + dst_doc_is_new=True, + after_page_number=False, + before_page_number=False +): + """ + dest_doc_ep = endpoint of the doc where newly created + file will be placed. + src_doc_ep_list is a list of following format: + [ + { + 'doc_ep': doc_ep, + 'page_nums': [page_num_1, page_num_2, page_num_3] + }, + { + 'doc_ep': doc_ep, + 'page_nums': [page_num_1, page_num_2, page_num_3] + }, + ... + ] + src_doc_ep_list is a list of documents where pages + (with numbers page_num_1...) will be paste from. + + dst_doc_is_new = True well.. destination document was just created, + we are pasting here cutted pages into some folder as new document. + + In this case 'after' and 'before' arguments are ignored + + dst_doc_is_new = False, pasting pages into exiting document. + If before_page_number > 0 - paste pages before page number + 'before_page_number' + If after_page_number > 0 - paste pages after page number + 'after_page_number' + + before_page_number argument has priority over after_page_number. + + If both before_page_number and after_page_number are < 0 - just paste + pages at the end of the document. + """ + if not dst_doc_is_new: + return paste_pages_into_existing_doc( + src=src, + dst=dst, + data_list=data_list, + after_page_number=after_page_number, + before_page_number=before_page_number + ) + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + letters_2_doc_map = [] + letters_pages = [] + + for idx in range(0, len(data_list)): + letter = letters[idx] + src = data_list[idx]['src'] + pages = data_list[idx]['page_nums'] + + letters_2_doc_map.append( + f"{letter}={src}" + ) + for p in pages: + letters_pages.append( + f"{letter}{p}" + ) + + cmd = [ + settings.BINARY_STAPLER, + ] + cmd.append("sel") + + # add A=doc1_path, B=doc2_path + cmd.extend(letters_2_doc_map) + + cmd.extend(letters_pages) + + cmd.append(dst) + + run(cmd) + + +def reorder_pages( + src, dst, new_order +): + """ + new_order is a list of following format: + + [ + {'page_num': 2, page_order: 1}, + {'page_num': 1, page_order: 2}, + {'page_num': 3, page_order: 3}, + {'page_num': 4, page_order: 4}, + ] + Example above means that in current document of 4 pages, + first page was swapped with second one. + page_num = older page order + page_order = current page order + So in human language, each hash is read: + now should be + """ + page_count = get_pagecount(src) + + cat_ranges = cat_ranges_for_reorder( + page_count=page_count, + new_order=new_order + ) + + cmd = [ + settings.BINARY_STAPLER, + "sel", + src + ] + for page in cat_ranges: + cmd.append( + str(page) + ) + + cmd.append(dst) + run(cmd) + + +def delete_pages(src, dst, page_numbers): + page_count = get_pagecount(src) + + cmd = [ + settings.BINARY_STAPLER, + "del", + src + ] + for page in page_numbers: + cmd.append( + str(page) + ) + + cmd.append(dst) + + run(cmd) + diff --git a/test/test_stapler.py b/test/test_stapler.py new file mode 100644 index 0000000..eb46702 --- /dev/null +++ b/test/test_stapler.py @@ -0,0 +1,126 @@ +import os +import unittest +from unittest import mock +from mglib import stapler +from mglib.conf import settings +from mglib.runcmd import run + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + +DATA_DIR = os.path.join(BASE_DIR, "data") + + +class TestPdfLib(unittest.TestCase): + def test_ranges_for_reorder(self): + actual = stapler.cat_ranges_for_reorder(4, [ + {"page_order": 1, "page_num": 4}, + {"page_order": 2, "page_num": 3}, + {"page_order": 3, "page_num": 2}, + {"page_order": 4, "page_num": 1} + ]) + expected = [4,3,2,1] + assert expected == actual + + self.assertRaises(ValueError, stapler.cat_ranges_for_reorder, 2, []) + self.assertRaises(KeyError, stapler.cat_ranges_for_reorder, 2, [ + {"page_order": 3, "page_num": 4}, + {"page_order": 5, "page_num": 6} + ]) + + def test_delete_pages(self): + input_file = os.path.join(DATA_DIR, "berlin.pdf") + output_file = os.path.join(DATA_DIR, "berlin2.pdf") + + with mock.patch("mglib.stapler.run") as run_func: + stapler.delete_pages(input_file, output_file, [1]) + run_func.assert_called() + run_func.assert_called_with( + [settings.BINARY_STAPLER, "del", input_file, "1", output_file] + ) + + + def test_split_ranges(self): + page_count = 9 + page_numbers = list(range(1, 10)) + + self.assertRaises(ValueError, stapler.split_ranges, 9, after="a", before=False) + self.assertRaises(ValueError, stapler.split_ranges, 9, after=False, before=True) + + actual1, actual2 = stapler.split_ranges(page_count, 1, False) + expected1 = [1] + expected2 = [2, 3, 4, 5, 6, 7, 8, 9] + assert actual1 == expected1 + assert actual2 == expected2 + + actual1, actual2 = stapler.split_ranges(page_count, False, 2) + expected1 = [1] + expected2 = [2, 3, 4, 5, 6, 7, 8, 9] + assert actual1 == expected1 + assert actual2 == expected2 + + actual1, actual2 = stapler.split_ranges(page_count) + expected1 = list(range(1, page_count + 1)) + expected2 = [] + assert actual1 == expected1 + assert actual2 == expected2 + + def test_reorder_pages(self): + input_file = os.path.join(DATA_DIR, "berlin.pdf") + output_file = os.path.join(DATA_DIR, "berlin2.pdf") + new_order = [ + {'page_num': 2, 'page_order': 1}, + {'page_num': 1, 'page_order': 2}, + ] + + with mock.patch("mglib.stapler.run") as run_func: + stapler.reorder_pages(input_file, output_file, new_order) + run_func.assert_called() + run_func.assert_called_with( + [settings.BINARY_STAPLER, "sel", input_file, "2", "1", output_file] + ) + + def test_paste_pages_into_existing_doc(self): + input_file = os.path.join(DATA_DIR, "berlin.pdf") + output_file = os.path.join(DATA_DIR, "berlin2.pdf") + datalist = [] + + with mock.patch("mglib.stapler.run") as run_func: + stapler.paste_pages_into_existing_doc(input_file, output_file, datalist) + run_func.assert_called() + run_func.assert_called_with( + [settings.BINARY_STAPLER, "sel", "A=" + input_file, "A1", "A2", output_file] + ) + + datalist = [{"src": input_file, "page_nums": "34"}] + + with mock.patch("mglib.stapler.run") as run_func: + stapler.paste_pages_into_existing_doc(input_file, output_file, datalist, 1) + run_func.assert_called() + run_func.assert_called_with( + [settings.BINARY_STAPLER, "sel", "A=" + input_file, "B=" + input_file, "A1", "B3", + "B4", "A2", output_file] + ) + + + def test_paste_pages(self): + input_file = os.path.join(DATA_DIR, "berlin.pdf") + output_file = os.path.join(DATA_DIR, "berlin2.pdf") + datalist = [] + + with mock.patch("mglib.stapler.run") as run_func: + stapler.paste_pages(input_file, output_file, datalist, False) + run_func.assert_called() + run_func.assert_called_with( + [settings.BINARY_STAPLER, "sel", "A=" + input_file, "A1", "A2", output_file] + ) + + datalist = [{"src": input_file, "page_nums": "34"}] + + with mock.patch("mglib.stapler.run") as run_func: + stapler.paste_pages(input_file, output_file, datalist) + run_func.assert_called() + run_func.assert_called_with( + [settings.BINARY_STAPLER, "sel", "A=" + input_file, "A3", "A4", + output_file] + ) +