From fa90e6b0a65cb4ca1a24acafcc1e11119d056314 Mon Sep 17 00:00:00 2001 From: Eugen Ciur Date: Tue, 1 Dec 2020 07:44:57 +0100 Subject: [PATCH] removed pdftk dependency --- mglib/conf/default_settings.py | 4 - mglib/pdftk.py | 357 --------------------------------- mglib/storage.py | 8 +- test/test_pdftk.py | 145 ------------- test/test_stapler.py | 83 +++++--- 5 files changed, 63 insertions(+), 534 deletions(-) delete mode 100644 mglib/pdftk.py delete mode 100644 test/test_pdftk.py diff --git a/mglib/conf/default_settings.py b/mglib/conf/default_settings.py index fd0e970..63f318f 100644 --- a/mglib/conf/default_settings.py +++ b/mglib/conf/default_settings.py @@ -23,10 +23,6 @@ BINARY_IDENTIFY = "/usr/bin/identify" # Used to extract text from images/PDF files. BINARY_OCR = "/usr/bin/tesseract" -# Provided by pdftk package -# Used to reorder, cut/paste, delete pages withing PDF document -BINARY_PDFTK = "/usr/bin/pdftk" - # Provided by stapler # Used to edit PDF documents BINARY_STAPLER = "~/.local/bin/stapler" diff --git a/mglib/pdftk.py b/mglib/pdftk.py deleted file mode 100644 index 1463b68..0000000 --- a/mglib/pdftk.py +++ /dev/null @@ -1,357 +0,0 @@ -import logging - -from mglib.runcmd import run -from mglib.pdfinfo import get_pagecount - -from .conf import settings - -logger = logging.getLogger(__name__) - -# -# Utilities around pdftk command line tool -# -# https://www.pdflabs.com/docs/pdftk-man-page/ -# - - -def cat_ranges_for_reorder(page_count, new_order): - """ - Returns a list of integers. Each number in the list - is correctly positioned (newly ordered) page. - - Examples: - - If in document with 4 pages first and second pages were - swapped, then returned list will be: - - [2, 1, 3, 4] - - If first page was swapped with last one (also 4 paegs document) - result list will look like: - - [4, 2, 3, 1] - """ - if len(new_order) != page_count: - raise ValueError("Not enough pages specified") - results = [] - # key = page_num - # value = page_order - page_map = {} - - for item in new_order: - k = int(item['page_order']) - v = int(item['page_num']) - page_map[k] = v - - for number in range(1, page_count + 1): - results.append( - page_map[number] - ) - - return results - - -def cat_ranges_for_delete(page_count, page_numbers): - """ - Returns a list of integers. Each number in the list - is the number of page which will 'stay' in document. - In other words, it returns a list with not deleted pages. - - Examples: - - - If document has 22 pages (page_count=22) and page number 21 is to be - deleted (i.e page_numbers = [21]) will return - - [1, 2, 3, 4, ..., 19, 20, 22] - - If page number 1 is to be deleted: - - [2, 3, 4, ..., 22] list will be returned. - - If page number is 22 is to be deleted: - - [1, 2, 3,..., 21] will be returned. - - With page_numbers=[1, 7, 10] and page_count=22 result - will be: - - (2, 3, 4, 5, 6, 8, 9, 11, 12 , 13, ..., 22) - - - page_numbers is a list of page numbers (starting with 1). - """ - results = [] - - for check in page_numbers: - if not isinstance(check, int): - err_msg = "page_numbers must be a list of ints" - raise ValueError(err_msg) - - for number in range(1, page_count + 1): - if number not in page_numbers: - results.append(number) - - return results - - -def split_ranges(total, after=False, before=False): - """ - Given a range 1, 2, ..., total (page numbers of a doc). - Split it in two lists. - Example: - Input: total = 9, after=1, before=False - Output: list1 = [1]; list2 = [2, 3, 4, ..., 9]. - - Input: total = 9; after=False, before=1 - Output: list1 = [], list2 = [1, 2, 3, 4, ..., 9] - - Input: total = 5; after=4; before=False - Output: list1 = [1, 2, 3, 4] list2 = [5] - - Input: total = 5; after=False; before=False; - Output: list1 = [1, 2, 3, 4, 5], list2 = [] - (it means, by default, all pages are inserted at the end of the doc) - """ - if after and not before: - if not type(after) == int: - raise ValueError( - "argument 'after' is supposed to be an int" - ) - list1 = list(range(1, after + 1)) - list2 = list(range(after + 1, total + 1)) - return list1, list2 - - if not after and before: - if not type(before) == int: - raise ValueError( - "argument 'before' is supposed to be an int" - ) - list1 = list(range(1, before)) - list2 = list(range(before, total + 1)) - return list1, list2 - - list1 = list(range(1, total + 1)) - list2 = [] - - return list1, list2 - - -def paste_pages_into_existing_doc( - src, - dst, - data_list, - after_page_number=False, - before_page_number=False -): - page_count = get_pagecount(src) - list1, list2 = split_ranges( - total=page_count, - after=after_page_number, - before=before_page_number - ) - # notice missing A - # Letter A is assignent to current folder and - # pages from list1 and list2 - letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ" - letters_2_doc_map = [] - letters_pages = [] - letters_pages_before = [] - letters_pages_after = [] - - letters_2_doc_map.append( - f"A={src}" - ) - - for idx in range(0, len(data_list)): - letter = letters[idx] - src = data_list[idx]['src'] - pages = data_list[idx]['page_nums'] - - letters_2_doc_map.append( - f"{letter}={src}" - ) - for p in pages: - letters_pages.append( - f"{letter}{p}" - ) - - for p in list1: - letters_pages_before.append( - f"A{p}" - ) - - for p in list2: - letters_pages_after.append( - f"A{p}" - ) - - cmd = [ - settings.BINARY_PDFTK, - ] - # add A=doc1_path, B=doc2_path - cmd.extend(letters_2_doc_map) - - cmd.append("cat") - - # existing doc pages (may be empty) - cmd.extend(letters_pages_before) - # newly inserted pages - cmd.extend(letters_pages) - # existing doc pages (may be empty) - cmd.extend(letters_pages_after) - - cmd.append("output") - - cmd.append(dst) - - run(cmd) - - -def paste_pages( - src, - dst, - data_list, - dst_doc_is_new=True, - after_page_number=False, - before_page_number=False -): - """ - dest_doc_ep = endpoint of the doc where newly created - file will be placed. - src_doc_ep_list is a list of following format: - [ - { - 'doc_ep': doc_ep, - 'page_nums': [page_num_1, page_num_2, page_num_3] - }, - { - 'doc_ep': doc_ep, - 'page_nums': [page_num_1, page_num_2, page_num_3] - }, - ... - ] - src_doc_ep_list is a list of documents where pages - (with numbers page_num_1...) will be paste from. - - dst_doc_is_new = True well.. destination document was just created, - we are pasting here cutted pages into some folder as new document. - - In this case 'after' and 'before' arguments are ignored - - dst_doc_is_new = False, pasting pages into exiting document. - If before_page_number > 0 - paste pages before page number - 'before_page_number' - If after_page_number > 0 - paste pages after page number - 'after_page_number' - - before_page_number argument has priority over after_page_number. - - If both before_page_number and after_page_number are < 0 - just paste - pages at the end of the document. - """ - if not dst_doc_is_new: - return paste_pages_into_existing_doc( - src=src, - dst=dst, - data_list=data_list, - after_page_number=after_page_number, - before_page_number=before_page_number - ) - letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - letters_2_doc_map = [] - letters_pages = [] - - for idx in range(0, len(data_list)): - letter = letters[idx] - src = data_list[idx]['src'] - pages = data_list[idx]['page_nums'] - - letters_2_doc_map.append( - f"{letter}={src}" - ) - for p in pages: - letters_pages.append( - f"{letter}{p}" - ) - - cmd = [ - settings.BINARY_PDFTK, - ] - # add A=doc1_path, B=doc2_path - cmd.extend(letters_2_doc_map) - - cmd.append("cat") - - cmd.extend(letters_pages) - - cmd.append("output") - - cmd.append(dst) - - run(cmd) - - -def reorder_pages( - src, dst, new_order -): - """ - new_order is a list of following format: - - [ - {'page_num': 2, page_order: 1}, - {'page_num': 1, page_order: 2}, - {'page_num': 3, page_order: 3}, - {'page_num': 4, page_order: 4}, - ] - Example above means that in current document of 4 pages, - first page was swapped with second one. - page_num = older page order - page_order = current page order - So in human language, each hash is read: - now should be - """ - page_count = get_pagecount(src) - - cat_ranges = cat_ranges_for_reorder( - page_count=page_count, - new_order=new_order - ) - - cmd = [ - settings.BINARY_PDFTK, - src, - "cat" - ] - for page in cat_ranges: - cmd.append( - str(page) - ) - - cmd.append("output") - cmd.append(dst) - run(cmd) - - -def delete_pages(src, dst, page_numbers): - page_count = get_pagecount(src) - - cat_ranges = cat_ranges_for_delete( - page_count, - page_numbers - ) - - cmd = [ - settings.BINARY_PDFTK, - src, - "cat" - ] - for page in cat_ranges: - cmd.append( - str(page) - ) - - cmd.append("output") - cmd.append(dst) - - run(cmd) diff --git a/mglib/storage.py b/mglib/storage.py index 24689d7..14ec3a1 100644 --- a/mglib/storage.py +++ b/mglib/storage.py @@ -4,7 +4,7 @@ import shutil from os import listdir from os.path import isdir, join -from mglib import pdftk +from mglib import stapler from mglib.path import DocumentPath, PagePath from mglib.step import Steps from mglib.utils import get_assigns_after_delete, safe_to_delete @@ -209,7 +209,7 @@ class Storage: self.abspath(dst_doc_path) ) - pdftk.reorder_pages( + stapler.reorder_pages( src=self.abspath(src_doc_path), dst=self.abspath(dst_doc_path), new_order=new_order @@ -269,7 +269,7 @@ class Storage: self.make_sure_path_exists( self.abspath(dst_doc_path) ) - pdftk.delete_pages( + stapler.delete_pages( self.abspath(src_doc_path), self.abspath(dst_doc_path), page_numbers @@ -332,7 +332,7 @@ class Storage: self.abspath(next_ver_dp) ) - pdftk.paste_pages( + stapler.paste_pages( src=self.abspath(dest_doc_path), dst=self.abspath(next_ver_dp), data_list=data_list, diff --git a/test/test_pdftk.py b/test/test_pdftk.py deleted file mode 100644 index cfb5462..0000000 --- a/test/test_pdftk.py +++ /dev/null @@ -1,145 +0,0 @@ -import os -import unittest -from unittest import mock -from mglib import pdftk -from mglib.conf import settings -from mglib.runcmd import run - -BASE_DIR = os.path.dirname(os.path.abspath(__file__)) - -DATA_DIR = os.path.join(BASE_DIR, "data") - - -class TestPdfLib(unittest.TestCase): - def test_ranges_for_reorder(self): - actual = pdftk.cat_ranges_for_reorder(4, [ - {"page_order": 1, "page_num": 4}, - {"page_order": 2, "page_num": 3}, - {"page_order": 3, "page_num": 2}, - {"page_order": 4, "page_num": 1} - ]) - expected = [4,3,2,1] - assert expected == actual - - self.assertRaises(ValueError, pdftk.cat_ranges_for_reorder, 2, []) - self.assertRaises(KeyError, pdftk.cat_ranges_for_reorder, 2, [ - {"page_order": 3, "page_num": 4}, - {"page_order": 5, "page_num": 6} - ]) - - def test_delete_pages(self): - input_file = os.path.join(DATA_DIR, "berlin.pdf") - output_file = os.path.join(DATA_DIR, "berlin2.pdf") - - with mock.patch("mglib.pdftk.run") as run_func: - pdftk.delete_pages(input_file, output_file, [1]) - run_func.assert_called() - run_func.assert_called_with( - [settings.BINARY_PDFTK, input_file, "cat", "2", "output", output_file] - ) - - def test_cat_ranges_for_delete(self): - page_count = 22 - page_numbers = range(1, 23) - - actual = pdftk.cat_ranges_for_delete(page_count, [21]) - expected = list(page_numbers) - expected.remove(21) - assert actual == expected - - actual = pdftk.cat_ranges_for_delete(page_count, [1]) - expected = list(page_numbers) - expected.remove(1) - assert actual == expected - - actual = pdftk.cat_ranges_for_delete(page_count, [1, 7, 10]) - expected = list(page_numbers) - expected.remove(1) - expected.remove(7) - expected.remove(10) - assert actual == expected - - self.assertRaises(ValueError, pdftk.cat_ranges_for_delete, page_count, ["1"]) - - def test_split_ranges(self): - page_count = 9 - page_numbers = list(range(1, 10)) - - self.assertRaises(ValueError, pdftk.split_ranges, 9, after="a", before=False) - self.assertRaises(ValueError, pdftk.split_ranges, 9, after=False, before=True) - - actual1, actual2 = pdftk.split_ranges(page_count, 1, False) - expected1 = [1] - expected2 = [2, 3, 4, 5, 6, 7, 8, 9] - assert actual1 == expected1 - assert actual2 == expected2 - - actual1, actual2 = pdftk.split_ranges(page_count, False, 2) - expected1 = [1] - expected2 = [2, 3, 4, 5, 6, 7, 8, 9] - assert actual1 == expected1 - assert actual2 == expected2 - - actual1, actual2 = pdftk.split_ranges(page_count) - expected1 = list(range(1, page_count + 1)) - expected2 = [] - assert actual1 == expected1 - assert actual2 == expected2 - - def test_reorder_pages(self): - input_file = os.path.join(DATA_DIR, "berlin.pdf") - output_file = os.path.join(DATA_DIR, "berlin2.pdf") - new_order = [ - {'page_num': 2, 'page_order': 1}, - {'page_num': 1, 'page_order': 2}, - ] - - with mock.patch("mglib.pdftk.run") as run_func: - pdftk.reorder_pages(input_file, output_file, new_order) - run_func.assert_called() - run_func.assert_called_with( - [settings.BINARY_PDFTK, input_file, "cat", "2", "1", "output", output_file] - ) - - def test_paste_pages_into_existing_doc(self): - input_file = os.path.join(DATA_DIR, "berlin.pdf") - output_file = os.path.join(DATA_DIR, "berlin2.pdf") - datalist = [] - - with mock.patch("mglib.pdftk.run") as run_func: - pdftk.paste_pages_into_existing_doc(input_file, output_file, datalist) - run_func.assert_called() - run_func.assert_called_with( - [settings.BINARY_PDFTK, "A=" + input_file, "cat", "A1", "A2", "output", output_file] - ) - - datalist = [{"src": input_file, "page_nums": "34"}] - - with mock.patch("mglib.pdftk.run") as run_func: - pdftk.paste_pages_into_existing_doc(input_file, output_file, datalist, 1) - run_func.assert_called() - run_func.assert_called_with( - [settings.BINARY_PDFTK, "A=" + input_file, "B=" + input_file, "cat", "A1", "B3", - "B4", "A2", "output", output_file] - ) - def test_paste_pages(self): - input_file = os.path.join(DATA_DIR, "berlin.pdf") - output_file = os.path.join(DATA_DIR, "berlin2.pdf") - datalist = [] - - with mock.patch("mglib.pdftk.run") as run_func: - pdftk.paste_pages(input_file, output_file, datalist, False) - run_func.assert_called() - run_func.assert_called_with( - [settings.BINARY_PDFTK, "A=" + input_file, "cat", "A1", "A2", "output", output_file] - ) - - datalist = [{"src": input_file, "page_nums": "34"}] - - with mock.patch("mglib.pdftk.run") as run_func: - pdftk.paste_pages(input_file, output_file, datalist) - run_func.assert_called() - run_func.assert_called_with( - [settings.BINARY_PDFTK, "A=" + input_file, "cat", "A3", "A4", - "output", output_file] - ) diff --git a/test/test_stapler.py b/test/test_stapler.py index eb46702..f87bccc 100644 --- a/test/test_stapler.py +++ b/test/test_stapler.py @@ -3,7 +3,6 @@ import unittest from unittest import mock from mglib import stapler from mglib.conf import settings -from mglib.runcmd import run BASE_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -17,15 +16,15 @@ class TestPdfLib(unittest.TestCase): {"page_order": 2, "page_num": 3}, {"page_order": 3, "page_num": 2}, {"page_order": 4, "page_num": 1} - ]) - expected = [4,3,2,1] + ]) + expected = [4, 3, 2, 1] assert expected == actual self.assertRaises(ValueError, stapler.cat_ranges_for_reorder, 2, []) self.assertRaises(KeyError, stapler.cat_ranges_for_reorder, 2, [ {"page_order": 3, "page_num": 4}, {"page_order": 5, "page_num": 6} - ]) + ]) def test_delete_pages(self): input_file = os.path.join(DATA_DIR, "berlin.pdf") @@ -38,13 +37,22 @@ class TestPdfLib(unittest.TestCase): [settings.BINARY_STAPLER, "del", input_file, "1", output_file] ) - def test_split_ranges(self): page_count = 9 - page_numbers = list(range(1, 10)) - self.assertRaises(ValueError, stapler.split_ranges, 9, after="a", before=False) - self.assertRaises(ValueError, stapler.split_ranges, 9, after=False, before=True) + self.assertRaises( + ValueError, + stapler.split_ranges, + 9, + after="a", + before=False + ) + self.assertRaises( + ValueError, + stapler.split_ranges, + 9, after=False, + before=True + ) actual1, actual2 = stapler.split_ranges(page_count, 1, False) expected1 = [1] @@ -67,54 +75,81 @@ class TestPdfLib(unittest.TestCase): def test_reorder_pages(self): input_file = os.path.join(DATA_DIR, "berlin.pdf") output_file = os.path.join(DATA_DIR, "berlin2.pdf") - new_order = [ - {'page_num': 2, 'page_order': 1}, - {'page_num': 1, 'page_order': 2}, - ] + new_order = [ + {'page_num': 2, 'page_order': 1}, + {'page_num': 1, 'page_order': 2}, + ] with mock.patch("mglib.stapler.run") as run_func: stapler.reorder_pages(input_file, output_file, new_order) run_func.assert_called() run_func.assert_called_with( - [settings.BINARY_STAPLER, "sel", input_file, "2", "1", output_file] + [ + settings.BINARY_STAPLER, + "sel", + input_file, + "2", + "1", + output_file + ] ) def test_paste_pages_into_existing_doc(self): input_file = os.path.join(DATA_DIR, "berlin.pdf") output_file = os.path.join(DATA_DIR, "berlin2.pdf") - datalist = [] + datalist = [] with mock.patch("mglib.stapler.run") as run_func: - stapler.paste_pages_into_existing_doc(input_file, output_file, datalist) + stapler.paste_pages_into_existing_doc( + input_file, output_file, datalist + ) run_func.assert_called() run_func.assert_called_with( - [settings.BINARY_STAPLER, "sel", "A=" + input_file, "A1", "A2", output_file] + [ + settings.BINARY_STAPLER, + "sel", "A=" + input_file, "A1", "A2", output_file + ] ) - datalist = [{"src": input_file, "page_nums": "34"}] + datalist = [{"src": input_file, "page_nums": "34"}] with mock.patch("mglib.stapler.run") as run_func: - stapler.paste_pages_into_existing_doc(input_file, output_file, datalist, 1) + stapler.paste_pages_into_existing_doc( + input_file, + output_file, + datalist, + 1 + ) run_func.assert_called() run_func.assert_called_with( - [settings.BINARY_STAPLER, "sel", "A=" + input_file, "B=" + input_file, "A1", "B3", - "B4", "A2", output_file] + [ + settings.BINARY_STAPLER, + "sel", "A=" + input_file, + "B=" + input_file, "A1", "B3", + "B4", "A2", output_file + ] ) - def test_paste_pages(self): input_file = os.path.join(DATA_DIR, "berlin.pdf") output_file = os.path.join(DATA_DIR, "berlin2.pdf") - datalist = [] + datalist = [] with mock.patch("mglib.stapler.run") as run_func: stapler.paste_pages(input_file, output_file, datalist, False) run_func.assert_called() run_func.assert_called_with( - [settings.BINARY_STAPLER, "sel", "A=" + input_file, "A1", "A2", output_file] + [ + settings.BINARY_STAPLER, + "sel", + "A=" + input_file, + "A1", + "A2", + output_file + ] ) - datalist = [{"src": input_file, "page_nums": "34"}] + datalist = [{"src": input_file, "page_nums": "34"}] with mock.patch("mglib.stapler.run") as run_func: stapler.paste_pages(input_file, output_file, datalist)