removed pdftk dependency

master
Eugen Ciur 2020-12-01 07:44:57 +01:00
parent 9e24776ba8
commit fa90e6b0a6
5 changed files with 63 additions and 534 deletions

View File

@ -23,10 +23,6 @@ BINARY_IDENTIFY = "/usr/bin/identify"
# Used to extract text from images/PDF files.
BINARY_OCR = "/usr/bin/tesseract"
# Provided by pdftk package
# Used to reorder, cut/paste, delete pages withing PDF document
BINARY_PDFTK = "/usr/bin/pdftk"
# Provided by stapler
# Used to edit PDF documents
BINARY_STAPLER = "~/.local/bin/stapler"

View File

@ -1,357 +0,0 @@
import logging
from mglib.runcmd import run
from mglib.pdfinfo import get_pagecount
from .conf import settings
logger = logging.getLogger(__name__)
#
# Utilities around pdftk command line tool
#
# https://www.pdflabs.com/docs/pdftk-man-page/
#
def cat_ranges_for_reorder(page_count, new_order):
"""
Returns a list of integers. Each number in the list
is correctly positioned (newly ordered) page.
Examples:
If in document with 4 pages first and second pages were
swapped, then returned list will be:
[2, 1, 3, 4]
If first page was swapped with last one (also 4 paegs document)
result list will look like:
[4, 2, 3, 1]
"""
if len(new_order) != page_count:
raise ValueError("Not enough pages specified")
results = []
# key = page_num
# value = page_order
page_map = {}
for item in new_order:
k = int(item['page_order'])
v = int(item['page_num'])
page_map[k] = v
for number in range(1, page_count + 1):
results.append(
page_map[number]
)
return results
def cat_ranges_for_delete(page_count, page_numbers):
"""
Returns a list of integers. Each number in the list
is the number of page which will 'stay' in document.
In other words, it returns a list with not deleted pages.
Examples:
If document has 22 pages (page_count=22) and page number 21 is to be
deleted (i.e page_numbers = [21]) will return
[1, 2, 3, 4, ..., 19, 20, 22]
If page number 1 is to be deleted:
[2, 3, 4, ..., 22] list will be returned.
If page number is 22 is to be deleted:
[1, 2, 3,..., 21] will be returned.
With page_numbers=[1, 7, 10] and page_count=22 result
will be:
(2, 3, 4, 5, 6, 8, 9, 11, 12 , 13, ..., 22)
page_numbers is a list of page numbers (starting with 1).
"""
results = []
for check in page_numbers:
if not isinstance(check, int):
err_msg = "page_numbers must be a list of ints"
raise ValueError(err_msg)
for number in range(1, page_count + 1):
if number not in page_numbers:
results.append(number)
return results
def split_ranges(total, after=False, before=False):
"""
Given a range 1, 2, ..., total (page numbers of a doc).
Split it in two lists.
Example:
Input: total = 9, after=1, before=False
Output: list1 = [1]; list2 = [2, 3, 4, ..., 9].
Input: total = 9; after=False, before=1
Output: list1 = [], list2 = [1, 2, 3, 4, ..., 9]
Input: total = 5; after=4; before=False
Output: list1 = [1, 2, 3, 4] list2 = [5]
Input: total = 5; after=False; before=False;
Output: list1 = [1, 2, 3, 4, 5], list2 = []
(it means, by default, all pages are inserted at the end of the doc)
"""
if after and not before:
if not type(after) == int:
raise ValueError(
"argument 'after' is supposed to be an int"
)
list1 = list(range(1, after + 1))
list2 = list(range(after + 1, total + 1))
return list1, list2
if not after and before:
if not type(before) == int:
raise ValueError(
"argument 'before' is supposed to be an int"
)
list1 = list(range(1, before))
list2 = list(range(before, total + 1))
return list1, list2
list1 = list(range(1, total + 1))
list2 = []
return list1, list2
def paste_pages_into_existing_doc(
src,
dst,
data_list,
after_page_number=False,
before_page_number=False
):
page_count = get_pagecount(src)
list1, list2 = split_ranges(
total=page_count,
after=after_page_number,
before=before_page_number
)
# notice missing A
# Letter A is assignent to current folder and
# pages from list1 and list2
letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ"
letters_2_doc_map = []
letters_pages = []
letters_pages_before = []
letters_pages_after = []
letters_2_doc_map.append(
f"A={src}"
)
for idx in range(0, len(data_list)):
letter = letters[idx]
src = data_list[idx]['src']
pages = data_list[idx]['page_nums']
letters_2_doc_map.append(
f"{letter}={src}"
)
for p in pages:
letters_pages.append(
f"{letter}{p}"
)
for p in list1:
letters_pages_before.append(
f"A{p}"
)
for p in list2:
letters_pages_after.append(
f"A{p}"
)
cmd = [
settings.BINARY_PDFTK,
]
# add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map)
cmd.append("cat")
# existing doc pages (may be empty)
cmd.extend(letters_pages_before)
# newly inserted pages
cmd.extend(letters_pages)
# existing doc pages (may be empty)
cmd.extend(letters_pages_after)
cmd.append("output")
cmd.append(dst)
run(cmd)
def paste_pages(
src,
dst,
data_list,
dst_doc_is_new=True,
after_page_number=False,
before_page_number=False
):
"""
dest_doc_ep = endpoint of the doc where newly created
file will be placed.
src_doc_ep_list is a list of following format:
[
{
'doc_ep': doc_ep,
'page_nums': [page_num_1, page_num_2, page_num_3]
},
{
'doc_ep': doc_ep,
'page_nums': [page_num_1, page_num_2, page_num_3]
},
...
]
src_doc_ep_list is a list of documents where pages
(with numbers page_num_1...) will be paste from.
dst_doc_is_new = True well.. destination document was just created,
we are pasting here cutted pages into some folder as new document.
In this case 'after' and 'before' arguments are ignored
dst_doc_is_new = False, pasting pages into exiting document.
If before_page_number > 0 - paste pages before page number
'before_page_number'
If after_page_number > 0 - paste pages after page number
'after_page_number'
before_page_number argument has priority over after_page_number.
If both before_page_number and after_page_number are < 0 - just paste
pages at the end of the document.
"""
if not dst_doc_is_new:
return paste_pages_into_existing_doc(
src=src,
dst=dst,
data_list=data_list,
after_page_number=after_page_number,
before_page_number=before_page_number
)
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
letters_2_doc_map = []
letters_pages = []
for idx in range(0, len(data_list)):
letter = letters[idx]
src = data_list[idx]['src']
pages = data_list[idx]['page_nums']
letters_2_doc_map.append(
f"{letter}={src}"
)
for p in pages:
letters_pages.append(
f"{letter}{p}"
)
cmd = [
settings.BINARY_PDFTK,
]
# add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map)
cmd.append("cat")
cmd.extend(letters_pages)
cmd.append("output")
cmd.append(dst)
run(cmd)
def reorder_pages(
src, dst, new_order
):
"""
new_order is a list of following format:
[
{'page_num': 2, page_order: 1},
{'page_num': 1, page_order: 2},
{'page_num': 3, page_order: 3},
{'page_num': 4, page_order: 4},
]
Example above means that in current document of 4 pages,
first page was swapped with second one.
page_num = older page order
page_order = current page order
So in human language, each hash is read:
<page_num> now should be <page_order>
"""
page_count = get_pagecount(src)
cat_ranges = cat_ranges_for_reorder(
page_count=page_count,
new_order=new_order
)
cmd = [
settings.BINARY_PDFTK,
src,
"cat"
]
for page in cat_ranges:
cmd.append(
str(page)
)
cmd.append("output")
cmd.append(dst)
run(cmd)
def delete_pages(src, dst, page_numbers):
page_count = get_pagecount(src)
cat_ranges = cat_ranges_for_delete(
page_count,
page_numbers
)
cmd = [
settings.BINARY_PDFTK,
src,
"cat"
]
for page in cat_ranges:
cmd.append(
str(page)
)
cmd.append("output")
cmd.append(dst)
run(cmd)

View File

@ -4,7 +4,7 @@ import shutil
from os import listdir
from os.path import isdir, join
from mglib import pdftk
from mglib import stapler
from mglib.path import DocumentPath, PagePath
from mglib.step import Steps
from mglib.utils import get_assigns_after_delete, safe_to_delete
@ -209,7 +209,7 @@ class Storage:
self.abspath(dst_doc_path)
)
pdftk.reorder_pages(
stapler.reorder_pages(
src=self.abspath(src_doc_path),
dst=self.abspath(dst_doc_path),
new_order=new_order
@ -269,7 +269,7 @@ class Storage:
self.make_sure_path_exists(
self.abspath(dst_doc_path)
)
pdftk.delete_pages(
stapler.delete_pages(
self.abspath(src_doc_path),
self.abspath(dst_doc_path),
page_numbers
@ -332,7 +332,7 @@ class Storage:
self.abspath(next_ver_dp)
)
pdftk.paste_pages(
stapler.paste_pages(
src=self.abspath(dest_doc_path),
dst=self.abspath(next_ver_dp),
data_list=data_list,

View File

@ -1,145 +0,0 @@
import os
import unittest
from unittest import mock
from mglib import pdftk
from mglib.conf import settings
from mglib.runcmd import run
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")
class TestPdfLib(unittest.TestCase):
def test_ranges_for_reorder(self):
actual = pdftk.cat_ranges_for_reorder(4, [
{"page_order": 1, "page_num": 4},
{"page_order": 2, "page_num": 3},
{"page_order": 3, "page_num": 2},
{"page_order": 4, "page_num": 1}
])
expected = [4,3,2,1]
assert expected == actual
self.assertRaises(ValueError, pdftk.cat_ranges_for_reorder, 2, [])
self.assertRaises(KeyError, pdftk.cat_ranges_for_reorder, 2, [
{"page_order": 3, "page_num": 4},
{"page_order": 5, "page_num": 6}
])
def test_delete_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.delete_pages(input_file, output_file, [1])
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, input_file, "cat", "2", "output", output_file]
)
def test_cat_ranges_for_delete(self):
page_count = 22
page_numbers = range(1, 23)
actual = pdftk.cat_ranges_for_delete(page_count, [21])
expected = list(page_numbers)
expected.remove(21)
assert actual == expected
actual = pdftk.cat_ranges_for_delete(page_count, [1])
expected = list(page_numbers)
expected.remove(1)
assert actual == expected
actual = pdftk.cat_ranges_for_delete(page_count, [1, 7, 10])
expected = list(page_numbers)
expected.remove(1)
expected.remove(7)
expected.remove(10)
assert actual == expected
self.assertRaises(ValueError, pdftk.cat_ranges_for_delete, page_count, ["1"])
def test_split_ranges(self):
page_count = 9
page_numbers = list(range(1, 10))
self.assertRaises(ValueError, pdftk.split_ranges, 9, after="a", before=False)
self.assertRaises(ValueError, pdftk.split_ranges, 9, after=False, before=True)
actual1, actual2 = pdftk.split_ranges(page_count, 1, False)
expected1 = [1]
expected2 = [2, 3, 4, 5, 6, 7, 8, 9]
assert actual1 == expected1
assert actual2 == expected2
actual1, actual2 = pdftk.split_ranges(page_count, False, 2)
expected1 = [1]
expected2 = [2, 3, 4, 5, 6, 7, 8, 9]
assert actual1 == expected1
assert actual2 == expected2
actual1, actual2 = pdftk.split_ranges(page_count)
expected1 = list(range(1, page_count + 1))
expected2 = []
assert actual1 == expected1
assert actual2 == expected2
def test_reorder_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
new_order = [
{'page_num': 2, 'page_order': 1},
{'page_num': 1, 'page_order': 2},
]
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.reorder_pages(input_file, output_file, new_order)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, input_file, "cat", "2", "1", "output", output_file]
)
def test_paste_pages_into_existing_doc(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages_into_existing_doc(input_file, output_file, datalist)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "cat", "A1", "A2", "output", output_file]
)
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages_into_existing_doc(input_file, output_file, datalist, 1)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "B=" + input_file, "cat", "A1", "B3",
"B4", "A2", "output", output_file]
)
def test_paste_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages(input_file, output_file, datalist, False)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "cat", "A1", "A2", "output", output_file]
)
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.pdftk.run") as run_func:
pdftk.paste_pages(input_file, output_file, datalist)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_PDFTK, "A=" + input_file, "cat", "A3", "A4",
"output", output_file]
)

View File

@ -3,7 +3,6 @@ import unittest
from unittest import mock
from mglib import stapler
from mglib.conf import settings
from mglib.runcmd import run
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@ -17,15 +16,15 @@ class TestPdfLib(unittest.TestCase):
{"page_order": 2, "page_num": 3},
{"page_order": 3, "page_num": 2},
{"page_order": 4, "page_num": 1}
])
expected = [4,3,2,1]
])
expected = [4, 3, 2, 1]
assert expected == actual
self.assertRaises(ValueError, stapler.cat_ranges_for_reorder, 2, [])
self.assertRaises(KeyError, stapler.cat_ranges_for_reorder, 2, [
{"page_order": 3, "page_num": 4},
{"page_order": 5, "page_num": 6}
])
])
def test_delete_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
@ -38,13 +37,22 @@ class TestPdfLib(unittest.TestCase):
[settings.BINARY_STAPLER, "del", input_file, "1", output_file]
)
def test_split_ranges(self):
page_count = 9
page_numbers = list(range(1, 10))
self.assertRaises(ValueError, stapler.split_ranges, 9, after="a", before=False)
self.assertRaises(ValueError, stapler.split_ranges, 9, after=False, before=True)
self.assertRaises(
ValueError,
stapler.split_ranges,
9,
after="a",
before=False
)
self.assertRaises(
ValueError,
stapler.split_ranges,
9, after=False,
before=True
)
actual1, actual2 = stapler.split_ranges(page_count, 1, False)
expected1 = [1]
@ -67,54 +75,81 @@ class TestPdfLib(unittest.TestCase):
def test_reorder_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
new_order = [
{'page_num': 2, 'page_order': 1},
{'page_num': 1, 'page_order': 2},
]
new_order = [
{'page_num': 2, 'page_order': 1},
{'page_num': 1, 'page_order': 2},
]
with mock.patch("mglib.stapler.run") as run_func:
stapler.reorder_pages(input_file, output_file, new_order)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", input_file, "2", "1", output_file]
[
settings.BINARY_STAPLER,
"sel",
input_file,
"2",
"1",
output_file
]
)
def test_paste_pages_into_existing_doc(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
datalist = []
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages_into_existing_doc(input_file, output_file, datalist)
stapler.paste_pages_into_existing_doc(
input_file, output_file, datalist
)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "A1", "A2", output_file]
[
settings.BINARY_STAPLER,
"sel", "A=" + input_file, "A1", "A2", output_file
]
)
datalist = [{"src": input_file, "page_nums": "34"}]
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages_into_existing_doc(input_file, output_file, datalist, 1)
stapler.paste_pages_into_existing_doc(
input_file,
output_file,
datalist,
1
)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "B=" + input_file, "A1", "B3",
"B4", "A2", output_file]
[
settings.BINARY_STAPLER,
"sel", "A=" + input_file,
"B=" + input_file, "A1", "B3",
"B4", "A2", output_file
]
)
def test_paste_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
datalist = []
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages(input_file, output_file, datalist, False)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "A1", "A2", output_file]
[
settings.BINARY_STAPLER,
"sel",
"A=" + input_file,
"A1",
"A2",
output_file
]
)
datalist = [{"src": input_file, "page_nums": "34"}]
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages(input_file, output_file, datalist)