diff --git a/mglib/pdfinfo.py b/mglib/pdfinfo.py new file mode 100644 index 0000000..f7065b9 --- /dev/null +++ b/mglib/pdfinfo.py @@ -0,0 +1,70 @@ +import os +import re +import subprocess +import logging + +""" +Uses command line pdfinfo utility (from poppler pakage) for various +small operations (e.g. get pdf page count). +""" + +logger = logging.getLogger(__name__) + + +def get_pagecount(filepath): + """ + Returns the number of pages in a PDF document as integer. + + filepath - is filesystem path to a PDF document + """ + if not os.path.isfile(filepath): + raise ValueError("Filepath %s is not a file" % filepath) + + if os.path.isdir(filepath): + raise ValueError("Filepath %s is a directory!" % filepath) + + base, ext = os.path.splitext(filepath) + + # pure images (png, jpeg) have only one page :) + if ext and ext.lower() in ('.jpeg', '.png', '.jpg'): + # whatever png/jpg image is there - it is + # considered by default one page document. + return 1 + + if ext and ext.lower() not in ('.pdf',): + raise ValueError( + "Only jpeg, png and pdf are handlerd by this" + " method" + ) + + # pdfinfo "${PDFFILE}" | grep Pages + + cmd = ["/usr/bin/pdfinfo", filepath] + compl = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + + if compl.returncode: + + logger.error( + "get_pagecount: cmd=%s args=%s stdout=%s stderr=%s code=%s", + cmd, + compl.args, + compl.stdout, + compl.stderr, + compl.returncode, + stack_info=True + ) + + raise Exception("Error occured while getting document page count.") + + lines = compl.stdout.decode('utf-8').split('\n') + # look up for the line containing "Pages: 11" + for line in lines: + x = re.match("Pages:\W+(\d+)$", line.strip()) + if x: + return int(x.group(1)) + + return 0 diff --git a/mglib/pdftk.py b/mglib/pdftk.py new file mode 100644 index 0000000..bafa12d --- /dev/null +++ b/mglib/pdftk.py @@ -0,0 +1,382 @@ +import os +import logging + +from mglib.runcmd import run +from mglib.pdfinfo import get_pagecount + +logger = logging.getLogger(__name__) + +# +# Utilities around pdftk command line tool +# +# https://www.pdflabs.com/docs/pdftk-man-page/ +# + + +def cat_ranges_for_reorder(page_count, new_order): + """ + Returns a list of integers. Each number in the list + is correctly positioned (newly ordered) page. + + Examples: + + If in document with 4 pages first and second pages were + swapped, then returned list will be: + + [2, 1, 3, 4] + + If first page was swapped with last one (also 4 paegs document) + result list will look like: + + [4, 2, 3, 1] + """ + results = [] + # key = page_num + # value = page_order + page_map = {} + + for item in new_order: + k = int(item['page_order']) + v = int(item['page_num']) + page_map[k] = v + + for number in range(1, page_count + 1): + results.append( + page_map[number] + ) + + return results + + +def cat_ranges_for_delete(page_count, page_numbers): + """ + Returns a list of integers. Each number in the list + is the number of page which will 'stay' in document. + In other words, it returns a list with deleted pages. + + Examples: + + + If document has 22 pages (page_count=22) and page number 21 is to be + deleted (i.e page_numbers = [21]) will return + + [1, 2, 3, 4, ..., 19, 20, 22] + + If page number 1 is to be deleted: + + [2, 3, 4, ..., 22] list will be returned. + + If page number is 22 is to be deleted: + + [1, 2, 3,..., 21] will be returned. + + With page_numbers=[1, 7, 10] and page_count=22 result + will be: + + (2, 3, 4, 5, 6, 8, 9, 11, 12 , 13, ..., 22) + + + page_numbers is a list of page numbers (starting with 1). + """ + results = [] + + for check in page_numbers: + if not isinstance(check, int): + err_msg = "page_numbers must be a list of strings" + raise ValueError(err_msg) + + for number in range(1, page_count + 1): + if number not in page_numbers: + results.append(number) + + return results + + +def make_sure_path_exists(filepath): + logger.debug(f"make_sure_path_exists {filepath}") + dirname = os.path.dirname(filepath) + os.makedirs( + dirname, + exist_ok=True + ) + + +def split_ranges(total, after=False, before=False): + """ + Given a range 1, 2, ..., total (page numbers of a doc). + Split it in two lists. + Example: + Input: total = 9, after=1, before=False + Output: list1 = [1]; list2 = [2, 3, 4, ..., 9]. + + Input: total = 9; after=False, before=1 + Output: list1 = [], list2 = [1, 2, 3, 4, ..., 9] + + Input: total = 5; after=4; before=False + Output: list1 = [1, 2, 3, 4] list2 = [5] + + Input: total = 5; after=False; before=False; + Output: list1 = [1, 2, 3, 4, 5], list2 = [] + (it means, by default, all pages are inserted at the end of the doc) + """ + if after and not before: + if not isinstance(after, int): + raise ValueError( + "argument 'after' is supposed to be an int" + ) + list1 = list(range(1, after + 1)) + list2 = list(range(after + 1, total + 1)) + return list1, list2 + + if not after and before: + if not isinstance(before, int): + raise ValueError( + "argument 'before' is supposed to be an int" + ) + list1 = list(range(1, before)) + list2 = list(range(before, total + 1)) + return list1, list2 + + list1 = list(range(1, total + 1)) + list2 = [] + + return list1, list2 + + +def paste_pages_into_existing_doc( + dest_doc_ep, + src_doc_ep_list, + after_page_number=False, + before_page_number=False +): + page_count = get_pagecount(dest_doc_ep.url()) + list1, list2 = split_ranges( + total=page_count, + after=after_page_number, + before=before_page_number + ) + # notice missing A + # Letter A is assignent to current folder and + # pages from list1 and list2 + letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ" + letters_2_doc_map = [] + letters_pages = [] + letters_pages_before = [] + letters_pages_after = [] + + letters_2_doc_map.append( + f"A={dest_doc_ep.url()}" + ) + + for idx in range(0, len(src_doc_ep_list)): + letter = letters[idx] + doc_ep = src_doc_ep_list[idx]['doc_ep'] + pages = src_doc_ep_list[idx]['page_nums'] + + letters_2_doc_map.append( + f"{letter}={doc_ep.url()}" + ) + for p in pages: + letters_pages.append( + f"{letter}{p}" + ) + + dest_doc_ep.inc_version() + + for p in list1: + letters_pages_before.append( + f"A{p}" + ) + + for p in list2: + letters_pages_after.append( + f"A{p}" + ) + + cmd = [ + "pdftk", + ] + # add A=doc1_path, B=doc2_path + cmd.extend(letters_2_doc_map) + + cmd.append("cat") + + # existing doc pages (may be empty) + cmd.extend(letters_pages_before) + # newly inserted pages + cmd.extend(letters_pages) + # existing doc pages (may be empty) + cmd.extend(letters_pages_after) + + cmd.append("output") + + make_sure_path_exists(dest_doc_ep.url()) + + cmd.append(dest_doc_ep.url()) + + run(cmd) + + return dest_doc_ep.version + + +def paste_pages( + dest_doc_ep, + src_doc_ep_list, + dest_doc_is_new=True, + after_page_number=False, + before_page_number=False +): + """ + dest_doc_ep = endpoint of the doc where newly created + file will be placed. + src_doc_ep_list is a list of following format: + [ + { + 'doc_ep': doc_ep, + 'page_nums': [page_num_1, page_num_2, page_num_3] + }, + { + 'doc_ep': doc_ep, + 'page_nums': [page_num_1, page_num_2, page_num_3] + }, + ... + ] + src_doc_ep_list is a list of documents where pages + (with numbers page_num_1...) will be paste from. + + dest_doc_is_new = True well.. destination document was just created, + we are pasting here cutted pages into some folder as new document. + + In this case 'after' and 'before' arguments are ignored + + dest_doc_is_new = False, pasting pages into exiting document. + If before_page_number > 0 - paste pages before page number + 'before_page_number' + If after_page_number > 0 - paste pages after page number + 'after_page_number' + + before_page_number argument has priority over after_page_number. + + If both before_page_number and after_page_number are < 0 - just paste + pages at the end of the document. + """ + if not dest_doc_is_new: + return paste_pages_into_existing_doc( + dest_doc_ep=dest_doc_ep, + src_doc_ep_list=src_doc_ep_list, + after_page_number=after_page_number, + before_page_number=before_page_number + ) + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + letters_2_doc_map = [] + letters_pages = [] + + for idx in range(0, len(src_doc_ep_list)): + letter = letters[idx] + doc_ep = src_doc_ep_list[idx]['doc_ep'] + pages = src_doc_ep_list[idx]['page_nums'] + + letters_2_doc_map.append( + f"{letter}={doc_ep.url()}" + ) + for p in pages: + letters_pages.append( + f"{letter}{p}" + ) + + dest_doc_ep.inc_version() + + cmd = [ + "pdftk", + ] + # add A=doc1_path, B=doc2_path + cmd.extend(letters_2_doc_map) + + cmd.append("cat") + + cmd.extend(letters_pages) + + cmd.append("output") + + make_sure_path_exists(dest_doc_ep.url()) + + cmd.append(dest_doc_ep.url()) + + run(cmd) + + return dest_doc_ep.version + + +def reorder_pages(doc_ep, new_order): + """ + new_order is a list of following format: + + [ + {'page_num': 2, page_order: 1}, + {'page_num': 1, page_order: 2}, + {'page_num': 3, page_order: 3}, + {'page_num': 4, page_order: 4}, + ] + Example above means that in current document of 4 pages, + first page was swapped with second one. + page_num = older page order + page_order = current page order + So in human language, each hash is read: + now should be + """ + ep_url = doc_ep.url() + page_count = get_pagecount(ep_url) + + cat_ranges = cat_ranges_for_reorder( + page_count=page_count, + new_order=new_order + ) + + doc_ep.inc_version() + + cmd = [ + "pdftk", + ep_url, + "cat" + ] + for page in cat_ranges: + cmd.append( + str(page) + ) + + cmd.append("output") + make_sure_path_exists(doc_ep.url()) + cmd.append(doc_ep.url()) + run(cmd) + + return doc_ep.version + + +def delete_pages(doc_ep, page_numbers): + ep_url = doc_ep.url() + page_count = get_pagecount(ep_url) + + cat_ranges = cat_ranges_for_delete( + page_count, + page_numbers + ) + + doc_ep.inc_version() + + cmd = [ + "pdftk", + ep_url, + "cat" + ] + for page in cat_ranges: + cmd.append( + str(page) + ) + + cmd.append("output") + make_sure_path_exists(doc_ep.url()) + cmd.append(doc_ep.url()) + + run(cmd) + + return doc_ep.version