diff --git a/mglib/ocrmigrate.py b/mglib/ocrmigrate.py index aa811c8..414df5c 100644 --- a/mglib/ocrmigrate.py +++ b/mglib/ocrmigrate.py @@ -18,50 +18,6 @@ OcrMigrate class takes care of this sort of txt/hocr files moves. logger = logging.getLogger(__name__) -def migrate_cutted_pages(dest_ep, src_doc_ep_list): - """ - dest_ep = destination document endpoint - src_doc_ep_list = a list of following format: - [ - { - 'doc_ep': doc_ep, - 'page_nums': [page_num_1, page_num_2, page_num_3] - }, - { - 'doc_ep': doc_ep, - 'page_nums': [page_num_1, page_num_2, page_num_3] - }, - ... - ] - with a list of source document with copied pages. - """ - dest_page_num = 1 - dest_page_count = sum([ - len(item['page_nums']) for item in src_doc_ep_list - ]) - for item in src_doc_ep_list: - src_ep = item['doc_ep'] - for page_num in item['page_nums']: - for step in Steps(): - src_page_ep = PageEp( - document_ep=src_ep, - page_num=int(page_num), - step=step, - page_count=get_pagecount(src_ep) - ) - dst_page_ep = PageEp( - document_ep=dest_ep, - page_num=dest_page_num, - step=step, - page_count=dest_page_count - ) - logger.debug(f"src={src_page_ep} dst={dst_page_ep}") - copy_page( - src_page_ep=src_page_ep, - dst_page_ep=dst_page_ep - ) - dest_page_num += 1 - class OcrMigrate: """ diff --git a/mglib/pdftk.py b/mglib/pdftk.py index 1b3891a..6f7e90f 100644 --- a/mglib/pdftk.py +++ b/mglib/pdftk.py @@ -134,12 +134,12 @@ def split_ranges(total, after=False, before=False): def paste_pages_into_existing_doc( - dest_doc_ep, - src_doc_ep_list, + dst, + data_list, after_page_number=False, before_page_number=False ): - page_count = get_pagecount(dest_doc_ep.url()) + page_count = get_pagecount(dst) list1, list2 = split_ranges( total=page_count, after=after_page_number, @@ -155,24 +155,22 @@ def paste_pages_into_existing_doc( letters_pages_after = [] letters_2_doc_map.append( - f"A={dest_doc_ep.url()}" + f"A={dst.url()}" ) - for idx in range(0, len(src_doc_ep_list)): + for idx in range(0, len(data_list)): letter = letters[idx] - doc_ep = src_doc_ep_list[idx]['doc_ep'] - pages = src_doc_ep_list[idx]['page_nums'] + src = data_list[idx]['src'] + pages = data_list[idx]['page_nums'] letters_2_doc_map.append( - f"{letter}={doc_ep.url()}" + f"{letter}={src}" ) for p in pages: letters_pages.append( f"{letter}{p}" ) - dest_doc_ep.inc_version() - for p in list1: letters_pages_before.append( f"A{p}" @@ -200,19 +198,15 @@ def paste_pages_into_existing_doc( cmd.append("output") - make_sure_path_exists(dest_doc_ep.url()) - - cmd.append(dest_doc_ep.url()) + cmd.append(dst) run(cmd) - return dest_doc_ep.version - def paste_pages( - dest_doc_ep, - src_doc_ep_list, - dest_doc_is_new=True, + dst, + data_list, + dst_doc_is_new=True, after_page_number=False, before_page_number=False ): @@ -234,12 +228,12 @@ def paste_pages( src_doc_ep_list is a list of documents where pages (with numbers page_num_1...) will be paste from. - dest_doc_is_new = True well.. destination document was just created, + dst_doc_is_new = True well.. destination document was just created, we are pasting here cutted pages into some folder as new document. In this case 'after' and 'before' arguments are ignored - dest_doc_is_new = False, pasting pages into exiting document. + dst_doc_is_new = False, pasting pages into exiting document. If before_page_number > 0 - paste pages before page number 'before_page_number' If after_page_number > 0 - paste pages after page number @@ -250,10 +244,10 @@ def paste_pages( If both before_page_number and after_page_number are < 0 - just paste pages at the end of the document. """ - if not dest_doc_is_new: + if not dst_doc_is_new: return paste_pages_into_existing_doc( - dest_doc_ep=dest_doc_ep, - src_doc_ep_list=src_doc_ep_list, + dst=dst, + data_list=data_list, after_page_number=after_page_number, before_page_number=before_page_number ) @@ -261,21 +255,19 @@ def paste_pages( letters_2_doc_map = [] letters_pages = [] - for idx in range(0, len(src_doc_ep_list)): + for idx in range(0, len(data_list)): letter = letters[idx] - doc_ep = src_doc_ep_list[idx]['doc_ep'] - pages = src_doc_ep_list[idx]['page_nums'] + src = data_list[idx]['src'] + pages = data_list[idx]['page_nums'] letters_2_doc_map.append( - f"{letter}={doc_ep.url()}" + f"{letter}={src}" ) for p in pages: letters_pages.append( f"{letter}{p}" ) - dest_doc_ep.inc_version() - cmd = [ "pdftk", ] @@ -288,14 +280,10 @@ def paste_pages( cmd.append("output") - make_sure_path_exists(dest_doc_ep.url()) - - cmd.append(dest_doc_ep.url()) + cmd.append(dst) run(cmd) - return dest_doc_ep.version - def reorder_pages( src, dst, new_order diff --git a/mglib/storage.py b/mglib/storage.py index 8d67633..eb36bd3 100644 --- a/mglib/storage.py +++ b/mglib/storage.py @@ -307,7 +307,7 @@ class Storage: def paste_pages( self, dest_doc_path, - src_doc_path, + data_list, dest_doc_is_new=False, after_page_number=False, before_page_number=False @@ -317,7 +317,48 @@ class Storage: from src_doc_path. Both dest and src are instances of mglib.path.DocumentPath """ - pass + next_ver_dp = DocumentPath.copy_from( + dest_doc_path, + version=dest_doc_path.version + 1 + ) + self.make_sure_path_exists( + self.abspath(next_ver_dp) + ) + + pdftk.paste_pages( + dst=self.abspath(next_ver_dp), + data_list=data_list, + dst_doc_is_new=dest_doc_is_new, + after_page_number=after_page_number, + before_page_number=before_page_number + ) + + dest_page_num = 1 + dest_page_count = sum([ + len(item['page_nums']) for item in data_list + ]) + for item in data_list: + src_path = item['doc_path'] + for page_num in item['page_nums']: + for step in Steps(): + src_page_path = PagePath( + document_path=src_path, + page_num=int(page_num), + step=step, + page_count=self.get_pagecount(src_path) + ) + dst_page_path = PagePath( + document_path=next_ver_dp, + page_num=dest_page_num, + step=step, + page_count=dest_page_count + ) + logger.debug(f"src={src_page_path} dst={dst_page_path}") + self.copy_page( + src_page_path=src_page_path, + dst_page_path=dst_page_path + ) + dest_page_num += 1 class FileSystemStorage(Storage):