refactoring page delete functionality. OCRmigrate is about to be removed

pull/3/head
Eugen Ciur 2020-05-17 07:26:38 +02:00
parent 924dced78f
commit bf5342724a
4 changed files with 113 additions and 78 deletions

View File

@ -18,59 +18,6 @@ OcrMigrate class takes care of this sort of txt/hocr files moves.
logger = logging.getLogger(__name__)
def get_assigns_after_delete(total_pages, deleted_pages):
"""
given total pages and a list of deleted pages - returns
a list of assignations of pages:
[new_version_page_num, old_version_page_num]
Example 1:
total_pages: 6
deleted_pages: [1, 2]
returns: [
[(1, 3), (2, 4), (3, 5), (4, 6)]
# page #1 gets info from prev page #3
# page #2 ... #4
...
# page #4 ... #6
]
Example 2:
total pages: 5
deleted_pages [1, 5]
returns: [
[(1, 2), (2, 3), (3, 4)
]
Example 3:
total pages: 5
deleted_pages [2, 3]
returns: [
[(1, 1), (2, 4), (3, 5)
# page #1 stays unaffected
# page #2 gets the info from page number 4
# page #3 gets info from page #5
]
"""
if total_pages < len(deleted_pages):
err_msg = f"total_pages < deleted_pages"
raise ValueError(err_msg)
# only numbers of pages which were not deleted
pages = [
page for page in list(range(1, total_pages + 1))
if page not in deleted_pages
]
page_numbers = range(1, len(pages) + 1)
return list(zip(page_numbers, pages))
def migrate_cutted_pages(dest_ep, src_doc_ep_list):
"""
dest_ep = destination document endpoint
@ -177,8 +124,3 @@ class OcrMigrate:
src_page_ep=src_page_ep,
dst_page_ep=dst_page_ep
)
def migrate_reorder(self, new_order):
"""
Similar to migrate_delete, with minor tweaks.
"""

View File

@ -338,20 +338,17 @@ def reorder_pages(
run(cmd)
def delete_pages(doc_ep, page_numbers):
ep_url = doc_ep.url()
page_count = get_pagecount(ep_url)
def delete_pages(src, dst, page_numbers):
page_count = get_pagecount(src)
cat_ranges = cat_ranges_for_delete(
page_count,
page_numbers
)
doc_ep.inc_version()
cmd = [
"pdftk",
ep_url,
src,
"cat"
]
for page in cat_ranges:
@ -360,9 +357,6 @@ def delete_pages(doc_ep, page_numbers):
)
cmd.append("output")
make_sure_path_exists(doc_ep.url())
cmd.append(doc_ep.url())
cmd.append(dst)
run(cmd)
return doc_ep.version

View File

@ -4,7 +4,10 @@ from os.path import isdir, join
import logging
import shutil
from mglib.step import Steps
from mglib.utils import safe_to_delete
from mglib.utils import (
safe_to_delete,
get_assigns_after_delete
)
from mglib import pdftk
from mglib.path import PagePath, DocumentPath
@ -125,15 +128,6 @@ class Storage:
self.path(_path)
)
def delete_pages(self, doc_path, page_numers):
"""
Delets pages in the document pointed by doc_path.
doc_path is an instance of mglib.path.DocumentPath
In case of success returns document's new version.
"""
pass
def copy_page(self, src_page_path, dst_page_path):
err_msg = "copy_page accepts only PageEp instances"
@ -252,6 +246,64 @@ class Storage:
return doc_path.version + 1
def delete_pages(self, doc_path, page_numbers):
"""
Delets pages in the document pointed by doc_path.
doc_path is an instance of mglib.path.DocumentPath
In case of success returns document's new version.
"""
if not isinstance(page_numbers, list):
logger.error("Expecting list argument")
return False
src_doc_path = doc_path
dst_doc_path = DocumentPath.copy_from(
src_doc_path,
version=doc_path.version + 1
)
self.make_sure_path_exists(
self.abspath(dst_doc_path)
)
pdftk.delete_pages(
self.abspath(src_doc_path),
self.abspath(dst_doc_path),
page_numbers
)
page_count = self.get_pagecount(doc_path)
if len(page_numbers) > page_count:
logger.error(
f"deleted_pages({page_numbers}) > page_count({page_count})"
)
return
assigns = get_assigns_after_delete(
total_pages=page_count,
deleted_pages=page_numbers
)
for a in assigns:
for step in Steps():
src_page_path = PagePath(
document_path=src_doc_path,
page_num=a[1],
step=step,
page_count=page_count
)
dst_page_path = PagePath(
document_path=dst_doc_path,
page_num=a[0],
step=step,
page_count=page_count - len(page_numbers)
)
self.copy_page(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
return doc_path.version + 1
def paste_pages(
self,
dest_doc_path,

View File

@ -37,3 +37,50 @@ def safe_to_delete(place):
return True
def get_assigns_after_delete(total_pages, deleted_pages):
"""
given total pages and a list of deleted pages - returns
a list of assignations of pages:
[new_version_page_num, old_version_page_num]
Example 1:
total_pages: 6
deleted_pages: [1, 2]
returns: [
[(1, 3), (2, 4), (3, 5), (4, 6)]
# page #1 gets info from prev page #3
# page #2 ... #4
...
# page #4 ... #6
]
Example 2:
total pages: 5
deleted_pages [1, 5]
returns: [
[(1, 2), (2, 3), (3, 4)
]
Example 3:
total pages: 5
deleted_pages [2, 3]
returns: [
[(1, 1), (2, 4), (3, 5)
# page #1 stays unaffected
# page #2 gets the info from page number 4
# page #3 gets info from page #5
]
"""
if total_pages < len(deleted_pages):
err_msg = f"total_pages < deleted_pages"
raise ValueError(err_msg)
# only numbers of pages which were not deleted
pages = [
page for page in list(range(1, total_pages + 1))
if page not in deleted_pages
]
page_numbers = range(1, len(pages) + 1)
return list(zip(page_numbers, pages))