mirror of https://github.com/papermerge/mglib
refactoring page delete functionality. OCRmigrate is about to be removed
parent
924dced78f
commit
bf5342724a
|
@ -18,59 +18,6 @@ OcrMigrate class takes care of this sort of txt/hocr files moves.
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_assigns_after_delete(total_pages, deleted_pages):
|
|
||||||
"""
|
|
||||||
given total pages and a list of deleted pages - returns
|
|
||||||
a list of assignations of pages:
|
|
||||||
[new_version_page_num, old_version_page_num]
|
|
||||||
Example 1:
|
|
||||||
total_pages: 6
|
|
||||||
deleted_pages: [1, 2]
|
|
||||||
returns: [
|
|
||||||
[(1, 3), (2, 4), (3, 5), (4, 6)]
|
|
||||||
# page #1 gets info from prev page #3
|
|
||||||
# page #2 ... #4
|
|
||||||
...
|
|
||||||
# page #4 ... #6
|
|
||||||
]
|
|
||||||
|
|
||||||
Example 2:
|
|
||||||
total pages: 5
|
|
||||||
deleted_pages [1, 5]
|
|
||||||
returns: [
|
|
||||||
[(1, 2), (2, 3), (3, 4)
|
|
||||||
]
|
|
||||||
|
|
||||||
Example 3:
|
|
||||||
total pages: 5
|
|
||||||
deleted_pages [2, 3]
|
|
||||||
returns: [
|
|
||||||
[(1, 1), (2, 4), (3, 5)
|
|
||||||
# page #1 stays unaffected
|
|
||||||
# page #2 gets the info from page number 4
|
|
||||||
# page #3 gets info from page #5
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
if total_pages < len(deleted_pages):
|
|
||||||
err_msg = f"total_pages < deleted_pages"
|
|
||||||
raise ValueError(err_msg)
|
|
||||||
|
|
||||||
# only numbers of pages which were not deleted
|
|
||||||
pages = [
|
|
||||||
page for page in list(range(1, total_pages + 1))
|
|
||||||
if page not in deleted_pages
|
|
||||||
]
|
|
||||||
|
|
||||||
page_numbers = range(1, len(pages) + 1)
|
|
||||||
|
|
||||||
return list(zip(page_numbers, pages))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def migrate_cutted_pages(dest_ep, src_doc_ep_list):
|
def migrate_cutted_pages(dest_ep, src_doc_ep_list):
|
||||||
"""
|
"""
|
||||||
dest_ep = destination document endpoint
|
dest_ep = destination document endpoint
|
||||||
|
@ -177,8 +124,3 @@ class OcrMigrate:
|
||||||
src_page_ep=src_page_ep,
|
src_page_ep=src_page_ep,
|
||||||
dst_page_ep=dst_page_ep
|
dst_page_ep=dst_page_ep
|
||||||
)
|
)
|
||||||
|
|
||||||
def migrate_reorder(self, new_order):
|
|
||||||
"""
|
|
||||||
Similar to migrate_delete, with minor tweaks.
|
|
||||||
"""
|
|
||||||
|
|
|
@ -338,20 +338,17 @@ def reorder_pages(
|
||||||
run(cmd)
|
run(cmd)
|
||||||
|
|
||||||
|
|
||||||
def delete_pages(doc_ep, page_numbers):
|
def delete_pages(src, dst, page_numbers):
|
||||||
ep_url = doc_ep.url()
|
page_count = get_pagecount(src)
|
||||||
page_count = get_pagecount(ep_url)
|
|
||||||
|
|
||||||
cat_ranges = cat_ranges_for_delete(
|
cat_ranges = cat_ranges_for_delete(
|
||||||
page_count,
|
page_count,
|
||||||
page_numbers
|
page_numbers
|
||||||
)
|
)
|
||||||
|
|
||||||
doc_ep.inc_version()
|
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"pdftk",
|
"pdftk",
|
||||||
ep_url,
|
src,
|
||||||
"cat"
|
"cat"
|
||||||
]
|
]
|
||||||
for page in cat_ranges:
|
for page in cat_ranges:
|
||||||
|
@ -360,9 +357,6 @@ def delete_pages(doc_ep, page_numbers):
|
||||||
)
|
)
|
||||||
|
|
||||||
cmd.append("output")
|
cmd.append("output")
|
||||||
make_sure_path_exists(doc_ep.url())
|
cmd.append(dst)
|
||||||
cmd.append(doc_ep.url())
|
|
||||||
|
|
||||||
run(cmd)
|
run(cmd)
|
||||||
|
|
||||||
return doc_ep.version
|
|
||||||
|
|
|
@ -4,7 +4,10 @@ from os.path import isdir, join
|
||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
from mglib.step import Steps
|
from mglib.step import Steps
|
||||||
from mglib.utils import safe_to_delete
|
from mglib.utils import (
|
||||||
|
safe_to_delete,
|
||||||
|
get_assigns_after_delete
|
||||||
|
)
|
||||||
from mglib import pdftk
|
from mglib import pdftk
|
||||||
from mglib.path import PagePath, DocumentPath
|
from mglib.path import PagePath, DocumentPath
|
||||||
|
|
||||||
|
@ -125,15 +128,6 @@ class Storage:
|
||||||
self.path(_path)
|
self.path(_path)
|
||||||
)
|
)
|
||||||
|
|
||||||
def delete_pages(self, doc_path, page_numers):
|
|
||||||
"""
|
|
||||||
Delets pages in the document pointed by doc_path.
|
|
||||||
doc_path is an instance of mglib.path.DocumentPath
|
|
||||||
|
|
||||||
In case of success returns document's new version.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def copy_page(self, src_page_path, dst_page_path):
|
def copy_page(self, src_page_path, dst_page_path):
|
||||||
err_msg = "copy_page accepts only PageEp instances"
|
err_msg = "copy_page accepts only PageEp instances"
|
||||||
|
|
||||||
|
@ -252,6 +246,64 @@ class Storage:
|
||||||
|
|
||||||
return doc_path.version + 1
|
return doc_path.version + 1
|
||||||
|
|
||||||
|
def delete_pages(self, doc_path, page_numbers):
|
||||||
|
"""
|
||||||
|
Delets pages in the document pointed by doc_path.
|
||||||
|
doc_path is an instance of mglib.path.DocumentPath
|
||||||
|
|
||||||
|
In case of success returns document's new version.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not isinstance(page_numbers, list):
|
||||||
|
logger.error("Expecting list argument")
|
||||||
|
return False
|
||||||
|
|
||||||
|
src_doc_path = doc_path
|
||||||
|
dst_doc_path = DocumentPath.copy_from(
|
||||||
|
src_doc_path,
|
||||||
|
version=doc_path.version + 1
|
||||||
|
)
|
||||||
|
self.make_sure_path_exists(
|
||||||
|
self.abspath(dst_doc_path)
|
||||||
|
)
|
||||||
|
pdftk.delete_pages(
|
||||||
|
self.abspath(src_doc_path),
|
||||||
|
self.abspath(dst_doc_path),
|
||||||
|
page_numbers
|
||||||
|
)
|
||||||
|
|
||||||
|
page_count = self.get_pagecount(doc_path)
|
||||||
|
if len(page_numbers) > page_count:
|
||||||
|
logger.error(
|
||||||
|
f"deleted_pages({page_numbers}) > page_count({page_count})"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
assigns = get_assigns_after_delete(
|
||||||
|
total_pages=page_count,
|
||||||
|
deleted_pages=page_numbers
|
||||||
|
)
|
||||||
|
for a in assigns:
|
||||||
|
for step in Steps():
|
||||||
|
src_page_path = PagePath(
|
||||||
|
document_path=src_doc_path,
|
||||||
|
page_num=a[1],
|
||||||
|
step=step,
|
||||||
|
page_count=page_count
|
||||||
|
)
|
||||||
|
dst_page_path = PagePath(
|
||||||
|
document_path=dst_doc_path,
|
||||||
|
page_num=a[0],
|
||||||
|
step=step,
|
||||||
|
page_count=page_count - len(page_numbers)
|
||||||
|
)
|
||||||
|
self.copy_page(
|
||||||
|
src_page_path=src_page_path,
|
||||||
|
dst_page_path=dst_page_path
|
||||||
|
)
|
||||||
|
|
||||||
|
return doc_path.version + 1
|
||||||
|
|
||||||
def paste_pages(
|
def paste_pages(
|
||||||
self,
|
self,
|
||||||
dest_doc_path,
|
dest_doc_path,
|
||||||
|
|
|
@ -37,3 +37,50 @@ def safe_to_delete(place):
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def get_assigns_after_delete(total_pages, deleted_pages):
|
||||||
|
"""
|
||||||
|
given total pages and a list of deleted pages - returns
|
||||||
|
a list of assignations of pages:
|
||||||
|
[new_version_page_num, old_version_page_num]
|
||||||
|
Example 1:
|
||||||
|
total_pages: 6
|
||||||
|
deleted_pages: [1, 2]
|
||||||
|
returns: [
|
||||||
|
[(1, 3), (2, 4), (3, 5), (4, 6)]
|
||||||
|
# page #1 gets info from prev page #3
|
||||||
|
# page #2 ... #4
|
||||||
|
...
|
||||||
|
# page #4 ... #6
|
||||||
|
]
|
||||||
|
|
||||||
|
Example 2:
|
||||||
|
total pages: 5
|
||||||
|
deleted_pages [1, 5]
|
||||||
|
returns: [
|
||||||
|
[(1, 2), (2, 3), (3, 4)
|
||||||
|
]
|
||||||
|
|
||||||
|
Example 3:
|
||||||
|
total pages: 5
|
||||||
|
deleted_pages [2, 3]
|
||||||
|
returns: [
|
||||||
|
[(1, 1), (2, 4), (3, 5)
|
||||||
|
# page #1 stays unaffected
|
||||||
|
# page #2 gets the info from page number 4
|
||||||
|
# page #3 gets info from page #5
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
if total_pages < len(deleted_pages):
|
||||||
|
err_msg = f"total_pages < deleted_pages"
|
||||||
|
raise ValueError(err_msg)
|
||||||
|
|
||||||
|
# only numbers of pages which were not deleted
|
||||||
|
pages = [
|
||||||
|
page for page in list(range(1, total_pages + 1))
|
||||||
|
if page not in deleted_pages
|
||||||
|
]
|
||||||
|
|
||||||
|
page_numbers = range(1, len(pages) + 1)
|
||||||
|
|
||||||
|
return list(zip(page_numbers, pages))
|
||||||
|
|
Loading…
Reference in New Issue