mirror of https://github.com/papermerge/mglib
refactoring
parent
d4d5bd36a3
commit
00c083ac12
|
@ -19,20 +19,6 @@ OcrMigrate class takes care of this sort of txt/hocr files moves.
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def get_pagecount(doc_ep):
|
|
||||||
"""
|
|
||||||
Returns total number of pages for this endpoint.
|
|
||||||
Total number of pages = number of page_xy.txt files
|
|
||||||
in pages_dirname folder.
|
|
||||||
"""
|
|
||||||
doc_ep_pointing_to_results = DocumentPath.copy_from(
|
|
||||||
doc_ep, aux_dir="results"
|
|
||||||
)
|
|
||||||
pages_dir = doc_ep_pointing_to_results.pages_dirname
|
|
||||||
only_dirs = [
|
|
||||||
fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi))
|
|
||||||
]
|
|
||||||
return len(only_dirs)
|
|
||||||
|
|
||||||
|
|
||||||
def get_assigns_after_delete(total_pages, deleted_pages):
|
def get_assigns_after_delete(total_pages, deleted_pages):
|
||||||
|
@ -83,50 +69,6 @@ def get_assigns_after_delete(total_pages, deleted_pages):
|
||||||
return list(zip(page_numbers, pages))
|
return list(zip(page_numbers, pages))
|
||||||
|
|
||||||
|
|
||||||
def copy_page(src_page_ep, dst_page_ep):
|
|
||||||
err_msg = "copy_page accepts only PageEp instances"
|
|
||||||
|
|
||||||
for inst in [src_page_ep, dst_page_ep]:
|
|
||||||
if not isinstance(inst, PagePath):
|
|
||||||
raise ValueError(err_msg)
|
|
||||||
|
|
||||||
# copy .txt file
|
|
||||||
if src_page_ep.txt_exists():
|
|
||||||
make_sure_path_exists(dst_page_ep.txt_url())
|
|
||||||
|
|
||||||
src_txt = src_page_ep.txt_url()
|
|
||||||
dst_txt = dst_page_ep.txt_url()
|
|
||||||
logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
|
|
||||||
shutil.copy(src_txt, dst_txt)
|
|
||||||
else:
|
|
||||||
logger.debug(
|
|
||||||
f"txt does not exits {src_page_ep.txt_exists()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# hocr
|
|
||||||
if src_page_ep.hocr_exists():
|
|
||||||
make_sure_path_exists(dst_page_ep.hocr_url())
|
|
||||||
|
|
||||||
src_hocr = src_page_ep.hocr_url()
|
|
||||||
dst_hocr = dst_page_ep.hocr_url()
|
|
||||||
logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
|
|
||||||
shutil.copy(src_hocr, dst_hocr)
|
|
||||||
else:
|
|
||||||
logger.debug(
|
|
||||||
f"hocr does not exits {src_page_ep.hocr_exists()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if src_page_ep.img_exists():
|
|
||||||
make_sure_path_exists(dst_page_ep.img_url())
|
|
||||||
|
|
||||||
src_img = src_page_ep.img_url()
|
|
||||||
dst_img = dst_page_ep.img_url()
|
|
||||||
logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
|
|
||||||
shutil.copy(src_img, dst_img)
|
|
||||||
else:
|
|
||||||
logger.debug(
|
|
||||||
f"img does not exits {src_page_ep.img_exists()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def migrate_cutted_pages(dest_ep, src_doc_ep_list):
|
def migrate_cutted_pages(dest_ep, src_doc_ep_list):
|
||||||
|
@ -240,29 +182,3 @@ class OcrMigrate:
|
||||||
"""
|
"""
|
||||||
Similar to migrate_delete, with minor tweaks.
|
Similar to migrate_delete, with minor tweaks.
|
||||||
"""
|
"""
|
||||||
page_count = get_pagecount(self.src_ep)
|
|
||||||
|
|
||||||
if len(new_order) > page_count:
|
|
||||||
logger.error(
|
|
||||||
f"deleted_pages({new_order}) > page_count({page_count})"
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
for item in new_order:
|
|
||||||
for step in Steps():
|
|
||||||
src_page_ep = PageEp(
|
|
||||||
document_ep=self.src_ep,
|
|
||||||
page_num=int(item['page_num']),
|
|
||||||
step=step,
|
|
||||||
page_count=len(new_order)
|
|
||||||
)
|
|
||||||
dst_page_ep = PageEp(
|
|
||||||
document_ep=self.dst_ep,
|
|
||||||
page_num=int(item['page_order']),
|
|
||||||
step=step,
|
|
||||||
page_count=len(new_order)
|
|
||||||
)
|
|
||||||
copy_page(
|
|
||||||
src_page_ep=src_page_ep,
|
|
||||||
dst_page_ep=dst_page_ep
|
|
||||||
)
|
|
||||||
|
|
|
@ -85,13 +85,33 @@ class DocumentPath:
|
||||||
def inc_version(self):
|
def inc_version(self):
|
||||||
self.version = self.version + 1
|
self.version = self.version + 1
|
||||||
|
|
||||||
def copy_from(doc_ep, aux_dir):
|
def copy_from(doc_path, **kw):
|
||||||
|
"""
|
||||||
|
Will create a copy of provided
|
||||||
|
DocumentPath (first parameter = doc_path) and replace
|
||||||
|
existing parameter of new copy with the one from kw.
|
||||||
|
|
||||||
|
kw => key/value parameters.
|
||||||
|
Keys can be one of doc_path attributes: user_id, document_id,
|
||||||
|
file_name, aux_dir, version
|
||||||
|
"""
|
||||||
|
copy_values = {
|
||||||
|
'user_id': doc_path.user_id,
|
||||||
|
'document_id': doc_path.document_id,
|
||||||
|
'file_name': doc_path.file_name,
|
||||||
|
'version': doc_path.version,
|
||||||
|
'aux_dir': doc_path.aux_dir
|
||||||
|
|
||||||
|
}
|
||||||
|
for key, value in kw.items():
|
||||||
|
copy_values[key] = kw[key]
|
||||||
|
|
||||||
return DocumentPath(
|
return DocumentPath(
|
||||||
user_id=doc_ep.user_id,
|
user_id=copy_values['user_id'],
|
||||||
document_id=doc_ep.document_id,
|
document_id=copy_values['document_id'],
|
||||||
file_name=doc_ep.file_name,
|
file_name=copy_values['file_name'],
|
||||||
version=doc_ep.version,
|
version=copy_values['version'],
|
||||||
aux_dir=aux_dir
|
aux_dir=copy_values['aux_dir']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from mglib.runcmd import run
|
from mglib.runcmd import run
|
||||||
|
@ -92,15 +91,6 @@ def cat_ranges_for_delete(page_count, page_numbers):
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def make_sure_path_exists(filepath):
|
|
||||||
logger.debug(f"make_sure_path_exists {filepath}")
|
|
||||||
dirname = os.path.dirname(filepath)
|
|
||||||
os.makedirs(
|
|
||||||
dirname,
|
|
||||||
exist_ok=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def split_ranges(total, after=False, before=False):
|
def split_ranges(total, after=False, before=False):
|
||||||
"""
|
"""
|
||||||
Given a range 1, 2, ..., total (page numbers of a doc).
|
Given a range 1, 2, ..., total (page numbers of a doc).
|
||||||
|
@ -307,7 +297,7 @@ def paste_pages(
|
||||||
return dest_doc_ep.version
|
return dest_doc_ep.version
|
||||||
|
|
||||||
|
|
||||||
def reorder_pages(doc_ep, new_order):
|
def reorder_pages(doc_path, new_order):
|
||||||
"""
|
"""
|
||||||
new_order is a list of following format:
|
new_order is a list of following format:
|
||||||
|
|
||||||
|
@ -324,19 +314,19 @@ def reorder_pages(doc_ep, new_order):
|
||||||
So in human language, each hash is read:
|
So in human language, each hash is read:
|
||||||
<page_num> now should be <page_order>
|
<page_num> now should be <page_order>
|
||||||
"""
|
"""
|
||||||
ep_url = doc_ep.url()
|
url = doc_path.url()
|
||||||
page_count = get_pagecount(ep_url)
|
page_count = get_pagecount(url)
|
||||||
|
|
||||||
cat_ranges = cat_ranges_for_reorder(
|
cat_ranges = cat_ranges_for_reorder(
|
||||||
page_count=page_count,
|
page_count=page_count,
|
||||||
new_order=new_order
|
new_order=new_order
|
||||||
)
|
)
|
||||||
|
|
||||||
doc_ep.inc_version()
|
doc_path.inc_version()
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"pdftk",
|
"pdftk",
|
||||||
ep_url,
|
url,
|
||||||
"cat"
|
"cat"
|
||||||
]
|
]
|
||||||
for page in cat_ranges:
|
for page in cat_ranges:
|
||||||
|
@ -345,11 +335,11 @@ def reorder_pages(doc_ep, new_order):
|
||||||
)
|
)
|
||||||
|
|
||||||
cmd.append("output")
|
cmd.append("output")
|
||||||
make_sure_path_exists(doc_ep.url())
|
make_sure_path_exists(doc_path.url())
|
||||||
cmd.append(doc_ep.url())
|
cmd.append(doc_path.url())
|
||||||
run(cmd)
|
run(cmd)
|
||||||
|
|
||||||
return doc_ep.version
|
return doc_path.version
|
||||||
|
|
||||||
|
|
||||||
def delete_pages(doc_ep, page_numbers):
|
def delete_pages(doc_ep, page_numbers):
|
||||||
|
|
148
mglib/storage.py
148
mglib/storage.py
|
@ -1,7 +1,12 @@
|
||||||
import os
|
import os
|
||||||
|
from os import listdir
|
||||||
|
from os.path import isdir, join
|
||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
|
from mglib.step import Steps
|
||||||
from mglib.utils import safe_to_delete
|
from mglib.utils import safe_to_delete
|
||||||
|
from mglib import pdftk
|
||||||
|
from mglib.path import PagePath, DocumentPath
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -17,10 +22,46 @@ class Storage:
|
||||||
# settings.MEDIA_ROOT
|
# settings.MEDIA_ROOT
|
||||||
self._location = location
|
self._location = location
|
||||||
|
|
||||||
|
def d(self):
|
||||||
|
"""
|
||||||
|
doc_path proxy object
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def p(self):
|
||||||
|
"""
|
||||||
|
page_path proxy object
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def location(self):
|
def location(self):
|
||||||
return self._location
|
return self._location
|
||||||
|
|
||||||
|
def make_sure_path_exists(self, filepath):
|
||||||
|
logger.debug(f"make_sure_path_exists {filepath}")
|
||||||
|
dirname = os.path.dirname(filepath)
|
||||||
|
os.makedirs(
|
||||||
|
dirname,
|
||||||
|
exist_ok=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_pagecount(self, doc_path):
|
||||||
|
"""
|
||||||
|
Returns total number of pages for this doc_path.
|
||||||
|
Total number of pages = number of page_xy.txt files
|
||||||
|
in pages_dirname folder.
|
||||||
|
"""
|
||||||
|
doc_path_pointing_to_results = DocumentPath.copy_from(
|
||||||
|
doc_path, aux_dir="results"
|
||||||
|
)
|
||||||
|
pages_dir = doc_path_pointing_to_results.pages_dirname
|
||||||
|
|
||||||
|
only_dirs = [
|
||||||
|
fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi))
|
||||||
|
]
|
||||||
|
return len(only_dirs)
|
||||||
|
|
||||||
def abspath(self, _path):
|
def abspath(self, _path):
|
||||||
return os.path.join(
|
return os.path.join(
|
||||||
self.location, _path
|
self.location, _path
|
||||||
|
@ -29,7 +70,7 @@ class Storage:
|
||||||
def path(self, _path):
|
def path(self, _path):
|
||||||
return self.abspath(_path)
|
return self.abspath(_path)
|
||||||
|
|
||||||
def delete_document(self, doc_path):
|
def delete_doc(self, doc_path):
|
||||||
"""
|
"""
|
||||||
Receives a mglib.path.DocumentPath instance
|
Receives a mglib.path.DocumentPath instance
|
||||||
"""
|
"""
|
||||||
|
@ -96,14 +137,115 @@ class Storage:
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def reoder_pages(self, doc_path, new_order):
|
def copy_page(self, src_page_path, dst_page_path):
|
||||||
|
err_msg = "copy_page accepts only PageEp instances"
|
||||||
|
|
||||||
|
for inst in [src_page_path, dst_page_path]:
|
||||||
|
if not isinstance(inst, PagePath):
|
||||||
|
raise ValueError(err_msg)
|
||||||
|
|
||||||
|
# copy .txt file
|
||||||
|
if src_page_path.txt_exists():
|
||||||
|
|
||||||
|
self.make_sure_path_exists(
|
||||||
|
dst_page_path.txt_url()
|
||||||
|
)
|
||||||
|
|
||||||
|
src_txt = src_page_path.txt_url()
|
||||||
|
dst_txt = dst_page_path.txt_url()
|
||||||
|
logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
|
||||||
|
shutil.copy(src_txt, dst_txt)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"txt does not exits {src_page_path.txt_exists()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# hocr
|
||||||
|
if src_page_path.hocr_exists():
|
||||||
|
self.make_sure_path_exists(
|
||||||
|
dst_page_path.hocr_url()
|
||||||
|
)
|
||||||
|
|
||||||
|
src_hocr = src_page_path.hocr_url()
|
||||||
|
dst_hocr = dst_page_path.hocr_url()
|
||||||
|
logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
|
||||||
|
shutil.copy(src_hocr, dst_hocr)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"hocr does not exits {src_page_path.hocr_exists()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if src_page_path.img_exists():
|
||||||
|
self.make_sure_path_exists(
|
||||||
|
dst_page_path.img_url()
|
||||||
|
)
|
||||||
|
|
||||||
|
src_img = src_page_path.img_url()
|
||||||
|
dst_img = dst_page_path.img_url()
|
||||||
|
logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
|
||||||
|
shutil.copy(src_img, dst_img)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"img does not exits {src_page_path.img_exists()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def reorder_pages(self, doc_path, new_order):
|
||||||
"""
|
"""
|
||||||
Reorders pages in the document pointed by doc_path.
|
Reorders pages in the document pointed by doc_path.
|
||||||
doc_path is an instance of mglib.path.DocumentPath
|
doc_path is an instance of mglib.path.DocumentPath
|
||||||
|
|
||||||
In case of success returns document's new version.
|
In case of success returns document's new version.
|
||||||
|
|
||||||
|
new_order is a list of following format:
|
||||||
|
|
||||||
|
[
|
||||||
|
{'page_num': 2, page_order: 1},
|
||||||
|
{'page_num': 1, page_order: 2},
|
||||||
|
{'page_num': 3, page_order: 3},
|
||||||
|
{'page_num': 4, page_order: 4},
|
||||||
|
]
|
||||||
|
Example above means that in current document of 4 pages,
|
||||||
|
first page was swapped with second one.
|
||||||
|
page_num = older page order
|
||||||
|
page_order = current page order
|
||||||
|
So in human language, each hash is read:
|
||||||
|
<page_num> now should be <page_order>
|
||||||
"""
|
"""
|
||||||
pass
|
new_version = pdftk.reorder_pages(doc_path, new_order)
|
||||||
|
|
||||||
|
page_count = self.get_pagecount(doc_path)
|
||||||
|
src_doc_path = doc_path
|
||||||
|
dst_doc_path = DocumentPath.copy_from(
|
||||||
|
src_doc_path,
|
||||||
|
version=new_version
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(new_order) > page_count:
|
||||||
|
logger.error(
|
||||||
|
f"deleted_pages({new_order}) > page_count({page_count})"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
for item in new_order:
|
||||||
|
for step in Steps():
|
||||||
|
src_page_path = PagePath(
|
||||||
|
document_path=src_doc_path,
|
||||||
|
page_num=int(item['page_num']),
|
||||||
|
step=step,
|
||||||
|
page_count=len(new_order)
|
||||||
|
)
|
||||||
|
dst_page_path = PagePath(
|
||||||
|
document_ep=dst_doc_path,
|
||||||
|
page_num=int(item['page_order']),
|
||||||
|
step=step,
|
||||||
|
page_count=len(new_order)
|
||||||
|
)
|
||||||
|
self.copy_page(
|
||||||
|
src_page_path=src_page_path,
|
||||||
|
dst_page_path=dst_page_path
|
||||||
|
)
|
||||||
|
|
||||||
|
return new_version
|
||||||
|
|
||||||
def paste_pages(
|
def paste_pages(
|
||||||
self,
|
self,
|
||||||
|
|
Loading…
Reference in New Issue