mglib/mglib/storage.py

268 lines
7.5 KiB
Python

import os
from os import listdir
from os.path import isdir, join
import logging
import shutil
from mglib.step import Steps
from mglib.utils import safe_to_delete
from mglib import pdftk
from mglib.path import PagePath, DocumentPath
logger = logging.getLogger(__name__)
class Storage:
"""
Default Storage class which works with DocumentPath and PagePath
on local host filesystem
"""
def __init__(self, location=None):
# by default, this will be something like
# settings.MEDIA_ROOT
self._location = location
def d(self):
"""
doc_path proxy object
"""
pass
def p(self):
"""
page_path proxy object
"""
pass
@property
def location(self):
return self._location
def make_sure_path_exists(self, filepath):
logger.debug(f"make_sure_path_exists {filepath}")
dirname = os.path.dirname(filepath)
os.makedirs(
dirname,
exist_ok=True
)
def get_pagecount(self, doc_path):
"""
Returns total number of pages for this doc_path.
Total number of pages = number of page_xy.txt files
in pages_dirname folder.
"""
doc_path_pointing_to_results = DocumentPath.copy_from(
doc_path, aux_dir="results"
)
pages_dir = doc_path_pointing_to_results.pages_dirname
only_dirs = [
fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi))
]
return len(only_dirs)
def abspath(self, _path):
return os.path.join(
self.location, _path
)
def path(self, _path):
return self.abspath(_path)
def delete_doc(self, doc_path):
"""
Receives a mglib.path.DocumentPath instance
"""
# where original documents and their versions are stored
abs_dirname_docs = self.path(
doc_path.dirname_docs
)
# where OCRed information and generated thumbnails
# are stored
abs_dirname_results = self.path(
doc_path.dirname_results
)
# Before recursively deleting everything in folder
# double check that there are only
# .pdf, .txt, .hocr, .jpg files.
if safe_to_delete(
abs_dirname_docs
):
shutil.rmtree(abs_dirname_docs)
if os.path.exists(abs_dirname_docs):
os.rmdir(abs_dirname_docs)
if safe_to_delete(
abs_dirname_results
):
shutil.rmtree(abs_dirname_results)
if os.path.exists(abs_dirname_results):
os.rmdir(abs_dirname_results)
def copy_doc(self, src, dst):
"""
copy given file src file path to destination
as absolute doc_path
"""
dirname = os.path.dirname(
self.abspath(dst)
)
if not os.path.exists(
dirname
):
os.makedirs(
dirname, exist_ok=True
)
logger.debug(
f"copy_doc: {src} to {dst}"
)
shutil.copyfile(
src,
self.abspath(dst)
)
def exists(self, _path):
return os.path.exists(
self.path(_path)
)
def delete_pages(self, doc_path, page_numers):
"""
Delets pages in the document pointed by doc_path.
doc_path is an instance of mglib.path.DocumentPath
In case of success returns document's new version.
"""
pass
def copy_page(self, src_page_path, dst_page_path):
err_msg = "copy_page accepts only PageEp instances"
for inst in [src_page_path, dst_page_path]:
if not isinstance(inst, PagePath):
raise ValueError(err_msg)
# copy .txt file
if src_page_path.txt_exists():
self.make_sure_path_exists(
dst_page_path.txt_url()
)
src_txt = src_page_path.txt_url()
dst_txt = dst_page_path.txt_url()
logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
shutil.copy(src_txt, dst_txt)
else:
logger.debug(
f"txt does not exits {src_page_path.txt_exists()}"
)
# hocr
if src_page_path.hocr_exists():
self.make_sure_path_exists(
dst_page_path.hocr_url()
)
src_hocr = src_page_path.hocr_url()
dst_hocr = dst_page_path.hocr_url()
logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
shutil.copy(src_hocr, dst_hocr)
else:
logger.debug(
f"hocr does not exits {src_page_path.hocr_exists()}"
)
if src_page_path.img_exists():
self.make_sure_path_exists(
dst_page_path.img_url()
)
src_img = src_page_path.img_url()
dst_img = dst_page_path.img_url()
logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
shutil.copy(src_img, dst_img)
else:
logger.debug(
f"img does not exits {src_page_path.img_exists()}"
)
def reorder_pages(self, doc_path, new_order):
"""
Reorders pages in the document pointed by doc_path.
doc_path is an instance of mglib.path.DocumentPath
In case of success returns document's new version.
new_order is a list of following format:
[
{'page_num': 2, page_order: 1},
{'page_num': 1, page_order: 2},
{'page_num': 3, page_order: 3},
{'page_num': 4, page_order: 4},
]
Example above means that in current document of 4 pages,
first page was swapped with second one.
page_num = older page order
page_order = current page order
So in human language, each hash is read:
<page_num> now should be <page_order>
"""
new_version = pdftk.reorder_pages(doc_path, new_order)
page_count = self.get_pagecount(doc_path)
src_doc_path = doc_path
dst_doc_path = DocumentPath.copy_from(
src_doc_path,
version=new_version
)
if len(new_order) > page_count:
logger.error(
f"deleted_pages({new_order}) > page_count({page_count})"
)
return
for item in new_order:
for step in Steps():
src_page_path = PagePath(
document_path=src_doc_path,
page_num=int(item['page_num']),
step=step,
page_count=len(new_order)
)
dst_page_path = PagePath(
document_ep=dst_doc_path,
page_num=int(item['page_order']),
step=step,
page_count=len(new_order)
)
self.copy_page(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
return new_version
def paste_pages(
self,
dest_doc_path,
src_doc_path,
dest_doc_is_new=False,
after_page_number=False,
before_page_number=False
):
"""
Pastes pages in the document pointed by dest_doc_path
from src_doc_path. Both dest and src are instances of
mglib.path.DocumentPath
"""
pass
class FileSystemStorage(Storage):
pass