mglib/mglib/storage.py

467 lines
14 KiB
Python
Raw Normal View History

import logging
import os
import shutil
2020-05-16 17:53:15 +02:00
from os import listdir
from os.path import isdir, join
2020-12-01 07:44:57 +01:00
from mglib import stapler
from mglib.path import DocumentPath, PagePath
from mglib.step import Steps
from mglib.utils import get_assigns_after_delete, safe_to_delete
2020-05-04 16:05:23 +02:00
logger = logging.getLogger(__name__)
2020-05-04 12:35:08 +02:00
class Storage:
"""
Default Storage class which works with DocumentPath and PagePath
on local host filesystem
2020-05-04 12:35:08 +02:00
"""
2020-12-25 09:57:36 +01:00
def __init__(self, location=None, **kwargs):
# by default, this will be something like
# settings.MEDIA_ROOT
self._location = location
@property
def location(self):
return self._location
def upload(self, doc_path_url, **kwargs):
2020-12-25 09:57:36 +01:00
pass
def download(self, doc_path_url, **kwargs):
2020-12-25 09:57:36 +01:00
pass
def _s3copy(self, src, dst):
pass
2020-05-16 17:53:15 +02:00
def make_sure_path_exists(self, filepath):
logger.debug(f"make_sure_path_exists {filepath}")
dirname = os.path.dirname(filepath)
os.makedirs(
dirname,
exist_ok=True
)
2020-12-14 07:01:20 +01:00
def get_versions(self, doc_path):
"""
Returns a list of (all) ordered versions
of specific doc_path. Versions
start with 0. Examples of return values:
- [0, 1, 2, 3] = 4 versions of the document
- [ 0 ] = only one version (original)
To count versions it just counts number of subfolders
in specific document folder. Versions are
stored in subfolders named v1, v2, v3, ...
"""
abs_dirname_docs = self.path(
doc_path.dirname_docs
)
2020-12-14 07:51:02 +01:00
try:
only_dirs = [
fi for fi in listdir(abs_dirname_docs) if isdir(
join(abs_dirname_docs, fi)
)
]
except FileNotFoundError:
# in tests, document folders are not always created.
# If no document folder is found, just return [ 0 ]
# i.e that document has only one single version and it
# is the latest one.
return [0]
2020-12-14 07:01:20 +01:00
dirs_count = len(only_dirs)
return list(range(0, dirs_count + 1))
2020-05-16 17:53:15 +02:00
def get_pagecount(self, doc_path):
"""
Returns total number of pages for this doc_path.
Total number of pages = number of page_xy.txt files
in pages_dirname folder.
"""
doc_path_pointing_to_results = DocumentPath.copy_from(
doc_path, aux_dir="results"
)
2020-12-13 08:27:39 +01:00
pages_dir = self.abspath(doc_path_pointing_to_results.pages_dirname())
2020-05-16 17:53:15 +02:00
only_dirs = [
fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi))
]
return len(only_dirs)
2020-05-10 16:02:58 +02:00
def abspath(self, _path):
2020-05-16 18:33:42 +02:00
if isinstance(_path, DocumentPath):
return os.path.join(
self.location, _path.url()
)
elif isinstance(_path, PagePath):
return os.path.join(
self.location, _path.url()
)
return os.path.join(
2020-05-04 13:58:56 +02:00
self.location, _path
)
2020-05-10 16:02:58 +02:00
def path(self, _path):
return self.abspath(_path)
2020-05-16 17:53:15 +02:00
def delete_doc(self, doc_path):
2020-05-04 13:58:56 +02:00
"""
Receives a mglib.path.DocumentPath instance
"""
# where original documents and their versions are stored
abs_dirname_docs = self.path(
doc_path.dirname_docs
)
# where OCRed information and generated thumbnails
# are stored
abs_dirname_results = self.path(
2020-05-04 16:05:23 +02:00
doc_path.dirname_results
)
# Before recursively deleting everything in folder
# double check that there are only
# .pdf, .txt, .hocr, .jpg files.
if safe_to_delete(
abs_dirname_docs
):
shutil.rmtree(abs_dirname_docs)
if os.path.exists(abs_dirname_docs):
os.rmdir(abs_dirname_docs)
if safe_to_delete(
abs_dirname_results
):
shutil.rmtree(abs_dirname_results)
if os.path.exists(abs_dirname_results):
os.rmdir(abs_dirname_results)
2020-05-04 13:58:56 +02:00
2021-02-21 15:22:42 +01:00
def copy_doc(self, src: DocumentPath, dst: DocumentPath):
2020-05-10 16:02:58 +02:00
"""
copy given file src file path to destination
as absolute doc_path
"""
dirname = os.path.dirname(
self.abspath(dst)
)
if not os.path.exists(
dirname
):
os.makedirs(
dirname, exist_ok=True
)
logger.debug(
f"copy_doc: {src} to {dst}"
)
shutil.copyfile(
2021-02-21 15:22:42 +01:00
self.abspath(src),
2020-05-10 16:02:58 +02:00
self.abspath(dst)
)
2020-05-04 13:58:56 +02:00
def exists(self, _path):
return os.path.exists(
self.path(_path)
)
def copy_page_txt(self, src_page_path, dst_page_path):
self.make_sure_path_exists(
self.abspath(dst_page_path.txt_url())
)
src_txt = self.abspath(src_page_path.txt_url())
dst_txt = self.abspath(dst_page_path.txt_url())
logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
shutil.copy(src_txt, dst_txt)
def copy_page_img(self, src_page_path, dst_page_path):
self.make_sure_path_exists(
self.abspath(dst_page_path.img_url())
)
src_img = self.abspath(src_page_path.img_url())
dst_img = self.abspath(dst_page_path.img_url())
logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
shutil.copy(src_img, dst_img)
def copy_page_hocr(self, src_page_path, dst_page_path):
self.make_sure_path_exists(
self.abspath(dst_page_path.hocr_url())
)
src_hocr = self.abspath(src_page_path.hocr_url())
dst_hocr = self.abspath(dst_page_path.hocr_url())
logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
shutil.copy(src_hocr, dst_hocr)
2020-05-16 17:53:15 +02:00
def copy_page(self, src_page_path, dst_page_path):
"""
Copies page data from source to destination.
2020-05-16 17:53:15 +02:00
Page data are files with following extentions:
* txt
* hocr
* jpeg
they are located in media root of respective application.
"""
2020-05-16 17:53:15 +02:00
for inst in [src_page_path, dst_page_path]:
if not isinstance(inst, PagePath):
raise ValueError("copy_page accepts only PagePath instances")
2020-05-16 17:53:15 +02:00
# copy .txt file
2020-05-16 18:33:42 +02:00
if self.exists(src_page_path.txt_url()):
self.copy_page_txt(
src_page_path=src_page_path,
dst_page_path=dst_page_path
2020-05-16 17:53:15 +02:00
)
else:
logger.debug(
2020-05-16 18:33:42 +02:00
f"txt does not exits {src_page_path.txt_url()}"
2020-05-16 17:53:15 +02:00
)
# hocr
2020-05-16 18:33:42 +02:00
if self.exists(src_page_path.hocr_url()):
self.copy_page_hocr(
src_page_path=src_page_path,
dst_page_path=dst_page_path
2020-05-16 17:53:15 +02:00
)
else:
logger.debug(
2020-05-16 18:33:42 +02:00
f"hocr does not exits {src_page_path.hocr_url()}"
2020-05-16 17:53:15 +02:00
)
2020-05-16 18:33:42 +02:00
if src_page_path.img_url():
self.copy_page_img(
src_page_path=src_page_path,
dst_page_path=dst_page_path
2020-05-16 17:53:15 +02:00
)
else:
logger.debug(
2020-05-16 18:33:42 +02:00
f"img does not exits {src_page_path.img_url()}"
2020-05-16 17:53:15 +02:00
)
def reorder_pages(self, doc_path, new_order):
2020-05-11 09:05:50 +02:00
"""
Reorders pages in the document pointed by doc_path.
doc_path is an instance of mglib.path.DocumentPath
In case of success returns document's new version.
2020-05-16 17:53:15 +02:00
new_order is a list of following format:
[
{'page_num': 2, page_order: 1},
{'page_num': 1, page_order: 2},
{'page_num': 3, page_order: 3},
{'page_num': 4, page_order: 4},
]
Example above means that in current document of 4 pages,
first page was swapped with second one.
page_num = older page order
page_order = current page order
So in human language, each hash is read:
<page_num> now should be <page_order>
2020-05-11 09:05:50 +02:00
"""
2020-05-16 17:53:15 +02:00
src_doc_path = doc_path
dst_doc_path = DocumentPath.copy_from(
src_doc_path,
2020-05-16 18:33:42 +02:00
version=doc_path.version + 1
)
self.make_sure_path_exists(
self.abspath(dst_doc_path)
2020-05-16 17:53:15 +02:00
)
2020-12-01 07:44:57 +01:00
stapler.reorder_pages(
2020-05-16 18:33:42 +02:00
src=self.abspath(src_doc_path),
dst=self.abspath(dst_doc_path),
new_order=new_order
)
page_count = self.get_pagecount(doc_path)
2020-05-16 17:53:15 +02:00
if len(new_order) > page_count:
logger.error(
f"deleted_pages({new_order}) > page_count({page_count})"
)
return
for item in new_order:
for step in Steps():
src_page_path = PagePath(
document_path=src_doc_path,
page_num=int(item['page_num']),
step=step,
page_count=len(new_order)
)
dst_page_path = PagePath(
2020-05-16 18:33:42 +02:00
document_path=dst_doc_path,
2020-05-16 17:53:15 +02:00
page_num=int(item['page_order']),
step=step,
page_count=len(new_order)
)
self.copy_page(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
return doc_path.version + 1
2020-05-11 09:05:50 +02:00
2020-06-04 19:27:04 +02:00
def delete_pages(
self,
doc_path,
page_numbers,
skip_migration=False
):
"""
Delets pages in the document pointed by doc_path.
doc_path is an instance of mglib.path.DocumentPath
In case of success returns document's new version.
"""
if not isinstance(page_numbers, list):
logger.error("Expecting list argument")
return False
src_doc_path = doc_path
dst_doc_path = DocumentPath.copy_from(
src_doc_path,
version=doc_path.version + 1
)
self.make_sure_path_exists(
self.abspath(dst_doc_path)
)
2020-12-01 07:44:57 +01:00
stapler.delete_pages(
self.abspath(src_doc_path),
self.abspath(dst_doc_path),
page_numbers
)
2020-06-04 19:27:04 +02:00
if skip_migration:
return doc_path.version + 1
page_count = self.get_pagecount(doc_path)
if len(page_numbers) > page_count:
logger.error(
f"deleted_pages({page_numbers}) > page_count({page_count})"
)
return
assigns = get_assigns_after_delete(
total_pages=page_count,
deleted_pages=page_numbers
)
for a in assigns:
for step in Steps():
src_page_path = PagePath(
document_path=src_doc_path,
page_num=a[1],
step=step,
page_count=page_count
)
dst_page_path = PagePath(
document_path=dst_doc_path,
page_num=a[0],
step=step,
page_count=page_count - len(page_numbers)
)
self.copy_page(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
return doc_path.version + 1
2020-05-11 09:05:50 +02:00
def paste_pages(
self,
dest_doc_path,
2020-05-17 09:01:59 +02:00
data_list,
2020-05-11 09:05:50 +02:00
dest_doc_is_new=False,
after_page_number=False,
before_page_number=False
):
"""
Pastes pages in the document pointed by dest_doc_path
from src_doc_path. Both dest and src are instances of
mglib.path.DocumentPath
"""
next_version = 0
if dest_doc_is_new:
# document is new, start version with 0
next_version = 0
else:
# destination document is not new, increment its version
next_version = dest_doc_path.version + 1
2020-05-17 09:01:59 +02:00
next_ver_dp = DocumentPath.copy_from(
dest_doc_path,
version=next_version
2020-05-17 09:01:59 +02:00
)
self.make_sure_path_exists(
self.abspath(next_ver_dp)
)
2020-12-01 07:44:57 +01:00
stapler.paste_pages(
2020-05-17 12:17:27 +02:00
src=self.abspath(dest_doc_path),
2020-05-17 09:01:59 +02:00
dst=self.abspath(next_ver_dp),
data_list=data_list,
dst_doc_is_new=dest_doc_is_new,
after_page_number=after_page_number,
before_page_number=before_page_number
)
2020-05-17 14:16:29 +02:00
if not dest_doc_is_new:
# migrate document's own pages from previous
# version (this differs from pasting into newly
# created docs)
pcount = self.get_pagecount(dest_doc_path)
data_list.insert(
0,
{
'doc_path': dest_doc_path,
'page_nums': list(range(1, pcount + 1))
}
)
2020-05-17 09:01:59 +02:00
dest_page_num = 1
dest_page_count = sum([
len(item['page_nums']) for item in data_list
])
for item in data_list:
src_path = item['doc_path']
for page_num in item['page_nums']:
for step in Steps():
src_page_path = PagePath(
document_path=src_path,
page_num=int(page_num),
step=step,
page_count=self.get_pagecount(src_path)
)
dst_page_path = PagePath(
document_path=next_ver_dp,
page_num=dest_page_num,
step=step,
page_count=dest_page_count
)
logger.debug(f"src={src_page_path} dst={dst_page_path}")
self.copy_page(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
dest_page_num += 1
2020-05-11 09:05:50 +02:00
return next_version
2020-05-17 09:32:47 +02:00
class FileSystemStorage(Storage):
pass