import logging import os import shutil from os import listdir from os.path import isdir, join from mglib import stapler from mglib.path import DocumentPath, PagePath from mglib.step import Steps from mglib.utils import get_assigns_after_delete, safe_to_delete logger = logging.getLogger(__name__) class Storage: """ Default Storage class which works with DocumentPath and PagePath on local host filesystem """ def __init__(self, location=None, **kwargs): # by default, this will be something like # settings.MEDIA_ROOT self._location = location @property def location(self): return self._location def upload(self, doc_path_url, **kwargs): pass def download(self, doc_path_url, **kwargs): pass def _s3copy(self, src, dst): pass def make_sure_path_exists(self, filepath): logger.debug(f"make_sure_path_exists {filepath}") dirname = os.path.dirname(filepath) os.makedirs( dirname, exist_ok=True ) def get_versions(self, doc_path): """ Returns a list of (all) ordered versions of specific doc_path. Versions start with 0. Examples of return values: - [0, 1, 2, 3] = 4 versions of the document - [ 0 ] = only one version (original) To count versions it just counts number of subfolders in specific document folder. Versions are stored in subfolders named v1, v2, v3, ... """ abs_dirname_docs = self.path( doc_path.dirname_docs ) try: only_dirs = [ fi for fi in listdir(abs_dirname_docs) if isdir( join(abs_dirname_docs, fi) ) ] except FileNotFoundError: # in tests, document folders are not always created. # If no document folder is found, just return [ 0 ] # i.e that document has only one single version and it # is the latest one. return [0] dirs_count = len(only_dirs) return list(range(0, dirs_count + 1)) def get_pagecount(self, doc_path): """ Returns total number of pages for this doc_path. Total number of pages = number of page_xy.txt files in pages_dirname folder. """ doc_path_pointing_to_results = DocumentPath.copy_from( doc_path, aux_dir="results" ) pages_dir = self.abspath(doc_path_pointing_to_results.pages_dirname()) only_dirs = [ fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi)) ] return len(only_dirs) def abspath(self, _path): if isinstance(_path, DocumentPath): return os.path.join( self.location, _path.url() ) elif isinstance(_path, PagePath): return os.path.join( self.location, _path.url() ) return os.path.join( self.location, _path ) def path(self, _path): return self.abspath(_path) def delete_doc(self, doc_path): """ Receives a mglib.path.DocumentPath instance """ # where original documents and their versions are stored abs_dirname_docs = self.path( doc_path.dirname_docs ) # where OCRed information and generated thumbnails # are stored abs_dirname_results = self.path( doc_path.dirname_results ) # Before recursively deleting everything in folder # double check that there are only # .pdf, .txt, .hocr, .jpg files. if safe_to_delete( abs_dirname_docs ): shutil.rmtree(abs_dirname_docs) if os.path.exists(abs_dirname_docs): os.rmdir(abs_dirname_docs) if safe_to_delete( abs_dirname_results ): shutil.rmtree(abs_dirname_results) if os.path.exists(abs_dirname_results): os.rmdir(abs_dirname_results) def copy_doc(self, src: DocumentPath, dst: DocumentPath): """ copy given file src file path to destination as absolute doc_path """ dirname = os.path.dirname( self.abspath(dst) ) if not os.path.exists( dirname ): os.makedirs( dirname, exist_ok=True ) logger.debug( f"copy_doc: {src} to {dst}" ) shutil.copyfile( self.abspath(src), self.abspath(dst) ) def exists(self, _path): return os.path.exists( self.path(_path) ) def copy_page_txt(self, src_page_path, dst_page_path): self.make_sure_path_exists( self.abspath(dst_page_path.txt_url()) ) src_txt = self.abspath(src_page_path.txt_url()) dst_txt = self.abspath(dst_page_path.txt_url()) logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}") shutil.copy(src_txt, dst_txt) def copy_page_img(self, src_page_path, dst_page_path): self.make_sure_path_exists( self.abspath(dst_page_path.img_url()) ) src_img = self.abspath(src_page_path.img_url()) dst_img = self.abspath(dst_page_path.img_url()) logger.debug(f"copy src_img={src_img} dst_img={dst_img}") shutil.copy(src_img, dst_img) def copy_page_hocr(self, src_page_path, dst_page_path): self.make_sure_path_exists( self.abspath(dst_page_path.hocr_url()) ) src_hocr = self.abspath(src_page_path.hocr_url()) dst_hocr = self.abspath(dst_page_path.hocr_url()) logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}") shutil.copy(src_hocr, dst_hocr) def copy_page(self, src_page_path, dst_page_path): """ Copies page data from source to destination. Page data are files with following extentions: * txt * hocr * jpeg they are located in media root of respective application. """ for inst in [src_page_path, dst_page_path]: if not isinstance(inst, PagePath): raise ValueError("copy_page accepts only PagePath instances") # copy .txt file if self.exists(src_page_path.txt_url()): self.copy_page_txt( src_page_path=src_page_path, dst_page_path=dst_page_path ) else: logger.debug( f"txt does not exits {src_page_path.txt_url()}" ) # hocr if self.exists(src_page_path.hocr_url()): self.copy_page_hocr( src_page_path=src_page_path, dst_page_path=dst_page_path ) else: logger.debug( f"hocr does not exits {src_page_path.hocr_url()}" ) if src_page_path.img_url(): self.copy_page_img( src_page_path=src_page_path, dst_page_path=dst_page_path ) else: logger.debug( f"img does not exits {src_page_path.img_url()}" ) def reorder_pages(self, doc_path, new_order): """ Reorders pages in the document pointed by doc_path. doc_path is an instance of mglib.path.DocumentPath In case of success returns document's new version. new_order is a list of following format: [ {'page_num': 2, page_order: 1}, {'page_num': 1, page_order: 2}, {'page_num': 3, page_order: 3}, {'page_num': 4, page_order: 4}, ] Example above means that in current document of 4 pages, first page was swapped with second one. page_num = older page order page_order = current page order So in human language, each hash is read: now should be """ src_doc_path = doc_path dst_doc_path = DocumentPath.copy_from( src_doc_path, version=doc_path.version + 1 ) self.make_sure_path_exists( self.abspath(dst_doc_path) ) stapler.reorder_pages( src=self.abspath(src_doc_path), dst=self.abspath(dst_doc_path), new_order=new_order ) page_count = self.get_pagecount(doc_path) if len(new_order) > page_count: logger.error( f"deleted_pages({new_order}) > page_count({page_count})" ) return for item in new_order: for step in Steps(): src_page_path = PagePath( document_path=src_doc_path, page_num=int(item['page_num']), step=step, page_count=len(new_order) ) dst_page_path = PagePath( document_path=dst_doc_path, page_num=int(item['page_order']), step=step, page_count=len(new_order) ) self.copy_page( src_page_path=src_page_path, dst_page_path=dst_page_path ) return doc_path.version + 1 def delete_pages( self, doc_path, page_numbers, skip_migration=False ): """ Delets pages in the document pointed by doc_path. doc_path is an instance of mglib.path.DocumentPath In case of success returns document's new version. """ if not isinstance(page_numbers, list): logger.error("Expecting list argument") return False src_doc_path = doc_path dst_doc_path = DocumentPath.copy_from( src_doc_path, version=doc_path.version + 1 ) self.make_sure_path_exists( self.abspath(dst_doc_path) ) stapler.delete_pages( self.abspath(src_doc_path), self.abspath(dst_doc_path), page_numbers ) if skip_migration: return doc_path.version + 1 page_count = self.get_pagecount(doc_path) if len(page_numbers) > page_count: logger.error( f"deleted_pages({page_numbers}) > page_count({page_count})" ) return assigns = get_assigns_after_delete( total_pages=page_count, deleted_pages=page_numbers ) for a in assigns: for step in Steps(): src_page_path = PagePath( document_path=src_doc_path, page_num=a[1], step=step, page_count=page_count ) dst_page_path = PagePath( document_path=dst_doc_path, page_num=a[0], step=step, page_count=page_count - len(page_numbers) ) self.copy_page( src_page_path=src_page_path, dst_page_path=dst_page_path ) return doc_path.version + 1 def paste_pages( self, dest_doc_path, data_list, dest_doc_is_new=False, after_page_number=False, before_page_number=False ): """ Pastes pages in the document pointed by dest_doc_path from src_doc_path. Both dest and src are instances of mglib.path.DocumentPath """ next_version = 0 if dest_doc_is_new: # document is new, start version with 0 next_version = 0 else: # destination document is not new, increment its version next_version = dest_doc_path.version + 1 next_ver_dp = DocumentPath.copy_from( dest_doc_path, version=next_version ) self.make_sure_path_exists( self.abspath(next_ver_dp) ) stapler.paste_pages( src=self.abspath(dest_doc_path), dst=self.abspath(next_ver_dp), data_list=data_list, dst_doc_is_new=dest_doc_is_new, after_page_number=after_page_number, before_page_number=before_page_number ) if not dest_doc_is_new: # migrate document's own pages from previous # version (this differs from pasting into newly # created docs) pcount = self.get_pagecount(dest_doc_path) data_list.insert( 0, { 'doc_path': dest_doc_path, 'page_nums': list(range(1, pcount + 1)) } ) dest_page_num = 1 dest_page_count = sum([ len(item['page_nums']) for item in data_list ]) for item in data_list: src_path = item['doc_path'] for page_num in item['page_nums']: for step in Steps(): src_page_path = PagePath( document_path=src_path, page_num=int(page_num), step=step, page_count=self.get_pagecount(src_path) ) dst_page_path = PagePath( document_path=next_ver_dp, page_num=dest_page_num, step=step, page_count=dest_page_count ) logger.debug(f"src={src_page_path} dst={dst_page_path}") self.copy_page( src_page_path=src_page_path, dst_page_path=dst_page_path ) dest_page_num += 1 return next_version class FileSystemStorage(Storage): pass