import os from os import listdir from os.path import isdir, join import logging import shutil from mglib.step import Steps from mglib.utils import safe_to_delete from mglib import pdftk from mglib.path import PagePath, DocumentPath logger = logging.getLogger(__name__) class Storage: """ Default Storage class which works with DocumentPath and PagePath on local host filesystem """ def __init__(self, location=None): # by default, this will be something like # settings.MEDIA_ROOT self._location = location def d(self): """ doc_path proxy object """ pass def p(self): """ page_path proxy object """ pass @property def location(self): return self._location def make_sure_path_exists(self, filepath): logger.debug(f"make_sure_path_exists {filepath}") dirname = os.path.dirname(filepath) os.makedirs( dirname, exist_ok=True ) def get_pagecount(self, doc_path): """ Returns total number of pages for this doc_path. Total number of pages = number of page_xy.txt files in pages_dirname folder. """ doc_path_pointing_to_results = DocumentPath.copy_from( doc_path, aux_dir="results" ) pages_dir = doc_path_pointing_to_results.pages_dirname only_dirs = [ fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi)) ] return len(only_dirs) def abspath(self, _path): return os.path.join( self.location, _path ) def path(self, _path): return self.abspath(_path) def delete_doc(self, doc_path): """ Receives a mglib.path.DocumentPath instance """ # where original documents and their versions are stored abs_dirname_docs = self.path( doc_path.dirname_docs ) # where OCRed information and generated thumbnails # are stored abs_dirname_results = self.path( doc_path.dirname_results ) # Before recursively deleting everything in folder # double check that there are only # .pdf, .txt, .hocr, .jpg files. if safe_to_delete( abs_dirname_docs ): shutil.rmtree(abs_dirname_docs) if os.path.exists(abs_dirname_docs): os.rmdir(abs_dirname_docs) if safe_to_delete( abs_dirname_results ): shutil.rmtree(abs_dirname_results) if os.path.exists(abs_dirname_results): os.rmdir(abs_dirname_results) def copy_doc(self, src, dst): """ copy given file src file path to destination as absolute doc_path """ dirname = os.path.dirname( self.abspath(dst) ) if not os.path.exists( dirname ): os.makedirs( dirname, exist_ok=True ) logger.debug( f"copy_doc: {src} to {dst}" ) shutil.copyfile( src, self.abspath(dst) ) def exists(self, _path): return os.path.exists( self.path(_path) ) def delete_pages(self, doc_path, page_numers): """ Delets pages in the document pointed by doc_path. doc_path is an instance of mglib.path.DocumentPath In case of success returns document's new version. """ pass def copy_page(self, src_page_path, dst_page_path): err_msg = "copy_page accepts only PageEp instances" for inst in [src_page_path, dst_page_path]: if not isinstance(inst, PagePath): raise ValueError(err_msg) # copy .txt file if src_page_path.txt_exists(): self.make_sure_path_exists( dst_page_path.txt_url() ) src_txt = src_page_path.txt_url() dst_txt = dst_page_path.txt_url() logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}") shutil.copy(src_txt, dst_txt) else: logger.debug( f"txt does not exits {src_page_path.txt_exists()}" ) # hocr if src_page_path.hocr_exists(): self.make_sure_path_exists( dst_page_path.hocr_url() ) src_hocr = src_page_path.hocr_url() dst_hocr = dst_page_path.hocr_url() logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}") shutil.copy(src_hocr, dst_hocr) else: logger.debug( f"hocr does not exits {src_page_path.hocr_exists()}" ) if src_page_path.img_exists(): self.make_sure_path_exists( dst_page_path.img_url() ) src_img = src_page_path.img_url() dst_img = dst_page_path.img_url() logger.debug(f"copy src_img={src_img} dst_img={dst_img}") shutil.copy(src_img, dst_img) else: logger.debug( f"img does not exits {src_page_path.img_exists()}" ) def reorder_pages(self, doc_path, new_order): """ Reorders pages in the document pointed by doc_path. doc_path is an instance of mglib.path.DocumentPath In case of success returns document's new version. new_order is a list of following format: [ {'page_num': 2, page_order: 1}, {'page_num': 1, page_order: 2}, {'page_num': 3, page_order: 3}, {'page_num': 4, page_order: 4}, ] Example above means that in current document of 4 pages, first page was swapped with second one. page_num = older page order page_order = current page order So in human language, each hash is read: now should be """ new_version = pdftk.reorder_pages(doc_path, new_order) page_count = self.get_pagecount(doc_path) src_doc_path = doc_path dst_doc_path = DocumentPath.copy_from( src_doc_path, version=new_version ) if len(new_order) > page_count: logger.error( f"deleted_pages({new_order}) > page_count({page_count})" ) return for item in new_order: for step in Steps(): src_page_path = PagePath( document_path=src_doc_path, page_num=int(item['page_num']), step=step, page_count=len(new_order) ) dst_page_path = PagePath( document_ep=dst_doc_path, page_num=int(item['page_order']), step=step, page_count=len(new_order) ) self.copy_page( src_page_path=src_page_path, dst_page_path=dst_page_path ) return new_version def paste_pages( self, dest_doc_path, src_doc_path, dest_doc_is_new=False, after_page_number=False, before_page_number=False ): """ Pastes pages in the document pointed by dest_doc_path from src_doc_path. Both dest and src are instances of mglib.path.DocumentPath """ pass class FileSystemStorage(Storage): pass