From 00c083ac12aaa7f697d169f49d1366e2dbda81d8 Mon Sep 17 00:00:00 2001 From: Eugen Ciur Date: Sat, 16 May 2020 17:53:15 +0200 Subject: [PATCH] refactoring --- mglib/ocrmigrate.py | 84 ------------------------- mglib/path.py | 32 ++++++++-- mglib/pdftk.py | 26 +++----- mglib/storage.py | 148 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 179 insertions(+), 111 deletions(-) diff --git a/mglib/ocrmigrate.py b/mglib/ocrmigrate.py index dc5d246..3feaed3 100644 --- a/mglib/ocrmigrate.py +++ b/mglib/ocrmigrate.py @@ -19,20 +19,6 @@ OcrMigrate class takes care of this sort of txt/hocr files moves. logger = logging.getLogger(__name__) -def get_pagecount(doc_ep): - """ - Returns total number of pages for this endpoint. - Total number of pages = number of page_xy.txt files - in pages_dirname folder. - """ - doc_ep_pointing_to_results = DocumentPath.copy_from( - doc_ep, aux_dir="results" - ) - pages_dir = doc_ep_pointing_to_results.pages_dirname - only_dirs = [ - fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi)) - ] - return len(only_dirs) def get_assigns_after_delete(total_pages, deleted_pages): @@ -83,50 +69,6 @@ def get_assigns_after_delete(total_pages, deleted_pages): return list(zip(page_numbers, pages)) -def copy_page(src_page_ep, dst_page_ep): - err_msg = "copy_page accepts only PageEp instances" - - for inst in [src_page_ep, dst_page_ep]: - if not isinstance(inst, PagePath): - raise ValueError(err_msg) - - # copy .txt file - if src_page_ep.txt_exists(): - make_sure_path_exists(dst_page_ep.txt_url()) - - src_txt = src_page_ep.txt_url() - dst_txt = dst_page_ep.txt_url() - logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}") - shutil.copy(src_txt, dst_txt) - else: - logger.debug( - f"txt does not exits {src_page_ep.txt_exists()}" - ) - - # hocr - if src_page_ep.hocr_exists(): - make_sure_path_exists(dst_page_ep.hocr_url()) - - src_hocr = src_page_ep.hocr_url() - dst_hocr = dst_page_ep.hocr_url() - logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}") - shutil.copy(src_hocr, dst_hocr) - else: - logger.debug( - f"hocr does not exits {src_page_ep.hocr_exists()}" - ) - - if src_page_ep.img_exists(): - make_sure_path_exists(dst_page_ep.img_url()) - - src_img = src_page_ep.img_url() - dst_img = dst_page_ep.img_url() - logger.debug(f"copy src_img={src_img} dst_img={dst_img}") - shutil.copy(src_img, dst_img) - else: - logger.debug( - f"img does not exits {src_page_ep.img_exists()}" - ) def migrate_cutted_pages(dest_ep, src_doc_ep_list): @@ -240,29 +182,3 @@ class OcrMigrate: """ Similar to migrate_delete, with minor tweaks. """ - page_count = get_pagecount(self.src_ep) - - if len(new_order) > page_count: - logger.error( - f"deleted_pages({new_order}) > page_count({page_count})" - ) - return - - for item in new_order: - for step in Steps(): - src_page_ep = PageEp( - document_ep=self.src_ep, - page_num=int(item['page_num']), - step=step, - page_count=len(new_order) - ) - dst_page_ep = PageEp( - document_ep=self.dst_ep, - page_num=int(item['page_order']), - step=step, - page_count=len(new_order) - ) - copy_page( - src_page_ep=src_page_ep, - dst_page_ep=dst_page_ep - ) diff --git a/mglib/path.py b/mglib/path.py index cb9f076..271f979 100644 --- a/mglib/path.py +++ b/mglib/path.py @@ -85,13 +85,33 @@ class DocumentPath: def inc_version(self): self.version = self.version + 1 - def copy_from(doc_ep, aux_dir): + def copy_from(doc_path, **kw): + """ + Will create a copy of provided + DocumentPath (first parameter = doc_path) and replace + existing parameter of new copy with the one from kw. + + kw => key/value parameters. + Keys can be one of doc_path attributes: user_id, document_id, + file_name, aux_dir, version + """ + copy_values = { + 'user_id': doc_path.user_id, + 'document_id': doc_path.document_id, + 'file_name': doc_path.file_name, + 'version': doc_path.version, + 'aux_dir': doc_path.aux_dir + + } + for key, value in kw.items(): + copy_values[key] = kw[key] + return DocumentPath( - user_id=doc_ep.user_id, - document_id=doc_ep.document_id, - file_name=doc_ep.file_name, - version=doc_ep.version, - aux_dir=aux_dir + user_id=copy_values['user_id'], + document_id=copy_values['document_id'], + file_name=copy_values['file_name'], + version=copy_values['version'], + aux_dir=copy_values['aux_dir'] ) diff --git a/mglib/pdftk.py b/mglib/pdftk.py index bafa12d..e148805 100644 --- a/mglib/pdftk.py +++ b/mglib/pdftk.py @@ -1,4 +1,3 @@ -import os import logging from mglib.runcmd import run @@ -92,15 +91,6 @@ def cat_ranges_for_delete(page_count, page_numbers): return results -def make_sure_path_exists(filepath): - logger.debug(f"make_sure_path_exists {filepath}") - dirname = os.path.dirname(filepath) - os.makedirs( - dirname, - exist_ok=True - ) - - def split_ranges(total, after=False, before=False): """ Given a range 1, 2, ..., total (page numbers of a doc). @@ -307,7 +297,7 @@ def paste_pages( return dest_doc_ep.version -def reorder_pages(doc_ep, new_order): +def reorder_pages(doc_path, new_order): """ new_order is a list of following format: @@ -324,19 +314,19 @@ def reorder_pages(doc_ep, new_order): So in human language, each hash is read: now should be """ - ep_url = doc_ep.url() - page_count = get_pagecount(ep_url) + url = doc_path.url() + page_count = get_pagecount(url) cat_ranges = cat_ranges_for_reorder( page_count=page_count, new_order=new_order ) - doc_ep.inc_version() + doc_path.inc_version() cmd = [ "pdftk", - ep_url, + url, "cat" ] for page in cat_ranges: @@ -345,11 +335,11 @@ def reorder_pages(doc_ep, new_order): ) cmd.append("output") - make_sure_path_exists(doc_ep.url()) - cmd.append(doc_ep.url()) + make_sure_path_exists(doc_path.url()) + cmd.append(doc_path.url()) run(cmd) - return doc_ep.version + return doc_path.version def delete_pages(doc_ep, page_numbers): diff --git a/mglib/storage.py b/mglib/storage.py index e1972e5..3ecf25c 100644 --- a/mglib/storage.py +++ b/mglib/storage.py @@ -1,7 +1,12 @@ import os +from os import listdir +from os.path import isdir, join import logging import shutil +from mglib.step import Steps from mglib.utils import safe_to_delete +from mglib import pdftk +from mglib.path import PagePath, DocumentPath logger = logging.getLogger(__name__) @@ -17,10 +22,46 @@ class Storage: # settings.MEDIA_ROOT self._location = location + def d(self): + """ + doc_path proxy object + """ + pass + + def p(self): + """ + page_path proxy object + """ + pass + @property def location(self): return self._location + def make_sure_path_exists(self, filepath): + logger.debug(f"make_sure_path_exists {filepath}") + dirname = os.path.dirname(filepath) + os.makedirs( + dirname, + exist_ok=True + ) + + def get_pagecount(self, doc_path): + """ + Returns total number of pages for this doc_path. + Total number of pages = number of page_xy.txt files + in pages_dirname folder. + """ + doc_path_pointing_to_results = DocumentPath.copy_from( + doc_path, aux_dir="results" + ) + pages_dir = doc_path_pointing_to_results.pages_dirname + + only_dirs = [ + fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi)) + ] + return len(only_dirs) + def abspath(self, _path): return os.path.join( self.location, _path @@ -29,7 +70,7 @@ class Storage: def path(self, _path): return self.abspath(_path) - def delete_document(self, doc_path): + def delete_doc(self, doc_path): """ Receives a mglib.path.DocumentPath instance """ @@ -96,14 +137,115 @@ class Storage: """ pass - def reoder_pages(self, doc_path, new_order): + def copy_page(self, src_page_path, dst_page_path): + err_msg = "copy_page accepts only PageEp instances" + + for inst in [src_page_path, dst_page_path]: + if not isinstance(inst, PagePath): + raise ValueError(err_msg) + + # copy .txt file + if src_page_path.txt_exists(): + + self.make_sure_path_exists( + dst_page_path.txt_url() + ) + + src_txt = src_page_path.txt_url() + dst_txt = dst_page_path.txt_url() + logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}") + shutil.copy(src_txt, dst_txt) + else: + logger.debug( + f"txt does not exits {src_page_path.txt_exists()}" + ) + + # hocr + if src_page_path.hocr_exists(): + self.make_sure_path_exists( + dst_page_path.hocr_url() + ) + + src_hocr = src_page_path.hocr_url() + dst_hocr = dst_page_path.hocr_url() + logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}") + shutil.copy(src_hocr, dst_hocr) + else: + logger.debug( + f"hocr does not exits {src_page_path.hocr_exists()}" + ) + + if src_page_path.img_exists(): + self.make_sure_path_exists( + dst_page_path.img_url() + ) + + src_img = src_page_path.img_url() + dst_img = dst_page_path.img_url() + logger.debug(f"copy src_img={src_img} dst_img={dst_img}") + shutil.copy(src_img, dst_img) + else: + logger.debug( + f"img does not exits {src_page_path.img_exists()}" + ) + + def reorder_pages(self, doc_path, new_order): """ Reorders pages in the document pointed by doc_path. doc_path is an instance of mglib.path.DocumentPath In case of success returns document's new version. + + new_order is a list of following format: + + [ + {'page_num': 2, page_order: 1}, + {'page_num': 1, page_order: 2}, + {'page_num': 3, page_order: 3}, + {'page_num': 4, page_order: 4}, + ] + Example above means that in current document of 4 pages, + first page was swapped with second one. + page_num = older page order + page_order = current page order + So in human language, each hash is read: + now should be """ - pass + new_version = pdftk.reorder_pages(doc_path, new_order) + + page_count = self.get_pagecount(doc_path) + src_doc_path = doc_path + dst_doc_path = DocumentPath.copy_from( + src_doc_path, + version=new_version + ) + + if len(new_order) > page_count: + logger.error( + f"deleted_pages({new_order}) > page_count({page_count})" + ) + return + + for item in new_order: + for step in Steps(): + src_page_path = PagePath( + document_path=src_doc_path, + page_num=int(item['page_num']), + step=step, + page_count=len(new_order) + ) + dst_page_path = PagePath( + document_ep=dst_doc_path, + page_num=int(item['page_order']), + step=step, + page_count=len(new_order) + ) + self.copy_page( + src_page_path=src_page_path, + dst_page_path=dst_page_path + ) + + return new_version def paste_pages( self,