From 31330c0011eb2eb71c1b9ef02aa7001bdc6f8d48 Mon Sep 17 00:00:00 2001 From: Eugen Ciur Date: Sun, 10 May 2020 16:02:58 +0200 Subject: [PATCH] refactoring, work in progress... --- mglib/ocrmigrate.py | 268 ++++++++++++++++++++++++++++++++++++++++++++ mglib/path.py | 10 +- mglib/runcmd.py | 25 +++++ mglib/shortcuts.py | 113 +++++++++++++++++++ mglib/storage.py | 28 ++++- 5 files changed, 438 insertions(+), 6 deletions(-) create mode 100644 mglib/ocrmigrate.py create mode 100644 mglib/runcmd.py create mode 100644 mglib/shortcuts.py diff --git a/mglib/ocrmigrate.py b/mglib/ocrmigrate.py new file mode 100644 index 0000000..dc5d246 --- /dev/null +++ b/mglib/ocrmigrate.py @@ -0,0 +1,268 @@ +import logging +import shutil +from os import listdir +from os.path import isdir, join + +from pmworker.pdftk import make_sure_path_exists +from mglib.path import (DocumentPath, PagePath) +from mglib.step import Steps + +""" +OCR operations are per page. Cut/Paste/Delete/Reorder are per page as well. +So it does not make sense to rerun such a heavy operation as OCR again, instead +we can do some magic tricks (copy them from one location to another) +on already extracted txt and hocr files. + +OcrMigrate class takes care of this sort of txt/hocr files moves. +""" + +logger = logging.getLogger(__name__) + + +def get_pagecount(doc_ep): + """ + Returns total number of pages for this endpoint. + Total number of pages = number of page_xy.txt files + in pages_dirname folder. + """ + doc_ep_pointing_to_results = DocumentPath.copy_from( + doc_ep, aux_dir="results" + ) + pages_dir = doc_ep_pointing_to_results.pages_dirname + only_dirs = [ + fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi)) + ] + return len(only_dirs) + + +def get_assigns_after_delete(total_pages, deleted_pages): + """ + given total pages and a list of deleted pages - returns + a list of assignations of pages: + [new_version_page_num, old_version_page_num] + Example 1: + total_pages: 6 + deleted_pages: [1, 2] + returns: [ + [(1, 3), (2, 4), (3, 5), (4, 6)] + # page #1 gets info from prev page #3 + # page #2 ... #4 + ... + # page #4 ... #6 + ] + + Example 2: + total pages: 5 + deleted_pages [1, 5] + returns: [ + [(1, 2), (2, 3), (3, 4) + ] + + Example 3: + total pages: 5 + deleted_pages [2, 3] + returns: [ + [(1, 1), (2, 4), (3, 5) + # page #1 stays unaffected + # page #2 gets the info from page number 4 + # page #3 gets info from page #5 + ] + """ + if total_pages < len(deleted_pages): + err_msg = f"total_pages < deleted_pages" + raise ValueError(err_msg) + + # only numbers of pages which were not deleted + pages = [ + page for page in list(range(1, total_pages + 1)) + if page not in deleted_pages + ] + + page_numbers = range(1, len(pages) + 1) + + return list(zip(page_numbers, pages)) + + +def copy_page(src_page_ep, dst_page_ep): + err_msg = "copy_page accepts only PageEp instances" + + for inst in [src_page_ep, dst_page_ep]: + if not isinstance(inst, PagePath): + raise ValueError(err_msg) + + # copy .txt file + if src_page_ep.txt_exists(): + make_sure_path_exists(dst_page_ep.txt_url()) + + src_txt = src_page_ep.txt_url() + dst_txt = dst_page_ep.txt_url() + logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}") + shutil.copy(src_txt, dst_txt) + else: + logger.debug( + f"txt does not exits {src_page_ep.txt_exists()}" + ) + + # hocr + if src_page_ep.hocr_exists(): + make_sure_path_exists(dst_page_ep.hocr_url()) + + src_hocr = src_page_ep.hocr_url() + dst_hocr = dst_page_ep.hocr_url() + logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}") + shutil.copy(src_hocr, dst_hocr) + else: + logger.debug( + f"hocr does not exits {src_page_ep.hocr_exists()}" + ) + + if src_page_ep.img_exists(): + make_sure_path_exists(dst_page_ep.img_url()) + + src_img = src_page_ep.img_url() + dst_img = dst_page_ep.img_url() + logger.debug(f"copy src_img={src_img} dst_img={dst_img}") + shutil.copy(src_img, dst_img) + else: + logger.debug( + f"img does not exits {src_page_ep.img_exists()}" + ) + + +def migrate_cutted_pages(dest_ep, src_doc_ep_list): + """ + dest_ep = destination document endpoint + src_doc_ep_list = a list of following format: + [ + { + 'doc_ep': doc_ep, + 'page_nums': [page_num_1, page_num_2, page_num_3] + }, + { + 'doc_ep': doc_ep, + 'page_nums': [page_num_1, page_num_2, page_num_3] + }, + ... + ] + with a list of source document with copied pages. + """ + dest_page_num = 1 + dest_page_count = sum([ + len(item['page_nums']) for item in src_doc_ep_list + ]) + for item in src_doc_ep_list: + src_ep = item['doc_ep'] + for page_num in item['page_nums']: + for step in Steps(): + src_page_ep = PageEp( + document_ep=src_ep, + page_num=int(page_num), + step=step, + page_count=get_pagecount(src_ep) + ) + dst_page_ep = PageEp( + document_ep=dest_ep, + page_num=dest_page_num, + step=step, + page_count=dest_page_count + ) + logger.debug(f"src={src_page_ep} dst={dst_page_ep}") + copy_page( + src_page_ep=src_page_ep, + dst_page_ep=dst_page_ep + ) + dest_page_num += 1 + + +class OcrMigrate: + """ + Insead of running again OCR operation on changed document AGAIN + (e.g. after pages 2 and 3 were deleted) + text files which are result of first (and only!) OCR are moved + (moved = migrated) inside new version's folder. + Basically migrate/move files instead of rerunning OCR operation. + + For each affected page (page_x), following files will need to be migrated: + * /pages/page_x.txt + * /pages/page_x/50/*.hocr + * /pages/page_x/75/*.hocr + * /pages/page_x/100/*.hocr + * /pages/page_x/125/*.hocr + from to + + Which pages are affected depends on the operation. + """ + + def __init__(self, src_ep, dst_ep): + # Both endpoints shoud be instance of DocumentEp + + for inst in [src_ep, dst_ep]: + if not isinstance(inst, DocumentEp): + raise ValueError( + "OcrMigrate args must be DocumentEp instances" + ) + + self.src_ep = src_ep + self.dst_ep = dst_ep + + def migrate_delete(self, deleted_pages): + page_count = get_pagecount(self.src_ep) + if len(deleted_pages) > page_count: + logger.error( + f"deleted_pages({deleted_pages}) > page_count({page_count})" + ) + return + + assigns = get_assigns_after_delete( + total_pages=page_count, + deleted_pages=deleted_pages + ) + for a in assigns: + for step in Steps(): + src_page_ep = PageEp( + document_ep=self.src_ep, + page_num=a[1], + step=step, + page_count=page_count + ) + dst_page_ep = PageEp( + document_ep=self.dst_ep, + page_num=a[0], + step=step, + page_count=page_count - len(deleted_pages) + ) + copy_page( + src_page_ep=src_page_ep, + dst_page_ep=dst_page_ep + ) + + def migrate_reorder(self, new_order): + """ + Similar to migrate_delete, with minor tweaks. + """ + page_count = get_pagecount(self.src_ep) + + if len(new_order) > page_count: + logger.error( + f"deleted_pages({new_order}) > page_count({page_count})" + ) + return + + for item in new_order: + for step in Steps(): + src_page_ep = PageEp( + document_ep=self.src_ep, + page_num=int(item['page_num']), + step=step, + page_count=len(new_order) + ) + dst_page_ep = PageEp( + document_ep=self.dst_ep, + page_num=int(item['page_order']), + step=step, + page_count=len(new_order) + ) + copy_page( + src_page_ep=src_page_ep, + dst_page_ep=dst_page_ep + ) diff --git a/mglib/path.py b/mglib/path.py index f02679f..cb9f076 100644 --- a/mglib/path.py +++ b/mglib/path.py @@ -102,7 +102,7 @@ class PagePath: def __init__( self, - document_ep, + document_path, page_num, page_count, step=None @@ -111,15 +111,15 @@ class PagePath: msg_err = f"PagePath.page_num must be an int. Got {page_num}." raise ValueError(msg_err) - self.document_ep = document_ep + self.document_path = document_path self.results_document_ep = DocumentPath.copy_from( - document_ep, + document_path, aux_dir=AUX_DIR_RESULTS ) self.page_count = page_count self.page_num = page_num self.step = step - self.pages = self.document_ep.pages + self.pages = self.document_path.pages @property def ppmroot(self): @@ -133,7 +133,7 @@ class PagePath: @property def pages_dirname(self): - return self.document_ep.pages_dirname + return self.document_path.pages_dirname @property def path(self): diff --git a/mglib/runcmd.py b/mglib/runcmd.py new file mode 100644 index 0000000..0aea140 --- /dev/null +++ b/mglib/runcmd.py @@ -0,0 +1,25 @@ +import logging +import subprocess + + +logger = logging.getLogger(__name__) + + +def run(cmd): + logger.debug( + f"Run:{'|'.join(cmd)}" + ) + + ret = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + encoding="utf-8" + ) + + if ret.returncode != 0: + logger.error(( + f"returncode={ret.returncode}" + f" stdout={ret.stdout}" + f" stderr={ret.stderr}" + )) diff --git a/mglib/shortcuts.py b/mglib/shortcuts.py new file mode 100644 index 0000000..37c4f12 --- /dev/null +++ b/mglib/shortcuts.py @@ -0,0 +1,113 @@ +import os +import logging + +from mglib.runcmd import run + +logger = logging.getLogger(__name__) + + +def extract_img(page_path, media_root): + + local_abspath = os.path.join( + media_root, + page_path.document_path.url() + ) + logger.debug(f"Extracing image for {page_path.img_url()}") + + ppmroot = os.path.join(media_root, page_path.ppmroot) + ppmroot_dirname = os.path.dirname(ppmroot) + + page_num = page_path.page_num + width = page_path.step.width + + if not os.path.exists(ppmroot_dirname): + logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.") + os.makedirs( + ppmroot_dirname, exist_ok=True + ) + else: + logger.debug(f"PPMROOT {ppmroot_dirname} already exists.") + cmd = ( + "pdftoppm", + "-jpeg", + "-f", + str(page_num), + "-l", # generate only one page + str(page_num), + "-scale-to-x", + str(width), + "-scale-to-y", + "-1", # it will adjust height according to img ratio + local_abspath, + # output directory path, + ppmroot + ) + + run(cmd) + + +def extract_hocr(page_url, lang, media_root): + page_abspath = os.path.join( + media_root, + page_url.img_url() + ) + + hocr_root, hocr_ext = os.path.splitext( + os.path.join(media_root, page_url.hocr_url()) + ) + cmd = ( + "tesseract", + "-l", + lang, + page_abspath, + hocr_root, + "hocr" + ) + run(cmd) + + +def extract_txt(page_url, lang, media_root): + page_abspath = os.path.join( + media_root, + page_url.img_url() + ) + txt_root, txt_ext = os.path.splitext( + os.path.join( + media_root, page_url.txt_url() + ) + ) + cmd = ( + "tesseract", + "-l", + lang, + page_abspath, + txt_root + ) + run(cmd) + + +#def text_from_pdf(filepath, lang, dry_run=False): +# +# # suffix .tiff in file name is required by conver utility, otherwise +# # it won't convert to tiff format! +# tiff = tempfile.NamedTemporaryFile(suffix=".tiff") +# conv = convert.Convert(dry_run=dry_run) +# conv(filepath=filepath, fout=tiff) +# try: +# tsact = tesseract.Tesseract() +# text = tsact(filepath=tiff.name, lang=lang) +# except subprocess.CalledProcessError as e: +# print(e) +# print(e.stderr) +# return +# +# return text +# +# +#def text_from_image(filepath, lang, dry_run=False): +# +# tsact = tesseract.Tesseract(dry_run=dry_run) +# text = tsact(filepath=filepath, lang=lang) +# +# return text +# diff --git a/mglib/storage.py b/mglib/storage.py index 0f404c7..219cde3 100644 --- a/mglib/storage.py +++ b/mglib/storage.py @@ -21,11 +21,14 @@ class Storage: def location(self): return self._location - def path(self, _path): + def abspath(self, _path): return os.path.join( self.location, _path ) + def path(self, _path): + return self.abspath(_path) + def delete_document(self, doc_path): """ Receives a mglib.path.DocumentPath instance @@ -56,6 +59,29 @@ class Storage: if os.path.exists(abs_dirname_results): os.rmdir(abs_dirname_results) + def copy_doc(self, src, dst): + """ + copy given file src file path to destination + as absolute doc_path + """ + + dirname = os.path.dirname( + self.abspath(dst) + ) + if not os.path.exists( + dirname + ): + os.makedirs( + dirname, exist_ok=True + ) + logger.debug( + f"copy_doc: {src} to {dst}" + ) + shutil.copyfile( + src, + self.abspath(dst) + ) + def exists(self, _path): return os.path.exists( self.path(_path)