refactoring, work in progress...

2020-05-10 16:02:58 +02:00 · 2020-05-10 16:02:58 +02:00 · 31330c0011
parent d0af1087af
commit 31330c0011
5 changed files with 438 additions and 6 deletions
--- a/mglib/ocrmigrate.py
+++ b/mglib/ocrmigrate.py
@ -0,0 +1,268 @@
 import logging
 import shutil
 from os import listdir
 from os.path import isdir, join
 from pmworker.pdftk import make_sure_path_exists
 from mglib.path import (DocumentPath, PagePath)
 from mglib.step import Steps
 """
 OCR operations are per page. Cut/Paste/Delete/Reorder are per page as well.
 So it does not make sense to rerun such a heavy operation as OCR again, instead
 we can do some magic tricks (copy them from one location to another)
 on already extracted txt and hocr files.
 OcrMigrate class takes care of this sort of txt/hocr files moves.
 """
 logger = logging.getLogger(__name__)
 def get_pagecount(doc_ep):
    """
    Returns total number of pages for this endpoint.
    Total number of pages = number of page_xy.txt files
    in pages_dirname folder.
    """
    doc_ep_pointing_to_results = DocumentPath.copy_from(
        doc_ep, aux_dir="results"
    )
    pages_dir = doc_ep_pointing_to_results.pages_dirname
    only_dirs = [
        fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi))
    ]
    return len(only_dirs)
 def get_assigns_after_delete(total_pages, deleted_pages):
    """
    given total pages and a list of deleted pages - returns
    a list of assignations of pages:
        [new_version_page_num, old_version_page_num]
    Example 1:
    total_pages: 6
    deleted_pages: [1, 2]
    returns: [
        [(1, 3),  (2, 4), (3, 5), (4, 6)]
        # page #1 gets info from prev page #3
        # page #2 ... #4
        ...
        # page #4 ... #6
    ]
    Example 2:
    total pages: 5
    deleted_pages [1, 5]
    returns: [
        [(1, 2), (2, 3), (3, 4)
    ]
    Example 3:
    total pages: 5
    deleted_pages [2, 3]
    returns: [
        [(1, 1), (2, 4), (3, 5)
        # page #1 stays unaffected
        # page #2 gets the info from page number 4
        # page #3 gets info from page #5
    ]
    """
    if total_pages < len(deleted_pages):
        err_msg = f"total_pages < deleted_pages"
        raise ValueError(err_msg)
    # only numbers of pages which were not deleted
    pages = [
        page for page in list(range(1, total_pages + 1))
        if page not in deleted_pages
    ]
    page_numbers = range(1, len(pages) + 1)
    return list(zip(page_numbers, pages))
 def copy_page(src_page_ep, dst_page_ep):
    err_msg = "copy_page accepts only PageEp instances"
    for inst in [src_page_ep, dst_page_ep]:
        if not isinstance(inst, PagePath):
            raise ValueError(err_msg)
    # copy .txt file
    if src_page_ep.txt_exists():
        make_sure_path_exists(dst_page_ep.txt_url())
        src_txt = src_page_ep.txt_url()
        dst_txt = dst_page_ep.txt_url()
        logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
        shutil.copy(src_txt, dst_txt)
    else:
        logger.debug(
            f"txt does not exits {src_page_ep.txt_exists()}"
        )
    # hocr
    if src_page_ep.hocr_exists():
        make_sure_path_exists(dst_page_ep.hocr_url())
        src_hocr = src_page_ep.hocr_url()
        dst_hocr = dst_page_ep.hocr_url()
        logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
        shutil.copy(src_hocr, dst_hocr)
    else:
        logger.debug(
            f"hocr does not exits {src_page_ep.hocr_exists()}"
        )
    if src_page_ep.img_exists():
        make_sure_path_exists(dst_page_ep.img_url())
        src_img = src_page_ep.img_url()
        dst_img = dst_page_ep.img_url()
        logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
        shutil.copy(src_img, dst_img)
    else:
        logger.debug(
            f"img does not exits {src_page_ep.img_exists()}"
        )
 def migrate_cutted_pages(dest_ep, src_doc_ep_list):
    """
    dest_ep = destination document endpoint
    src_doc_ep_list = a list of following format:
    [
        {
            'doc_ep': doc_ep,
            'page_nums': [page_num_1, page_num_2, page_num_3]
        },
        {
            'doc_ep': doc_ep,
            'page_nums': [page_num_1, page_num_2, page_num_3]
        },
        ...
    ]
    with a list of source document with copied pages.
    """
    dest_page_num = 1
    dest_page_count = sum([
        len(item['page_nums']) for item in src_doc_ep_list
    ])
    for item in src_doc_ep_list:
        src_ep = item['doc_ep']
        for page_num in item['page_nums']:
            for step in Steps():
                src_page_ep = PageEp(
                    document_ep=src_ep,
                    page_num=int(page_num),
                    step=step,
                    page_count=get_pagecount(src_ep)
                )
                dst_page_ep = PageEp(
                    document_ep=dest_ep,
                    page_num=dest_page_num,
                    step=step,
                    page_count=dest_page_count
                )
                logger.debug(f"src={src_page_ep}  dst={dst_page_ep}")
                copy_page(
                    src_page_ep=src_page_ep,
                    dst_page_ep=dst_page_ep
                )
            dest_page_num += 1
 class OcrMigrate:
    """
    Insead of running again OCR operation on changed document AGAIN
    (e.g. after pages 2 and 3 were deleted)
    text files which are result of first (and only!) OCR are moved
    (moved = migrated) inside new version's folder.
    Basically migrate/move files instead of rerunning OCR operation.
    For each affected page (page_x), following files will need to be migrated:
        * <version>/pages/page_x.txt
        * <version>/pages/page_x/50/*.hocr
        * <version>/pages/page_x/75/*.hocr
        * <version>/pages/page_x/100/*.hocr
        * <version>/pages/page_x/125/*.hocr
        from <old_version> to <new_version>
    Which pages are affected depends on the operation.
    """
    def __init__(self, src_ep, dst_ep):
        # Both endpoints shoud be instance of DocumentEp
        for inst in [src_ep, dst_ep]:
            if not isinstance(inst, DocumentEp):
                raise ValueError(
                    "OcrMigrate args must be DocumentEp instances"
                )
        self.src_ep = src_ep
        self.dst_ep = dst_ep
    def migrate_delete(self, deleted_pages):
        page_count = get_pagecount(self.src_ep)
        if len(deleted_pages) > page_count:
            logger.error(
                f"deleted_pages({deleted_pages}) > page_count({page_count})"
            )
            return
        assigns = get_assigns_after_delete(
            total_pages=page_count,
            deleted_pages=deleted_pages
        )
        for a in assigns:
            for step in Steps():
                src_page_ep = PageEp(
                    document_ep=self.src_ep,
                    page_num=a[1],
                    step=step,
                    page_count=page_count
                )
                dst_page_ep = PageEp(
                    document_ep=self.dst_ep,
                    page_num=a[0],
                    step=step,
                    page_count=page_count - len(deleted_pages)
                )
                copy_page(
                    src_page_ep=src_page_ep,
                    dst_page_ep=dst_page_ep
                )
    def migrate_reorder(self, new_order):
        """
        Similar to migrate_delete, with minor tweaks.
        """
        page_count = get_pagecount(self.src_ep)
        if len(new_order) > page_count:
            logger.error(
                f"deleted_pages({new_order}) > page_count({page_count})"
            )
            return
        for item in new_order:
            for step in Steps():
                src_page_ep = PageEp(
                    document_ep=self.src_ep,
                    page_num=int(item['page_num']),
                    step=step,
                    page_count=len(new_order)
                )
                dst_page_ep = PageEp(
                    document_ep=self.dst_ep,
                    page_num=int(item['page_order']),
                    step=step,
                    page_count=len(new_order)
                )
                copy_page(
                    src_page_ep=src_page_ep,
                    dst_page_ep=dst_page_ep
                )
--- a/mglib/path.py
+++ b/mglib/path.py
@ -102,7 +102,7 @@ class PagePath:
    def __init__(
        self,
-        document_ep,
+        document_path,
        page_num,
        page_count,
        step=None
@ -111,15 +111,15 @@ class PagePath:
            msg_err = f"PagePath.page_num must be an int. Got {page_num}."
            raise ValueError(msg_err)
-        self.document_ep = document_ep
+        self.document_path = document_path
        self.results_document_ep = DocumentPath.copy_from(
-            document_ep,
+            document_path,
            aux_dir=AUX_DIR_RESULTS
        )
        self.page_count = page_count
        self.page_num = page_num
        self.step = step
-        self.pages = self.document_ep.pages
+        self.pages = self.document_path.pages
    @property
    def ppmroot(self):
@ -133,7 +133,7 @@ class PagePath:
    @property
    def pages_dirname(self):
-        return self.document_ep.pages_dirname
+        return self.document_path.pages_dirname
    @property
    def path(self):
--- a/mglib/runcmd.py
+++ b/mglib/runcmd.py
@ -0,0 +1,25 @@
 import logging
 import subprocess
 logger = logging.getLogger(__name__)
 def run(cmd):
    logger.debug(
        f"Run:{'|'.join(cmd)}"
    )
    ret = subprocess.run(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        encoding="utf-8"
    )
    if ret.returncode != 0:
        logger.error((
            f"returncode={ret.returncode}"
            f" stdout={ret.stdout}"
            f" stderr={ret.stderr}"
        ))
--- a/mglib/shortcuts.py
+++ b/mglib/shortcuts.py
@ -0,0 +1,113 @@
 import os
 import logging
 from mglib.runcmd import run
 logger = logging.getLogger(__name__)
 def extract_img(page_path, media_root):
    local_abspath = os.path.join(
        media_root,
        page_path.document_path.url()
    )
    logger.debug(f"Extracing image for {page_path.img_url()}")
    ppmroot = os.path.join(media_root, page_path.ppmroot)
    ppmroot_dirname = os.path.dirname(ppmroot)
    page_num = page_path.page_num
    width = page_path.step.width
    if not os.path.exists(ppmroot_dirname):
        logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
        os.makedirs(
            ppmroot_dirname, exist_ok=True
        )
    else:
        logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
    cmd = (
        "pdftoppm",
        "-jpeg",
        "-f",
        str(page_num),
        "-l",  # generate only one page
        str(page_num),
        "-scale-to-x",
        str(width),
        "-scale-to-y",
        "-1",  # it will adjust height according to img ratio
        local_abspath,
        # output directory path,
        ppmroot
    )
    run(cmd)
 def extract_hocr(page_url, lang, media_root):
    page_abspath = os.path.join(
        media_root,
        page_url.img_url()
    )
    hocr_root, hocr_ext = os.path.splitext(
        os.path.join(media_root, page_url.hocr_url())
    )
    cmd = (
        "tesseract",
        "-l",
        lang,
        page_abspath,
        hocr_root,
        "hocr"
    )
    run(cmd)
 def extract_txt(page_url, lang, media_root):
    page_abspath = os.path.join(
        media_root,
        page_url.img_url()
    )
    txt_root, txt_ext = os.path.splitext(
        os.path.join(
            media_root, page_url.txt_url()
        )
    )
    cmd = (
        "tesseract",
        "-l",
        lang,
        page_abspath,
        txt_root
    )
    run(cmd)
 #def text_from_pdf(filepath, lang, dry_run=False):
 #
 #    # suffix .tiff in file name is required by conver utility, otherwise
 #    # it won't convert to tiff format!
 #    tiff = tempfile.NamedTemporaryFile(suffix=".tiff")
 #    conv = convert.Convert(dry_run=dry_run)
 #    conv(filepath=filepath, fout=tiff)
 #    try:
 #        tsact = tesseract.Tesseract()
 #        text = tsact(filepath=tiff.name, lang=lang)
 #    except subprocess.CalledProcessError as e:
 #        print(e)
 #        print(e.stderr)
 #        return
 #
 #    return text
 #
 #
 #def text_from_image(filepath, lang, dry_run=False):
 #
 #    tsact = tesseract.Tesseract(dry_run=dry_run)
 #    text = tsact(filepath=filepath, lang=lang)
 #
 #    return text
 #
--- a/mglib/storage.py
+++ b/mglib/storage.py
@ -21,11 +21,14 @@ class Storage:
    def location(self):
        return self._location
-    def path(self, _path):
+    def abspath(self, _path):
        return os.path.join(
            self.location, _path
        )
    def path(self, _path):
        return self.abspath(_path)
    def delete_document(self, doc_path):
        """
        Receives a mglib.path.DocumentPath instance
@ -56,6 +59,29 @@ class Storage:
            if os.path.exists(abs_dirname_results):
                os.rmdir(abs_dirname_results)
    def copy_doc(self, src, dst):
        """
        copy given file src file path to destination
        as absolute doc_path
        """
        dirname = os.path.dirname(
            self.abspath(dst)
        )
        if not os.path.exists(
            dirname
        ):
            os.makedirs(
                dirname, exist_ok=True
            )
        logger.debug(
            f"copy_doc: {src} to {dst}"
        )
        shutil.copyfile(
            src,
            self.abspath(dst)
        )
    def exists(self, _path):
        return os.path.exists(
            self.path(_path)