refactoring, work in progress...

2020-05-10 16:02:58 +02:00 · 2020-05-10 16:02:58 +02:00 · 31330c0011
parent d0af1087af
commit 31330c0011
5 changed files with 438 additions and 6 deletions
--- a/mglib/ocrmigrate.py
+++ b/mglib/ocrmigrate.py
@ -0,0 +1,268 @@
+import logging
+import shutil
+from os import listdir
+from os.path import isdir, join
+
+from pmworker.pdftk import make_sure_path_exists
+from mglib.path import (DocumentPath, PagePath)
+from mglib.step import Steps
+
+"""
+OCR operations are per page. Cut/Paste/Delete/Reorder are per page as well.
+So it does not make sense to rerun such a heavy operation as OCR again, instead
+we can do some magic tricks (copy them from one location to another)
+on already extracted txt and hocr files.
+
+OcrMigrate class takes care of this sort of txt/hocr files moves.
+"""
+
+logger = logging.getLogger(__name__)
+
+
+def get_pagecount(doc_ep):
+    """
+    Returns total number of pages for this endpoint.
+    Total number of pages = number of page_xy.txt files
+    in pages_dirname folder.
+    """
+    doc_ep_pointing_to_results = DocumentPath.copy_from(
+        doc_ep, aux_dir="results"
+    )
+    pages_dir = doc_ep_pointing_to_results.pages_dirname
+    only_dirs = [
+        fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi))
+    ]
+    return len(only_dirs)
+
+
+def get_assigns_after_delete(total_pages, deleted_pages):
+    """
+    given total pages and a list of deleted pages - returns
+    a list of assignations of pages:
+        [new_version_page_num, old_version_page_num]
+    Example 1:
+    total_pages: 6
+    deleted_pages: [1, 2]
+    returns: [
+        [(1, 3),  (2, 4), (3, 5), (4, 6)]
+        # page #1 gets info from prev page #3
+        # page #2 ... #4
+        ...
+        # page #4 ... #6
+    ]
+
+    Example 2:
+    total pages: 5
+    deleted_pages [1, 5]
+    returns: [
+        [(1, 2), (2, 3), (3, 4)
+    ]
+
+    Example 3:
+    total pages: 5
+    deleted_pages [2, 3]
+    returns: [
+        [(1, 1), (2, 4), (3, 5)
+        # page #1 stays unaffected
+        # page #2 gets the info from page number 4
+        # page #3 gets info from page #5
+    ]
+    """
+    if total_pages < len(deleted_pages):
+        err_msg = f"total_pages < deleted_pages"
+        raise ValueError(err_msg)
+
+    # only numbers of pages which were not deleted
+    pages = [
+        page for page in list(range(1, total_pages + 1))
+        if page not in deleted_pages
+    ]
+
+    page_numbers = range(1, len(pages) + 1)
+
+    return list(zip(page_numbers, pages))
+
+
+def copy_page(src_page_ep, dst_page_ep):
+    err_msg = "copy_page accepts only PageEp instances"
+
+    for inst in [src_page_ep, dst_page_ep]:
+        if not isinstance(inst, PagePath):
+            raise ValueError(err_msg)
+
+    # copy .txt file
+    if src_page_ep.txt_exists():
+        make_sure_path_exists(dst_page_ep.txt_url())
+
+        src_txt = src_page_ep.txt_url()
+        dst_txt = dst_page_ep.txt_url()
+        logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
+        shutil.copy(src_txt, dst_txt)
+    else:
+        logger.debug(
+            f"txt does not exits {src_page_ep.txt_exists()}"
+        )
+
+    # hocr
+    if src_page_ep.hocr_exists():
+        make_sure_path_exists(dst_page_ep.hocr_url())
+
+        src_hocr = src_page_ep.hocr_url()
+        dst_hocr = dst_page_ep.hocr_url()
+        logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
+        shutil.copy(src_hocr, dst_hocr)
+    else:
+        logger.debug(
+            f"hocr does not exits {src_page_ep.hocr_exists()}"
+        )
+
+    if src_page_ep.img_exists():
+        make_sure_path_exists(dst_page_ep.img_url())
+
+        src_img = src_page_ep.img_url()
+        dst_img = dst_page_ep.img_url()
+        logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
+        shutil.copy(src_img, dst_img)
+    else:
+        logger.debug(
+            f"img does not exits {src_page_ep.img_exists()}"
+        )
+
+
+def migrate_cutted_pages(dest_ep, src_doc_ep_list):
+    """
+    dest_ep = destination document endpoint
+    src_doc_ep_list = a list of following format:
+    [
+        {
+            'doc_ep': doc_ep,
+            'page_nums': [page_num_1, page_num_2, page_num_3]
+        },
+        {
+            'doc_ep': doc_ep,
+            'page_nums': [page_num_1, page_num_2, page_num_3]
+        },
+        ...
+    ]
+    with a list of source document with copied pages.
+    """
+    dest_page_num = 1
+    dest_page_count = sum([
+        len(item['page_nums']) for item in src_doc_ep_list
+    ])
+    for item in src_doc_ep_list:
+        src_ep = item['doc_ep']
+        for page_num in item['page_nums']:
+            for step in Steps():
+                src_page_ep = PageEp(
+                    document_ep=src_ep,
+                    page_num=int(page_num),
+                    step=step,
+                    page_count=get_pagecount(src_ep)
+                )
+                dst_page_ep = PageEp(
+                    document_ep=dest_ep,
+                    page_num=dest_page_num,
+                    step=step,
+                    page_count=dest_page_count
+                )
+                logger.debug(f"src={src_page_ep}  dst={dst_page_ep}")
+                copy_page(
+                    src_page_ep=src_page_ep,
+                    dst_page_ep=dst_page_ep
+                )
+            dest_page_num += 1
+
+
+class OcrMigrate:
+    """
+    Insead of running again OCR operation on changed document AGAIN
+    (e.g. after pages 2 and 3 were deleted)
+    text files which are result of first (and only!) OCR are moved
+    (moved = migrated) inside new version's folder.
+    Basically migrate/move files instead of rerunning OCR operation.
+
+    For each affected page (page_x), following files will need to be migrated:
+        * <version>/pages/page_x.txt
+        * <version>/pages/page_x/50/*.hocr
+        * <version>/pages/page_x/75/*.hocr
+        * <version>/pages/page_x/100/*.hocr
+        * <version>/pages/page_x/125/*.hocr
+        from <old_version> to <new_version>
+
+    Which pages are affected depends on the operation.
+    """
+
+    def __init__(self, src_ep, dst_ep):
+        # Both endpoints shoud be instance of DocumentEp
+
+        for inst in [src_ep, dst_ep]:
+            if not isinstance(inst, DocumentEp):
+                raise ValueError(
+                    "OcrMigrate args must be DocumentEp instances"
+                )
+
+        self.src_ep = src_ep
+        self.dst_ep = dst_ep
+
+    def migrate_delete(self, deleted_pages):
+        page_count = get_pagecount(self.src_ep)
+        if len(deleted_pages) > page_count:
+            logger.error(
+                f"deleted_pages({deleted_pages}) > page_count({page_count})"
+            )
+            return
+
+        assigns = get_assigns_after_delete(
+            total_pages=page_count,
+            deleted_pages=deleted_pages
+        )
+        for a in assigns:
+            for step in Steps():
+                src_page_ep = PageEp(
+                    document_ep=self.src_ep,
+                    page_num=a[1],
+                    step=step,
+                    page_count=page_count
+                )
+                dst_page_ep = PageEp(
+                    document_ep=self.dst_ep,
+                    page_num=a[0],
+                    step=step,
+                    page_count=page_count - len(deleted_pages)
+                )
+                copy_page(
+                    src_page_ep=src_page_ep,
+                    dst_page_ep=dst_page_ep
+                )
+
+    def migrate_reorder(self, new_order):
+        """
+        Similar to migrate_delete, with minor tweaks.
+        """
+        page_count = get_pagecount(self.src_ep)
+
+        if len(new_order) > page_count:
+            logger.error(
+                f"deleted_pages({new_order}) > page_count({page_count})"
+            )
+            return
+
+        for item in new_order:
+            for step in Steps():
+                src_page_ep = PageEp(
+                    document_ep=self.src_ep,
+                    page_num=int(item['page_num']),
+                    step=step,
+                    page_count=len(new_order)
+                )
+                dst_page_ep = PageEp(
+                    document_ep=self.dst_ep,
+                    page_num=int(item['page_order']),
+                    step=step,
+                    page_count=len(new_order)
+                )
+                copy_page(
+                    src_page_ep=src_page_ep,
+                    dst_page_ep=dst_page_ep
+                )
--- a/mglib/path.py
+++ b/mglib/path.py
@ -102,7 +102,7 @@ class PagePath:

    def __init__(
        self,
-        document_ep,
+        document_path,
        page_num,
        page_count,
        step=None
@ -111,15 +111,15 @@ class PagePath:
            msg_err = f"PagePath.page_num must be an int. Got {page_num}."
            raise ValueError(msg_err)

-        self.document_ep = document_ep
+        self.document_path = document_path
        self.results_document_ep = DocumentPath.copy_from(
-            document_ep,
+            document_path,
            aux_dir=AUX_DIR_RESULTS
        )
        self.page_count = page_count
        self.page_num = page_num
        self.step = step
-        self.pages = self.document_ep.pages
+        self.pages = self.document_path.pages

    @property
    def ppmroot(self):
@ -133,7 +133,7 @@ class PagePath:

    @property
    def pages_dirname(self):
-        return self.document_ep.pages_dirname
+        return self.document_path.pages_dirname

    @property
    def path(self):
--- a/mglib/runcmd.py
+++ b/mglib/runcmd.py
@ -0,0 +1,25 @@
+import logging
+import subprocess
+
+
+logger = logging.getLogger(__name__)
+
+
+def run(cmd):
+    logger.debug(
+        f"Run:{'|'.join(cmd)}"
+    )
+
+    ret = subprocess.run(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        encoding="utf-8"
+    )
+
+    if ret.returncode != 0:
+        logger.error((
+            f"returncode={ret.returncode}"
+            f" stdout={ret.stdout}"
+            f" stderr={ret.stderr}"
+        ))
--- a/mglib/shortcuts.py
+++ b/mglib/shortcuts.py
@ -0,0 +1,113 @@
+import os
+import logging
+
+from mglib.runcmd import run
+
+logger = logging.getLogger(__name__)
+
+
+def extract_img(page_path, media_root):
+
+    local_abspath = os.path.join(
+        media_root,
+        page_path.document_path.url()
+    )
+    logger.debug(f"Extracing image for {page_path.img_url()}")
+
+    ppmroot = os.path.join(media_root, page_path.ppmroot)
+    ppmroot_dirname = os.path.dirname(ppmroot)
+
+    page_num = page_path.page_num
+    width = page_path.step.width
+
+    if not os.path.exists(ppmroot_dirname):
+        logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
+        os.makedirs(
+            ppmroot_dirname, exist_ok=True
+        )
+    else:
+        logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
+    cmd = (
+        "pdftoppm",
+        "-jpeg",
+        "-f",
+        str(page_num),
+        "-l",  # generate only one page
+        str(page_num),
+        "-scale-to-x",
+        str(width),
+        "-scale-to-y",
+        "-1",  # it will adjust height according to img ratio
+        local_abspath,
+        # output directory path,
+        ppmroot
+    )
+
+    run(cmd)
+
+
+def extract_hocr(page_url, lang, media_root):
+    page_abspath = os.path.join(
+        media_root,
+        page_url.img_url()
+    )
+
+    hocr_root, hocr_ext = os.path.splitext(
+        os.path.join(media_root, page_url.hocr_url())
+    )
+    cmd = (
+        "tesseract",
+        "-l",
+        lang,
+        page_abspath,
+        hocr_root,
+        "hocr"
+    )
+    run(cmd)
+
+
+def extract_txt(page_url, lang, media_root):
+    page_abspath = os.path.join(
+        media_root,
+        page_url.img_url()
+    )
+    txt_root, txt_ext = os.path.splitext(
+        os.path.join(
+            media_root, page_url.txt_url()
+        )
+    )
+    cmd = (
+        "tesseract",
+        "-l",
+        lang,
+        page_abspath,
+        txt_root
+    )
+    run(cmd)
+
+
+#def text_from_pdf(filepath, lang, dry_run=False):
+#
+#    # suffix .tiff in file name is required by conver utility, otherwise
+#    # it won't convert to tiff format!
+#    tiff = tempfile.NamedTemporaryFile(suffix=".tiff")
+#    conv = convert.Convert(dry_run=dry_run)
+#    conv(filepath=filepath, fout=tiff)
+#    try:
+#        tsact = tesseract.Tesseract()
+#        text = tsact(filepath=tiff.name, lang=lang)
+#    except subprocess.CalledProcessError as e:
+#        print(e)
+#        print(e.stderr)
+#        return
+#
+#    return text
+#
+#
+#def text_from_image(filepath, lang, dry_run=False):
+#
+#    tsact = tesseract.Tesseract(dry_run=dry_run)
+#    text = tsact(filepath=filepath, lang=lang)
+#
+#    return text
+#
--- a/mglib/storage.py
+++ b/mglib/storage.py
@ -21,11 +21,14 @@ class Storage:
    def location(self):
        return self._location

-    def path(self, _path):
+    def abspath(self, _path):
        return os.path.join(
            self.location, _path
        )

+    def path(self, _path):
+        return self.abspath(_path)
+
    def delete_document(self, doc_path):
        """
        Receives a mglib.path.DocumentPath instance
@ -56,6 +59,29 @@ class Storage:
            if os.path.exists(abs_dirname_results):
                os.rmdir(abs_dirname_results)

+    def copy_doc(self, src, dst):
+        """
+        copy given file src file path to destination
+        as absolute doc_path
+        """
+
+        dirname = os.path.dirname(
+            self.abspath(dst)
+        )
+        if not os.path.exists(
+            dirname
+        ):
+            os.makedirs(
+                dirname, exist_ok=True
+            )
+        logger.debug(
+            f"copy_doc: {src} to {dst}"
+        )
+        shutil.copyfile(
+            src,
+            self.abspath(dst)
+        )
+
    def exists(self, _path):
        return os.path.exists(
            self.path(_path)