mglib/mglib/shortcuts.py

import os
import logging

from mglib.runcmd import run

logger = logging.getLogger(__name__)


def resize_img(page_path, media_root):

    local_abspath = os.path.join(
        media_root,
        page_path.document_path.url()
    )
    logger.debug(f"Resizing image {page_path.img_url()}")

    ppmroot = os.path.join(media_root, page_path.ppmroot)
    ppmroot_dirname = os.path.dirname(ppmroot)

    width = page_path.step.width

    if not os.path.exists(ppmroot_dirname):
        logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
        os.makedirs(
            ppmroot_dirname, exist_ok=True
        )
    else:
        logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")

    cmd = (
        "convert",
        "-resize",
        f"{width}x",
        local_abspath,
        # output directory path, similar to ppmroot
        f"{ppmroot}-1.jpg"
    )

    run(cmd)


def extract_img(page_path, media_root):

    local_abspath = os.path.join(
        media_root,
        page_path.document_path.url()
    )
    logger.debug(f"Extracing image for {page_path.img_url()}")

    ppmroot = os.path.join(media_root, page_path.ppmroot)
    ppmroot_dirname = os.path.dirname(ppmroot)

    page_num = page_path.page_num
    width = page_path.step.width

    if not os.path.exists(ppmroot_dirname):
        logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
        os.makedirs(
            ppmroot_dirname, exist_ok=True
        )
    else:
        logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
    cmd = (
        "pdftoppm",
        "-jpeg",
        "-f",
        str(page_num),
        "-l",  # generate only one page
        str(page_num),
        "-scale-to-x",
        str(width),
        "-scale-to-y",
        "-1",  # it will adjust height according to img ratio
        local_abspath,
        # output directory path,
        ppmroot
    )

    run(cmd)


def extract_hocr(page_url, lang, media_root):
    page_abspath = os.path.join(
        media_root,
        page_url.img_url()
    )

    hocr_root, hocr_ext = os.path.splitext(
        os.path.join(media_root, page_url.hocr_url())
    )
    cmd = (
        "tesseract",
        "-l",
        lang,
        page_abspath,
        hocr_root,
        "hocr"
    )
    run(cmd)
    logger.debug(f"OCR for {page_url.img_url()} - Complete.")
    logger.debug(f"OCR Result {page_url.hocr_url()}.")


def extract_txt(page_url, lang, media_root):
    page_abspath = os.path.join(
        media_root,
        page_url.img_url()
    )
    txt_root, txt_ext = os.path.splitext(
        os.path.join(
            media_root, page_url.txt_url()
        )
    )
    cmd = (
        "tesseract",
        "-l",
        lang,
        page_abspath,
        txt_root
    )
    run(cmd)


#def text_from_pdf(filepath, lang, dry_run=False):
#
#    # suffix .tiff in file name is required by conver utility, otherwise
#    # it won't convert to tiff format!
#    tiff = tempfile.NamedTemporaryFile(suffix=".tiff")
#    conv = convert.Convert(dry_run=dry_run)
#    conv(filepath=filepath, fout=tiff)
#    try:
#        tsact = tesseract.Tesseract()
#        text = tsact(filepath=tiff.name, lang=lang)
#    except subprocess.CalledProcessError as e:
#        print(e)
#        print(e.stderr)
#        return
#
#    return text
#
#
#def text_from_image(filepath, lang, dry_run=False):
#
#    tsact = tesseract.Tesseract(dry_run=dry_run)
#    text = tsact(filepath=filepath, lang=lang)
#
#    return text
#