2020-05-10 16:02:58 +02:00
|
|
|
import os
|
|
|
|
import logging
|
|
|
|
|
|
|
|
from mglib.runcmd import run
|
2020-08-11 19:47:35 +02:00
|
|
|
from .conf import settings
|
2020-05-10 16:02:58 +02:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2020-07-16 08:25:43 +02:00
|
|
|
def resize_img(page_path, media_root):
|
|
|
|
|
|
|
|
local_abspath = os.path.join(
|
|
|
|
media_root,
|
|
|
|
page_path.document_path.url()
|
|
|
|
)
|
|
|
|
logger.debug(f"Resizing image {page_path.img_url()}")
|
|
|
|
|
|
|
|
ppmroot = os.path.join(media_root, page_path.ppmroot)
|
|
|
|
ppmroot_dirname = os.path.dirname(ppmroot)
|
|
|
|
|
|
|
|
width = page_path.step.width
|
|
|
|
|
|
|
|
if not os.path.exists(ppmroot_dirname):
|
|
|
|
logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
|
|
|
|
os.makedirs(
|
|
|
|
ppmroot_dirname, exist_ok=True
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
|
|
|
|
|
|
|
cmd = (
|
2020-08-11 19:47:35 +02:00
|
|
|
settings.BINARY_CONVERT,
|
2020-07-16 08:25:43 +02:00
|
|
|
"-resize",
|
|
|
|
f"{width}x",
|
|
|
|
local_abspath,
|
|
|
|
# output directory path, similar to ppmroot
|
|
|
|
f"{ppmroot}-1.jpg"
|
|
|
|
)
|
|
|
|
|
|
|
|
run(cmd)
|
|
|
|
|
|
|
|
|
2020-05-10 16:02:58 +02:00
|
|
|
def extract_img(page_path, media_root):
|
|
|
|
|
|
|
|
local_abspath = os.path.join(
|
|
|
|
media_root,
|
|
|
|
page_path.document_path.url()
|
|
|
|
)
|
|
|
|
logger.debug(f"Extracing image for {page_path.img_url()}")
|
|
|
|
|
|
|
|
ppmroot = os.path.join(media_root, page_path.ppmroot)
|
|
|
|
ppmroot_dirname = os.path.dirname(ppmroot)
|
|
|
|
|
|
|
|
page_num = page_path.page_num
|
|
|
|
width = page_path.step.width
|
|
|
|
|
|
|
|
if not os.path.exists(ppmroot_dirname):
|
|
|
|
logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
|
|
|
|
os.makedirs(
|
|
|
|
ppmroot_dirname, exist_ok=True
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
|
|
|
cmd = (
|
2020-08-11 19:47:35 +02:00
|
|
|
settings.BINARY_PDFTOPPM,
|
2020-05-10 16:02:58 +02:00
|
|
|
"-jpeg",
|
|
|
|
"-f",
|
|
|
|
str(page_num),
|
|
|
|
"-l", # generate only one page
|
|
|
|
str(page_num),
|
|
|
|
"-scale-to-x",
|
|
|
|
str(width),
|
|
|
|
"-scale-to-y",
|
|
|
|
"-1", # it will adjust height according to img ratio
|
|
|
|
local_abspath,
|
|
|
|
# output directory path,
|
|
|
|
ppmroot
|
|
|
|
)
|
|
|
|
|
|
|
|
run(cmd)
|
|
|
|
|
|
|
|
|
|
|
|
def extract_hocr(page_url, lang, media_root):
|
|
|
|
page_abspath = os.path.join(
|
|
|
|
media_root,
|
|
|
|
page_url.img_url()
|
|
|
|
)
|
|
|
|
|
|
|
|
hocr_root, hocr_ext = os.path.splitext(
|
|
|
|
os.path.join(media_root, page_url.hocr_url())
|
|
|
|
)
|
|
|
|
cmd = (
|
2020-08-11 19:47:35 +02:00
|
|
|
settings.BINARY_OCR,
|
2020-05-10 16:02:58 +02:00
|
|
|
"-l",
|
|
|
|
lang,
|
|
|
|
page_abspath,
|
|
|
|
hocr_root,
|
|
|
|
"hocr"
|
|
|
|
)
|
|
|
|
run(cmd)
|
2020-07-26 07:49:01 +02:00
|
|
|
logger.debug(f"OCR for {page_url.img_url()} - Complete.")
|
|
|
|
logger.debug(f"OCR Result {page_url.hocr_url()}.")
|
2020-05-10 16:02:58 +02:00
|
|
|
|
|
|
|
|
|
|
|
def extract_txt(page_url, lang, media_root):
|
|
|
|
page_abspath = os.path.join(
|
|
|
|
media_root,
|
|
|
|
page_url.img_url()
|
|
|
|
)
|
|
|
|
txt_root, txt_ext = os.path.splitext(
|
|
|
|
os.path.join(
|
|
|
|
media_root, page_url.txt_url()
|
|
|
|
)
|
|
|
|
)
|
|
|
|
cmd = (
|
2020-08-11 19:47:35 +02:00
|
|
|
settings.BINARY_OCR,
|
2020-05-10 16:02:58 +02:00
|
|
|
"-l",
|
|
|
|
lang,
|
|
|
|
page_abspath,
|
|
|
|
txt_root
|
|
|
|
)
|
|
|
|
run(cmd)
|