mirror of https://github.com/papermerge/mglib
149 lines
3.4 KiB
Python
149 lines
3.4 KiB
Python
import os
|
|
import logging
|
|
|
|
from mglib.runcmd import run
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def resize_img(page_path, media_root):
|
|
|
|
local_abspath = os.path.join(
|
|
media_root,
|
|
page_path.document_path.url()
|
|
)
|
|
logger.debug(f"Resizing image {page_path.img_url()}")
|
|
|
|
ppmroot = os.path.join(media_root, page_path.ppmroot)
|
|
ppmroot_dirname = os.path.dirname(ppmroot)
|
|
|
|
width = page_path.step.width
|
|
|
|
if not os.path.exists(ppmroot_dirname):
|
|
logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
|
|
os.makedirs(
|
|
ppmroot_dirname, exist_ok=True
|
|
)
|
|
else:
|
|
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
|
|
|
cmd = (
|
|
"convert",
|
|
"-resize",
|
|
f"{width}x",
|
|
local_abspath,
|
|
# output directory path, similar to ppmroot
|
|
f"{ppmroot}-1.jpg"
|
|
)
|
|
|
|
run(cmd)
|
|
|
|
|
|
def extract_img(page_path, media_root):
|
|
|
|
local_abspath = os.path.join(
|
|
media_root,
|
|
page_path.document_path.url()
|
|
)
|
|
logger.debug(f"Extracing image for {page_path.img_url()}")
|
|
|
|
ppmroot = os.path.join(media_root, page_path.ppmroot)
|
|
ppmroot_dirname = os.path.dirname(ppmroot)
|
|
|
|
page_num = page_path.page_num
|
|
width = page_path.step.width
|
|
|
|
if not os.path.exists(ppmroot_dirname):
|
|
logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
|
|
os.makedirs(
|
|
ppmroot_dirname, exist_ok=True
|
|
)
|
|
else:
|
|
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
|
cmd = (
|
|
"pdftoppm",
|
|
"-jpeg",
|
|
"-f",
|
|
str(page_num),
|
|
"-l", # generate only one page
|
|
str(page_num),
|
|
"-scale-to-x",
|
|
str(width),
|
|
"-scale-to-y",
|
|
"-1", # it will adjust height according to img ratio
|
|
local_abspath,
|
|
# output directory path,
|
|
ppmroot
|
|
)
|
|
|
|
run(cmd)
|
|
|
|
|
|
def extract_hocr(page_url, lang, media_root):
|
|
page_abspath = os.path.join(
|
|
media_root,
|
|
page_url.img_url()
|
|
)
|
|
|
|
hocr_root, hocr_ext = os.path.splitext(
|
|
os.path.join(media_root, page_url.hocr_url())
|
|
)
|
|
cmd = (
|
|
"tesseract",
|
|
"-l",
|
|
lang,
|
|
page_abspath,
|
|
hocr_root,
|
|
"hocr"
|
|
)
|
|
run(cmd)
|
|
logger.debug(f"OCR for {page_url.img_url()} - Complete.")
|
|
logger.debug(f"OCR Result {page_url.hocr_url()}.")
|
|
|
|
|
|
def extract_txt(page_url, lang, media_root):
|
|
page_abspath = os.path.join(
|
|
media_root,
|
|
page_url.img_url()
|
|
)
|
|
txt_root, txt_ext = os.path.splitext(
|
|
os.path.join(
|
|
media_root, page_url.txt_url()
|
|
)
|
|
)
|
|
cmd = (
|
|
"tesseract",
|
|
"-l",
|
|
lang,
|
|
page_abspath,
|
|
txt_root
|
|
)
|
|
run(cmd)
|
|
|
|
|
|
#def text_from_pdf(filepath, lang, dry_run=False):
|
|
#
|
|
# # suffix .tiff in file name is required by conver utility, otherwise
|
|
# # it won't convert to tiff format!
|
|
# tiff = tempfile.NamedTemporaryFile(suffix=".tiff")
|
|
# conv = convert.Convert(dry_run=dry_run)
|
|
# conv(filepath=filepath, fout=tiff)
|
|
# try:
|
|
# tsact = tesseract.Tesseract()
|
|
# text = tsact(filepath=tiff.name, lang=lang)
|
|
# except subprocess.CalledProcessError as e:
|
|
# print(e)
|
|
# print(e.stderr)
|
|
# return
|
|
#
|
|
# return text
|
|
#
|
|
#
|
|
#def text_from_image(filepath, lang, dry_run=False):
|
|
#
|
|
# tsact = tesseract.Tesseract(dry_run=dry_run)
|
|
# text = tsact(filepath=filepath, lang=lang)
|
|
#
|
|
# return text
|
|
#
|