mirror of https://github.com/papermerge/mglib
refactoring, work in progress...
parent
d0af1087af
commit
31330c0011
|
@ -0,0 +1,268 @@
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
from os import listdir
|
||||||
|
from os.path import isdir, join
|
||||||
|
|
||||||
|
from pmworker.pdftk import make_sure_path_exists
|
||||||
|
from mglib.path import (DocumentPath, PagePath)
|
||||||
|
from mglib.step import Steps
|
||||||
|
|
||||||
|
"""
|
||||||
|
OCR operations are per page. Cut/Paste/Delete/Reorder are per page as well.
|
||||||
|
So it does not make sense to rerun such a heavy operation as OCR again, instead
|
||||||
|
we can do some magic tricks (copy them from one location to another)
|
||||||
|
on already extracted txt and hocr files.
|
||||||
|
|
||||||
|
OcrMigrate class takes care of this sort of txt/hocr files moves.
|
||||||
|
"""
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def get_pagecount(doc_ep):
|
||||||
|
"""
|
||||||
|
Returns total number of pages for this endpoint.
|
||||||
|
Total number of pages = number of page_xy.txt files
|
||||||
|
in pages_dirname folder.
|
||||||
|
"""
|
||||||
|
doc_ep_pointing_to_results = DocumentPath.copy_from(
|
||||||
|
doc_ep, aux_dir="results"
|
||||||
|
)
|
||||||
|
pages_dir = doc_ep_pointing_to_results.pages_dirname
|
||||||
|
only_dirs = [
|
||||||
|
fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi))
|
||||||
|
]
|
||||||
|
return len(only_dirs)
|
||||||
|
|
||||||
|
|
||||||
|
def get_assigns_after_delete(total_pages, deleted_pages):
|
||||||
|
"""
|
||||||
|
given total pages and a list of deleted pages - returns
|
||||||
|
a list of assignations of pages:
|
||||||
|
[new_version_page_num, old_version_page_num]
|
||||||
|
Example 1:
|
||||||
|
total_pages: 6
|
||||||
|
deleted_pages: [1, 2]
|
||||||
|
returns: [
|
||||||
|
[(1, 3), (2, 4), (3, 5), (4, 6)]
|
||||||
|
# page #1 gets info from prev page #3
|
||||||
|
# page #2 ... #4
|
||||||
|
...
|
||||||
|
# page #4 ... #6
|
||||||
|
]
|
||||||
|
|
||||||
|
Example 2:
|
||||||
|
total pages: 5
|
||||||
|
deleted_pages [1, 5]
|
||||||
|
returns: [
|
||||||
|
[(1, 2), (2, 3), (3, 4)
|
||||||
|
]
|
||||||
|
|
||||||
|
Example 3:
|
||||||
|
total pages: 5
|
||||||
|
deleted_pages [2, 3]
|
||||||
|
returns: [
|
||||||
|
[(1, 1), (2, 4), (3, 5)
|
||||||
|
# page #1 stays unaffected
|
||||||
|
# page #2 gets the info from page number 4
|
||||||
|
# page #3 gets info from page #5
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
if total_pages < len(deleted_pages):
|
||||||
|
err_msg = f"total_pages < deleted_pages"
|
||||||
|
raise ValueError(err_msg)
|
||||||
|
|
||||||
|
# only numbers of pages which were not deleted
|
||||||
|
pages = [
|
||||||
|
page for page in list(range(1, total_pages + 1))
|
||||||
|
if page not in deleted_pages
|
||||||
|
]
|
||||||
|
|
||||||
|
page_numbers = range(1, len(pages) + 1)
|
||||||
|
|
||||||
|
return list(zip(page_numbers, pages))
|
||||||
|
|
||||||
|
|
||||||
|
def copy_page(src_page_ep, dst_page_ep):
|
||||||
|
err_msg = "copy_page accepts only PageEp instances"
|
||||||
|
|
||||||
|
for inst in [src_page_ep, dst_page_ep]:
|
||||||
|
if not isinstance(inst, PagePath):
|
||||||
|
raise ValueError(err_msg)
|
||||||
|
|
||||||
|
# copy .txt file
|
||||||
|
if src_page_ep.txt_exists():
|
||||||
|
make_sure_path_exists(dst_page_ep.txt_url())
|
||||||
|
|
||||||
|
src_txt = src_page_ep.txt_url()
|
||||||
|
dst_txt = dst_page_ep.txt_url()
|
||||||
|
logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
|
||||||
|
shutil.copy(src_txt, dst_txt)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"txt does not exits {src_page_ep.txt_exists()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# hocr
|
||||||
|
if src_page_ep.hocr_exists():
|
||||||
|
make_sure_path_exists(dst_page_ep.hocr_url())
|
||||||
|
|
||||||
|
src_hocr = src_page_ep.hocr_url()
|
||||||
|
dst_hocr = dst_page_ep.hocr_url()
|
||||||
|
logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
|
||||||
|
shutil.copy(src_hocr, dst_hocr)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"hocr does not exits {src_page_ep.hocr_exists()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if src_page_ep.img_exists():
|
||||||
|
make_sure_path_exists(dst_page_ep.img_url())
|
||||||
|
|
||||||
|
src_img = src_page_ep.img_url()
|
||||||
|
dst_img = dst_page_ep.img_url()
|
||||||
|
logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
|
||||||
|
shutil.copy(src_img, dst_img)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"img does not exits {src_page_ep.img_exists()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def migrate_cutted_pages(dest_ep, src_doc_ep_list):
|
||||||
|
"""
|
||||||
|
dest_ep = destination document endpoint
|
||||||
|
src_doc_ep_list = a list of following format:
|
||||||
|
[
|
||||||
|
{
|
||||||
|
'doc_ep': doc_ep,
|
||||||
|
'page_nums': [page_num_1, page_num_2, page_num_3]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'doc_ep': doc_ep,
|
||||||
|
'page_nums': [page_num_1, page_num_2, page_num_3]
|
||||||
|
},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
with a list of source document with copied pages.
|
||||||
|
"""
|
||||||
|
dest_page_num = 1
|
||||||
|
dest_page_count = sum([
|
||||||
|
len(item['page_nums']) for item in src_doc_ep_list
|
||||||
|
])
|
||||||
|
for item in src_doc_ep_list:
|
||||||
|
src_ep = item['doc_ep']
|
||||||
|
for page_num in item['page_nums']:
|
||||||
|
for step in Steps():
|
||||||
|
src_page_ep = PageEp(
|
||||||
|
document_ep=src_ep,
|
||||||
|
page_num=int(page_num),
|
||||||
|
step=step,
|
||||||
|
page_count=get_pagecount(src_ep)
|
||||||
|
)
|
||||||
|
dst_page_ep = PageEp(
|
||||||
|
document_ep=dest_ep,
|
||||||
|
page_num=dest_page_num,
|
||||||
|
step=step,
|
||||||
|
page_count=dest_page_count
|
||||||
|
)
|
||||||
|
logger.debug(f"src={src_page_ep} dst={dst_page_ep}")
|
||||||
|
copy_page(
|
||||||
|
src_page_ep=src_page_ep,
|
||||||
|
dst_page_ep=dst_page_ep
|
||||||
|
)
|
||||||
|
dest_page_num += 1
|
||||||
|
|
||||||
|
|
||||||
|
class OcrMigrate:
|
||||||
|
"""
|
||||||
|
Insead of running again OCR operation on changed document AGAIN
|
||||||
|
(e.g. after pages 2 and 3 were deleted)
|
||||||
|
text files which are result of first (and only!) OCR are moved
|
||||||
|
(moved = migrated) inside new version's folder.
|
||||||
|
Basically migrate/move files instead of rerunning OCR operation.
|
||||||
|
|
||||||
|
For each affected page (page_x), following files will need to be migrated:
|
||||||
|
* <version>/pages/page_x.txt
|
||||||
|
* <version>/pages/page_x/50/*.hocr
|
||||||
|
* <version>/pages/page_x/75/*.hocr
|
||||||
|
* <version>/pages/page_x/100/*.hocr
|
||||||
|
* <version>/pages/page_x/125/*.hocr
|
||||||
|
from <old_version> to <new_version>
|
||||||
|
|
||||||
|
Which pages are affected depends on the operation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, src_ep, dst_ep):
|
||||||
|
# Both endpoints shoud be instance of DocumentEp
|
||||||
|
|
||||||
|
for inst in [src_ep, dst_ep]:
|
||||||
|
if not isinstance(inst, DocumentEp):
|
||||||
|
raise ValueError(
|
||||||
|
"OcrMigrate args must be DocumentEp instances"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.src_ep = src_ep
|
||||||
|
self.dst_ep = dst_ep
|
||||||
|
|
||||||
|
def migrate_delete(self, deleted_pages):
|
||||||
|
page_count = get_pagecount(self.src_ep)
|
||||||
|
if len(deleted_pages) > page_count:
|
||||||
|
logger.error(
|
||||||
|
f"deleted_pages({deleted_pages}) > page_count({page_count})"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
assigns = get_assigns_after_delete(
|
||||||
|
total_pages=page_count,
|
||||||
|
deleted_pages=deleted_pages
|
||||||
|
)
|
||||||
|
for a in assigns:
|
||||||
|
for step in Steps():
|
||||||
|
src_page_ep = PageEp(
|
||||||
|
document_ep=self.src_ep,
|
||||||
|
page_num=a[1],
|
||||||
|
step=step,
|
||||||
|
page_count=page_count
|
||||||
|
)
|
||||||
|
dst_page_ep = PageEp(
|
||||||
|
document_ep=self.dst_ep,
|
||||||
|
page_num=a[0],
|
||||||
|
step=step,
|
||||||
|
page_count=page_count - len(deleted_pages)
|
||||||
|
)
|
||||||
|
copy_page(
|
||||||
|
src_page_ep=src_page_ep,
|
||||||
|
dst_page_ep=dst_page_ep
|
||||||
|
)
|
||||||
|
|
||||||
|
def migrate_reorder(self, new_order):
|
||||||
|
"""
|
||||||
|
Similar to migrate_delete, with minor tweaks.
|
||||||
|
"""
|
||||||
|
page_count = get_pagecount(self.src_ep)
|
||||||
|
|
||||||
|
if len(new_order) > page_count:
|
||||||
|
logger.error(
|
||||||
|
f"deleted_pages({new_order}) > page_count({page_count})"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
for item in new_order:
|
||||||
|
for step in Steps():
|
||||||
|
src_page_ep = PageEp(
|
||||||
|
document_ep=self.src_ep,
|
||||||
|
page_num=int(item['page_num']),
|
||||||
|
step=step,
|
||||||
|
page_count=len(new_order)
|
||||||
|
)
|
||||||
|
dst_page_ep = PageEp(
|
||||||
|
document_ep=self.dst_ep,
|
||||||
|
page_num=int(item['page_order']),
|
||||||
|
step=step,
|
||||||
|
page_count=len(new_order)
|
||||||
|
)
|
||||||
|
copy_page(
|
||||||
|
src_page_ep=src_page_ep,
|
||||||
|
dst_page_ep=dst_page_ep
|
||||||
|
)
|
|
@ -102,7 +102,7 @@ class PagePath:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
document_ep,
|
document_path,
|
||||||
page_num,
|
page_num,
|
||||||
page_count,
|
page_count,
|
||||||
step=None
|
step=None
|
||||||
|
@ -111,15 +111,15 @@ class PagePath:
|
||||||
msg_err = f"PagePath.page_num must be an int. Got {page_num}."
|
msg_err = f"PagePath.page_num must be an int. Got {page_num}."
|
||||||
raise ValueError(msg_err)
|
raise ValueError(msg_err)
|
||||||
|
|
||||||
self.document_ep = document_ep
|
self.document_path = document_path
|
||||||
self.results_document_ep = DocumentPath.copy_from(
|
self.results_document_ep = DocumentPath.copy_from(
|
||||||
document_ep,
|
document_path,
|
||||||
aux_dir=AUX_DIR_RESULTS
|
aux_dir=AUX_DIR_RESULTS
|
||||||
)
|
)
|
||||||
self.page_count = page_count
|
self.page_count = page_count
|
||||||
self.page_num = page_num
|
self.page_num = page_num
|
||||||
self.step = step
|
self.step = step
|
||||||
self.pages = self.document_ep.pages
|
self.pages = self.document_path.pages
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ppmroot(self):
|
def ppmroot(self):
|
||||||
|
@ -133,7 +133,7 @@ class PagePath:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pages_dirname(self):
|
def pages_dirname(self):
|
||||||
return self.document_ep.pages_dirname
|
return self.document_path.pages_dirname
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def path(self):
|
def path(self):
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def run(cmd):
|
||||||
|
logger.debug(
|
||||||
|
f"Run:{'|'.join(cmd)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
ret = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
encoding="utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
if ret.returncode != 0:
|
||||||
|
logger.error((
|
||||||
|
f"returncode={ret.returncode}"
|
||||||
|
f" stdout={ret.stdout}"
|
||||||
|
f" stderr={ret.stderr}"
|
||||||
|
))
|
|
@ -0,0 +1,113 @@
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from mglib.runcmd import run
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_img(page_path, media_root):
|
||||||
|
|
||||||
|
local_abspath = os.path.join(
|
||||||
|
media_root,
|
||||||
|
page_path.document_path.url()
|
||||||
|
)
|
||||||
|
logger.debug(f"Extracing image for {page_path.img_url()}")
|
||||||
|
|
||||||
|
ppmroot = os.path.join(media_root, page_path.ppmroot)
|
||||||
|
ppmroot_dirname = os.path.dirname(ppmroot)
|
||||||
|
|
||||||
|
page_num = page_path.page_num
|
||||||
|
width = page_path.step.width
|
||||||
|
|
||||||
|
if not os.path.exists(ppmroot_dirname):
|
||||||
|
logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.")
|
||||||
|
os.makedirs(
|
||||||
|
ppmroot_dirname, exist_ok=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
||||||
|
cmd = (
|
||||||
|
"pdftoppm",
|
||||||
|
"-jpeg",
|
||||||
|
"-f",
|
||||||
|
str(page_num),
|
||||||
|
"-l", # generate only one page
|
||||||
|
str(page_num),
|
||||||
|
"-scale-to-x",
|
||||||
|
str(width),
|
||||||
|
"-scale-to-y",
|
||||||
|
"-1", # it will adjust height according to img ratio
|
||||||
|
local_abspath,
|
||||||
|
# output directory path,
|
||||||
|
ppmroot
|
||||||
|
)
|
||||||
|
|
||||||
|
run(cmd)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_hocr(page_url, lang, media_root):
|
||||||
|
page_abspath = os.path.join(
|
||||||
|
media_root,
|
||||||
|
page_url.img_url()
|
||||||
|
)
|
||||||
|
|
||||||
|
hocr_root, hocr_ext = os.path.splitext(
|
||||||
|
os.path.join(media_root, page_url.hocr_url())
|
||||||
|
)
|
||||||
|
cmd = (
|
||||||
|
"tesseract",
|
||||||
|
"-l",
|
||||||
|
lang,
|
||||||
|
page_abspath,
|
||||||
|
hocr_root,
|
||||||
|
"hocr"
|
||||||
|
)
|
||||||
|
run(cmd)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_txt(page_url, lang, media_root):
|
||||||
|
page_abspath = os.path.join(
|
||||||
|
media_root,
|
||||||
|
page_url.img_url()
|
||||||
|
)
|
||||||
|
txt_root, txt_ext = os.path.splitext(
|
||||||
|
os.path.join(
|
||||||
|
media_root, page_url.txt_url()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
cmd = (
|
||||||
|
"tesseract",
|
||||||
|
"-l",
|
||||||
|
lang,
|
||||||
|
page_abspath,
|
||||||
|
txt_root
|
||||||
|
)
|
||||||
|
run(cmd)
|
||||||
|
|
||||||
|
|
||||||
|
#def text_from_pdf(filepath, lang, dry_run=False):
|
||||||
|
#
|
||||||
|
# # suffix .tiff in file name is required by conver utility, otherwise
|
||||||
|
# # it won't convert to tiff format!
|
||||||
|
# tiff = tempfile.NamedTemporaryFile(suffix=".tiff")
|
||||||
|
# conv = convert.Convert(dry_run=dry_run)
|
||||||
|
# conv(filepath=filepath, fout=tiff)
|
||||||
|
# try:
|
||||||
|
# tsact = tesseract.Tesseract()
|
||||||
|
# text = tsact(filepath=tiff.name, lang=lang)
|
||||||
|
# except subprocess.CalledProcessError as e:
|
||||||
|
# print(e)
|
||||||
|
# print(e.stderr)
|
||||||
|
# return
|
||||||
|
#
|
||||||
|
# return text
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#def text_from_image(filepath, lang, dry_run=False):
|
||||||
|
#
|
||||||
|
# tsact = tesseract.Tesseract(dry_run=dry_run)
|
||||||
|
# text = tsact(filepath=filepath, lang=lang)
|
||||||
|
#
|
||||||
|
# return text
|
||||||
|
#
|
|
@ -21,11 +21,14 @@ class Storage:
|
||||||
def location(self):
|
def location(self):
|
||||||
return self._location
|
return self._location
|
||||||
|
|
||||||
def path(self, _path):
|
def abspath(self, _path):
|
||||||
return os.path.join(
|
return os.path.join(
|
||||||
self.location, _path
|
self.location, _path
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def path(self, _path):
|
||||||
|
return self.abspath(_path)
|
||||||
|
|
||||||
def delete_document(self, doc_path):
|
def delete_document(self, doc_path):
|
||||||
"""
|
"""
|
||||||
Receives a mglib.path.DocumentPath instance
|
Receives a mglib.path.DocumentPath instance
|
||||||
|
@ -56,6 +59,29 @@ class Storage:
|
||||||
if os.path.exists(abs_dirname_results):
|
if os.path.exists(abs_dirname_results):
|
||||||
os.rmdir(abs_dirname_results)
|
os.rmdir(abs_dirname_results)
|
||||||
|
|
||||||
|
def copy_doc(self, src, dst):
|
||||||
|
"""
|
||||||
|
copy given file src file path to destination
|
||||||
|
as absolute doc_path
|
||||||
|
"""
|
||||||
|
|
||||||
|
dirname = os.path.dirname(
|
||||||
|
self.abspath(dst)
|
||||||
|
)
|
||||||
|
if not os.path.exists(
|
||||||
|
dirname
|
||||||
|
):
|
||||||
|
os.makedirs(
|
||||||
|
dirname, exist_ok=True
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
f"copy_doc: {src} to {dst}"
|
||||||
|
)
|
||||||
|
shutil.copyfile(
|
||||||
|
src,
|
||||||
|
self.abspath(dst)
|
||||||
|
)
|
||||||
|
|
||||||
def exists(self, _path):
|
def exists(self, _path):
|
||||||
return os.path.exists(
|
return os.path.exists(
|
||||||
self.path(_path)
|
self.path(_path)
|
||||||
|
|
Loading…
Reference in New Issue