2020-05-04 08:10:15 +02:00
|
|
|
import logging
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2020-05-04 13:16:44 +02:00
|
|
|
AUX_DIR_DOCS = "docs"
|
|
|
|
AUX_DIR_RESULTS = "results"
|
|
|
|
|
2020-05-04 08:10:15 +02:00
|
|
|
|
2020-05-04 13:29:31 +02:00
|
|
|
class DocumentPath:
|
2020-05-04 08:10:15 +02:00
|
|
|
"""
|
2020-05-04 13:58:56 +02:00
|
|
|
Document path:
|
2020-05-04 13:16:44 +02:00
|
|
|
/<aux_dir>/<user_id>/<doc_id>/<version>/<file_name>
|
2020-05-04 08:10:15 +02:00
|
|
|
|
2020-05-04 13:59:47 +02:00
|
|
|
If version = 0, it is not included in DocumentPath.
|
2020-05-04 08:10:15 +02:00
|
|
|
Document's version is incremented everytime pdftk operation runs on it
|
|
|
|
(when pages are deleted, reordered, pasted)
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
user_id,
|
|
|
|
document_id,
|
|
|
|
file_name,
|
2020-05-04 13:16:44 +02:00
|
|
|
aux_dir=AUX_DIR_DOCS,
|
2020-05-04 08:10:15 +02:00
|
|
|
version=0
|
|
|
|
):
|
|
|
|
self.user_id = user_id
|
|
|
|
self.document_id = document_id
|
|
|
|
self.file_name = file_name
|
|
|
|
self.aux_dir = aux_dir
|
|
|
|
# by default, document has version 0
|
|
|
|
self.version = version
|
|
|
|
self.pages = "pages"
|
|
|
|
|
2020-05-04 13:58:56 +02:00
|
|
|
def url(self):
|
|
|
|
return f"{self.dirname}{self.file_name}"
|
2020-05-04 08:10:15 +02:00
|
|
|
|
2020-05-04 13:58:56 +02:00
|
|
|
@property
|
|
|
|
def path(self):
|
|
|
|
return self.url()
|
2020-05-04 08:10:15 +02:00
|
|
|
|
|
|
|
@property
|
2020-05-04 13:16:44 +02:00
|
|
|
def dirname_docs(self):
|
|
|
|
_path = (
|
|
|
|
f"{AUX_DIR_DOCS}/user_{self.user_id}/"
|
|
|
|
f"document_{self.document_id}/"
|
|
|
|
)
|
|
|
|
|
|
|
|
return _path
|
|
|
|
|
|
|
|
@property
|
|
|
|
def dirname_results(self):
|
|
|
|
_path = (
|
|
|
|
f"{AUX_DIR_RESULTS}/user_{self.user_id}/"
|
|
|
|
f"document_{self.document_id}/"
|
|
|
|
)
|
2020-05-04 08:10:15 +02:00
|
|
|
|
2020-05-04 13:16:44 +02:00
|
|
|
return _path
|
|
|
|
|
|
|
|
@property
|
|
|
|
def dirname(self):
|
2020-05-04 08:10:15 +02:00
|
|
|
full_path = (
|
|
|
|
f"{self.aux_dir}/user_{self.user_id}/"
|
|
|
|
f"document_{self.document_id}/"
|
|
|
|
)
|
|
|
|
|
|
|
|
if self.version > 0:
|
|
|
|
full_path = f"{full_path}v{self.version}/"
|
|
|
|
|
|
|
|
return full_path
|
|
|
|
|
|
|
|
@property
|
|
|
|
def pages_dirname(self):
|
|
|
|
return f"{self.dirname}{self.pages}/"
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
message = (
|
2020-05-04 13:29:31 +02:00
|
|
|
f"DocumentPath(version={self.version},"
|
2020-05-04 08:10:15 +02:00
|
|
|
f"user_id={self.user_id},"
|
|
|
|
f"document_id={self.document_id},"
|
|
|
|
f"file_name={self.file_name})"
|
|
|
|
)
|
|
|
|
return message
|
|
|
|
|
|
|
|
def inc_version(self):
|
|
|
|
self.version = self.version + 1
|
|
|
|
|
2020-05-16 17:53:15 +02:00
|
|
|
def copy_from(doc_path, **kw):
|
|
|
|
"""
|
|
|
|
Will create a copy of provided
|
|
|
|
DocumentPath (first parameter = doc_path) and replace
|
|
|
|
existing parameter of new copy with the one from kw.
|
|
|
|
|
|
|
|
kw => key/value parameters.
|
|
|
|
Keys can be one of doc_path attributes: user_id, document_id,
|
|
|
|
file_name, aux_dir, version
|
|
|
|
"""
|
|
|
|
copy_values = {
|
|
|
|
'user_id': doc_path.user_id,
|
|
|
|
'document_id': doc_path.document_id,
|
|
|
|
'file_name': doc_path.file_name,
|
|
|
|
'version': doc_path.version,
|
|
|
|
'aux_dir': doc_path.aux_dir
|
|
|
|
|
|
|
|
}
|
|
|
|
for key, value in kw.items():
|
|
|
|
copy_values[key] = kw[key]
|
|
|
|
|
2020-05-04 13:29:31 +02:00
|
|
|
return DocumentPath(
|
2020-05-16 17:53:15 +02:00
|
|
|
user_id=copy_values['user_id'],
|
|
|
|
document_id=copy_values['document_id'],
|
|
|
|
file_name=copy_values['file_name'],
|
|
|
|
version=copy_values['version'],
|
|
|
|
aux_dir=copy_values['aux_dir']
|
2020-05-04 08:10:15 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2020-05-04 13:29:31 +02:00
|
|
|
class PagePath:
|
2020-05-04 08:10:15 +02:00
|
|
|
"""
|
2020-05-04 13:58:56 +02:00
|
|
|
<aux_dir>/<doc_id>/pages/<page_num>/<step>/page-<xyz>.jpg
|
2020-05-04 08:10:15 +02:00
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
2020-05-10 16:02:58 +02:00
|
|
|
document_path,
|
2020-05-04 08:10:15 +02:00
|
|
|
page_num,
|
|
|
|
page_count,
|
|
|
|
step=None
|
|
|
|
):
|
|
|
|
if not isinstance(page_num, int):
|
2020-05-04 13:29:31 +02:00
|
|
|
msg_err = f"PagePath.page_num must be an int. Got {page_num}."
|
2020-05-04 08:10:15 +02:00
|
|
|
raise ValueError(msg_err)
|
|
|
|
|
2020-05-10 16:02:58 +02:00
|
|
|
self.document_path = document_path
|
2020-05-04 13:29:31 +02:00
|
|
|
self.results_document_ep = DocumentPath.copy_from(
|
2020-05-10 16:02:58 +02:00
|
|
|
document_path,
|
2020-05-04 13:16:44 +02:00
|
|
|
aux_dir=AUX_DIR_RESULTS
|
2020-05-04 08:10:15 +02:00
|
|
|
)
|
|
|
|
self.page_count = page_count
|
|
|
|
self.page_num = page_num
|
|
|
|
self.step = step
|
2020-05-10 16:02:58 +02:00
|
|
|
self.pages = self.document_path.pages
|
2020-05-04 08:10:15 +02:00
|
|
|
|
|
|
|
@property
|
|
|
|
def ppmroot(self):
|
|
|
|
# returns schema://.../<doc_id>/pages/<page_num>/<step>/page
|
|
|
|
pages_dirname = self.results_document_ep.pages_dirname
|
|
|
|
result = (
|
|
|
|
f"{pages_dirname}page_{self.page_num}/"
|
|
|
|
f"{self.step.percent}/page"
|
|
|
|
)
|
|
|
|
return result
|
|
|
|
|
|
|
|
@property
|
|
|
|
def pages_dirname(self):
|
2020-05-10 16:02:58 +02:00
|
|
|
return self.document_path.pages_dirname
|
2020-05-04 08:10:15 +02:00
|
|
|
|
2020-05-04 13:58:56 +02:00
|
|
|
@property
|
|
|
|
def path(self):
|
|
|
|
return self.url()
|
2020-05-04 08:10:15 +02:00
|
|
|
|
2020-05-04 13:58:56 +02:00
|
|
|
def url(self):
|
|
|
|
return self.txt_url()
|
2020-05-04 08:10:15 +02:00
|
|
|
|
2020-05-04 13:58:56 +02:00
|
|
|
@property
|
|
|
|
def txt_path(self):
|
|
|
|
return self.txt_url()
|
2020-05-04 08:10:15 +02:00
|
|
|
|
2020-05-04 13:58:56 +02:00
|
|
|
def txt_url(self):
|
|
|
|
pages_dirname = self.results_document_ep.pages_dirname
|
|
|
|
return f"{pages_dirname}page_{self.page_num}.txt"
|
2020-05-04 08:10:15 +02:00
|
|
|
|
|
|
|
@property
|
2020-05-04 13:58:56 +02:00
|
|
|
def hocr_path(self):
|
|
|
|
return self.hocr_url()
|
2020-05-04 08:10:15 +02:00
|
|
|
|
2020-05-04 13:58:56 +02:00
|
|
|
def hocr_url(self):
|
|
|
|
url = f"{self.ppmroot}-{self.ppmtopdf_formated_number}.hocr"
|
2020-05-04 08:10:15 +02:00
|
|
|
return url
|
|
|
|
|
2020-05-04 13:58:56 +02:00
|
|
|
@property
|
|
|
|
def img_path(self):
|
|
|
|
return self.img_url()
|
2020-05-04 08:10:15 +02:00
|
|
|
|
2020-05-04 13:58:56 +02:00
|
|
|
def img_url(self):
|
|
|
|
url = f"{self.ppmroot}-{self.ppmtopdf_formated_number}.jpg"
|
|
|
|
return url
|
2020-05-04 08:10:15 +02:00
|
|
|
|
|
|
|
@property
|
|
|
|
def ppmtopdf_formated_number(self):
|
|
|
|
|
|
|
|
if self.page_count <= 9:
|
|
|
|
fmt_num = "{num:d}"
|
|
|
|
elif self.page_count > 9 and self.page_count < 100:
|
|
|
|
fmt_num = "{num:02d}"
|
|
|
|
elif self.page_count > 100:
|
|
|
|
fmt_num = "{num:003d}"
|
|
|
|
|
|
|
|
return fmt_num.format(
|
|
|
|
num=int(self.page_num)
|
|
|
|
)
|