import logging logger = logging.getLogger(__name__) AUX_DIR_DOCS = "docs" AUX_DIR_RESULTS = "results" class DocumentPath: """ Document path: ///// If version = 0, it is not included in DocumentPath. Document's version is incremented everytime pdftk operation runs on it (when pages are deleted, reordered, pasted) """ def __init__( self, user_id, document_id, file_name, aux_dir=AUX_DIR_DOCS, version=0 ): self.user_id = user_id self.document_id = document_id self.file_name = file_name self.aux_dir = aux_dir # by default, document has version 0 self.version = version self.pages = "pages" def url(self, version=None): if version: version = int(version) return f"{self.dirname(version=version)}{self.file_name}" def path(self, version=None): if version: version = int(version) return self.url(version=version) @property def dirname_docs(self): _path = ( f"{AUX_DIR_DOCS}/user_{self.user_id}/" f"document_{self.document_id}/" ) return _path @property def dirname_results(self): _path = ( f"{AUX_DIR_RESULTS}/user_{self.user_id}/" f"document_{self.document_id}/" ) return _path def dirname(self, version=None): if version is None: version = self.version full_path = ( f"{self.aux_dir}/user_{self.user_id}/" f"document_{self.document_id}/" ) if version > 0: full_path = f"{full_path}v{version}/" return full_path def pages_dirname(self, version=None): return f"{self.dirname(version=version)}{self.pages}/" def __repr__(self): message = ( f"DocumentPath(version={self.version}," f"user_id={self.user_id}," f"document_id={self.document_id}," f"file_name={self.file_name})" ) return message def inc_version(self): self.version = self.version + 1 def copy_from(doc_path, **kw): """ Will create a copy of provided DocumentPath (first parameter = doc_path) and replace existing parameter of new copy with the one from kw. kw => key/value parameters. Keys can be one of doc_path attributes: user_id, document_id, file_name, aux_dir, version """ copy_values = { 'user_id': doc_path.user_id, 'document_id': doc_path.document_id, 'file_name': doc_path.file_name, 'version': doc_path.version, 'aux_dir': doc_path.aux_dir } for key, value in kw.items(): copy_values[key] = kw[key] return DocumentPath( user_id=copy_values['user_id'], document_id=copy_values['document_id'], file_name=copy_values['file_name'], version=copy_values['version'], aux_dir=copy_values['aux_dir'] ) class PagePath: """ //pages///page-.jpg """ def __init__( self, document_path, page_num, page_count, step=None ): if not isinstance(page_num, int): msg_err = f"PagePath.page_num must be an int. Got {page_num}." raise ValueError(msg_err) self.document_path = document_path self.results_document_ep = DocumentPath.copy_from( document_path, aux_dir=AUX_DIR_RESULTS ) self.page_count = page_count self.page_num = page_num self.step = step self.pages = self.document_path.pages @property def ppmroot(self): # returns schema://...//pages///page pages_dirname = self.results_document_ep.pages_dirname() result = ( f"{pages_dirname}page_{self.page_num}/" f"{self.step.percent}/page" ) return result @property def pages_dirname(self): return self.document_path.pages_dirname() @property def path(self): return self.url() def url(self): return self.txt_url() @property def txt_path(self): return self.txt_url() def txt_url(self): pages_dirname = self.results_document_ep.pages_dirname() return f"{pages_dirname}page_{self.page_num}.txt" @property def hocr_path(self): return self.hocr_url() def hocr_url(self): url = f"{self.ppmroot}-{self.ppmtopdf_formated_number}.hocr" return url @property def img_path(self): return self.img_url() def img_url(self): url = f"{self.ppmroot}-{self.ppmtopdf_formated_number}.jpg" return url @property def ppmtopdf_formated_number(self): if self.page_count <= 9: fmt_num = "{num:d}" elif self.page_count > 9 and self.page_count < 100: fmt_num = "{num:02d}" elif self.page_count >= 100: fmt_num = "{num:003d}" return fmt_num.format( num=int(self.page_num) )