Compare commits

...

23 Commits

Author SHA1 Message Date
Eugen Ciur 00775cef7d add empty _s3copy method to the storage class 2021-09-19 08:09:30 +02:00
Eugen Ciur bea2a7dd62 Reset version to 0 for newly created documents "from paste" 2021-03-04 09:34:50 +01:00
Eugen Ciur e27107ae83 add one more test 2021-02-22 12:33:46 +01:00
Eugen Ciur c5a0be464c version bump 2021-02-21 15:23:48 +01:00
Eugen Ciur 6fecc4d60f minor fix 2021-02-21 15:22:42 +01:00
Eugen Ciur 21a9ebb57b README and setup.cfg updated 2021-01-19 12:25:21 +01:00
Eugen Ciur 2d7c96be4e update mglib 2021-01-19 12:18:29 +01:00
Eugen Ciur 90352965e6 PEP8 2021-01-19 11:30:01 +01:00
Eugen Ciur e267f7e98f adding pdfname_from_tiffname 2021-01-19 08:37:36 +01:00
Eugen Ciur 4a7099a16b fix minor bug 2021-01-18 17:49:06 +01:00
Eugen Ciur 7ddb02dcb5 PEP8 formatting 2021-01-18 07:47:08 +01:00
Eugen Ciur eb98ef1329 Add comments about Step class limitations 2021-01-18 07:42:08 +01:00
Eugen Ciur 6f7e8ba0e2 copy txt, jpg and hocr extracted into separate methods 2020-12-25 15:19:36 +01:00
Eugen Ciur 40f95466c8 add upload/download functions 2020-12-25 09:57:36 +01:00
Eugen Ciur 25e973ff79 minor fix 2020-12-24 12:00:47 +01:00
Eugen Ciur 1e6d1a10ec minor fixes, version inc 2020-12-14 07:51:02 +01:00
Eugen Ciur 535af7df83 version inc 2020-12-14 07:01:44 +01:00
Eugen Ciur e341100e69 get_versions method added 2020-12-14 07:01:20 +01:00
Eugen Ciur faba619024 version inc 2020-12-14 06:21:09 +01:00
Eugen Ciur 94a72760ca WIP: document versioning 2020-12-13 08:27:39 +01:00
Eugen Ciur c8b524910d changes to support document versioning 2020-12-11 10:45:10 +01:00
Eugen Ciur 1b86732056 change to support versioning 2020-12-11 10:44:53 +01:00
Eugen Ciur 06be42542a add extra checks for mime type, inc version, fix failing tests 2020-12-01 11:40:51 +01:00
14 changed files with 397 additions and 97 deletions

View File

@ -1,18 +0,0 @@
MgLib
=======
Python Package containing modules shared across all [Papermerge Project](https://github.com/ciur/papermerge) project.
## Installation
pip install mglib
## Run tests
python test/run.py
## Requirements
python >= 3.7

20
README.rst Normal file
View File

@ -0,0 +1,20 @@
MgLib
=======
Python Package containing modules shared across all `Papermerge Project <https://github.com/ciur/papermerge>`_ project.
Installation
##############
pip install mglib
Run tests
###########
python test/run.py
Requirements
##############
python >= 3.7

View File

@ -1,6 +1,40 @@
# Changelog
## [1.3.9] - 2021-09-19
### Added
- empty \_s3copy method to meglib.storage.Storage class
## [1.3.8] - 4 March 2020
- bug fix: reset version to 0 for newly created documents "from paste"
## [1.3.5] - 14 December 2020
### Changed
- bug fixing of get_versions method
## [1.3.4] - 14 December 2020
### Changed
- mglib.storage.get_versions(self, doc_path) method added
## [1.3.3] - 14 December 2020
### Changed
- mglib.path module adjusted to accept version argument. Supports
getting/setting path to versioned documents.
## [1.3.2] - 1 December 2020
### Changed
- mglib.pdfinfo.get_pagecount use python magic + file extention to determine correct mime type (and thus page count)
## [1.3.1] - 1 December 2020
### Changed

View File

@ -32,12 +32,16 @@ class DocumentPath:
self.version = version
self.pages = "pages"
def url(self):
return f"{self.dirname}{self.file_name}"
def url(self, version=None):
if version:
version = int(version)
@property
def path(self):
return self.url()
return f"{self.dirname(version=version)}{self.file_name}"
def path(self, version=None):
if version:
version = int(version)
return self.url(version=version)
@property
def dirname_docs(self):
@ -57,21 +61,23 @@ class DocumentPath:
return _path
@property
def dirname(self):
def dirname(self, version=None):
if version is None:
version = self.version
full_path = (
f"{self.aux_dir}/user_{self.user_id}/"
f"document_{self.document_id}/"
)
if self.version > 0:
full_path = f"{full_path}v{self.version}/"
if version > 0:
full_path = f"{full_path}v{version}/"
return full_path
@property
def pages_dirname(self):
return f"{self.dirname}{self.pages}/"
def pages_dirname(self, version=None):
return f"{self.dirname(version=version)}{self.pages}/"
def __repr__(self):
message = (
@ -144,7 +150,7 @@ class PagePath:
@property
def ppmroot(self):
# returns schema://.../<doc_id>/pages/<page_num>/<step>/page
pages_dirname = self.results_document_ep.pages_dirname
pages_dirname = self.results_document_ep.pages_dirname()
result = (
f"{pages_dirname}page_{self.page_num}/"
f"{self.step.percent}/page"
@ -153,7 +159,7 @@ class PagePath:
@property
def pages_dirname(self):
return self.document_path.pages_dirname
return self.document_path.pages_dirname()
@property
def path(self):
@ -167,7 +173,7 @@ class PagePath:
return self.txt_url()
def txt_url(self):
pages_dirname = self.results_document_ep.pages_dirname
pages_dirname = self.results_document_ep.pages_dirname()
return f"{pages_dirname}page_{self.page_num}.txt"
@property
@ -193,7 +199,7 @@ class PagePath:
fmt_num = "{num:d}"
elif self.page_count > 9 and self.page_count < 100:
fmt_num = "{num:02d}"
elif self.page_count > 100:
elif self.page_count >= 100:
fmt_num = "{num:003d}"
return fmt_num.format(

View File

@ -64,22 +64,42 @@ def get_pagecount(filepath):
if os.path.isdir(filepath):
raise ValueError("Filepath %s is a directory!" % filepath)
base, ext = os.path.splitext(filepath)
mime_type = from_file(filepath, mime=True)
# pure images (png, jpeg) have only one page :)
if mime_type in ['image/png', 'image/jpeg', 'image/jpg']:
# whatever png/jpg image is there - it is
# considered by default one page document.
return 1
# In case of REST API upload (via PUT + form multipart)
# django saves temporary file as application/octet-stream
# Checking extentions is an extra method of finding out correct
# mime type
if ext and ext.lower() in ('.jpeg', '.png', '.jpg'):
return 1
if mime_type == 'image/tiff':
return get_tiff_pagecount(filepath)
# In case of REST API upload (via PUT + form multipart)
# django saves temporary file as application/octet-stream
# Checking extentions is an extra method of finding out correct
# mime type
if ext and ext.lower() in ('.tiff', ):
return get_tiff_pagecount(filepath)
if mime_type != 'application/pdf':
raise FileTypeNotSupported(
"Only jpeg, png, pdf and tiff are handled by this"
" method"
)
# In case of REST API upload (via PUT + form multipart)
# django saves temporary file as application/octet-stream
# Checking extentions is an extra method of finding out correct
# mime type
if ext and ext.lower() != '.pdf':
raise FileTypeNotSupported(
"Only jpeg, png, pdf and tiff are handled by this"
" method"
)
# pdfinfo "${PDFFILE}" | grep Pages
cmd = [
settings.BINARY_PDFINFO,

View File

@ -1,5 +1,43 @@
class Step:
# Q: What is ``Step`` and why it was a bad decision to introduce it?
#
# A: ``Step`` class is closely related to zooming in/zooming out
# a specific page in the document in the frontend (javascript code).
#
# When user opens the document in document viewer, he/she actually
# sees an image with text over it (text overlay). Text overlay is
# created from hocr data. Very important point here, is that
# text hocr data corresponds to (extracted, format jpeg) image of the page
# of VERY SAME width/height. Again, hocr file and respective image file
# of the page MUST HAVE SAME WIDTH AND HEIGHT.
#
# Each step is meant to be a specific zoom value of the page. Thus, step
# 2, which corresonds to LIST[2] % = 75 % of the page initial logical size
# of WIDTH_100p = 1240.
# When user zooms in/zooms out - a new hocr file is downloaded
# corresponding to that zoom step. As you may guess, user can zoom only
# 125%, 100%, 75% and 50%. Value of 10% corresponds to thumbnail of the
# document and does not count as 'real' step.
#
# Instead of doing this step thingy, it would have been better to drop
# the entire step concept. Much better solution for zoom in/zoom out would
# have been to download one SVG file for each page (instead of hocr) and
# SVG file of respective page should contain embedded image
# (binary jpeg; yes SVG format allows embedding of binary formats!) and
# correctly mapped text overlay (built from hocr file). User later
# can zoom in/zoom out using SVG transforations in frontend!
#
# The good things about SVG solutions are:
#
# * there will be 4X less OCR required (corresponding to
# hOCR of each step minus thumbnail/10% step)
# * will simplify front-end code as SVG (= hocr + jpeg) will be
# generated on the on server side
# * eliminate conept of Step entirely
# (there will be only one SVG file per page)
# * increase front-end and back-end performance as only one file SVG file
# will be sent back and forth (from backend to frontend)
#
# width of a document when displayed as 100%.
WIDTH_100p = 1240
PERCENT = 100

View File

@ -18,7 +18,7 @@ class Storage:
on local host filesystem
"""
def __init__(self, location=None):
def __init__(self, location=None, **kwargs):
# by default, this will be something like
# settings.MEDIA_ROOT
self._location = location
@ -27,6 +27,15 @@ class Storage:
def location(self):
return self._location
def upload(self, doc_path_url, **kwargs):
pass
def download(self, doc_path_url, **kwargs):
pass
def _s3copy(self, src, dst):
pass
def make_sure_path_exists(self, filepath):
logger.debug(f"make_sure_path_exists {filepath}")
dirname = os.path.dirname(filepath)
@ -35,6 +44,39 @@ class Storage:
exist_ok=True
)
def get_versions(self, doc_path):
"""
Returns a list of (all) ordered versions
of specific doc_path. Versions
start with 0. Examples of return values:
- [0, 1, 2, 3] = 4 versions of the document
- [ 0 ] = only one version (original)
To count versions it just counts number of subfolders
in specific document folder. Versions are
stored in subfolders named v1, v2, v3, ...
"""
abs_dirname_docs = self.path(
doc_path.dirname_docs
)
try:
only_dirs = [
fi for fi in listdir(abs_dirname_docs) if isdir(
join(abs_dirname_docs, fi)
)
]
except FileNotFoundError:
# in tests, document folders are not always created.
# If no document folder is found, just return [ 0 ]
# i.e that document has only one single version and it
# is the latest one.
return [0]
dirs_count = len(only_dirs)
return list(range(0, dirs_count + 1))
def get_pagecount(self, doc_path):
"""
Returns total number of pages for this doc_path.
@ -44,7 +86,7 @@ class Storage:
doc_path_pointing_to_results = DocumentPath.copy_from(
doc_path, aux_dir="results"
)
pages_dir = self.abspath(doc_path_pointing_to_results.pages_dirname)
pages_dir = self.abspath(doc_path_pointing_to_results.pages_dirname())
only_dirs = [
fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi))
@ -98,7 +140,7 @@ class Storage:
if os.path.exists(abs_dirname_results):
os.rmdir(abs_dirname_results)
def copy_doc(self, src, dst):
def copy_doc(self, src: DocumentPath, dst: DocumentPath):
"""
copy given file src file path to destination
as absolute doc_path
@ -117,7 +159,7 @@ class Storage:
f"copy_doc: {src} to {dst}"
)
shutil.copyfile(
src,
self.abspath(src),
self.abspath(dst)
)
@ -126,24 +168,60 @@ class Storage:
self.path(_path)
)
def copy_page(self, src_page_path, dst_page_path):
err_msg = "copy_page accepts only PageEp instances"
def copy_page_txt(self, src_page_path, dst_page_path):
self.make_sure_path_exists(
self.abspath(dst_page_path.txt_url())
)
src_txt = self.abspath(src_page_path.txt_url())
dst_txt = self.abspath(dst_page_path.txt_url())
logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
shutil.copy(src_txt, dst_txt)
def copy_page_img(self, src_page_path, dst_page_path):
self.make_sure_path_exists(
self.abspath(dst_page_path.img_url())
)
src_img = self.abspath(src_page_path.img_url())
dst_img = self.abspath(dst_page_path.img_url())
logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
shutil.copy(src_img, dst_img)
def copy_page_hocr(self, src_page_path, dst_page_path):
self.make_sure_path_exists(
self.abspath(dst_page_path.hocr_url())
)
src_hocr = self.abspath(src_page_path.hocr_url())
dst_hocr = self.abspath(dst_page_path.hocr_url())
logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
shutil.copy(src_hocr, dst_hocr)
def copy_page(self, src_page_path, dst_page_path):
"""
Copies page data from source to destination.
Page data are files with following extentions:
* txt
* hocr
* jpeg
they are located in media root of respective application.
"""
for inst in [src_page_path, dst_page_path]:
if not isinstance(inst, PagePath):
raise ValueError(err_msg)
raise ValueError("copy_page accepts only PagePath instances")
# copy .txt file
if self.exists(src_page_path.txt_url()):
self.make_sure_path_exists(
self.abspath(dst_page_path.txt_url())
self.copy_page_txt(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
src_txt = self.abspath(src_page_path.txt_url())
dst_txt = self.abspath(dst_page_path.txt_url())
logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
shutil.copy(src_txt, dst_txt)
else:
logger.debug(
f"txt does not exits {src_page_path.txt_url()}"
@ -151,28 +229,20 @@ class Storage:
# hocr
if self.exists(src_page_path.hocr_url()):
self.make_sure_path_exists(
self.abspath(dst_page_path.hocr_url())
self.copy_page_hocr(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
src_hocr = self.abspath(src_page_path.hocr_url())
dst_hocr = self.abspath(dst_page_path.hocr_url())
logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
shutil.copy(src_hocr, dst_hocr)
else:
logger.debug(
f"hocr does not exits {src_page_path.hocr_url()}"
)
if src_page_path.img_url():
self.make_sure_path_exists(
self.abspath(dst_page_path.img_url())
self.copy_page_img(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
src_img = self.abspath(src_page_path.img_url())
dst_img = self.abspath(dst_page_path.img_url())
logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
shutil.copy(src_img, dst_img)
else:
logger.debug(
f"img does not exits {src_page_path.img_url()}"
@ -324,9 +394,17 @@ class Storage:
from src_doc_path. Both dest and src are instances of
mglib.path.DocumentPath
"""
next_version = 0
if dest_doc_is_new:
# document is new, start version with 0
next_version = 0
else:
# destination document is not new, increment its version
next_version = dest_doc_path.version + 1
next_ver_dp = DocumentPath.copy_from(
dest_doc_path,
version=dest_doc_path.version + 1
version=next_version
)
self.make_sure_path_exists(
self.abspath(next_ver_dp)
@ -381,7 +459,7 @@ class Storage:
)
dest_page_num += 1
return dest_doc_path.version + 1
return next_version
class FileSystemStorage(Storage):

View File

@ -7,16 +7,34 @@ from .conf import settings
logger = logging.getLogger(__name__)
def convert_tiff2pdf(doc_url):
def pdfname_from_tiffname(doc_url):
"""
Given tiff document url, will return
respective pdf file name. Returned
file name can be use used as destination
for tiff2pdf tool.
logger.debug(f"convert_tiff2pdf for {doc_url}")
Returns a tuple (new_doc_url, new_filename).
new_doc_url - is new absolute path to the pdf file
new_filename - is new pdf filename
"""
# basename is filename + ext (no path)
basename = os.path.basename(doc_url)
base_root, base_ext = os.path.splitext(basename)
root, ext = os.path.splitext(doc_url)
new_doc_url = f"{root}.pdf"
return new_doc_url, f"{base_root}.pdf"
def convert_tiff2pdf(doc_url):
logger.debug(f"convert_tiff2pdf for {doc_url}")
new_doc_url, new_filename = pdfname_from_tiffname(
doc_url
)
logger.debug(
f"tiff2pdf source={doc_url} dest={new_doc_url}"
)
@ -30,4 +48,4 @@ def convert_tiff2pdf(doc_url):
run(cmd)
# returns new filename
return f"{base_root}.pdf"
return new_filename

View File

@ -43,7 +43,7 @@ def safe_to_delete(place):
for root, dirs, files in os.walk(place):
for name in files:
base, ext = os.path.splitext(name)
if ext not in SAFE_EXTENSIONS:
if ext.lower() not in SAFE_EXTENSIONS:
logger.warning(
f"Trying to delete unsefe location: "
f"extention={ext} not found in {SAFE_EXTENSIONS}"

16
setup.cfg Normal file
View File

@ -0,0 +1,16 @@
[metadata]
name = mglib
version = 1.3.9
description = Common code used across all Papermerge project utilities
long_description = file: README.rst
url = https://www.papermerge.com/
author = Eugen Ciur
author_email = eugen@papermerge.com
keywords= common, package, shared, papermerge, pdf, ocr, dms
license = Apache 2.0 License
classifiers =
Programming Language :: Python :: 3
Programming Language :: Python :: 3 :: Only
Programming Language :: Python :: 3.7
License :: OSI Approved :: Apache Software License
Operating System :: OS Independent

View File

@ -1,25 +1,6 @@
from setuptools import find_packages, setup
with open("README.md", "r") as fh:
long_description = fh.read()
setup(
name="mglib",
version="1.3.1",
author="Eugen Ciur",
author_email="eugen@papermerge.com",
url="https://github.com/papermerge/mglib",
description="Common code used across all Papermerge project utilities",
long_description=long_description,
long_description_content_type="text/markdown",
license="Apache 2.0 License",
keywords="common, package, shared, papermerge, pdf, ocr, dms",
packages=find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
],
python_requires='>=3.7',
)

View File

@ -21,3 +21,16 @@ class TestConvert(unittest.TestCase):
self.assertTrue(
mime_type.is_pdf()
)
def test_get_mime_type(self):
file_path = os.path.join(
DATA_DIR,
"berlin.pdf"
)
mime_type = mime.Mime(filepath=file_path)
self.assertEquals(
mime_type.guess(),
"application/pdf"
)

View File

@ -19,6 +19,47 @@ class TestDocumentPath(unittest.TestCase):
"docs/user_1/document_3/x.pdf"
)
def test_document_url_with_another_version(self):
doc_ep = DocumentPath(
user_id=1,
document_id=15,
file_name="x.pdf"
)
self.assertEqual(
doc_ep.url(version=3),
"docs/user_1/document_15/v3/x.pdf"
)
self.assertEqual(
doc_ep.url(version=2),
"docs/user_1/document_15/v2/x.pdf"
)
def test_document_url_none_vs_0(self):
doc_ep = DocumentPath(
user_id=1,
document_id=15,
file_name="x.pdf"
)
doc_ep.inc_version() # current version = 1
doc_ep.inc_version() # current version = 2
doc_ep.inc_version() # current version = 3
self.assertEqual(
# with version == None, latest version of the document
# will be returned, which is 3
doc_ep.url(version=None),
"docs/user_1/document_15/v3/x.pdf"
)
self.assertEqual(
# with version == 0, version 0 will be provided
# i.e. version=0 returns original doc.
doc_ep.url(version=0),
"docs/user_1/document_15/x.pdf"
)
def test_inc_version(self):
"""
Document endpoints are now versioned.
@ -48,6 +89,13 @@ class TestDocumentPath(unittest.TestCase):
"docs/user_1/document_3/v2/x.pdf"
)
# however, explicit version can be forced
# by providing an argument to url method.
self.assertEqual(
doc_ep.url(version=1),
"docs/user_1/document_3/v1/x.pdf"
)
def test_dirname(self):
ep = DocumentPath(
user_id=1,
@ -56,7 +104,7 @@ class TestDocumentPath(unittest.TestCase):
file_name="x.pdf"
)
self.assertEqual(
ep.dirname,
ep.dirname(),
"results/user_1/document_3/"
)
@ -68,7 +116,7 @@ class TestDocumentPath(unittest.TestCase):
file_name="x.pdf"
)
self.assertEqual(
ep.pages_dirname,
ep.pages_dirname(),
"results/user_1/document_3/pages/"
)

View File

@ -41,3 +41,49 @@ class TestStorage(unittest.TestCase):
f1.exists()
)
def test_get_versions_1(self):
storage = FileSystemStorage(location=MEDIA_ROOT)
with TemporaryNode(MEDIA_ROOT) as media_root:
docs = media_root.add_folder("docs")
res = media_root.add_folder("results")
f1 = docs.add_folder("user_1/document_2")
f1.add_file("doku.pdf")
# simulate 2 versions of the document.
f1.add_folder("v1")
f1.add_folder("v2")
res.add_folder("user_1/document_2/pages")
doc_path = DocumentPath(
user_id=1,
document_id=2,
file_name='doku.pdf',
version=2
)
versions = storage.get_versions(doc_path)
self.assertEqual(
versions, [0, 1, 2]
)
def test_get_versions_2(self):
storage = FileSystemStorage(location=MEDIA_ROOT)
with TemporaryNode(MEDIA_ROOT) as media_root:
docs = media_root.add_folder("docs")
f1 = docs.add_folder("user_1/document_2")
f1.add_file("doku.pdf")
doc_path = DocumentPath(
user_id=1,
document_id=2,
file_name='doku.pdf',
version=2
)
versions = storage.get_versions(doc_path)
# document has only one version - the latest
self.assertEqual(
versions, [0]
)