Compare commits

...

48 Commits

Author SHA1 Message Date
Eugen Ciur 00775cef7d add empty _s3copy method to the storage class 2021-09-19 08:09:30 +02:00
Eugen Ciur bea2a7dd62 Reset version to 0 for newly created documents "from paste" 2021-03-04 09:34:50 +01:00
Eugen Ciur e27107ae83 add one more test 2021-02-22 12:33:46 +01:00
Eugen Ciur c5a0be464c version bump 2021-02-21 15:23:48 +01:00
Eugen Ciur 6fecc4d60f minor fix 2021-02-21 15:22:42 +01:00
Eugen Ciur 21a9ebb57b README and setup.cfg updated 2021-01-19 12:25:21 +01:00
Eugen Ciur 2d7c96be4e update mglib 2021-01-19 12:18:29 +01:00
Eugen Ciur 90352965e6 PEP8 2021-01-19 11:30:01 +01:00
Eugen Ciur e267f7e98f adding pdfname_from_tiffname 2021-01-19 08:37:36 +01:00
Eugen Ciur 4a7099a16b fix minor bug 2021-01-18 17:49:06 +01:00
Eugen Ciur 7ddb02dcb5 PEP8 formatting 2021-01-18 07:47:08 +01:00
Eugen Ciur eb98ef1329 Add comments about Step class limitations 2021-01-18 07:42:08 +01:00
Eugen Ciur 6f7e8ba0e2 copy txt, jpg and hocr extracted into separate methods 2020-12-25 15:19:36 +01:00
Eugen Ciur 40f95466c8 add upload/download functions 2020-12-25 09:57:36 +01:00
Eugen Ciur 25e973ff79 minor fix 2020-12-24 12:00:47 +01:00
Eugen Ciur 1e6d1a10ec minor fixes, version inc 2020-12-14 07:51:02 +01:00
Eugen Ciur 535af7df83 version inc 2020-12-14 07:01:44 +01:00
Eugen Ciur e341100e69 get_versions method added 2020-12-14 07:01:20 +01:00
Eugen Ciur faba619024 version inc 2020-12-14 06:21:09 +01:00
Eugen Ciur 94a72760ca WIP: document versioning 2020-12-13 08:27:39 +01:00
Eugen Ciur c8b524910d changes to support document versioning 2020-12-11 10:45:10 +01:00
Eugen Ciur 1b86732056 change to support versioning 2020-12-11 10:44:53 +01:00
Eugen Ciur 06be42542a add extra checks for mime type, inc version, fix failing tests 2020-12-01 11:40:51 +01:00
Eugen Ciur fe20ddd72b typo in license file 2020-12-01 07:49:57 +01:00
Eugen Ciur b7ce57b055 version inc 2020-12-01 07:47:24 +01:00
Eugen Ciur fa90e6b0a6 removed pdftk dependency 2020-12-01 07:44:57 +01:00
Eugen Ciur 9e24776ba8 typo in setup.py 2020-11-29 09:44:51 +01:00
Eugen Ciur 47ed3e0d94 typo 2020-11-29 09:37:12 +01:00
Eugen Ciur 11bdcd25c2 try with python setup develop instead of installing mglib 2020-11-29 09:35:27 +01:00
Eugen Ciur f72496264a fix failing tests 2020-11-29 09:33:28 +01:00
Eugen Ciur fe35b3c333 use python3.7 2020-11-29 08:58:37 +01:00
Eugen Ciur f02db89086 PEP8 fixes 2020-11-29 08:54:51 +01:00
Eugen Ciur 7fe9928d74 run flake8 + tests 2020-11-29 08:50:46 +01:00
Eugen Ciur 7982243dda increment mglib version 2020-11-29 08:38:14 +01:00
Eugen Ciur 8d5077933f
Merge pull request #6 from francescocarzaniga/master
Replace extension checking with python-magic
2020-11-29 08:35:29 +01:00
francesco.carzaniga 0bf3789dca PNG needs to be more complete to work 2020-11-28 20:37:38 +01:00
francesco.carzaniga b0fbd06a25 Changed test file to include magic bytes 2020-11-28 20:22:02 +01:00
francesco.carzaniga 030df1e049 Modified tests 2020-11-28 20:06:05 +01:00
francesco.carzaniga ae93586a37 Formatting 2020-11-27 19:26:57 +01:00
francesco.carzaniga 0b80c1f446 Replace extension checking with python-magic 2020-11-27 19:22:43 +01:00
Eugen Ciur 9fbaaf7dfd
Merge pull request #2 from georgkrause/test_pdflib
Unit Tests for PDF lib
2020-11-27 15:41:04 +01:00
Eugen Ciur 4e28785b65
Merge pull request #5 from georgkrause/stapler
Add lib interacting with stapler
2020-11-27 15:40:45 +01:00
Georg Krause 6ddff80818
Remove blank line at end of file 2020-11-27 15:04:01 +01:00
Georg Krause 49d569ead8
Add lib interacting with stapler 2020-11-27 14:57:47 +01:00
Eugen Ciur 74cefa5d3c
Merge pull request #4 from georgkrause/github-actions
Introduce Github Actions
2020-11-25 10:41:54 +01:00
Georg Krause 5a46fdaef3
Introduce Github Actions 2020-11-24 21:37:03 +01:00
Georg Krause 1ad7239172
Add Unit Tests for pdftk module 2020-11-24 20:28:39 +01:00
Eugen Ciur 38d3efad11 typo 2020-08-24 09:26:09 +02:00
23 changed files with 635 additions and 180 deletions

35
.github/workflows/python-app.yml vendored Normal file
View File

@ -0,0 +1,35 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: mglib
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
test:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8
python setup.py develop
if [ -f requirements/base.txt ]; then pip install -r requirements/base.txt; fi
sudo apt install poppler-utils pdftk
- name: Lint with flake8
run: |
flake8 mglib
- name: Run tests
run: |
python test/run.py

View File

@ -1,6 +1,6 @@
Copyright 2020 Eugen Ciur <eugen@papermerge.com>
MgMail is Licensed under Apache License version 2.0
MgLib is Licensed under Apache License version 2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this software except in compliance with the License.

View File

@ -1,13 +0,0 @@
MgLib
=======
Python Package containing modules shared across all [Papermerge Project](https://github.com/ciur/papermerge) project.
## Installation
pip install mglib
## Run tests
python test/run.py

20
README.rst Normal file
View File

@ -0,0 +1,20 @@
MgLib
=======
Python Package containing modules shared across all `Papermerge Project <https://github.com/ciur/papermerge>`_ project.
Installation
##############
pip install mglib
Run tests
###########
python test/run.py
Requirements
##############
python >= 3.7

View File

@ -1,6 +1,47 @@
# Changelog
## [1.3.9] - 2021-09-19
### Added
- empty \_s3copy method to meglib.storage.Storage class
## [1.3.8] - 4 March 2020
- bug fix: reset version to 0 for newly created documents "from paste"
## [1.3.5] - 14 December 2020
### Changed
- bug fixing of get_versions method
## [1.3.4] - 14 December 2020
### Changed
- mglib.storage.get_versions(self, doc_path) method added
## [1.3.3] - 14 December 2020
### Changed
- mglib.path module adjusted to accept version argument. Supports
getting/setting path to versioned documents.
## [1.3.2] - 1 December 2020
### Changed
- mglib.pdfinfo.get_pagecount use python magic + file extention to determine correct mime type (and thus page count)
## [1.3.1] - 1 December 2020
### Changed
- pdftk module was replaced with stapler
## [1.2.8] - 24 August 2020
### Added

View File

@ -23,6 +23,6 @@ BINARY_IDENTIFY = "/usr/bin/identify"
# Used to extract text from images/PDF files.
BINARY_OCR = "/usr/bin/tesseract"
# Provided by pdftk package
# Used to reorder, cut/paste, delete pages withing PDF document
BINARY_PDFTK = "/usr/bin/pdftk"
# Provided by stapler
# Used to edit PDF documents
BINARY_STAPLER = "~/.local/bin/stapler"

View File

@ -32,12 +32,16 @@ class DocumentPath:
self.version = version
self.pages = "pages"
def url(self):
return f"{self.dirname}{self.file_name}"
def url(self, version=None):
if version:
version = int(version)
@property
def path(self):
return self.url()
return f"{self.dirname(version=version)}{self.file_name}"
def path(self, version=None):
if version:
version = int(version)
return self.url(version=version)
@property
def dirname_docs(self):
@ -57,21 +61,23 @@ class DocumentPath:
return _path
@property
def dirname(self):
def dirname(self, version=None):
if version is None:
version = self.version
full_path = (
f"{self.aux_dir}/user_{self.user_id}/"
f"document_{self.document_id}/"
)
if self.version > 0:
full_path = f"{full_path}v{self.version}/"
if version > 0:
full_path = f"{full_path}v{version}/"
return full_path
@property
def pages_dirname(self):
return f"{self.dirname}{self.pages}/"
def pages_dirname(self, version=None):
return f"{self.dirname(version=version)}{self.pages}/"
def __repr__(self):
message = (
@ -144,7 +150,7 @@ class PagePath:
@property
def ppmroot(self):
# returns schema://.../<doc_id>/pages/<page_num>/<step>/page
pages_dirname = self.results_document_ep.pages_dirname
pages_dirname = self.results_document_ep.pages_dirname()
result = (
f"{pages_dirname}page_{self.page_num}/"
f"{self.step.percent}/page"
@ -153,7 +159,7 @@ class PagePath:
@property
def pages_dirname(self):
return self.document_path.pages_dirname
return self.document_path.pages_dirname()
@property
def path(self):
@ -167,7 +173,7 @@ class PagePath:
return self.txt_url()
def txt_url(self):
pages_dirname = self.results_document_ep.pages_dirname
pages_dirname = self.results_document_ep.pages_dirname()
return f"{pages_dirname}page_{self.page_num}.txt"
@property
@ -193,7 +199,7 @@ class PagePath:
fmt_num = "{num:d}"
elif self.page_count > 9 and self.page_count < 100:
fmt_num = "{num:02d}"
elif self.page_count > 100:
elif self.page_count >= 100:
fmt_num = "{num:003d}"
return fmt_num.format(

View File

@ -2,6 +2,7 @@ import os
import re
import subprocess
import logging
from magic import from_file
from .conf import settings
from .exceptions import FileTypeNotSupported
@ -64,21 +65,41 @@ def get_pagecount(filepath):
raise ValueError("Filepath %s is a directory!" % filepath)
base, ext = os.path.splitext(filepath)
mime_type = from_file(filepath, mime=True)
# pure images (png, jpeg) have only one page :)
if ext and ext.lower() in ('.jpeg', '.png', '.jpg'):
if mime_type in ['image/png', 'image/jpeg', 'image/jpg']:
# whatever png/jpg image is there - it is
# considered by default one page document.
return 1
# In case of REST API upload (via PUT + form multipart)
# django saves temporary file as application/octet-stream
# Checking extentions is an extra method of finding out correct
# mime type
if ext and ext.lower() in ('.jpeg', '.png', '.jpg'):
return 1
if mime_type == 'image/tiff':
return get_tiff_pagecount(filepath)
# In case of REST API upload (via PUT + form multipart)
# django saves temporary file as application/octet-stream
# Checking extentions is an extra method of finding out correct
# mime type
if ext and ext.lower() in ('.tiff', ):
return get_tiff_pagecount(filepath)
if ext and ext.lower() not in ('.pdf', '.tiff'):
raise FileTypeNotSupported(
"Only jpeg, png, pdf and tiff are handlerd by this"
" method"
)
if mime_type != 'application/pdf':
# In case of REST API upload (via PUT + form multipart)
# django saves temporary file as application/octet-stream
# Checking extentions is an extra method of finding out correct
# mime type
if ext and ext.lower() != '.pdf':
raise FileTypeNotSupported(
"Only jpeg, png, pdf and tiff are handled by this"
" method"
)
# pdfinfo "${PDFFILE}" | grep Pages
cmd = [
settings.BINARY_PDFINFO,

View File

@ -8,9 +8,9 @@ from .conf import settings
logger = logging.getLogger(__name__)
#
# Utilities around pdftk command line tool
# Utilities around stapler command line tool
#
# https://www.pdflabs.com/docs/pdftk-man-page/
# https://github.com/hellerbarde/stapler
#
@ -31,6 +31,8 @@ def cat_ranges_for_reorder(page_count, new_order):
[4, 2, 3, 1]
"""
if len(new_order) != page_count:
raise ValueError("Not enough pages specified")
results = []
# key = page_num
# value = page_order
@ -49,50 +51,6 @@ def cat_ranges_for_reorder(page_count, new_order):
return results
def cat_ranges_for_delete(page_count, page_numbers):
"""
Returns a list of integers. Each number in the list
is the number of page which will 'stay' in document.
In other words, it returns a list with deleted pages.
Examples:
If document has 22 pages (page_count=22) and page number 21 is to be
deleted (i.e page_numbers = [21]) will return
[1, 2, 3, 4, ..., 19, 20, 22]
If page number 1 is to be deleted:
[2, 3, 4, ..., 22] list will be returned.
If page number is 22 is to be deleted:
[1, 2, 3,..., 21] will be returned.
With page_numbers=[1, 7, 10] and page_count=22 result
will be:
(2, 3, 4, 5, 6, 8, 9, 11, 12 , 13, ..., 22)
page_numbers is a list of page numbers (starting with 1).
"""
results = []
for check in page_numbers:
if not isinstance(check, int):
err_msg = "page_numbers must be a list of strings"
raise ValueError(err_msg)
for number in range(1, page_count + 1):
if number not in page_numbers:
results.append(number)
return results
def split_ranges(total, after=False, before=False):
"""
Given a range 1, 2, ..., total (page numbers of a doc).
@ -112,7 +70,7 @@ def split_ranges(total, after=False, before=False):
(it means, by default, all pages are inserted at the end of the doc)
"""
if after and not before:
if not isinstance(after, int):
if not type(after) == int:
raise ValueError(
"argument 'after' is supposed to be an int"
)
@ -121,7 +79,7 @@ def split_ranges(total, after=False, before=False):
return list1, list2
if not after and before:
if not isinstance(before, int):
if not type(before) == int:
raise ValueError(
"argument 'before' is supposed to be an int"
)
@ -185,13 +143,13 @@ def paste_pages_into_existing_doc(
)
cmd = [
settings.BINARY_PDFTK,
settings.BINARY_STAPLER,
]
cmd.append("sel")
# add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map)
cmd.append("cat")
# existing doc pages (may be empty)
cmd.extend(letters_pages_before)
# newly inserted pages
@ -199,8 +157,6 @@ def paste_pages_into_existing_doc(
# existing doc pages (may be empty)
cmd.extend(letters_pages_after)
cmd.append("output")
cmd.append(dst)
run(cmd)
@ -274,17 +230,15 @@ def paste_pages(
)
cmd = [
settings.BINARY_PDFTK,
settings.BINARY_STAPLER,
]
cmd.append("sel")
# add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map)
cmd.append("cat")
cmd.extend(letters_pages)
cmd.append("output")
cmd.append(dst)
run(cmd)
@ -317,39 +271,30 @@ def reorder_pages(
)
cmd = [
settings.BINARY_PDFTK,
src,
"cat"
settings.BINARY_STAPLER,
"sel",
src
]
for page in cat_ranges:
cmd.append(
str(page)
)
cmd.append("output")
cmd.append(dst)
run(cmd)
def delete_pages(src, dst, page_numbers):
page_count = get_pagecount(src)
cat_ranges = cat_ranges_for_delete(
page_count,
page_numbers
)
cmd = [
settings.BINARY_PDFTK,
src,
"cat"
settings.BINARY_STAPLER,
"del",
src
]
for page in cat_ranges:
for page in page_numbers:
cmd.append(
str(page)
)
cmd.append("output")
cmd.append(dst)
run(cmd)

View File

@ -1,5 +1,43 @@
class Step:
# Q: What is ``Step`` and why it was a bad decision to introduce it?
#
# A: ``Step`` class is closely related to zooming in/zooming out
# a specific page in the document in the frontend (javascript code).
#
# When user opens the document in document viewer, he/she actually
# sees an image with text over it (text overlay). Text overlay is
# created from hocr data. Very important point here, is that
# text hocr data corresponds to (extracted, format jpeg) image of the page
# of VERY SAME width/height. Again, hocr file and respective image file
# of the page MUST HAVE SAME WIDTH AND HEIGHT.
#
# Each step is meant to be a specific zoom value of the page. Thus, step
# 2, which corresonds to LIST[2] % = 75 % of the page initial logical size
# of WIDTH_100p = 1240.
# When user zooms in/zooms out - a new hocr file is downloaded
# corresponding to that zoom step. As you may guess, user can zoom only
# 125%, 100%, 75% and 50%. Value of 10% corresponds to thumbnail of the
# document and does not count as 'real' step.
#
# Instead of doing this step thingy, it would have been better to drop
# the entire step concept. Much better solution for zoom in/zoom out would
# have been to download one SVG file for each page (instead of hocr) and
# SVG file of respective page should contain embedded image
# (binary jpeg; yes SVG format allows embedding of binary formats!) and
# correctly mapped text overlay (built from hocr file). User later
# can zoom in/zoom out using SVG transforations in frontend!
#
# The good things about SVG solutions are:
#
# * there will be 4X less OCR required (corresponding to
# hOCR of each step minus thumbnail/10% step)
# * will simplify front-end code as SVG (= hocr + jpeg) will be
# generated on the on server side
# * eliminate conept of Step entirely
# (there will be only one SVG file per page)
# * increase front-end and back-end performance as only one file SVG file
# will be sent back and forth (from backend to frontend)
#
# width of a document when displayed as 100%.
WIDTH_100p = 1240
PERCENT = 100

View File

@ -4,7 +4,7 @@ import shutil
from os import listdir
from os.path import isdir, join
from mglib import pdftk
from mglib import stapler
from mglib.path import DocumentPath, PagePath
from mglib.step import Steps
from mglib.utils import get_assigns_after_delete, safe_to_delete
@ -18,7 +18,7 @@ class Storage:
on local host filesystem
"""
def __init__(self, location=None):
def __init__(self, location=None, **kwargs):
# by default, this will be something like
# settings.MEDIA_ROOT
self._location = location
@ -27,6 +27,15 @@ class Storage:
def location(self):
return self._location
def upload(self, doc_path_url, **kwargs):
pass
def download(self, doc_path_url, **kwargs):
pass
def _s3copy(self, src, dst):
pass
def make_sure_path_exists(self, filepath):
logger.debug(f"make_sure_path_exists {filepath}")
dirname = os.path.dirname(filepath)
@ -35,6 +44,39 @@ class Storage:
exist_ok=True
)
def get_versions(self, doc_path):
"""
Returns a list of (all) ordered versions
of specific doc_path. Versions
start with 0. Examples of return values:
- [0, 1, 2, 3] = 4 versions of the document
- [ 0 ] = only one version (original)
To count versions it just counts number of subfolders
in specific document folder. Versions are
stored in subfolders named v1, v2, v3, ...
"""
abs_dirname_docs = self.path(
doc_path.dirname_docs
)
try:
only_dirs = [
fi for fi in listdir(abs_dirname_docs) if isdir(
join(abs_dirname_docs, fi)
)
]
except FileNotFoundError:
# in tests, document folders are not always created.
# If no document folder is found, just return [ 0 ]
# i.e that document has only one single version and it
# is the latest one.
return [0]
dirs_count = len(only_dirs)
return list(range(0, dirs_count + 1))
def get_pagecount(self, doc_path):
"""
Returns total number of pages for this doc_path.
@ -44,7 +86,7 @@ class Storage:
doc_path_pointing_to_results = DocumentPath.copy_from(
doc_path, aux_dir="results"
)
pages_dir = self.abspath(doc_path_pointing_to_results.pages_dirname)
pages_dir = self.abspath(doc_path_pointing_to_results.pages_dirname())
only_dirs = [
fi for fi in listdir(pages_dir) if isdir(join(pages_dir, fi))
@ -98,7 +140,7 @@ class Storage:
if os.path.exists(abs_dirname_results):
os.rmdir(abs_dirname_results)
def copy_doc(self, src, dst):
def copy_doc(self, src: DocumentPath, dst: DocumentPath):
"""
copy given file src file path to destination
as absolute doc_path
@ -117,7 +159,7 @@ class Storage:
f"copy_doc: {src} to {dst}"
)
shutil.copyfile(
src,
self.abspath(src),
self.abspath(dst)
)
@ -126,24 +168,60 @@ class Storage:
self.path(_path)
)
def copy_page(self, src_page_path, dst_page_path):
err_msg = "copy_page accepts only PageEp instances"
def copy_page_txt(self, src_page_path, dst_page_path):
self.make_sure_path_exists(
self.abspath(dst_page_path.txt_url())
)
src_txt = self.abspath(src_page_path.txt_url())
dst_txt = self.abspath(dst_page_path.txt_url())
logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
shutil.copy(src_txt, dst_txt)
def copy_page_img(self, src_page_path, dst_page_path):
self.make_sure_path_exists(
self.abspath(dst_page_path.img_url())
)
src_img = self.abspath(src_page_path.img_url())
dst_img = self.abspath(dst_page_path.img_url())
logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
shutil.copy(src_img, dst_img)
def copy_page_hocr(self, src_page_path, dst_page_path):
self.make_sure_path_exists(
self.abspath(dst_page_path.hocr_url())
)
src_hocr = self.abspath(src_page_path.hocr_url())
dst_hocr = self.abspath(dst_page_path.hocr_url())
logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
shutil.copy(src_hocr, dst_hocr)
def copy_page(self, src_page_path, dst_page_path):
"""
Copies page data from source to destination.
Page data are files with following extentions:
* txt
* hocr
* jpeg
they are located in media root of respective application.
"""
for inst in [src_page_path, dst_page_path]:
if not isinstance(inst, PagePath):
raise ValueError(err_msg)
raise ValueError("copy_page accepts only PagePath instances")
# copy .txt file
if self.exists(src_page_path.txt_url()):
self.make_sure_path_exists(
self.abspath(dst_page_path.txt_url())
self.copy_page_txt(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
src_txt = self.abspath(src_page_path.txt_url())
dst_txt = self.abspath(dst_page_path.txt_url())
logger.debug(f"copy src_txt={src_txt} dst_txt={dst_txt}")
shutil.copy(src_txt, dst_txt)
else:
logger.debug(
f"txt does not exits {src_page_path.txt_url()}"
@ -151,28 +229,20 @@ class Storage:
# hocr
if self.exists(src_page_path.hocr_url()):
self.make_sure_path_exists(
self.abspath(dst_page_path.hocr_url())
self.copy_page_hocr(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
src_hocr = self.abspath(src_page_path.hocr_url())
dst_hocr = self.abspath(dst_page_path.hocr_url())
logger.debug(f"copy src_hocr={src_hocr} dst_hocr={dst_hocr}")
shutil.copy(src_hocr, dst_hocr)
else:
logger.debug(
f"hocr does not exits {src_page_path.hocr_url()}"
)
if src_page_path.img_url():
self.make_sure_path_exists(
self.abspath(dst_page_path.img_url())
self.copy_page_img(
src_page_path=src_page_path,
dst_page_path=dst_page_path
)
src_img = self.abspath(src_page_path.img_url())
dst_img = self.abspath(dst_page_path.img_url())
logger.debug(f"copy src_img={src_img} dst_img={dst_img}")
shutil.copy(src_img, dst_img)
else:
logger.debug(
f"img does not exits {src_page_path.img_url()}"
@ -209,7 +279,7 @@ class Storage:
self.abspath(dst_doc_path)
)
pdftk.reorder_pages(
stapler.reorder_pages(
src=self.abspath(src_doc_path),
dst=self.abspath(dst_doc_path),
new_order=new_order
@ -269,7 +339,7 @@ class Storage:
self.make_sure_path_exists(
self.abspath(dst_doc_path)
)
pdftk.delete_pages(
stapler.delete_pages(
self.abspath(src_doc_path),
self.abspath(dst_doc_path),
page_numbers
@ -324,15 +394,23 @@ class Storage:
from src_doc_path. Both dest and src are instances of
mglib.path.DocumentPath
"""
next_version = 0
if dest_doc_is_new:
# document is new, start version with 0
next_version = 0
else:
# destination document is not new, increment its version
next_version = dest_doc_path.version + 1
next_ver_dp = DocumentPath.copy_from(
dest_doc_path,
version=dest_doc_path.version + 1
version=next_version
)
self.make_sure_path_exists(
self.abspath(next_ver_dp)
)
pdftk.paste_pages(
stapler.paste_pages(
src=self.abspath(dest_doc_path),
dst=self.abspath(next_ver_dp),
data_list=data_list,
@ -381,7 +459,7 @@ class Storage:
)
dest_page_num += 1
return dest_doc_path.version + 1
return next_version
class FileSystemStorage(Storage):

View File

@ -7,16 +7,34 @@ from .conf import settings
logger = logging.getLogger(__name__)
def convert_tiff2pdf(doc_url):
def pdfname_from_tiffname(doc_url):
"""
Given tiff document url, will return
respective pdf file name. Returned
file name can be use used as destination
for tiff2pdf tool.
logger.debug(f"convert_tiff2pdf for {doc_url}")
Returns a tuple (new_doc_url, new_filename).
new_doc_url - is new absolute path to the pdf file
new_filename - is new pdf filename
"""
# basename is filename + ext (no path)
basename = os.path.basename(doc_url)
base_root, base_ext = os.path.splitext(basename)
root, ext = os.path.splitext(doc_url)
new_doc_url = f"{root}.pdf"
return new_doc_url, f"{base_root}.pdf"
def convert_tiff2pdf(doc_url):
logger.debug(f"convert_tiff2pdf for {doc_url}")
new_doc_url, new_filename = pdfname_from_tiffname(
doc_url
)
logger.debug(
f"tiff2pdf source={doc_url} dest={new_doc_url}"
)
@ -30,4 +48,4 @@ def convert_tiff2pdf(doc_url):
run(cmd)
# returns new filename
return f"{base_root}.pdf"
return new_filename

View File

@ -43,7 +43,7 @@ def safe_to_delete(place):
for root, dirs, files in os.walk(place):
for name in files:
base, ext = os.path.splitext(name)
if ext not in SAFE_EXTENSIONS:
if ext.lower() not in SAFE_EXTENSIONS:
logger.warning(
f"Trying to delete unsefe location: "
f"extention={ext} not found in {SAFE_EXTENSIONS}"
@ -87,7 +87,7 @@ def get_assigns_after_delete(total_pages, deleted_pages):
]
"""
if total_pages < len(deleted_pages):
err_msg = f"total_pages < deleted_pages"
err_msg = "total_pages < deleted_pages"
raise ValueError(err_msg)
# only numbers of pages which were not deleted
@ -136,7 +136,7 @@ def try_load_config(config_locations, config_env_var_name):
try:
cfg_papermerge, cfg_file_found = load_config(config_file)
except FileNotFoundError:
err_msg = f"Failed attempted to read" +\
err_msg = "Failed attempted to read" +\
f" configuration file '{config_file}'" +\
f" pointed by environment variable '{config_env_var_name}'"
raise FileNotFoundError(err_msg)

1
requirements/base.txt Normal file
View File

@ -0,0 +1 @@
python-magic

16
setup.cfg Normal file
View File

@ -0,0 +1,16 @@
[metadata]
name = mglib
version = 1.3.9
description = Common code used across all Papermerge project utilities
long_description = file: README.rst
url = https://www.papermerge.com/
author = Eugen Ciur
author_email = eugen@papermerge.com
keywords= common, package, shared, papermerge, pdf, ocr, dms
license = Apache 2.0 License
classifiers =
Programming Language :: Python :: 3
Programming Language :: Python :: 3 :: Only
Programming Language :: Python :: 3.7
License :: OSI Approved :: Apache Software License
Operating System :: OS Independent

View File

@ -1,25 +1,6 @@
from setuptools import find_packages, setup
with open("README.md", "r") as fh:
long_description = fh.read()
setup(
name="mglib",
version="1.2.8",
author="Eugen Ciur",
author_email="eugen@papermerge.com",
url="https://github.com/papermerge/mglib",
description="Common code used across all Papermerge project utilities",
long_description=long_description,
long_description_content_type="text/markdown",
license="Apache 2.0 License",
keywords="common, pacakge, shared, papermerge",
packages=find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
],
python_requires='>=3.7',
)

View File

@ -1,2 +1,2 @@
I am not even binary!
ÿØÿØI am not even binary!
The idea is to test pdfinfo.get_pagecount

Before

Width:  |  Height:  |  Size: 63 B

After

Width:  |  Height:  |  Size: 67 B

View File

@ -1 +1 @@
well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount
ÿØÿîwell... I am text! But who cares? The idea is to test pdfinfo.get_pagecount

Before

Width:  |  Height:  |  Size: 75 B

After

Width:  |  Height:  |  Size: 79 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 75 B

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@ -21,3 +21,16 @@ class TestConvert(unittest.TestCase):
self.assertTrue(
mime_type.is_pdf()
)
def test_get_mime_type(self):
file_path = os.path.join(
DATA_DIR,
"berlin.pdf"
)
mime_type = mime.Mime(filepath=file_path)
self.assertEquals(
mime_type.guess(),
"application/pdf"
)

View File

@ -19,6 +19,47 @@ class TestDocumentPath(unittest.TestCase):
"docs/user_1/document_3/x.pdf"
)
def test_document_url_with_another_version(self):
doc_ep = DocumentPath(
user_id=1,
document_id=15,
file_name="x.pdf"
)
self.assertEqual(
doc_ep.url(version=3),
"docs/user_1/document_15/v3/x.pdf"
)
self.assertEqual(
doc_ep.url(version=2),
"docs/user_1/document_15/v2/x.pdf"
)
def test_document_url_none_vs_0(self):
doc_ep = DocumentPath(
user_id=1,
document_id=15,
file_name="x.pdf"
)
doc_ep.inc_version() # current version = 1
doc_ep.inc_version() # current version = 2
doc_ep.inc_version() # current version = 3
self.assertEqual(
# with version == None, latest version of the document
# will be returned, which is 3
doc_ep.url(version=None),
"docs/user_1/document_15/v3/x.pdf"
)
self.assertEqual(
# with version == 0, version 0 will be provided
# i.e. version=0 returns original doc.
doc_ep.url(version=0),
"docs/user_1/document_15/x.pdf"
)
def test_inc_version(self):
"""
Document endpoints are now versioned.
@ -48,6 +89,13 @@ class TestDocumentPath(unittest.TestCase):
"docs/user_1/document_3/v2/x.pdf"
)
# however, explicit version can be forced
# by providing an argument to url method.
self.assertEqual(
doc_ep.url(version=1),
"docs/user_1/document_3/v1/x.pdf"
)
def test_dirname(self):
ep = DocumentPath(
user_id=1,
@ -56,7 +104,7 @@ class TestDocumentPath(unittest.TestCase):
file_name="x.pdf"
)
self.assertEqual(
ep.dirname,
ep.dirname(),
"results/user_1/document_3/"
)
@ -68,7 +116,7 @@ class TestDocumentPath(unittest.TestCase):
file_name="x.pdf"
)
self.assertEqual(
ep.pages_dirname,
ep.pages_dirname(),
"results/user_1/document_3/pages/"
)

161
test/test_stapler.py Normal file
View File

@ -0,0 +1,161 @@
import os
import unittest
from unittest import mock
from mglib import stapler
from mglib.conf import settings
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")
class TestPdfLib(unittest.TestCase):
def test_ranges_for_reorder(self):
actual = stapler.cat_ranges_for_reorder(4, [
{"page_order": 1, "page_num": 4},
{"page_order": 2, "page_num": 3},
{"page_order": 3, "page_num": 2},
{"page_order": 4, "page_num": 1}
])
expected = [4, 3, 2, 1]
assert expected == actual
self.assertRaises(ValueError, stapler.cat_ranges_for_reorder, 2, [])
self.assertRaises(KeyError, stapler.cat_ranges_for_reorder, 2, [
{"page_order": 3, "page_num": 4},
{"page_order": 5, "page_num": 6}
])
def test_delete_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
with mock.patch("mglib.stapler.run") as run_func:
stapler.delete_pages(input_file, output_file, [1])
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "del", input_file, "1", output_file]
)
def test_split_ranges(self):
page_count = 9
self.assertRaises(
ValueError,
stapler.split_ranges,
9,
after="a",
before=False
)
self.assertRaises(
ValueError,
stapler.split_ranges,
9, after=False,
before=True
)
actual1, actual2 = stapler.split_ranges(page_count, 1, False)
expected1 = [1]
expected2 = [2, 3, 4, 5, 6, 7, 8, 9]
assert actual1 == expected1
assert actual2 == expected2
actual1, actual2 = stapler.split_ranges(page_count, False, 2)
expected1 = [1]
expected2 = [2, 3, 4, 5, 6, 7, 8, 9]
assert actual1 == expected1
assert actual2 == expected2
actual1, actual2 = stapler.split_ranges(page_count)
expected1 = list(range(1, page_count + 1))
expected2 = []
assert actual1 == expected1
assert actual2 == expected2
def test_reorder_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
new_order = [
{'page_num': 2, 'page_order': 1},
{'page_num': 1, 'page_order': 2},
]
with mock.patch("mglib.stapler.run") as run_func:
stapler.reorder_pages(input_file, output_file, new_order)
run_func.assert_called()
run_func.assert_called_with(
[
settings.BINARY_STAPLER,
"sel",
input_file,
"2",
"1",
output_file
]
)
def test_paste_pages_into_existing_doc(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages_into_existing_doc(
input_file, output_file, datalist
)
run_func.assert_called()
run_func.assert_called_with(
[
settings.BINARY_STAPLER,
"sel", "A=" + input_file, "A1", "A2", output_file
]
)
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages_into_existing_doc(
input_file,
output_file,
datalist,
1
)
run_func.assert_called()
run_func.assert_called_with(
[
settings.BINARY_STAPLER,
"sel", "A=" + input_file,
"B=" + input_file, "A1", "B3",
"B4", "A2", output_file
]
)
def test_paste_pages(self):
input_file = os.path.join(DATA_DIR, "berlin.pdf")
output_file = os.path.join(DATA_DIR, "berlin2.pdf")
datalist = []
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages(input_file, output_file, datalist, False)
run_func.assert_called()
run_func.assert_called_with(
[
settings.BINARY_STAPLER,
"sel",
"A=" + input_file,
"A1",
"A2",
output_file
]
)
datalist = [{"src": input_file, "page_nums": "34"}]
with mock.patch("mglib.stapler.run") as run_func:
stapler.paste_pages(input_file, output_file, datalist)
run_func.assert_called()
run_func.assert_called_with(
[settings.BINARY_STAPLER, "sel", "A=" + input_file, "A3", "A4",
output_file]
)

View File

@ -41,3 +41,49 @@ class TestStorage(unittest.TestCase):
f1.exists()
)
def test_get_versions_1(self):
storage = FileSystemStorage(location=MEDIA_ROOT)
with TemporaryNode(MEDIA_ROOT) as media_root:
docs = media_root.add_folder("docs")
res = media_root.add_folder("results")
f1 = docs.add_folder("user_1/document_2")
f1.add_file("doku.pdf")
# simulate 2 versions of the document.
f1.add_folder("v1")
f1.add_folder("v2")
res.add_folder("user_1/document_2/pages")
doc_path = DocumentPath(
user_id=1,
document_id=2,
file_name='doku.pdf',
version=2
)
versions = storage.get_versions(doc_path)
self.assertEqual(
versions, [0, 1, 2]
)
def test_get_versions_2(self):
storage = FileSystemStorage(location=MEDIA_ROOT)
with TemporaryNode(MEDIA_ROOT) as media_root:
docs = media_root.add_folder("docs")
f1 = docs.add_folder("user_1/document_2")
f1.add_file("doku.pdf")
doc_path = DocumentPath(
user_id=1,
document_id=2,
file_name='doku.pdf',
version=2
)
versions = storage.get_versions(doc_path)
# document has only one version - the latest
self.assertEqual(
versions, [0]
)