mirror of https://github.com/papermerge/mglib
make binary paths configurable
parent
ac7f2ae37b
commit
5edd196aaa
|
@ -1,5 +1,14 @@
|
||||||
|
|
||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
|
||||||
|
## [1.2.6] - 11 August 2020
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- mglib.conf.settings module. The points it to get rid of hardcoded binary paths. Binary paths are now provided as configurations.
|
||||||
|
|
||||||
|
|
||||||
## [1.2.3] - 25 July 2020
|
## [1.2.3] - 25 July 2020
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
from .settings import (
|
||||||
|
DefaultSettings,
|
||||||
|
MgLibSettings
|
||||||
|
)
|
||||||
|
|
||||||
|
settings = MgLibSettings(
|
||||||
|
DefaultSettings()
|
||||||
|
)
|
|
@ -0,0 +1,28 @@
|
||||||
|
|
||||||
|
|
||||||
|
# file utility used to find out mime type of a file
|
||||||
|
BINARY_FILE = "/usr/bin/file"
|
||||||
|
|
||||||
|
# Provided by ImageMagick package.
|
||||||
|
# Used for resizing images.
|
||||||
|
BINARY_CONVERT = "/usr/bin/convert"
|
||||||
|
|
||||||
|
# Provided by Poppler Utils.
|
||||||
|
# Used to extract images from PDF file.
|
||||||
|
BINARY_PDFTOPPM = "/usr/bin/pdftoppm"
|
||||||
|
|
||||||
|
# Provided by Poppler Utils.
|
||||||
|
# used to get page count in PDF file
|
||||||
|
BINARY_PDFINFO = "/usr/bin/pdfinfo"
|
||||||
|
|
||||||
|
# Provided by ImageMagick package.
|
||||||
|
# Used to get number of pages in TIFF file.
|
||||||
|
BINARY_IDENTIFY = "/usr/bin/identify"
|
||||||
|
|
||||||
|
# Provided by tesseract package.
|
||||||
|
# Used to extract text from images/PDF files.
|
||||||
|
BINARY_OCR = "/usr/bin/tesseract"
|
||||||
|
|
||||||
|
# Provided by pdftk package
|
||||||
|
# Used to reorder, cut/paste, delete pages withing PDF document
|
||||||
|
BINARY_PDFTK = "/usr/bin/pdftk"
|
|
@ -0,0 +1,55 @@
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
try:
|
||||||
|
from django.conf import settings as django_settings
|
||||||
|
except ImportError:
|
||||||
|
# Operating outside django, use own settings module
|
||||||
|
django_settings = None
|
||||||
|
|
||||||
|
|
||||||
|
class DefaultSettings:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
settings_module="mglib.conf.default_settings"
|
||||||
|
):
|
||||||
|
self.SETTINGS_MODULE = settings_module
|
||||||
|
|
||||||
|
mod = importlib.import_module(
|
||||||
|
self.SETTINGS_MODULE
|
||||||
|
)
|
||||||
|
|
||||||
|
for setting in dir(mod):
|
||||||
|
if setting.isupper():
|
||||||
|
setting_value = getattr(mod, setting)
|
||||||
|
setattr(self, setting, setting_value)
|
||||||
|
|
||||||
|
def configure(self, **options):
|
||||||
|
for name, value in options.items():
|
||||||
|
setattr(self, name, value)
|
||||||
|
|
||||||
|
|
||||||
|
class MgLibSettings:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, default_settings
|
||||||
|
):
|
||||||
|
self.default_settings = default_settings
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
# When operating withing django,
|
||||||
|
# get configuration from django settings
|
||||||
|
if not name.isupper():
|
||||||
|
raise AttributeError
|
||||||
|
|
||||||
|
if django_settings:
|
||||||
|
val = getattr(django_settings, name)
|
||||||
|
return val
|
||||||
|
|
||||||
|
val = getattr(self.default_settings, name)
|
||||||
|
return val
|
||||||
|
|
||||||
|
def configure(self, **options):
|
||||||
|
self.default_settings.configure(
|
||||||
|
**options
|
||||||
|
)
|
|
@ -1,13 +1,14 @@
|
||||||
import logging
|
import logging
|
||||||
from . import wrapper
|
from . import wrapper
|
||||||
|
|
||||||
|
from .conf import settings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Mime(wrapper.Wrapper):
|
class Mime(wrapper.Wrapper):
|
||||||
def __init__(self, filepath):
|
def __init__(self, filepath):
|
||||||
super().__init__(exec_name="file")
|
super().__init__(exec_name=settings.BINARY_FILE)
|
||||||
self.filepath = filepath
|
self.filepath = filepath
|
||||||
|
|
||||||
def get_cmd(self):
|
def get_cmd(self):
|
||||||
|
|
|
@ -3,6 +3,8 @@ import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from .conf import settings
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Uses command line pdfinfo utility (from poppler pakage) for various
|
Uses command line pdfinfo utility (from poppler pakage) for various
|
||||||
small operations (e.g. get pdf page count).
|
small operations (e.g. get pdf page count).
|
||||||
|
@ -13,7 +15,7 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def get_tiff_pagecount(filepath):
|
def get_tiff_pagecount(filepath):
|
||||||
cmd = [
|
cmd = [
|
||||||
"/usr/bin/identify",
|
settings.BINARY_IDENTIFY,
|
||||||
"-format",
|
"-format",
|
||||||
"%n\n",
|
"%n\n",
|
||||||
filepath
|
filepath
|
||||||
|
@ -76,10 +78,11 @@ def get_pagecount(filepath):
|
||||||
"Only jpeg, png, pdf and tiff are handlerd by this"
|
"Only jpeg, png, pdf and tiff are handlerd by this"
|
||||||
" method"
|
" method"
|
||||||
)
|
)
|
||||||
|
|
||||||
# pdfinfo "${PDFFILE}" | grep Pages
|
# pdfinfo "${PDFFILE}" | grep Pages
|
||||||
|
cmd = [
|
||||||
cmd = ["/usr/bin/pdfinfo", filepath]
|
settings.BINARY_PDFINFO,
|
||||||
|
filepath
|
||||||
|
]
|
||||||
compl = subprocess.run(
|
compl = subprocess.run(
|
||||||
cmd,
|
cmd,
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
|
|
|
@ -3,6 +3,8 @@ import logging
|
||||||
from mglib.runcmd import run
|
from mglib.runcmd import run
|
||||||
from mglib.pdfinfo import get_pagecount
|
from mglib.pdfinfo import get_pagecount
|
||||||
|
|
||||||
|
from .conf import settings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -183,7 +185,7 @@ def paste_pages_into_existing_doc(
|
||||||
)
|
)
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"pdftk",
|
settings.BINARY_PDFTK,
|
||||||
]
|
]
|
||||||
# add A=doc1_path, B=doc2_path
|
# add A=doc1_path, B=doc2_path
|
||||||
cmd.extend(letters_2_doc_map)
|
cmd.extend(letters_2_doc_map)
|
||||||
|
@ -272,7 +274,7 @@ def paste_pages(
|
||||||
)
|
)
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"pdftk",
|
settings.BINARY_PDFTK,
|
||||||
]
|
]
|
||||||
# add A=doc1_path, B=doc2_path
|
# add A=doc1_path, B=doc2_path
|
||||||
cmd.extend(letters_2_doc_map)
|
cmd.extend(letters_2_doc_map)
|
||||||
|
@ -315,7 +317,7 @@ def reorder_pages(
|
||||||
)
|
)
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"pdftk",
|
settings.BINARY_PDFTK,
|
||||||
src,
|
src,
|
||||||
"cat"
|
"cat"
|
||||||
]
|
]
|
||||||
|
@ -338,7 +340,7 @@ def delete_pages(src, dst, page_numbers):
|
||||||
)
|
)
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"pdftk",
|
settings.BINARY_PDFTK,
|
||||||
src,
|
src,
|
||||||
"cat"
|
"cat"
|
||||||
]
|
]
|
||||||
|
|
|
@ -2,6 +2,7 @@ import os
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from mglib.runcmd import run
|
from mglib.runcmd import run
|
||||||
|
from .conf import settings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -28,7 +29,7 @@ def resize_img(page_path, media_root):
|
||||||
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
||||||
|
|
||||||
cmd = (
|
cmd = (
|
||||||
"convert",
|
settings.BINARY_CONVERT,
|
||||||
"-resize",
|
"-resize",
|
||||||
f"{width}x",
|
f"{width}x",
|
||||||
local_abspath,
|
local_abspath,
|
||||||
|
@ -61,7 +62,7 @@ def extract_img(page_path, media_root):
|
||||||
else:
|
else:
|
||||||
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
||||||
cmd = (
|
cmd = (
|
||||||
"pdftoppm",
|
settings.BINARY_PDFTOPPM,
|
||||||
"-jpeg",
|
"-jpeg",
|
||||||
"-f",
|
"-f",
|
||||||
str(page_num),
|
str(page_num),
|
||||||
|
@ -89,7 +90,7 @@ def extract_hocr(page_url, lang, media_root):
|
||||||
os.path.join(media_root, page_url.hocr_url())
|
os.path.join(media_root, page_url.hocr_url())
|
||||||
)
|
)
|
||||||
cmd = (
|
cmd = (
|
||||||
"tesseract",
|
settings.BINARY_OCR,
|
||||||
"-l",
|
"-l",
|
||||||
lang,
|
lang,
|
||||||
page_abspath,
|
page_abspath,
|
||||||
|
@ -112,37 +113,10 @@ def extract_txt(page_url, lang, media_root):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
cmd = (
|
cmd = (
|
||||||
"tesseract",
|
settings.BINARY_OCR,
|
||||||
"-l",
|
"-l",
|
||||||
lang,
|
lang,
|
||||||
page_abspath,
|
page_abspath,
|
||||||
txt_root
|
txt_root
|
||||||
)
|
)
|
||||||
run(cmd)
|
run(cmd)
|
||||||
|
|
||||||
|
|
||||||
#def text_from_pdf(filepath, lang, dry_run=False):
|
|
||||||
#
|
|
||||||
# # suffix .tiff in file name is required by conver utility, otherwise
|
|
||||||
# # it won't convert to tiff format!
|
|
||||||
# tiff = tempfile.NamedTemporaryFile(suffix=".tiff")
|
|
||||||
# conv = convert.Convert(dry_run=dry_run)
|
|
||||||
# conv(filepath=filepath, fout=tiff)
|
|
||||||
# try:
|
|
||||||
# tsact = tesseract.Tesseract()
|
|
||||||
# text = tsact(filepath=tiff.name, lang=lang)
|
|
||||||
# except subprocess.CalledProcessError as e:
|
|
||||||
# print(e)
|
|
||||||
# print(e.stderr)
|
|
||||||
# return
|
|
||||||
#
|
|
||||||
# return text
|
|
||||||
#
|
|
||||||
#
|
|
||||||
#def text_from_image(filepath, lang, dry_run=False):
|
|
||||||
#
|
|
||||||
# tsact = tesseract.Tesseract(dry_run=dry_run)
|
|
||||||
# text = tsact(filepath=filepath, lang=lang)
|
|
||||||
#
|
|
||||||
# return text
|
|
||||||
#
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ import os
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from mglib.runcmd import run
|
from mglib.runcmd import run
|
||||||
|
from .conf import settings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -21,7 +22,7 @@ def convert_tiff2pdf(doc_url):
|
||||||
)
|
)
|
||||||
|
|
||||||
cmd = (
|
cmd = (
|
||||||
"convert",
|
settings.BINARY_CONVERT,
|
||||||
doc_url,
|
doc_url,
|
||||||
new_doc_url,
|
new_doc_url,
|
||||||
)
|
)
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -6,7 +6,7 @@ with open("README.md", "r") as fh:
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="mglib",
|
name="mglib",
|
||||||
version="1.2.5",
|
version="1.2.6",
|
||||||
author="Eugen Ciur",
|
author="Eugen Ciur",
|
||||||
author_email="eugen@papermerge.com",
|
author_email="eugen@papermerge.com",
|
||||||
url="https://github.com/papermerge/mglib",
|
url="https://github.com/papermerge/mglib",
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from mglib.conf.settings import (
|
||||||
|
MgLibSettings,
|
||||||
|
DefaultSettings
|
||||||
|
)
|
||||||
|
|
||||||
|
DATA_DIR = os.path.join(
|
||||||
|
Path(__file__).parent,
|
||||||
|
'data'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMgLibSettings(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.settings = MgLibSettings(DefaultSettings())
|
||||||
|
|
||||||
|
def test_settings_outside_django_should_work(self):
|
||||||
|
"""
|
||||||
|
Without django there should be default values
|
||||||
|
for settings
|
||||||
|
"""
|
||||||
|
# check default value for pdfinfo
|
||||||
|
self.assertEqual(
|
||||||
|
"/usr/bin/pdfinfo",
|
||||||
|
self.settings.BINARY_PDFINFO
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_settings_are_configurable(self):
|
||||||
|
"""
|
||||||
|
User should be able to reconfigure mglibsettings
|
||||||
|
on the go (i.e. change default values).
|
||||||
|
"""
|
||||||
|
# check default value for pdfinfo
|
||||||
|
self.settings.configure(
|
||||||
|
BINARY_PDFINFO="/usr/bin/xyz"
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
"/usr/bin/xyz",
|
||||||
|
self.settings.BINARY_PDFINFO
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in New Issue