From 5edd196aaa1fe710ac5742048b1e252a38e3587c Mon Sep 17 00:00:00 2001 From: Eugen Ciur Date: Tue, 11 Aug 2020 19:47:35 +0200 Subject: [PATCH] make binary paths configurable --- changelog.md | 9 ++++++ mglib/conf/__init__.py | 8 +++++ mglib/conf/default_settings.py | 28 +++++++++++++++++ mglib/conf/settings.py | 55 ++++++++++++++++++++++++++++++++++ mglib/mime.py | 3 +- mglib/pdfinfo.py | 11 ++++--- mglib/pdftk.py | 10 ++++--- mglib/shortcuts.py | 36 ++++------------------ mglib/tiff.py | 3 +- setup.py | 2 +- test/test_settings.py | 45 ++++++++++++++++++++++++++++ 11 files changed, 168 insertions(+), 42 deletions(-) create mode 100644 mglib/conf/__init__.py create mode 100644 mglib/conf/default_settings.py create mode 100644 mglib/conf/settings.py create mode 100644 test/test_settings.py diff --git a/changelog.md b/changelog.md index b86f96d..9917620 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,14 @@ + # Changelog + +## [1.2.6] - 11 August 2020 + +### Added + +- mglib.conf.settings module. The points it to get rid of hardcoded binary paths. Binary paths are now provided as configurations. + + ## [1.2.3] - 25 July 2020 ### Changed diff --git a/mglib/conf/__init__.py b/mglib/conf/__init__.py new file mode 100644 index 0000000..ef600b7 --- /dev/null +++ b/mglib/conf/__init__.py @@ -0,0 +1,8 @@ +from .settings import ( + DefaultSettings, + MgLibSettings +) + +settings = MgLibSettings( + DefaultSettings() +) diff --git a/mglib/conf/default_settings.py b/mglib/conf/default_settings.py new file mode 100644 index 0000000..130b981 --- /dev/null +++ b/mglib/conf/default_settings.py @@ -0,0 +1,28 @@ + + +# file utility used to find out mime type of a file +BINARY_FILE = "/usr/bin/file" + +# Provided by ImageMagick package. +# Used for resizing images. +BINARY_CONVERT = "/usr/bin/convert" + +# Provided by Poppler Utils. +# Used to extract images from PDF file. +BINARY_PDFTOPPM = "/usr/bin/pdftoppm" + +# Provided by Poppler Utils. +# used to get page count in PDF file +BINARY_PDFINFO = "/usr/bin/pdfinfo" + +# Provided by ImageMagick package. +# Used to get number of pages in TIFF file. +BINARY_IDENTIFY = "/usr/bin/identify" + +# Provided by tesseract package. +# Used to extract text from images/PDF files. +BINARY_OCR = "/usr/bin/tesseract" + +# Provided by pdftk package +# Used to reorder, cut/paste, delete pages withing PDF document +BINARY_PDFTK = "/usr/bin/pdftk" diff --git a/mglib/conf/settings.py b/mglib/conf/settings.py new file mode 100644 index 0000000..63c3f58 --- /dev/null +++ b/mglib/conf/settings.py @@ -0,0 +1,55 @@ +import importlib + +try: + from django.conf import settings as django_settings +except ImportError: + # Operating outside django, use own settings module + django_settings = None + + +class DefaultSettings: + + def __init__( + self, + settings_module="mglib.conf.default_settings" + ): + self.SETTINGS_MODULE = settings_module + + mod = importlib.import_module( + self.SETTINGS_MODULE + ) + + for setting in dir(mod): + if setting.isupper(): + setting_value = getattr(mod, setting) + setattr(self, setting, setting_value) + + def configure(self, **options): + for name, value in options.items(): + setattr(self, name, value) + + +class MgLibSettings: + + def __init__( + self, default_settings + ): + self.default_settings = default_settings + + def __getattr__(self, name): + # When operating withing django, + # get configuration from django settings + if not name.isupper(): + raise AttributeError + + if django_settings: + val = getattr(django_settings, name) + return val + + val = getattr(self.default_settings, name) + return val + + def configure(self, **options): + self.default_settings.configure( + **options + ) diff --git a/mglib/mime.py b/mglib/mime.py index 7e7845d..22c1269 100644 --- a/mglib/mime.py +++ b/mglib/mime.py @@ -1,13 +1,14 @@ import logging from . import wrapper +from .conf import settings logger = logging.getLogger(__name__) class Mime(wrapper.Wrapper): def __init__(self, filepath): - super().__init__(exec_name="file") + super().__init__(exec_name=settings.BINARY_FILE) self.filepath = filepath def get_cmd(self): diff --git a/mglib/pdfinfo.py b/mglib/pdfinfo.py index 997884a..7aba959 100644 --- a/mglib/pdfinfo.py +++ b/mglib/pdfinfo.py @@ -3,6 +3,8 @@ import re import subprocess import logging +from .conf import settings + """ Uses command line pdfinfo utility (from poppler pakage) for various small operations (e.g. get pdf page count). @@ -13,7 +15,7 @@ logger = logging.getLogger(__name__) def get_tiff_pagecount(filepath): cmd = [ - "/usr/bin/identify", + settings.BINARY_IDENTIFY, "-format", "%n\n", filepath @@ -76,10 +78,11 @@ def get_pagecount(filepath): "Only jpeg, png, pdf and tiff are handlerd by this" " method" ) - # pdfinfo "${PDFFILE}" | grep Pages - - cmd = ["/usr/bin/pdfinfo", filepath] + cmd = [ + settings.BINARY_PDFINFO, + filepath + ] compl = subprocess.run( cmd, stdout=subprocess.PIPE, diff --git a/mglib/pdftk.py b/mglib/pdftk.py index 6384bc5..4840d85 100644 --- a/mglib/pdftk.py +++ b/mglib/pdftk.py @@ -3,6 +3,8 @@ import logging from mglib.runcmd import run from mglib.pdfinfo import get_pagecount +from .conf import settings + logger = logging.getLogger(__name__) # @@ -183,7 +185,7 @@ def paste_pages_into_existing_doc( ) cmd = [ - "pdftk", + settings.BINARY_PDFTK, ] # add A=doc1_path, B=doc2_path cmd.extend(letters_2_doc_map) @@ -272,7 +274,7 @@ def paste_pages( ) cmd = [ - "pdftk", + settings.BINARY_PDFTK, ] # add A=doc1_path, B=doc2_path cmd.extend(letters_2_doc_map) @@ -315,7 +317,7 @@ def reorder_pages( ) cmd = [ - "pdftk", + settings.BINARY_PDFTK, src, "cat" ] @@ -338,7 +340,7 @@ def delete_pages(src, dst, page_numbers): ) cmd = [ - "pdftk", + settings.BINARY_PDFTK, src, "cat" ] diff --git a/mglib/shortcuts.py b/mglib/shortcuts.py index efa23f9..bde1c64 100644 --- a/mglib/shortcuts.py +++ b/mglib/shortcuts.py @@ -2,6 +2,7 @@ import os import logging from mglib.runcmd import run +from .conf import settings logger = logging.getLogger(__name__) @@ -28,7 +29,7 @@ def resize_img(page_path, media_root): logger.debug(f"PPMROOT {ppmroot_dirname} already exists.") cmd = ( - "convert", + settings.BINARY_CONVERT, "-resize", f"{width}x", local_abspath, @@ -61,7 +62,7 @@ def extract_img(page_path, media_root): else: logger.debug(f"PPMROOT {ppmroot_dirname} already exists.") cmd = ( - "pdftoppm", + settings.BINARY_PDFTOPPM, "-jpeg", "-f", str(page_num), @@ -89,7 +90,7 @@ def extract_hocr(page_url, lang, media_root): os.path.join(media_root, page_url.hocr_url()) ) cmd = ( - "tesseract", + settings.BINARY_OCR, "-l", lang, page_abspath, @@ -112,37 +113,10 @@ def extract_txt(page_url, lang, media_root): ) ) cmd = ( - "tesseract", + settings.BINARY_OCR, "-l", lang, page_abspath, txt_root ) run(cmd) - - -#def text_from_pdf(filepath, lang, dry_run=False): -# -# # suffix .tiff in file name is required by conver utility, otherwise -# # it won't convert to tiff format! -# tiff = tempfile.NamedTemporaryFile(suffix=".tiff") -# conv = convert.Convert(dry_run=dry_run) -# conv(filepath=filepath, fout=tiff) -# try: -# tsact = tesseract.Tesseract() -# text = tsact(filepath=tiff.name, lang=lang) -# except subprocess.CalledProcessError as e: -# print(e) -# print(e.stderr) -# return -# -# return text -# -# -#def text_from_image(filepath, lang, dry_run=False): -# -# tsact = tesseract.Tesseract(dry_run=dry_run) -# text = tsact(filepath=filepath, lang=lang) -# -# return text -# diff --git a/mglib/tiff.py b/mglib/tiff.py index fc1298c..3756235 100644 --- a/mglib/tiff.py +++ b/mglib/tiff.py @@ -2,6 +2,7 @@ import os import logging from mglib.runcmd import run +from .conf import settings logger = logging.getLogger(__name__) @@ -21,7 +22,7 @@ def convert_tiff2pdf(doc_url): ) cmd = ( - "convert", + settings.BINARY_CONVERT, doc_url, new_doc_url, ) diff --git a/setup.py b/setup.py index 953afa1..19a4e59 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ with open("README.md", "r") as fh: setup( name="mglib", - version="1.2.5", + version="1.2.6", author="Eugen Ciur", author_email="eugen@papermerge.com", url="https://github.com/papermerge/mglib", diff --git a/test/test_settings.py b/test/test_settings.py new file mode 100644 index 0000000..1edd713 --- /dev/null +++ b/test/test_settings.py @@ -0,0 +1,45 @@ +import os +from pathlib import Path +import unittest + +from mglib.conf.settings import ( + MgLibSettings, + DefaultSettings +) + +DATA_DIR = os.path.join( + Path(__file__).parent, + 'data' +) + + +class TestMgLibSettings(unittest.TestCase): + + def setUp(self): + self.settings = MgLibSettings(DefaultSettings()) + + def test_settings_outside_django_should_work(self): + """ + Without django there should be default values + for settings + """ + # check default value for pdfinfo + self.assertEqual( + "/usr/bin/pdfinfo", + self.settings.BINARY_PDFINFO + ) + + def test_settings_are_configurable(self): + """ + User should be able to reconfigure mglibsettings + on the go (i.e. change default values). + """ + # check default value for pdfinfo + self.settings.configure( + BINARY_PDFINFO="/usr/bin/xyz" + ) + self.assertEqual( + "/usr/bin/xyz", + self.settings.BINARY_PDFINFO + ) +