make binary paths configurable

pull/3/head
Eugen Ciur 2020-08-11 19:47:35 +02:00
parent ac7f2ae37b
commit 5edd196aaa
11 changed files with 168 additions and 42 deletions

View File

@ -1,5 +1,14 @@
# Changelog
## [1.2.6] - 11 August 2020
### Added
- mglib.conf.settings module. The points it to get rid of hardcoded binary paths. Binary paths are now provided as configurations.
## [1.2.3] - 25 July 2020
### Changed

8
mglib/conf/__init__.py Normal file
View File

@ -0,0 +1,8 @@
from .settings import (
DefaultSettings,
MgLibSettings
)
settings = MgLibSettings(
DefaultSettings()
)

View File

@ -0,0 +1,28 @@
# file utility used to find out mime type of a file
BINARY_FILE = "/usr/bin/file"
# Provided by ImageMagick package.
# Used for resizing images.
BINARY_CONVERT = "/usr/bin/convert"
# Provided by Poppler Utils.
# Used to extract images from PDF file.
BINARY_PDFTOPPM = "/usr/bin/pdftoppm"
# Provided by Poppler Utils.
# used to get page count in PDF file
BINARY_PDFINFO = "/usr/bin/pdfinfo"
# Provided by ImageMagick package.
# Used to get number of pages in TIFF file.
BINARY_IDENTIFY = "/usr/bin/identify"
# Provided by tesseract package.
# Used to extract text from images/PDF files.
BINARY_OCR = "/usr/bin/tesseract"
# Provided by pdftk package
# Used to reorder, cut/paste, delete pages withing PDF document
BINARY_PDFTK = "/usr/bin/pdftk"

55
mglib/conf/settings.py Normal file
View File

@ -0,0 +1,55 @@
import importlib
try:
from django.conf import settings as django_settings
except ImportError:
# Operating outside django, use own settings module
django_settings = None
class DefaultSettings:
def __init__(
self,
settings_module="mglib.conf.default_settings"
):
self.SETTINGS_MODULE = settings_module
mod = importlib.import_module(
self.SETTINGS_MODULE
)
for setting in dir(mod):
if setting.isupper():
setting_value = getattr(mod, setting)
setattr(self, setting, setting_value)
def configure(self, **options):
for name, value in options.items():
setattr(self, name, value)
class MgLibSettings:
def __init__(
self, default_settings
):
self.default_settings = default_settings
def __getattr__(self, name):
# When operating withing django,
# get configuration from django settings
if not name.isupper():
raise AttributeError
if django_settings:
val = getattr(django_settings, name)
return val
val = getattr(self.default_settings, name)
return val
def configure(self, **options):
self.default_settings.configure(
**options
)

View File

@ -1,13 +1,14 @@
import logging
from . import wrapper
from .conf import settings
logger = logging.getLogger(__name__)
class Mime(wrapper.Wrapper):
def __init__(self, filepath):
super().__init__(exec_name="file")
super().__init__(exec_name=settings.BINARY_FILE)
self.filepath = filepath
def get_cmd(self):

View File

@ -3,6 +3,8 @@ import re
import subprocess
import logging
from .conf import settings
"""
Uses command line pdfinfo utility (from poppler pakage) for various
small operations (e.g. get pdf page count).
@ -13,7 +15,7 @@ logger = logging.getLogger(__name__)
def get_tiff_pagecount(filepath):
cmd = [
"/usr/bin/identify",
settings.BINARY_IDENTIFY,
"-format",
"%n\n",
filepath
@ -76,10 +78,11 @@ def get_pagecount(filepath):
"Only jpeg, png, pdf and tiff are handlerd by this"
" method"
)
# pdfinfo "${PDFFILE}" | grep Pages
cmd = ["/usr/bin/pdfinfo", filepath]
cmd = [
settings.BINARY_PDFINFO,
filepath
]
compl = subprocess.run(
cmd,
stdout=subprocess.PIPE,

View File

@ -3,6 +3,8 @@ import logging
from mglib.runcmd import run
from mglib.pdfinfo import get_pagecount
from .conf import settings
logger = logging.getLogger(__name__)
#
@ -183,7 +185,7 @@ def paste_pages_into_existing_doc(
)
cmd = [
"pdftk",
settings.BINARY_PDFTK,
]
# add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map)
@ -272,7 +274,7 @@ def paste_pages(
)
cmd = [
"pdftk",
settings.BINARY_PDFTK,
]
# add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map)
@ -315,7 +317,7 @@ def reorder_pages(
)
cmd = [
"pdftk",
settings.BINARY_PDFTK,
src,
"cat"
]
@ -338,7 +340,7 @@ def delete_pages(src, dst, page_numbers):
)
cmd = [
"pdftk",
settings.BINARY_PDFTK,
src,
"cat"
]

View File

@ -2,6 +2,7 @@ import os
import logging
from mglib.runcmd import run
from .conf import settings
logger = logging.getLogger(__name__)
@ -28,7 +29,7 @@ def resize_img(page_path, media_root):
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
cmd = (
"convert",
settings.BINARY_CONVERT,
"-resize",
f"{width}x",
local_abspath,
@ -61,7 +62,7 @@ def extract_img(page_path, media_root):
else:
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
cmd = (
"pdftoppm",
settings.BINARY_PDFTOPPM,
"-jpeg",
"-f",
str(page_num),
@ -89,7 +90,7 @@ def extract_hocr(page_url, lang, media_root):
os.path.join(media_root, page_url.hocr_url())
)
cmd = (
"tesseract",
settings.BINARY_OCR,
"-l",
lang,
page_abspath,
@ -112,37 +113,10 @@ def extract_txt(page_url, lang, media_root):
)
)
cmd = (
"tesseract",
settings.BINARY_OCR,
"-l",
lang,
page_abspath,
txt_root
)
run(cmd)
#def text_from_pdf(filepath, lang, dry_run=False):
#
# # suffix .tiff in file name is required by conver utility, otherwise
# # it won't convert to tiff format!
# tiff = tempfile.NamedTemporaryFile(suffix=".tiff")
# conv = convert.Convert(dry_run=dry_run)
# conv(filepath=filepath, fout=tiff)
# try:
# tsact = tesseract.Tesseract()
# text = tsact(filepath=tiff.name, lang=lang)
# except subprocess.CalledProcessError as e:
# print(e)
# print(e.stderr)
# return
#
# return text
#
#
#def text_from_image(filepath, lang, dry_run=False):
#
# tsact = tesseract.Tesseract(dry_run=dry_run)
# text = tsact(filepath=filepath, lang=lang)
#
# return text
#

View File

@ -2,6 +2,7 @@ import os
import logging
from mglib.runcmd import run
from .conf import settings
logger = logging.getLogger(__name__)
@ -21,7 +22,7 @@ def convert_tiff2pdf(doc_url):
)
cmd = (
"convert",
settings.BINARY_CONVERT,
doc_url,
new_doc_url,
)

View File

@ -6,7 +6,7 @@ with open("README.md", "r") as fh:
setup(
name="mglib",
version="1.2.5",
version="1.2.6",
author="Eugen Ciur",
author_email="eugen@papermerge.com",
url="https://github.com/papermerge/mglib",

45
test/test_settings.py Normal file
View File

@ -0,0 +1,45 @@
import os
from pathlib import Path
import unittest
from mglib.conf.settings import (
MgLibSettings,
DefaultSettings
)
DATA_DIR = os.path.join(
Path(__file__).parent,
'data'
)
class TestMgLibSettings(unittest.TestCase):
def setUp(self):
self.settings = MgLibSettings(DefaultSettings())
def test_settings_outside_django_should_work(self):
"""
Without django there should be default values
for settings
"""
# check default value for pdfinfo
self.assertEqual(
"/usr/bin/pdfinfo",
self.settings.BINARY_PDFINFO
)
def test_settings_are_configurable(self):
"""
User should be able to reconfigure mglibsettings
on the go (i.e. change default values).
"""
# check default value for pdfinfo
self.settings.configure(
BINARY_PDFINFO="/usr/bin/xyz"
)
self.assertEqual(
"/usr/bin/xyz",
self.settings.BINARY_PDFINFO
)