make binary paths configurable

pull/3/head
Eugen Ciur 2020-08-11 19:47:35 +02:00
parent ac7f2ae37b
commit 5edd196aaa
11 changed files with 168 additions and 42 deletions

View File

@ -1,5 +1,14 @@
# Changelog # Changelog
## [1.2.6] - 11 August 2020
### Added
- mglib.conf.settings module. The points it to get rid of hardcoded binary paths. Binary paths are now provided as configurations.
## [1.2.3] - 25 July 2020 ## [1.2.3] - 25 July 2020
### Changed ### Changed

8
mglib/conf/__init__.py Normal file
View File

@ -0,0 +1,8 @@
from .settings import (
DefaultSettings,
MgLibSettings
)
settings = MgLibSettings(
DefaultSettings()
)

View File

@ -0,0 +1,28 @@
# file utility used to find out mime type of a file
BINARY_FILE = "/usr/bin/file"
# Provided by ImageMagick package.
# Used for resizing images.
BINARY_CONVERT = "/usr/bin/convert"
# Provided by Poppler Utils.
# Used to extract images from PDF file.
BINARY_PDFTOPPM = "/usr/bin/pdftoppm"
# Provided by Poppler Utils.
# used to get page count in PDF file
BINARY_PDFINFO = "/usr/bin/pdfinfo"
# Provided by ImageMagick package.
# Used to get number of pages in TIFF file.
BINARY_IDENTIFY = "/usr/bin/identify"
# Provided by tesseract package.
# Used to extract text from images/PDF files.
BINARY_OCR = "/usr/bin/tesseract"
# Provided by pdftk package
# Used to reorder, cut/paste, delete pages withing PDF document
BINARY_PDFTK = "/usr/bin/pdftk"

55
mglib/conf/settings.py Normal file
View File

@ -0,0 +1,55 @@
import importlib
try:
from django.conf import settings as django_settings
except ImportError:
# Operating outside django, use own settings module
django_settings = None
class DefaultSettings:
def __init__(
self,
settings_module="mglib.conf.default_settings"
):
self.SETTINGS_MODULE = settings_module
mod = importlib.import_module(
self.SETTINGS_MODULE
)
for setting in dir(mod):
if setting.isupper():
setting_value = getattr(mod, setting)
setattr(self, setting, setting_value)
def configure(self, **options):
for name, value in options.items():
setattr(self, name, value)
class MgLibSettings:
def __init__(
self, default_settings
):
self.default_settings = default_settings
def __getattr__(self, name):
# When operating withing django,
# get configuration from django settings
if not name.isupper():
raise AttributeError
if django_settings:
val = getattr(django_settings, name)
return val
val = getattr(self.default_settings, name)
return val
def configure(self, **options):
self.default_settings.configure(
**options
)

View File

@ -1,13 +1,14 @@
import logging import logging
from . import wrapper from . import wrapper
from .conf import settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class Mime(wrapper.Wrapper): class Mime(wrapper.Wrapper):
def __init__(self, filepath): def __init__(self, filepath):
super().__init__(exec_name="file") super().__init__(exec_name=settings.BINARY_FILE)
self.filepath = filepath self.filepath = filepath
def get_cmd(self): def get_cmd(self):

View File

@ -3,6 +3,8 @@ import re
import subprocess import subprocess
import logging import logging
from .conf import settings
""" """
Uses command line pdfinfo utility (from poppler pakage) for various Uses command line pdfinfo utility (from poppler pakage) for various
small operations (e.g. get pdf page count). small operations (e.g. get pdf page count).
@ -13,7 +15,7 @@ logger = logging.getLogger(__name__)
def get_tiff_pagecount(filepath): def get_tiff_pagecount(filepath):
cmd = [ cmd = [
"/usr/bin/identify", settings.BINARY_IDENTIFY,
"-format", "-format",
"%n\n", "%n\n",
filepath filepath
@ -76,10 +78,11 @@ def get_pagecount(filepath):
"Only jpeg, png, pdf and tiff are handlerd by this" "Only jpeg, png, pdf and tiff are handlerd by this"
" method" " method"
) )
# pdfinfo "${PDFFILE}" | grep Pages # pdfinfo "${PDFFILE}" | grep Pages
cmd = [
cmd = ["/usr/bin/pdfinfo", filepath] settings.BINARY_PDFINFO,
filepath
]
compl = subprocess.run( compl = subprocess.run(
cmd, cmd,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,

View File

@ -3,6 +3,8 @@ import logging
from mglib.runcmd import run from mglib.runcmd import run
from mglib.pdfinfo import get_pagecount from mglib.pdfinfo import get_pagecount
from .conf import settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# #
@ -183,7 +185,7 @@ def paste_pages_into_existing_doc(
) )
cmd = [ cmd = [
"pdftk", settings.BINARY_PDFTK,
] ]
# add A=doc1_path, B=doc2_path # add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map) cmd.extend(letters_2_doc_map)
@ -272,7 +274,7 @@ def paste_pages(
) )
cmd = [ cmd = [
"pdftk", settings.BINARY_PDFTK,
] ]
# add A=doc1_path, B=doc2_path # add A=doc1_path, B=doc2_path
cmd.extend(letters_2_doc_map) cmd.extend(letters_2_doc_map)
@ -315,7 +317,7 @@ def reorder_pages(
) )
cmd = [ cmd = [
"pdftk", settings.BINARY_PDFTK,
src, src,
"cat" "cat"
] ]
@ -338,7 +340,7 @@ def delete_pages(src, dst, page_numbers):
) )
cmd = [ cmd = [
"pdftk", settings.BINARY_PDFTK,
src, src,
"cat" "cat"
] ]

View File

@ -2,6 +2,7 @@ import os
import logging import logging
from mglib.runcmd import run from mglib.runcmd import run
from .conf import settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -28,7 +29,7 @@ def resize_img(page_path, media_root):
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.") logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
cmd = ( cmd = (
"convert", settings.BINARY_CONVERT,
"-resize", "-resize",
f"{width}x", f"{width}x",
local_abspath, local_abspath,
@ -61,7 +62,7 @@ def extract_img(page_path, media_root):
else: else:
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.") logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
cmd = ( cmd = (
"pdftoppm", settings.BINARY_PDFTOPPM,
"-jpeg", "-jpeg",
"-f", "-f",
str(page_num), str(page_num),
@ -89,7 +90,7 @@ def extract_hocr(page_url, lang, media_root):
os.path.join(media_root, page_url.hocr_url()) os.path.join(media_root, page_url.hocr_url())
) )
cmd = ( cmd = (
"tesseract", settings.BINARY_OCR,
"-l", "-l",
lang, lang,
page_abspath, page_abspath,
@ -112,37 +113,10 @@ def extract_txt(page_url, lang, media_root):
) )
) )
cmd = ( cmd = (
"tesseract", settings.BINARY_OCR,
"-l", "-l",
lang, lang,
page_abspath, page_abspath,
txt_root txt_root
) )
run(cmd) run(cmd)
#def text_from_pdf(filepath, lang, dry_run=False):
#
# # suffix .tiff in file name is required by conver utility, otherwise
# # it won't convert to tiff format!
# tiff = tempfile.NamedTemporaryFile(suffix=".tiff")
# conv = convert.Convert(dry_run=dry_run)
# conv(filepath=filepath, fout=tiff)
# try:
# tsact = tesseract.Tesseract()
# text = tsact(filepath=tiff.name, lang=lang)
# except subprocess.CalledProcessError as e:
# print(e)
# print(e.stderr)
# return
#
# return text
#
#
#def text_from_image(filepath, lang, dry_run=False):
#
# tsact = tesseract.Tesseract(dry_run=dry_run)
# text = tsact(filepath=filepath, lang=lang)
#
# return text
#

View File

@ -2,6 +2,7 @@ import os
import logging import logging
from mglib.runcmd import run from mglib.runcmd import run
from .conf import settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -21,7 +22,7 @@ def convert_tiff2pdf(doc_url):
) )
cmd = ( cmd = (
"convert", settings.BINARY_CONVERT,
doc_url, doc_url,
new_doc_url, new_doc_url,
) )

View File

@ -6,7 +6,7 @@ with open("README.md", "r") as fh:
setup( setup(
name="mglib", name="mglib",
version="1.2.5", version="1.2.6",
author="Eugen Ciur", author="Eugen Ciur",
author_email="eugen@papermerge.com", author_email="eugen@papermerge.com",
url="https://github.com/papermerge/mglib", url="https://github.com/papermerge/mglib",

45
test/test_settings.py Normal file
View File

@ -0,0 +1,45 @@
import os
from pathlib import Path
import unittest
from mglib.conf.settings import (
MgLibSettings,
DefaultSettings
)
DATA_DIR = os.path.join(
Path(__file__).parent,
'data'
)
class TestMgLibSettings(unittest.TestCase):
def setUp(self):
self.settings = MgLibSettings(DefaultSettings())
def test_settings_outside_django_should_work(self):
"""
Without django there should be default values
for settings
"""
# check default value for pdfinfo
self.assertEqual(
"/usr/bin/pdfinfo",
self.settings.BINARY_PDFINFO
)
def test_settings_are_configurable(self):
"""
User should be able to reconfigure mglibsettings
on the go (i.e. change default values).
"""
# check default value for pdfinfo
self.settings.configure(
BINARY_PDFINFO="/usr/bin/xyz"
)
self.assertEqual(
"/usr/bin/xyz",
self.settings.BINARY_PDFINFO
)