mirror of https://github.com/papermerge/mglib
make binary paths configurable
parent
ac7f2ae37b
commit
5edd196aaa
|
@ -1,5 +1,14 @@
|
|||
|
||||
# Changelog
|
||||
|
||||
|
||||
## [1.2.6] - 11 August 2020
|
||||
|
||||
### Added
|
||||
|
||||
- mglib.conf.settings module. The points it to get rid of hardcoded binary paths. Binary paths are now provided as configurations.
|
||||
|
||||
|
||||
## [1.2.3] - 25 July 2020
|
||||
|
||||
### Changed
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
from .settings import (
|
||||
DefaultSettings,
|
||||
MgLibSettings
|
||||
)
|
||||
|
||||
settings = MgLibSettings(
|
||||
DefaultSettings()
|
||||
)
|
|
@ -0,0 +1,28 @@
|
|||
|
||||
|
||||
# file utility used to find out mime type of a file
|
||||
BINARY_FILE = "/usr/bin/file"
|
||||
|
||||
# Provided by ImageMagick package.
|
||||
# Used for resizing images.
|
||||
BINARY_CONVERT = "/usr/bin/convert"
|
||||
|
||||
# Provided by Poppler Utils.
|
||||
# Used to extract images from PDF file.
|
||||
BINARY_PDFTOPPM = "/usr/bin/pdftoppm"
|
||||
|
||||
# Provided by Poppler Utils.
|
||||
# used to get page count in PDF file
|
||||
BINARY_PDFINFO = "/usr/bin/pdfinfo"
|
||||
|
||||
# Provided by ImageMagick package.
|
||||
# Used to get number of pages in TIFF file.
|
||||
BINARY_IDENTIFY = "/usr/bin/identify"
|
||||
|
||||
# Provided by tesseract package.
|
||||
# Used to extract text from images/PDF files.
|
||||
BINARY_OCR = "/usr/bin/tesseract"
|
||||
|
||||
# Provided by pdftk package
|
||||
# Used to reorder, cut/paste, delete pages withing PDF document
|
||||
BINARY_PDFTK = "/usr/bin/pdftk"
|
|
@ -0,0 +1,55 @@
|
|||
import importlib
|
||||
|
||||
try:
|
||||
from django.conf import settings as django_settings
|
||||
except ImportError:
|
||||
# Operating outside django, use own settings module
|
||||
django_settings = None
|
||||
|
||||
|
||||
class DefaultSettings:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
settings_module="mglib.conf.default_settings"
|
||||
):
|
||||
self.SETTINGS_MODULE = settings_module
|
||||
|
||||
mod = importlib.import_module(
|
||||
self.SETTINGS_MODULE
|
||||
)
|
||||
|
||||
for setting in dir(mod):
|
||||
if setting.isupper():
|
||||
setting_value = getattr(mod, setting)
|
||||
setattr(self, setting, setting_value)
|
||||
|
||||
def configure(self, **options):
|
||||
for name, value in options.items():
|
||||
setattr(self, name, value)
|
||||
|
||||
|
||||
class MgLibSettings:
|
||||
|
||||
def __init__(
|
||||
self, default_settings
|
||||
):
|
||||
self.default_settings = default_settings
|
||||
|
||||
def __getattr__(self, name):
|
||||
# When operating withing django,
|
||||
# get configuration from django settings
|
||||
if not name.isupper():
|
||||
raise AttributeError
|
||||
|
||||
if django_settings:
|
||||
val = getattr(django_settings, name)
|
||||
return val
|
||||
|
||||
val = getattr(self.default_settings, name)
|
||||
return val
|
||||
|
||||
def configure(self, **options):
|
||||
self.default_settings.configure(
|
||||
**options
|
||||
)
|
|
@ -1,13 +1,14 @@
|
|||
import logging
|
||||
from . import wrapper
|
||||
|
||||
from .conf import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Mime(wrapper.Wrapper):
|
||||
def __init__(self, filepath):
|
||||
super().__init__(exec_name="file")
|
||||
super().__init__(exec_name=settings.BINARY_FILE)
|
||||
self.filepath = filepath
|
||||
|
||||
def get_cmd(self):
|
||||
|
|
|
@ -3,6 +3,8 @@ import re
|
|||
import subprocess
|
||||
import logging
|
||||
|
||||
from .conf import settings
|
||||
|
||||
"""
|
||||
Uses command line pdfinfo utility (from poppler pakage) for various
|
||||
small operations (e.g. get pdf page count).
|
||||
|
@ -13,7 +15,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
def get_tiff_pagecount(filepath):
|
||||
cmd = [
|
||||
"/usr/bin/identify",
|
||||
settings.BINARY_IDENTIFY,
|
||||
"-format",
|
||||
"%n\n",
|
||||
filepath
|
||||
|
@ -76,10 +78,11 @@ def get_pagecount(filepath):
|
|||
"Only jpeg, png, pdf and tiff are handlerd by this"
|
||||
" method"
|
||||
)
|
||||
|
||||
# pdfinfo "${PDFFILE}" | grep Pages
|
||||
|
||||
cmd = ["/usr/bin/pdfinfo", filepath]
|
||||
cmd = [
|
||||
settings.BINARY_PDFINFO,
|
||||
filepath
|
||||
]
|
||||
compl = subprocess.run(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
|
|
|
@ -3,6 +3,8 @@ import logging
|
|||
from mglib.runcmd import run
|
||||
from mglib.pdfinfo import get_pagecount
|
||||
|
||||
from .conf import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#
|
||||
|
@ -183,7 +185,7 @@ def paste_pages_into_existing_doc(
|
|||
)
|
||||
|
||||
cmd = [
|
||||
"pdftk",
|
||||
settings.BINARY_PDFTK,
|
||||
]
|
||||
# add A=doc1_path, B=doc2_path
|
||||
cmd.extend(letters_2_doc_map)
|
||||
|
@ -272,7 +274,7 @@ def paste_pages(
|
|||
)
|
||||
|
||||
cmd = [
|
||||
"pdftk",
|
||||
settings.BINARY_PDFTK,
|
||||
]
|
||||
# add A=doc1_path, B=doc2_path
|
||||
cmd.extend(letters_2_doc_map)
|
||||
|
@ -315,7 +317,7 @@ def reorder_pages(
|
|||
)
|
||||
|
||||
cmd = [
|
||||
"pdftk",
|
||||
settings.BINARY_PDFTK,
|
||||
src,
|
||||
"cat"
|
||||
]
|
||||
|
@ -338,7 +340,7 @@ def delete_pages(src, dst, page_numbers):
|
|||
)
|
||||
|
||||
cmd = [
|
||||
"pdftk",
|
||||
settings.BINARY_PDFTK,
|
||||
src,
|
||||
"cat"
|
||||
]
|
||||
|
|
|
@ -2,6 +2,7 @@ import os
|
|||
import logging
|
||||
|
||||
from mglib.runcmd import run
|
||||
from .conf import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -28,7 +29,7 @@ def resize_img(page_path, media_root):
|
|||
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
||||
|
||||
cmd = (
|
||||
"convert",
|
||||
settings.BINARY_CONVERT,
|
||||
"-resize",
|
||||
f"{width}x",
|
||||
local_abspath,
|
||||
|
@ -61,7 +62,7 @@ def extract_img(page_path, media_root):
|
|||
else:
|
||||
logger.debug(f"PPMROOT {ppmroot_dirname} already exists.")
|
||||
cmd = (
|
||||
"pdftoppm",
|
||||
settings.BINARY_PDFTOPPM,
|
||||
"-jpeg",
|
||||
"-f",
|
||||
str(page_num),
|
||||
|
@ -89,7 +90,7 @@ def extract_hocr(page_url, lang, media_root):
|
|||
os.path.join(media_root, page_url.hocr_url())
|
||||
)
|
||||
cmd = (
|
||||
"tesseract",
|
||||
settings.BINARY_OCR,
|
||||
"-l",
|
||||
lang,
|
||||
page_abspath,
|
||||
|
@ -112,37 +113,10 @@ def extract_txt(page_url, lang, media_root):
|
|||
)
|
||||
)
|
||||
cmd = (
|
||||
"tesseract",
|
||||
settings.BINARY_OCR,
|
||||
"-l",
|
||||
lang,
|
||||
page_abspath,
|
||||
txt_root
|
||||
)
|
||||
run(cmd)
|
||||
|
||||
|
||||
#def text_from_pdf(filepath, lang, dry_run=False):
|
||||
#
|
||||
# # suffix .tiff in file name is required by conver utility, otherwise
|
||||
# # it won't convert to tiff format!
|
||||
# tiff = tempfile.NamedTemporaryFile(suffix=".tiff")
|
||||
# conv = convert.Convert(dry_run=dry_run)
|
||||
# conv(filepath=filepath, fout=tiff)
|
||||
# try:
|
||||
# tsact = tesseract.Tesseract()
|
||||
# text = tsact(filepath=tiff.name, lang=lang)
|
||||
# except subprocess.CalledProcessError as e:
|
||||
# print(e)
|
||||
# print(e.stderr)
|
||||
# return
|
||||
#
|
||||
# return text
|
||||
#
|
||||
#
|
||||
#def text_from_image(filepath, lang, dry_run=False):
|
||||
#
|
||||
# tsact = tesseract.Tesseract(dry_run=dry_run)
|
||||
# text = tsact(filepath=filepath, lang=lang)
|
||||
#
|
||||
# return text
|
||||
#
|
||||
|
|
|
@ -2,6 +2,7 @@ import os
|
|||
import logging
|
||||
|
||||
from mglib.runcmd import run
|
||||
from .conf import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -21,7 +22,7 @@ def convert_tiff2pdf(doc_url):
|
|||
)
|
||||
|
||||
cmd = (
|
||||
"convert",
|
||||
settings.BINARY_CONVERT,
|
||||
doc_url,
|
||||
new_doc_url,
|
||||
)
|
||||
|
|
2
setup.py
2
setup.py
|
@ -6,7 +6,7 @@ with open("README.md", "r") as fh:
|
|||
|
||||
setup(
|
||||
name="mglib",
|
||||
version="1.2.5",
|
||||
version="1.2.6",
|
||||
author="Eugen Ciur",
|
||||
author_email="eugen@papermerge.com",
|
||||
url="https://github.com/papermerge/mglib",
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
import unittest
|
||||
|
||||
from mglib.conf.settings import (
|
||||
MgLibSettings,
|
||||
DefaultSettings
|
||||
)
|
||||
|
||||
DATA_DIR = os.path.join(
|
||||
Path(__file__).parent,
|
||||
'data'
|
||||
)
|
||||
|
||||
|
||||
class TestMgLibSettings(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.settings = MgLibSettings(DefaultSettings())
|
||||
|
||||
def test_settings_outside_django_should_work(self):
|
||||
"""
|
||||
Without django there should be default values
|
||||
for settings
|
||||
"""
|
||||
# check default value for pdfinfo
|
||||
self.assertEqual(
|
||||
"/usr/bin/pdfinfo",
|
||||
self.settings.BINARY_PDFINFO
|
||||
)
|
||||
|
||||
def test_settings_are_configurable(self):
|
||||
"""
|
||||
User should be able to reconfigure mglibsettings
|
||||
on the go (i.e. change default values).
|
||||
"""
|
||||
# check default value for pdfinfo
|
||||
self.settings.configure(
|
||||
BINARY_PDFINFO="/usr/bin/xyz"
|
||||
)
|
||||
self.assertEqual(
|
||||
"/usr/bin/xyz",
|
||||
self.settings.BINARY_PDFINFO
|
||||
)
|
||||
|
Loading…
Reference in New Issue