Merge pull request #6 from francescocarzaniga/master

Replace extension checking with python-magic
2020-11-29 08:35:29 +01:00 · 2020-11-29 08:35:29 +01:00 · 8d5077933f
parent 9fbaaf7dfd 0bf3789dca
commit 8d5077933f
6 changed files with 9 additions and 7 deletions
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@ -24,7 +24,7 @@ jobs:
      run: |
        python -m pip install --upgrade pip
        pip install pycodestyle pytest coverage
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        if [ -f requirements/base.txt ]; then pip install -r requirements/base.txt; fi
        sudo apt install poppler-utils pdftk
    - name: Lint with pycodestyle
      run: |
--- a/mglib/pdfinfo.py
+++ b/mglib/pdfinfo.py
@ -2,6 +2,7 @@ import os
 import re
 import subprocess
 import logging
+from magic import from_file

 from .conf import settings
 from .exceptions import FileTypeNotSupported
@ -63,18 +64,18 @@ def get_pagecount(filepath):
    if os.path.isdir(filepath):
        raise ValueError("Filepath %s is a directory!" % filepath)

-    base, ext = os.path.splitext(filepath)
+    mime_type = from_file(filepath, mime=True)

    # pure images (png, jpeg) have only one page :)
-    if ext and ext.lower() in ('.jpeg', '.png', '.jpg'):
+    if mime_type in ['image/png', 'image/jpeg', 'image/jpg']:
        # whatever png/jpg image is there - it is
        # considered by default one page document.
        return 1

-    if ext and ext.lower() in ('.tiff', ):
+    if mime_type == 'image/tiff':
        return get_tiff_pagecount(filepath)

-    if ext and ext.lower() not in ('.pdf', '.tiff'):
+    if mime_type != 'application/pdf':
        raise FileTypeNotSupported(
            "Only jpeg, png, pdf and tiff are handled by this"
            " method"
--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -0,0 +1 @@
+python-magic
--- a/test/data/berlin.jpeg
+++ b/test/data/berlin.jpeg
@ -1,2 +1,2 @@
-I am not even binary!
+ÿØÿØI am not even binary!
 The idea is to test pdfinfo.get_pagecount
--- a/test/data/berlin.jpg
+++ b/test/data/berlin.jpg
@ -1 +1 @@
-well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount
+ÿØÿîwell... I am text! But who cares? The idea is to test pdfinfo.get_pagecount
--- a/test/data/berlin.png
+++ b/test/data/berlin.png