From 0b80c1f446e52a00552f2f2aa3f351ab9d374d68 Mon Sep 17 00:00:00 2001 From: "francesco.carzaniga" Date: Fri, 27 Nov 2020 19:22:43 +0100 Subject: [PATCH 1/5] Replace extension checking with python-magic --- mglib/pdfinfo.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mglib/pdfinfo.py b/mglib/pdfinfo.py index 123f231..6b4c1f5 100644 --- a/mglib/pdfinfo.py +++ b/mglib/pdfinfo.py @@ -2,6 +2,7 @@ import os import re import subprocess import logging +from magic import from_file from .conf import settings from .exceptions import FileTypeNotSupported @@ -63,18 +64,18 @@ def get_pagecount(filepath): if os.path.isdir(filepath): raise ValueError("Filepath %s is a directory!" % filepath) - base, ext = os.path.splitext(filepath) + mime_type = from_file(filepath, mime=True) # pure images (png, jpeg) have only one page :) - if ext and ext.lower() in ('.jpeg', '.png', '.jpg'): + if mime_type in ['image/png', 'image/jpeg', 'image/jpg']: # whatever png/jpg image is there - it is # considered by default one page document. return 1 - if ext and ext.lower() in ('.tiff', ): + if mime_type == 'image/tiff': return get_tiff_pagecount(filepath) - if ext and ext.lower() not in ('.pdf', '.tiff'): + if mime_type != 'application/pdf' : raise FileTypeNotSupported( "Only jpeg, png, pdf and tiff are handled by this" " method" From ae93586a3780d92614c8f39c90735cb29b3673dd Mon Sep 17 00:00:00 2001 From: "francesco.carzaniga" Date: Fri, 27 Nov 2020 19:26:57 +0100 Subject: [PATCH 2/5] Formatting --- mglib/pdfinfo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mglib/pdfinfo.py b/mglib/pdfinfo.py index 6b4c1f5..ef82c6b 100644 --- a/mglib/pdfinfo.py +++ b/mglib/pdfinfo.py @@ -75,7 +75,7 @@ def get_pagecount(filepath): if mime_type == 'image/tiff': return get_tiff_pagecount(filepath) - if mime_type != 'application/pdf' : + if mime_type != 'application/pdf': raise FileTypeNotSupported( "Only jpeg, png, pdf and tiff are handled by this" " method" From 030df1e049fbdfda881cac70aaa1db2b62be21b1 Mon Sep 17 00:00:00 2001 From: "francesco.carzaniga" Date: Sat, 28 Nov 2020 20:06:05 +0100 Subject: [PATCH 3/5] Modified tests --- .github/workflows/python-app.yml | 2 +- requirements/base.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 requirements/base.txt diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 8443664..610f9a2 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -24,7 +24,7 @@ jobs: run: | python -m pip install --upgrade pip pip install pycodestyle pytest coverage - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + if [ -f requirements/base.txt ]; then pip install -r requirements/base.txt; fi sudo apt install poppler-utils pdftk - name: Lint with pycodestyle run: | diff --git a/requirements/base.txt b/requirements/base.txt new file mode 100644 index 0000000..aee0e39 --- /dev/null +++ b/requirements/base.txt @@ -0,0 +1 @@ +python-magic \ No newline at end of file From b0fbd06a25c6d56260fbf70db7038229e0339c1d Mon Sep 17 00:00:00 2001 From: "francesco.carzaniga" Date: Sat, 28 Nov 2020 20:22:02 +0100 Subject: [PATCH 4/5] Changed test file to include magic bytes --- test/data/berlin.jpeg | 2 +- test/data/berlin.jpg | 2 +- test/data/berlin.png | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/test/data/berlin.jpeg b/test/data/berlin.jpeg index c305027..2605b81 100644 --- a/test/data/berlin.jpeg +++ b/test/data/berlin.jpeg @@ -1,2 +1,2 @@ -I am not even binary! +ÿØÿØI am not even binary! The idea is to test pdfinfo.get_pagecount \ No newline at end of file diff --git a/test/data/berlin.jpg b/test/data/berlin.jpg index 6c06761..8e5a3dd 100644 --- a/test/data/berlin.jpg +++ b/test/data/berlin.jpg @@ -1 +1 @@ -well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount \ No newline at end of file +ÿØÿîwell... I am text! But who cares? The idea is to test pdfinfo.get_pagecount \ No newline at end of file diff --git a/test/data/berlin.png b/test/data/berlin.png index 6c06761..8114ee1 100644 --- a/test/data/berlin.png +++ b/test/data/berlin.png @@ -1 +1,3 @@ +‰PNG + well... I am text! But who cares? The idea is to test pdfinfo.get_pagecount \ No newline at end of file From 0bf3789dcadbc002901680420696caa64f13c10f Mon Sep 17 00:00:00 2001 From: "francesco.carzaniga" Date: Sat, 28 Nov 2020 20:37:38 +0100 Subject: [PATCH 5/5] PNG needs to be more complete to work --- test/data/berlin.png | Bin 83 -> 1374 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/test/data/berlin.png b/test/data/berlin.png index 8114ee1063ed2777dd117a602f428765b89c4ecc..df88485e4d66203dfc0b46b78a57c5a481bd58f3 100644 GIT binary patch literal 1374 zcmV-k1)=(hP)a7- zM4)VQNfjAPsDKD@V1zgb?@}O-)UHncNY`V=nw3 zml3dN0JAb}Zf=(4FnA3&q*dl3vLHueoym%VN3U0xgdl99HMp^9Aw)-C5#*{7i zKJOY1=Uzh2)2k$w6p@OrvW}xIzYps*+i{|B!0#G@7HzlNq1WqSv)NElQQ`NtpE>up zU0GR)?(S~r^?Ep+PT55RfMaA5rFDzZ-FFTfG&6dTy^NZs&R@W`Tm#21OgXR;7RcmT+{`x7D)-8r})E#g=L5n_m z^eA|qM_XGPv|4RQMQgQMw70i|=XtcYwmwwRE!XN1kuiyun_pvTOrFHDV~g;^<_qZF zI*3DtAF+^6m)Nn(&G@inFLoLCL7VkF_MY7$i+Sfi+-rt%>EE!qP9j;Qk=T+L9a?Wr zhF)3?F&`XO1&J2Rc=2515{Rs>Da(~JPhr0HBC6}@K7 z8Yq=Yl$V#Ir>93^gM)+6>2%;Y4rOI!vWo8M>GAM;n4%4-8xcwKSeN!Te(mp)7zZVU z7&{Vjc)V}e?curO6*HpKIkf-Y2=m#Guq!~7`Z961Zdu}Ic@*44NUKfUkhdn$m-}(ig=X{5s*ZqNAuO2|n$#-QjfBC)V=8HR! z8&iTKD|*2>RdBfMe%C7GBL#0x|ZnsNpro7+YoqqWq@~v^07!trnQixi_f{CD} za@-SPzbyb1piHDOF(4p*IR^l6-W4%1C_r=(FiHxlg$%ejIw%{qxg?L7+un1*NicrH z4cetZMe#s|DjlKXh+)Pi|R6If?T?n5L3ZjD~Y?vtA^ zq`vC&xX0pMN2Vzn@yiwSlqRq;P0;`auqq1tGYnWl`8e0pvEbpS@bnFBB z!bNd9)?@A~C+%VhOVf0-APCiuDwrIFYpJspdpD++9vK;Vk71Z^D2f`J??6wQXctQ& g?ku=W?ukYJ0Byu2Tb{63;Q#;t07*qoM6N<$g62qkJOBUy literal 83 zcmeAS@N?(olHw{)&B@Wz(^K$NNX%6zNv$YRRB$RSQ7F&IS4d7QN-efm2+2rQ$V^F1 lRLCq=D9Hz^DK1ecNJ-1gOUu_wPc4ZrNK8*n&M(a?0RS`E8`l5;