aboutsummaryrefslogtreecommitdiff
path: root/textproc/py-ocrmypdf
diff options
context:
space:
mode:
Diffstat (limited to 'textproc/py-ocrmypdf')
-rw-r--r--textproc/py-ocrmypdf/Makefile7
-rw-r--r--textproc/py-ocrmypdf/distinfo6
-rw-r--r--textproc/py-ocrmypdf/files/patch-src_ocrmypdf_optimize.py66
3 files changed, 73 insertions, 6 deletions
diff --git a/textproc/py-ocrmypdf/Makefile b/textproc/py-ocrmypdf/Makefile
index faba4fd54e22..157d71bad57d 100644
--- a/textproc/py-ocrmypdf/Makefile
+++ b/textproc/py-ocrmypdf/Makefile
@@ -1,5 +1,5 @@
PORTNAME= ocrmypdf
-DISTVERSION= 16.10.4
+DISTVERSION= 16.11.1
CATEGORIES= textproc python
MASTER_SITES= PYPI
PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX}
@@ -28,12 +28,13 @@ TEST_DEPENDS= ${PYTHON_PKGNAMEPREFIX}hypothesis>=6.36.0:devel/py-hypothesis@${PY
${PYTHON_PKGNAMEPREFIX}python-xmp-toolkit>=2.0.1:textproc/py-python-xmp-toolkit@${PY_FLAVOR} \
${PYTHON_PKGNAMEPREFIX}reportlab>=3.6.8:print/py-reportlab@${PY_FLAVOR}
-USES= ghostscript:run python:3.10+ shebangfix
+USES= ghostscript:run python shebangfix
USE_PYTHON= autoplist concurrent pep517 pytest
# Skip some checks as they yield wrong results if run with the root account
+# "test_watcher" requires additional deps used by the "watcher" feature
PYTEST_IGNORED_TESTS= test_chmod \
test_input_file_not_readable \
- test_malformed_docinfo # leads to an internal pytest error
+ test_watcher
SHEBANG_FILES= src/ocrmypdf/__main__.py \
src/ocrmypdf/pdfinfo/__init__.py
diff --git a/textproc/py-ocrmypdf/distinfo b/textproc/py-ocrmypdf/distinfo
index ff8097787e66..582ec949cdca 100644
--- a/textproc/py-ocrmypdf/distinfo
+++ b/textproc/py-ocrmypdf/distinfo
@@ -1,3 +1,3 @@
-TIMESTAMP = 1753074854
-SHA256 (ocrmypdf-16.10.4.tar.gz) = de749ef5f554b63d57e68d032e7cba5500cbd5030835bf24f658f7b7a04f3dc1
-SIZE (ocrmypdf-16.10.4.tar.gz) = 7003649
+TIMESTAMP = 1763048154
+SHA256 (ocrmypdf-16.11.1.tar.gz) = 838ab69e0ee0f04feea0d5861a17badecab6d3beaed0e29a97058eadda58cbb1
+SIZE (ocrmypdf-16.11.1.tar.gz) = 7015278
diff --git a/textproc/py-ocrmypdf/files/patch-src_ocrmypdf_optimize.py b/textproc/py-ocrmypdf/files/patch-src_ocrmypdf_optimize.py
new file mode 100644
index 000000000000..34e6453d57df
--- /dev/null
+++ b/textproc/py-ocrmypdf/files/patch-src_ocrmypdf_optimize.py
@@ -0,0 +1,66 @@
+From: "James R. Barlow" <james@purplerock.ca>
+Date: Sun, 9 Nov 2025 15:43:36 -0800
+Subject: [PATCH] Work around Ghostscript 10.6.0 JPEG encoding issue by forcing
+ optimization.
+
+Not an ideal fix, but it improves an issue affecting numerous users.
+
+Fixes 1585.
+
+Obtained from:
+
+https://github.com/ocrmypdf/OCRmyPDF/commit/f4c6c8121ba8178ff3a1cb8f70037bbc3a31391b.patch
+
+--- src/ocrmypdf/optimize.py.orig 2020-02-02 00:00:00 UTC
++++ src/ocrmypdf/optimize.py
+@@ -17,6 +17,7 @@ import img2pdf
+ from zlib import compress
+
+ import img2pdf
++from packaging.version import Version
+ from pikepdf import (
+ Dictionary,
+ Name,
+@@ -32,7 +33,7 @@ from ocrmypdf._concurrent import Executor, SerialExecu
+ from PIL import Image
+
+ from ocrmypdf._concurrent import Executor, SerialExecutor
+-from ocrmypdf._exec import jbig2enc, pngquant
++from ocrmypdf._exec import ghostscript, jbig2enc, pngquant
+ from ocrmypdf._jobcontext import PdfContext
+ from ocrmypdf._progressbar import ProgressBar
+ from ocrmypdf.exceptions import OutputFileAccessError
+@@ -189,6 +190,16 @@ def extract_image_jbig2(
+ return None
+
+
++def _should_optimize_jpeg(options, filtdp):
++ if options.optimize >= 2:
++ return True
++ if options.optimize < 2 and ghostscript.version() >= Version('10.6.0'):
++ # Ghostscript 10.6.0+ introduced some sort of JPEG encoding issue.
++ # To resolve this, re-optimize the JPEG anyway.
++ return True
++ return False
++
++
+ def extract_image_generic(
+ *, pdf: Pdf, root: Path, image: Stream, xref: Xref, options
+ ) -> XrefExt | None:
+@@ -202,15 +213,7 @@ def extract_image_generic(
+ if pim.bits_per_component == 1:
+ return None
+
+- if filtdp[0] == Name.DCTDecode and options.optimize >= 2:
+- # This is a simple heuristic derived from some training data, that has
+- # about a 70% chance of guessing whether the JPEG is high quality,
+- # and possibly recompressible, or not. The number itself doesn't mean
+- # anything.
+- # bytes_per_pixel = int(raw_jpeg.Length) / (w * h)
+- # jpeg_quality_estimate = 117.0 * (bytes_per_pixel ** 0.213)
+- # if jpeg_quality_estimate < 65:
+- # return None
++ if filtdp[0] == Name.DCTDecode and _should_optimize_jpeg(options, filtdp):
+ try:
+ imgname = root / f'{xref:08d}'
+ with imgname.open('wb') as f: