From 22d777eb5116c4835c13b12d71067a18b35c1290 Mon Sep 17 00:00:00 2001 From: Benjamin Franzke <bfr@qbus.de> Date: Tue, 13 Dec 2022 09:14:33 +0100 Subject: [PATCH] [BUGFIX] Avoid double UTF-8 encoded PDF metadata in file indexer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are different versions of pdfinfo available and used by different providers/distributions. a) Debian/Fedora use pdfinfo (>v20) from the poppler-utils package. Also hosters like Hetzner use this version. This variant defaults to UTF-8 output for metadata: https://linux.die.net/man/1/pdfinfo > -enc encoding-name Sets the encoding to use for text output. This defaults to "UTF-8". pdfinfo -v pdfinfo version 21.08.0 Copyright 2005-2021 The Poppler Developers - http://poppler.freedesktop.org Copyright 1996-2011 Glyph & Cog, LLC b) Older servers and hosters with legacy software (Mittwald, Domainfactory) use pdfinfo v3. This one defaults to Latin1 output: https://www.xpdfreader.com/pdfinfo-man.html > −enc encoding-name > Sets the encoding to use for text output. […] > This defaults to "Latin1" pdfinfo -v pdfinfo version 3.02 Copyright 1996-2007 Glyph & Cog, LLC Both versions support an -enc UTF-8 option, which is nowused to circumvent the differences between these tools, instead of implying Latin1 output (as done in #80085) which breaks variant a) by interpreting valid UTF-8 as ISO-8859-1 and thus applying a double encoding. Resolves: #99352 Related: #80085 Releases: main, 11.5, 10.4 Change-Id: Ib8f7ae742c5edc73036afcb7d2608cd01f4176fd Reviewed-on: https://review.typo3.org/c/Packages/TYPO3.CMS/+/77081 Reviewed-by: Benni Mack <benni@typo3.org> Tested-by: Benjamin Franzke <bfr@qbus.de> Tested-by: Benni Mack <benni@typo3.org> Reviewed-by: Stefan Bürk <stefan@buerk.tech> Tested-by: Stefan Bürk <stefan@buerk.tech> Reviewed-by: Benjamin Franzke <bfr@qbus.de> Tested-by: core-ci <typo3@b13.com> --- typo3/sysext/indexed_search/Classes/FileContentParser.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/typo3/sysext/indexed_search/Classes/FileContentParser.php b/typo3/sysext/indexed_search/Classes/FileContentParser.php index 4fe93590d5f3..25c0fc96a8b7 100644 --- a/typo3/sysext/indexed_search/Classes/FileContentParser.php +++ b/typo3/sysext/indexed_search/Classes/FileContentParser.php @@ -473,7 +473,7 @@ class FileContentParser if ($this->app['pdfinfo']) { $this->setLocaleForServerFileSystem(); // Getting pdf-info: - $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile); + $cmd = $this->app['pdfinfo'] . ' -enc UTF-8 ' . escapeshellarg($absFile); CommandUtility::exec($cmd, $res); $pdfInfo = $this->splitPdfInfo($res); unset($res); @@ -796,7 +796,7 @@ class FileContentParser foreach ($pdfInfoArray as $line) { $parts = explode(':', $line, 2); if (count($parts) > 1 && trim($parts[0])) { - $res[strtolower(trim($parts[0]))] = mb_convert_encoding(trim($parts[1]), 'UTF-8', 'ISO-8859-1'); + $res[strtolower(trim($parts[0]))] = trim($parts[1]); } } } -- GitLab