From 22d777eb5116c4835c13b12d71067a18b35c1290 Mon Sep 17 00:00:00 2001
From: Benjamin Franzke <bfr@qbus.de>
Date: Tue, 13 Dec 2022 09:14:33 +0100
Subject: [PATCH] [BUGFIX] Avoid double UTF-8 encoded PDF metadata in file
 indexer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are different versions of pdfinfo available and used
by different providers/distributions.

a) Debian/Fedora use pdfinfo (>v20) from the poppler-utils package.
   Also hosters like Hetzner use this version.
   This variant defaults to UTF-8 output for metadata:
   https://linux.die.net/man/1/pdfinfo
   > -enc encoding-name
   Sets the encoding to use for text output. This defaults to "UTF-8".

   pdfinfo -v
   pdfinfo version 21.08.0
   Copyright 2005-2021 The Poppler Developers -
                       http://poppler.freedesktop.org
   Copyright 1996-2011 Glyph & Cog, LLC

b) Older servers and hosters with legacy software (Mittwald,
   Domainfactory) use pdfinfo v3. This one defaults to Latin1 output:
   https://www.xpdfreader.com/pdfinfo-man.html
   > −enc encoding-name
   > Sets the encoding to use for text output. […]
   > This defaults to "Latin1"

   pdfinfo -v
   pdfinfo version 3.02
   Copyright 1996-2007 Glyph & Cog, LLC

Both versions support an -enc UTF-8 option, which is nowused to
circumvent the differences between these tools, instead of implying
Latin1 output (as done in #80085) which breaks variant a) by
interpreting valid UTF-8 as ISO-8859-1 and thus applying
a double encoding.

Resolves: #99352
Related: #80085
Releases: main, 11.5, 10.4
Change-Id: Ib8f7ae742c5edc73036afcb7d2608cd01f4176fd
Reviewed-on: https://review.typo3.org/c/Packages/TYPO3.CMS/+/77081
Reviewed-by: Benni Mack <benni@typo3.org>
Tested-by: Benjamin Franzke <bfr@qbus.de>
Tested-by: Benni Mack <benni@typo3.org>
Reviewed-by: Stefan Bürk <stefan@buerk.tech>
Tested-by: Stefan Bürk <stefan@buerk.tech>
Reviewed-by: Benjamin Franzke <bfr@qbus.de>
Tested-by: core-ci <typo3@b13.com>
---
 typo3/sysext/indexed_search/Classes/FileContentParser.php | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/typo3/sysext/indexed_search/Classes/FileContentParser.php b/typo3/sysext/indexed_search/Classes/FileContentParser.php
index 4fe93590d5f3..25c0fc96a8b7 100644
--- a/typo3/sysext/indexed_search/Classes/FileContentParser.php
+++ b/typo3/sysext/indexed_search/Classes/FileContentParser.php
@@ -473,7 +473,7 @@ class FileContentParser
                 if ($this->app['pdfinfo']) {
                     $this->setLocaleForServerFileSystem();
                     // Getting pdf-info:
-                    $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
+                    $cmd = $this->app['pdfinfo'] . ' -enc UTF-8 ' . escapeshellarg($absFile);
                     CommandUtility::exec($cmd, $res);
                     $pdfInfo = $this->splitPdfInfo($res);
                     unset($res);
@@ -796,7 +796,7 @@ class FileContentParser
             foreach ($pdfInfoArray as $line) {
                 $parts = explode(':', $line, 2);
                 if (count($parts) > 1 && trim($parts[0])) {
-                    $res[strtolower(trim($parts[0]))] = mb_convert_encoding(trim($parts[1]), 'UTF-8', 'ISO-8859-1');
+                    $res[strtolower(trim($parts[0]))] = trim($parts[1]);
                 }
             }
         }
-- 
GitLab