From a543c479f6a988bc4d81d568bb3a1aed900d7c41 Mon Sep 17 00:00:00 2001
From: Benni Mack <benni@typo3.org>
Date: Sat, 16 Jan 2016 14:33:34 +0100
Subject: [PATCH] [!!!][TASK] Remove charset functionality for locales
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The TYPO3 frontend resolves
config.locale_all (e.g. set to de_AT.UTF-8) and stores
it in $TSFE->localeCharset by using a "best guess" based
on decade-old mappings.

$TSFE->localeCharset is only used in stdWrap.strftime
to convert the result from the localeCharset to the ->renderCharset.

However, as it is obvious that a misconfiguration of the system, which happens only when
config.set_locale = de_AT@iso-8859-15
config.renderCharset = utf-8
is set, this exception needs to be configured manually
when using stdWrap.strftime.charset = iso-8859-15

Resolves: #72826
Releases: master
Change-Id: I1bba231879ebaf8e8700099bb87a03aba5d1b562
Reviewed-on: https://review.typo3.org/46011
Reviewed-by: Georg Ringer <georg.ringer@gmail.com>
Tested-by: Georg Ringer <georg.ringer@gmail.com>
Reviewed-by: Frank Nägler <frank.naegler@typo3.org>
Tested-by: Frank Nägler <frank.naegler@typo3.org>
---
 .../core/Classes/Charset/CharsetConverter.php | 281 ------------------
 ...edCustomCharsetConfigurationForLocales.rst |  43 +++
 .../ContentObject/ContentObjectRenderer.php   |   6 +-
 .../TypoScriptFrontendController.php          |   7 -
 4 files changed, 45 insertions(+), 292 deletions(-)
 create mode 100644 typo3/sysext/core/Documentation/Changelog/master/Breaking-72826-RemovedCustomCharsetConfigurationForLocales.rst

diff --git a/typo3/sysext/core/Classes/Charset/CharsetConverter.php b/typo3/sysext/core/Classes/Charset/CharsetConverter.php
index be81939a8778..efd6c8d2d942 100644
--- a/typo3/sysext/core/Classes/Charset/CharsetConverter.php
+++ b/typo3/sysext/core/Classes/Charset/CharsetConverter.php
@@ -220,245 +220,6 @@ class CharsetConverter implements SingletonInterface
         'ucs4' => 'ucs-4'
     );
 
-    /**
-     * Mapping of iso-639-1 language codes to script names
-     *
-     * @var array
-     */
-    public $lang_to_script = array(
-        // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
-        'af' => 'west_european', // Afrikaans
-        'ar' => 'arabic',
-        'bg' => 'cyrillic', // Bulgarian
-        'bs' => 'east_european', // Bosnian
-        'cs' => 'east_european', // Czech
-        'da' => 'west_european', // Danish
-        'de' => 'west_european', // German
-        'es' => 'west_european', // Spanish
-        'et' => 'estonian',
-        'eo' => 'unicode', // Esperanto
-        'eu' => 'west_european', // Basque
-        'fa' => 'arabic', // Persian
-        'fi' => 'west_european', // Finish
-        'fo' => 'west_european', // Faroese
-        'fr' => 'west_european', // French
-        'ga' => 'west_european', // Irish
-        'gl' => 'west_european', // Galician
-        'gr' => 'greek',
-        'he' => 'hebrew', // Hebrew (since 1998)
-        'hi' => 'unicode', // Hindi
-        'hr' => 'east_european', // Croatian
-        'hu' => 'east_european', // Hungarian
-        'iw' => 'hebrew', // Hebrew (til 1998)
-        'is' => 'west_european', // Icelandic
-        'it' => 'west_european', // Italian
-        'ja' => 'japanese',
-        'ka' => 'unicode', // Georgian
-        'kl' => 'west_european', // Greenlandic
-        'km' => 'unicode', // Khmer
-        'ko' => 'korean',
-        'lt' => 'lithuanian',
-        'lv' => 'west_european', // Latvian/Lettish
-        'nl' => 'west_european', // Dutch
-        'no' => 'west_european', // Norwegian
-        'nb' => 'west_european', // Norwegian Bokmal
-        'nn' => 'west_european', // Norwegian Nynorsk
-        'pl' => 'east_european', // Polish
-        'pt' => 'west_european', // Portuguese
-        'ro' => 'east_european', // Romanian
-        'ru' => 'cyrillic', // Russian
-        'sk' => 'east_european', // Slovak
-        'sl' => 'east_european', // Slovenian
-        'sr' => 'cyrillic', // Serbian
-        'sv' => 'west_european', // Swedish
-        'sq' => 'albanian', // Albanian
-        'th' => 'thai',
-        'uk' => 'cyrillic', // Ukranian
-        'vi' => 'vietnamese',
-        'zh' => 'chinese',
-
-        // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
-        // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
-        'afk' => 'west_european', // Afrikaans
-        'ara' => 'arabic',
-        'bgr' => 'cyrillic', // Bulgarian
-        'cat' => 'west_european', // Catalan
-        'chs' => 'simpl_chinese',
-        'cht' => 'trad_chinese',
-        'csy' => 'east_european', // Czech
-        'dan' => 'west_european', // Danish
-        'deu' => 'west_european', // German
-        'dea' => 'west_european', // German (Austrian)
-        'des' => 'west_european', // German (Swiss)
-        'ena' => 'west_european', // English (Australian)
-        'enc' => 'west_european', // English (Canadian)
-        'eng' => 'west_european', // English
-        'enz' => 'west_european', // English (New Zealand)
-        'enu' => 'west_european', // English (United States)
-        'euq' => 'west_european', // Basque
-        'fos' => 'west_european', // Faroese
-        'far' => 'arabic', // Persian
-        'fin' => 'west_european', // Finish
-        'fra' => 'west_european', // French
-        'frb' => 'west_european', // French (Belgian)
-        'frc' => 'west_european', // French (Canadian)
-        'frs' => 'west_european', // French (Swiss)
-        'geo' => 'unicode', // Georgian
-        'glg' => 'west_european', // Galician
-        'ell' => 'greek',
-        'heb' => 'hebrew',
-        'hin' => 'unicode', // Hindi
-        'hun' => 'east_european', // Hungarian
-        'isl' => 'west_european', // Icelandic
-        'ita' => 'west_european', // Italian
-        'its' => 'west_european', // Italian (Swiss)
-        'jpn' => 'japanese',
-        'khm' => 'unicode', // Khmer
-        'kor' => 'korean',
-        'lth' => 'lithuanian',
-        'lvi' => 'west_european', // Latvian/Lettish
-        'msl' => 'west_european', // Malay
-        'nlb' => 'west_european', // Dutch (Belgian)
-        'nld' => 'west_european', // Dutch
-        'nor' => 'west_european', // Norwegian (bokmal)
-        'non' => 'west_european', // Norwegian (nynorsk)
-        'plk' => 'east_european', // Polish
-        'ptg' => 'west_european', // Portuguese
-        'ptb' => 'west_european', // Portuguese (Brazil)
-        'rom' => 'east_european', // Romanian
-        'rus' => 'cyrillic', // Russian
-        'slv' => 'east_european', // Slovenian
-        'sky' => 'east_european', // Slovak
-        'srl' => 'east_european', // Serbian (Latin)
-        'srb' => 'cyrillic', // Serbian (Cyrillic)
-        'esp' => 'west_european', // Spanish (trad. sort)
-        'esm' => 'west_european', // Spanish (Mexican)
-        'esn' => 'west_european', // Spanish (internat. sort)
-        'sve' => 'west_european', // Swedish
-        'sqi' => 'albanian', // Albanian
-        'tha' => 'thai',
-        'trk' => 'turkish',
-        'ukr' => 'cyrillic', // Ukrainian
-
-        // English language names
-        'afrikaans' => 'west_european',
-        'albanian' => 'albanian',
-        'arabic' => 'arabic',
-        'basque' => 'west_european',
-        'bosnian' => 'east_european',
-        'bulgarian' => 'east_european',
-        'catalan' => 'west_european',
-        'croatian' => 'east_european',
-        'czech' => 'east_european',
-        'danish' => 'west_european',
-        'dutch' => 'west_european',
-        'english' => 'west_european',
-        'esperanto' => 'unicode',
-        'estonian' => 'estonian',
-        'faroese' => 'west_european',
-        'farsi' => 'arabic',
-        'finnish' => 'west_european',
-        'french' => 'west_european',
-        'galician' => 'west_european',
-        'georgian' => 'unicode',
-        'german' => 'west_european',
-        'greek' => 'greek',
-        'greenlandic' => 'west_european',
-        'hebrew' => 'hebrew',
-        'hindi' => 'unicode',
-        'hungarian' => 'east_european',
-        'icelandic' => 'west_european',
-        'italian' => 'west_european',
-        'khmer' => 'unicode',
-        'latvian' => 'west_european',
-        'lettish' => 'west_european',
-        'lithuanian' => 'lithuanian',
-        'malay' => 'west_european',
-        'norwegian' => 'west_european',
-        'persian' => 'arabic',
-        'polish' => 'east_european',
-        'portuguese' => 'west_european',
-        'russian' => 'cyrillic',
-        'romanian' => 'east_european',
-        'serbian' => 'cyrillic',
-        'slovak' => 'east_european',
-        'slovenian' => 'east_european',
-        'spanish' => 'west_european',
-        'svedish' => 'west_european',
-        'that' => 'thai',
-        'turkish' => 'turkish',
-        'ukrainian' => 'cyrillic'
-    );
-
-    /**
-     * Mapping of language (family) names to charsets on Unix
-     *
-     * @var array
-     */
-    public $script_to_charset_unix = array(
-        'west_european' => 'iso-8859-1',
-        'estonian' => 'iso-8859-1',
-        'east_european' => 'iso-8859-2',
-        'baltic' => 'iso-8859-4',
-        'cyrillic' => 'iso-8859-5',
-        'arabic' => 'iso-8859-6',
-        'greek' => 'iso-8859-7',
-        'hebrew' => 'iso-8859-8',
-        'turkish' => 'iso-8859-9',
-        'thai' => 'iso-8859-11', // = TIS-620
-        'lithuanian' => 'iso-8859-13',
-        'chinese' => 'gb2312', // = euc-cn
-        'japanese' => 'euc-jp',
-        'korean' => 'euc-kr',
-        'simpl_chinese' => 'gb2312',
-        'trad_chinese' => 'big5',
-        'vietnamese' => '',
-        'unicode' => 'utf-8',
-        'albanian' => 'utf-8'
-    );
-
-    /**
-     * Mapping of language (family) names to charsets on Windows
-     *
-     * @var array
-     */
-    public $script_to_charset_windows = array(
-        'east_european' => 'windows-1250',
-        'cyrillic' => 'windows-1251',
-        'west_european' => 'windows-1252',
-        'greek' => 'windows-1253',
-        'turkish' => 'windows-1254',
-        'hebrew' => 'windows-1255',
-        'arabic' => 'windows-1256',
-        'baltic' => 'windows-1257',
-        'estonian' => 'windows-1257',
-        'lithuanian' => 'windows-1257',
-        'vietnamese' => 'windows-1258',
-        'thai' => 'cp874',
-        'korean' => 'cp949',
-        'chinese' => 'gb2312',
-        'japanese' => 'shift_jis',
-        'simpl_chinese' => 'gb2312',
-        'trad_chinese' => 'big5',
-        'albanian' => 'windows-1250',
-        'unicode' => 'utf-8'
-    );
-
-    /**
-     * Mapping of locale names to charsets
-     *
-     * @var array
-     */
-    public $locale_to_charset = array(
-        'japanese.euc' => 'euc-jp',
-        'ja_jp.ujis' => 'euc-jp',
-        'korean.euc' => 'euc-kr',
-        'sr@Latn' => 'iso-8859-2',
-        'zh_cn' => 'gb2312',
-        'zh_hk' => 'big5',
-        'zh_tw' => 'big5'
-    );
-
     /**
      * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
      * Empty values means "utf-8"
@@ -549,48 +310,6 @@ class CharsetConverter implements SingletonInterface
         return $charset;
     }
 
-    /**
-     * Get the charset of a locale.
-     *
-     * ln      language
-     * ln_CN     language / country
-     * ln_CN.cs    language / country / charset
-     * ln_CN.cs@mod  language / country / charset / modifier
-     *
-     * @param string $locale Locale string
-     * @return string Charset resolved for locale string
-     */
-    public function get_locale_charset($locale)
-    {
-        $locale = strtolower($locale);
-        // Exact locale specific charset?
-        if (isset($this->locale_to_charset[$locale])) {
-            return $this->locale_to_charset[$locale];
-        }
-        // Get modifier
-        list($locale, $modifier) = explode('@', $locale);
-        // Locale contains charset: use it
-        list($locale, $charset) = explode('.', $locale);
-        if ($charset) {
-            return $this->parse_charset($charset);
-        }
-        // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
-        if ($modifier === 'euro') {
-            return 'iso-8859-15';
-        }
-        // Get language
-        list($language, ) = explode('_', $locale);
-        if (isset($this->lang_to_script[$language])) {
-            $script = $this->lang_to_script[$language];
-        }
-        if (TYPO3_OS === 'WIN') {
-            $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
-        } else {
-            $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
-        }
-        return $cs;
-    }
-
     /********************************************
      *
      * Charset Conversion functions
diff --git a/typo3/sysext/core/Documentation/Changelog/master/Breaking-72826-RemovedCustomCharsetConfigurationForLocales.rst b/typo3/sysext/core/Documentation/Changelog/master/Breaking-72826-RemovedCustomCharsetConfigurationForLocales.rst
new file mode 100644
index 000000000000..95a1a46416c1
--- /dev/null
+++ b/typo3/sysext/core/Documentation/Changelog/master/Breaking-72826-RemovedCustomCharsetConfigurationForLocales.rst
@@ -0,0 +1,43 @@
+===================================================================
+Breaking: #72826 - Removed custom charset configuration for locales
+===================================================================
+
+Description
+===========
+
+The TYPO3 Frontend resolves the TypoScript option ``config.locale_all`` and stores the charset part within ``$TSFE->localeCharset``. If the option ``locale_all`` does not provide a charset (e.g. when it is set to ``de_AT`` instead of ``de_AT.UTF-8`` a "best guess" was done based on a static list set up in 2004.
+
+The option ``$TSFE->localeCharset`` was removed, along with the following calculation options and methods
+available in the CharsetConverter class:
+
+    * CharsetConverter->lang_to_script
+    * CharsetConverter->script_to_charset_unix
+    * CharsetConverter->script_to_charset_windows
+    * CharsetConverter->locale_to_charset
+    * CharsetConverter->get_locale_charset()
+
+The localeCharset option was solely used within the TypoScript functionality ``stdWrap.strftime`` when no
+custom character set was given, and a character set conversion from the "localeCharset" (based on the best guess
+or explicitly set via ``config.locale_all = de_AT.UTF-8`` and it was different than the renderCharset option of the TYPO3 Frontend.
+
+
+Impact
+======
+
+When custom locales are configured in TypoScript which are not present on the server, or the character set of ``config.locale_all`` differs from the ``config.renderCharset``, or ``config.locale_all`` does not set a character set, could lead to unexpected output in the TYPO3 Frontend.
+
+
+Affected Installations
+======================
+
+Instances which have a different ``config.locale_all`` character set given than set via ``config.renderCharset``, or on
+servers that don't have the charset of the locale available but the output should be a certain but not given character set.
+
+
+Migration
+=========
+
+As this is a misconfiguration and only necessary if e.g. can not handle UTF-8 locales, config.set_locale can explicitly set to ``de_AT@iso-8859-15`` and the output should be renderCharset. On instances where ``stdWrap.strftime``is used, the subproperty ``charset`` can be set to the custom character set (e.g. ``iso-8859-15``).
+
+In each case, it should be configured that the ``config.locale_all`` option should have a character set given, to avoid
+any side-effects with the TypoScript stdWrap option ``strftime``.
\ No newline at end of file
diff --git a/typo3/sysext/frontend/Classes/ContentObject/ContentObjectRenderer.php b/typo3/sysext/frontend/Classes/ContentObject/ContentObjectRenderer.php
index 164c189df9fd..ef2b26ed60ab 100644
--- a/typo3/sysext/frontend/Classes/ContentObject/ContentObjectRenderer.php
+++ b/typo3/sysext/frontend/Classes/ContentObject/ContentObjectRenderer.php
@@ -2728,10 +2728,8 @@ class ContentObjectRenderer
         // Check for zero length string to mimic default case of strtime/gmstrftime
         $content = (string)$content === '' ? $GLOBALS['EXEC_TIME'] : (int)$content;
         $content = $conf['strftime.']['GMT'] ? gmstrftime($conf['strftime'], $content) : strftime($conf['strftime'], $content);
-        $tsfe = $this->getTypoScriptFrontendController();
-        $tmp_charset = $conf['strftime.']['charset'] ? $conf['strftime.']['charset'] : $tsfe->localeCharset;
-        if ($tmp_charset) {
-            $content = $tsfe->csConv($content, $tmp_charset);
+        if (!empty($conf['strftime.']['charset'])) {
+            $content = $this->getTypoScriptFrontendController()->csConv($content, $conf['strftime.']['charset']);
         }
         return $content;
     }
diff --git a/typo3/sysext/frontend/Classes/Controller/TypoScriptFrontendController.php b/typo3/sysext/frontend/Classes/Controller/TypoScriptFrontendController.php
index 898a2e81ba86..7880d77b002c 100644
--- a/typo3/sysext/frontend/Classes/Controller/TypoScriptFrontendController.php
+++ b/typo3/sysext/frontend/Classes/Controller/TypoScriptFrontendController.php
@@ -747,12 +747,6 @@ class TypoScriptFrontendController
      */
     public $metaCharset = 'utf-8';
 
-    /**
-     * Assumed charset of locale strings.
-     * @var string
-     */
-    public $localeCharset = '';
-
     /**
      * Set to the system language key (used on the site)
      * @var string
@@ -2763,7 +2757,6 @@ class TypoScriptFrontendController
                 setlocale(LC_CTYPE, $this->config['config']['locale_all']);
                 setlocale(LC_MONETARY, $this->config['config']['locale_all']);
                 setlocale(LC_TIME, $this->config['config']['locale_all']);
-                $this->localeCharset = $this->csConvObj->get_locale_charset($this->config['config']['locale_all']);
             } else {
                 $this->getTimeTracker()->setTSlogMessage('Locale "' . htmlspecialchars($this->config['config']['locale_all']) . '" not found.', 3);
             }
-- 
GitLab