From 6d954e14ac3d08c8ebff83bcacec4357f7feb10f Mon Sep 17 00:00:00 2001
From: Benni Mack <benni@typo3.org>
Date: Thu, 9 Feb 2023 23:36:32 +0100
Subject: [PATCH] [BUGFIX] Sort and limit available locales in SiteLanguage
 selection

When showing the available locales in the SiteLanguage
configuration, the selection is now sorted and also filtered,
by only showing "de_DE" and skipping locales with modifiers,
such as "de_DE.UTF-8" or "de_DE.ISO8859-1" in order to make
life easier for integrators.

Resolves: #99923
Related: #93651
Releases: main, 11.5
Change-Id: I03b38bb19207615e82a7f7917037aad0ac26b997
Reviewed-on: https://review.typo3.org/c/Packages/TYPO3.CMS/+/77756
Tested-by: Benni Mack <benni@typo3.org>
Tested-by: core-ci <typo3@b13.com>
Reviewed-by: Benni Mack <benni@typo3.org>
---
 .../Configuration/TCA/UserFunctions.php       |  15 +-
 .../core/Classes/Localization/Locale.php      | 181 ++++++++++++++++++
 .../Tests/Unit/Localization/LocaleTest.php    | 125 ++++++++++++
 3 files changed, 320 insertions(+), 1 deletion(-)
 create mode 100644 typo3/sysext/core/Classes/Localization/Locale.php
 create mode 100644 typo3/sysext/core/Tests/Unit/Localization/LocaleTest.php

diff --git a/typo3/sysext/backend/Classes/Configuration/TCA/UserFunctions.php b/typo3/sysext/backend/Classes/Configuration/TCA/UserFunctions.php
index 82a0965e83b3..d64c8c5b6ab8 100644
--- a/typo3/sysext/backend/Classes/Configuration/TCA/UserFunctions.php
+++ b/typo3/sysext/backend/Classes/Configuration/TCA/UserFunctions.php
@@ -18,6 +18,7 @@ declare(strict_types=1);
 namespace TYPO3\CMS\Backend\Configuration\TCA;
 
 use TYPO3\CMS\Core\Localization\LanguageService;
+use TYPO3\CMS\Core\Localization\Locale;
 use TYPO3\CMS\Core\Utility\CommandUtility;
 use TYPO3\CMS\Core\Utility\GeneralUtility;
 
@@ -109,10 +110,22 @@ class UserFunctions
         $rawOutput = [];
         CommandUtility::exec('locale -a', $rawOutput);
 
-        ksort($rawOutput, SORT_NATURAL);
+        sort($rawOutput, SORT_NATURAL);
         $locales = [];
+        $usedLocales = [];
         foreach ($rawOutput as $item) {
+            // do not show C/POSIX in the list of locales, as this is the default anyway
+            $obj = new Locale($item);
+            if ($obj->getPosixCodeSet() === 'C' || $obj->getPosixCodeSet() === 'POSIX') {
+                continue;
+            }
+            // Skip locales with appended language or country code (e.g. "de_DE.UTF-8", "de_DE.ISO8859-1").
+            // The user should only choose "de_DE".
+            if (in_array($obj->getName(), $usedLocales, true)) {
+                continue;
+            }
             $locales[] = [$item, $item];
+            $usedLocales[] = $obj->getName();
         }
 
         return $locales;
diff --git a/typo3/sysext/core/Classes/Localization/Locale.php b/typo3/sysext/core/Classes/Localization/Locale.php
new file mode 100644
index 000000000000..69c4772f8282
--- /dev/null
+++ b/typo3/sysext/core/Classes/Localization/Locale.php
@@ -0,0 +1,181 @@
+<?php
+
+declare(strict_types=1);
+
+/*
+ * This file is part of the TYPO3 CMS project.
+ *
+ * It is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License, either version 2
+ * of the License, or any later version.
+ *
+ * For the full copyright and license information, please read the
+ * LICENSE.txt file that was distributed with this source code.
+ *
+ * The TYPO3 project - inspiring people to share!
+ */
+
+namespace TYPO3\CMS\Core\Localization;
+
+/**
+ * A representation of
+ *    language key (based on ISO 639-1 / ISO 639-2)
+ *   - the optional four-letter script code that can follow the language code according to the Unicode ISO 15924 Registry (e.g. Hans in zh_Hans)
+ *   - region / country (based on ISO 3166-1)
+ * separated with a "-".
+ *
+ * This conforms to IETF - RFC 5646 (see https://datatracker.ietf.org/doc/rfc5646/) in a simplified form.
+ */
+class Locale implements \Stringable
+{
+    protected string $locale;
+    protected string $languageCode;
+    protected ?string $languageScript = null;
+    protected ?string $countryCode = null;
+    protected ?string $codeSet = null;
+    // see https://wiki.archlinux.org/title/locale#Generating_locales
+    protected ?string $charsetModifier = null;
+
+    // taken from https://meta.wikimedia.org/wiki/Template:List_of_language_names_ordered_by_code
+    protected const RIGHT_TO_LEFT_LANGUAGE_CODES = [
+        'ar', // Arabic
+        'arc', // Aramaic
+        'arz', // Egyptian Arabic
+        'ckb', // Kurdish (Sorani)
+        'dv', // Divehi
+        'fa', // Persian
+        'ha', // Hausa
+        'he', // Hebrew
+        'khw', // Khowar
+        'ks', // Kashmiri
+        'ps', // Pashto
+        'sd', // Sindhi
+        'ur', // Urdu
+        'uz-AF', // Uzbeki Afghanistan
+        'yi', // Yiddish
+    ];
+
+    /**
+     * List of language dependencies for an actual language. This setting is used for local variants of a language
+     * that depend on their "main" language, like Brazilian Portuguese or Canadian French.
+     *
+     * @var array<int, string>
+     */
+    protected array $dependencies = [];
+
+    public function __construct(
+        string $locale = 'en',
+        array $dependencies = []
+    ) {
+        $locale = $this->normalize($locale);
+        if (str_contains($locale, '@')) {
+            [$locale, $this->charsetModifier] = explode('@', $locale);
+        }
+        if (str_contains($locale, '.')) {
+            [$locale, $this->codeSet] = explode('.', $locale);
+        }
+        if (strtolower($locale) === 'c') {
+            $this->codeSet = 'C';
+            $locale = 'en';
+        } elseif (strtolower($locale) === 'posix') {
+            $this->codeSet = 'POSIX';
+            $locale = 'en';
+        }
+        if (str_contains($locale, '-')) {
+            [$this->languageCode, $tail] = explode('-', $locale, 2);
+            if (str_contains($tail, '-')) {
+                [$this->languageScript, $this->countryCode] = explode('-', $tail);
+            } elseif (strlen($tail) === 4) {
+                $this->languageScript = $tail;
+            } else {
+                $this->countryCode = $tail ?: null;
+            }
+            $this->languageCode = strtolower($this->languageCode);
+            $this->languageScript = $this->languageScript ? ucfirst(strtolower($this->languageScript)) : null;
+            $this->countryCode = $this->countryCode ? strtoupper($this->countryCode) : null;
+        } else {
+            $this->languageCode = strtolower($locale);
+        }
+
+        $this->locale = $this->languageCode . ($this->languageScript ? '-' . $this->languageScript : '') . ($this->countryCode ? '-' . $this->countryCode : '');
+        $this->dependencies = array_map(fn ($dep) => $this->normalize($dep), $dependencies);
+    }
+
+    public function getName(): string
+    {
+        return $this->locale;
+    }
+
+    public function getLanguageCode(): string
+    {
+        return $this->languageCode;
+    }
+
+    public function isRightToLeftLanguageDirection(): bool
+    {
+        return in_array($this->languageCode, self::RIGHT_TO_LEFT_LANGUAGE_CODES, true) || in_array($this->locale, self::RIGHT_TO_LEFT_LANGUAGE_CODES, true);
+    }
+
+    public function getLanguageScriptCode(): ?string
+    {
+        return $this->languageScript;
+    }
+
+    public function getCountryCode(): ?string
+    {
+        return $this->countryCode;
+    }
+
+    /**
+     * Return the locale as ISO/IEC 15897 format, including a possible POSIX charset
+     * "cs_CZ.UTF-8"
+     * see https://en.wikipedia.org/wiki/ISO/IEC_15897
+     * https://en.wikipedia.org/wiki/Locale_(computer_software)#POSIX_platforms
+     * @internal
+     */
+    public function posixFormatted(): string
+    {
+        $charsetModifier = $this->charsetModifier ? '@' . $this->charsetModifier : '';
+        if ($this->codeSet === 'C' || $this->codeSet === 'POSIX') {
+            return $this->codeSet . $charsetModifier;
+        }
+        $formatted = $this->languageCode;
+        if ($this->countryCode) {
+            $formatted .= '_' . $this->countryCode;
+        }
+        if ($this->codeSet) {
+            $formatted .= '.' . $this->codeSet;
+        }
+        return $formatted . $charsetModifier;
+    }
+
+    /**
+     * @internal
+     */
+    public function getPosixCodeSet(): ?string
+    {
+        return $this->codeSet;
+    }
+
+    public function getDependencies(): array
+    {
+        return $this->dependencies;
+    }
+
+    protected function normalize(string $locale): string
+    {
+        if ($locale === 'default') {
+            return 'en';
+        }
+        if (str_contains($locale, '_')) {
+            $locale = str_replace('_', '-', $locale);
+        }
+
+        return $locale;
+    }
+
+    public function __toString(): string
+    {
+        return $this->locale;
+    }
+}
diff --git a/typo3/sysext/core/Tests/Unit/Localization/LocaleTest.php b/typo3/sysext/core/Tests/Unit/Localization/LocaleTest.php
new file mode 100644
index 000000000000..5309c9012bc4
--- /dev/null
+++ b/typo3/sysext/core/Tests/Unit/Localization/LocaleTest.php
@@ -0,0 +1,125 @@
+<?php
+
+declare(strict_types=1);
+
+/*
+ * This file is part of the TYPO3 CMS project.
+ *
+ * It is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License, either version 2
+ * of the License, or any later version.
+ *
+ * For the full copyright and license information, please read the
+ * LICENSE.txt file that was distributed with this source code.
+ *
+ * The TYPO3 project - inspiring people to share!
+ */
+
+namespace TYPO3\CMS\Core\Tests\Unit\Localization;
+
+use TYPO3\CMS\Core\Localization\Locale;
+use TYPO3\TestingFramework\Core\Unit\UnitTestCase;
+
+class LocaleTest extends UnitTestCase
+{
+    /**
+     * @test
+     */
+    public function localeWithJustLanguageCodeSanitizesIncomingValuesProperly(): void
+    {
+        $subject = new Locale('en');
+        self::assertNull($subject->getLanguageScriptCode());
+        self::assertNull($subject->getCountryCode());
+        self::assertEquals('en', $subject->getLanguageCode());
+        self::assertEquals('en', (string)$subject);
+
+        $subject = new Locale('C');
+        self::assertNull($subject->getLanguageScriptCode());
+        self::assertNull($subject->getCountryCode());
+        self::assertEquals('en', $subject->getLanguageCode());
+        self::assertEquals('C', $subject->getPosixCodeSet());
+        self::assertEquals('C', $subject->posixFormatted());
+        self::assertEquals('en', (string)$subject);
+
+        $subject = new Locale('de_DE.UTF-8');
+        self::assertNull($subject->getLanguageScriptCode());
+        self::assertEquals('DE', $subject->getCountryCode());
+        self::assertEquals('de', $subject->getLanguageCode());
+        self::assertEquals('de-DE', (string)$subject);
+        self::assertEquals('de_DE.UTF-8', $subject->posixFormatted());
+
+        $subject = new Locale('de_DE@euro');
+        self::assertNull($subject->getLanguageScriptCode());
+        self::assertEquals('DE', $subject->getCountryCode());
+        self::assertEquals('de', $subject->getLanguageCode());
+        self::assertEquals('de-DE', (string)$subject);
+        self::assertEquals('de_DE@euro', $subject->posixFormatted());
+
+        // Also with mixed case
+        $subject = new Locale('eN');
+        self::assertNull($subject->getLanguageScriptCode());
+        self::assertNull($subject->getCountryCode());
+        self::assertEquals('en', $subject->getLanguageCode());
+        self::assertEquals('en', (string)$subject);
+    }
+
+    /**
+     * @test
+     */
+    public function localeWithLanguageAndScriptCodeSanitizesIncomingValuesProperly(): void
+    {
+        $subject = new Locale('zh_HANS');
+        self::assertEquals('Hans', $subject->getLanguageScriptCode());
+        self::assertNull($subject->getCountryCode());
+        self::assertEquals('zh', $subject->getLanguageCode());
+        self::assertEquals('zh-Hans', (string)$subject);
+    }
+
+    /**
+     * @test
+     */
+    public function localeWithLanguageAndScriptCodeAndCountryCodeSanitizesIncomingValuesProperly(): void
+    {
+        $subject = new Locale('zh_HANS_CN');
+        self::assertEquals('Hans', $subject->getLanguageScriptCode());
+        self::assertEquals('CN', $subject->getCountryCode());
+        self::assertEquals('zh', $subject->getLanguageCode());
+        self::assertEquals('zh-Hans-CN', (string)$subject);
+    }
+
+    /**
+     * @test
+     */
+    public function variousCombinationsOfLanguageAndCountryCodeReturnsSanitizedValues(): void
+    {
+        $subject = new Locale('fr_CA');
+        self::assertNull($subject->getLanguageScriptCode());
+        self::assertEquals('CA', $subject->getCountryCode());
+        self::assertEquals('fr', $subject->getLanguageCode());
+        self::assertEquals('fr-CA', (string)$subject);
+        $subject = new Locale('de-AT');
+        self::assertNull($subject->getLanguageScriptCode());
+        self::assertEquals('AT', $subject->getCountryCode());
+        self::assertEquals('de', $subject->getLanguageCode());
+        self::assertEquals('de-AT', (string)$subject);
+    }
+
+    /**
+     * @test
+     */
+    public function dependenciesAreSetAndRetrievedCorrectly(): void
+    {
+        $subject = new Locale('fr_CA', ['fr', 'en']);
+        self::assertNull($subject->getLanguageScriptCode());
+        self::assertEquals('CA', $subject->getCountryCode());
+        self::assertEquals('fr', $subject->getLanguageCode());
+        self::assertEquals(['fr', 'en'], $subject->getDependencies());
+        self::assertEquals('fr-CA', (string)$subject);
+        $subject = new Locale('en-US', ['en-UK', 'en']);
+        self::assertNull($subject->getLanguageScriptCode());
+        self::assertEquals('US', $subject->getCountryCode());
+        self::assertEquals('en', $subject->getLanguageCode());
+        self::assertEquals(['en-UK', 'en'], $subject->getDependencies());
+        self::assertEquals('en-US', (string)$subject);
+    }
+}
-- 
GitLab