From 4579bbecb0fd3007bfa3cc79eda73459c42e9fad Mon Sep 17 00:00:00 2001 From: Steffen Ritter <info@rs-websystems.de> Date: Sun, 19 Feb 2012 12:40:55 +0100 Subject: [PATCH] [TASK] Change various functions to use utf-8 by default Many functions in TYPO3core expect a character set parameter to be defined. Their default was iso-8859-1 in the past and is changed to utf-8 now. Change-Id: I9c228821e95167b67811c8475880707d5c77bdb7 Resolves: #34094 Releases: 4.7 Reviewed-on: http://review.typo3.org/9101 Reviewed-by: Michael Stucki Tested-by: Michael Stucki --- t3lib/class.t3lib_cs.php | 36 +++++++++++-------- t3lib/class.t3lib_div.php | 10 +++--- .../parser/class.t3lib_l10n_parser_llphp.php | 6 ++-- tests/t3lib/class.t3lib_pagerendererTest.php | 2 +- typo3/sysext/cms/tslib/class.tslib_fe.php | 4 +-- typo3/sysext/impexp/class.tx_impexp.php | 2 +- .../indexed_search/class.external_parser.php | 5 +-- .../rtehtmlarea/class.tx_rtehtmlarea_base.php | 2 +- .../class.tx_rtehtmlarea_browse_links.php | 2 +- .../mod6/class.tx_rtehtmlarea_parse_html.php | 2 +- .../pi2/class.tx_rtehtmlarea_pi2.php | 2 +- typo3/sysext/t3editor/res/tsref/tsref.xml | 2 +- typo3/template.php | 2 +- 13 files changed, 42 insertions(+), 35 deletions(-) diff --git a/t3lib/class.t3lib_cs.php b/t3lib/class.t3lib_cs.php index a49a9ea2795a..712c5006d602 100644 --- a/t3lib/class.t3lib_cs.php +++ b/t3lib/class.t3lib_cs.php @@ -572,7 +572,7 @@ class t3lib_cs { if (TYPO3_OS == 'WIN') { $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252'; } else { - $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1'; + $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8'; } return $cs; @@ -814,26 +814,32 @@ class t3lib_cs { * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well) * @return string Output string */ - function entities_to_utf8($str, $alsoStdHtmlEnt = 0) { + function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) { if ($alsoStdHtmlEnt) { - $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below. + $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8')); } $token = md5(microtime()); $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str)); foreach ($parts as $k => $v) { - if ($k % 2) { - if (substr($v, 0, 1) == '#') { // Dec or hex entities: - if (substr($v, 1, 1) == 'x') { - $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2))); - } else { - $parts[$k] = $this->UnumberToChar(substr($v, 1)); - } - } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities: - $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1'); - } else { // No conversion: - $parts[$k] = '&' . $v . ';'; + // only take every second element + if ($k % 2 === 0) { + continue; + } + + $position = 0; + if (substr($v, $position, 1) == '#') { // Dec or hex entities: + $position++; + if (substr($v, $position, 1) == 'x') { + $v = hexdec(substr($v, ++$position)); + } else { + $v = substr($v, $position); } + $parts[$k] = $this->UnumberToChar($v); + } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities: + $parts[$k] = $trans_tbl['&' . $v . ';']; + } else { // No conversion: + $parts[$k] = '&' . $v . ';'; } } @@ -2346,4 +2352,4 @@ if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLA include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']); } -?> \ No newline at end of file +?> diff --git a/t3lib/class.t3lib_div.php b/t3lib/class.t3lib_div.php index 86186d8f7c28..67d6be15b7e3 100644 --- a/t3lib/class.t3lib_div.php +++ b/t3lib/class.t3lib_div.php @@ -375,7 +375,7 @@ final class t3lib_div { } else { // this case should not happen $csConvObj = self::makeInstance('t3lib_cs'); - return $csConvObj->crop('iso-8859-1', $string, $chars, $appendString); + return $csConvObj->crop('utf-8', $string, $chars, $appendString); } } @@ -4189,7 +4189,7 @@ final class t3lib_div { if (@is_file($fileRef) && $langKey) { // Set charsets: - $sourceCharset = $csConvObj->parse_charset($csConvObj->charSetArray[$langKey] ? $csConvObj->charSetArray[$langKey] : 'iso-8859-1'); + $sourceCharset = $csConvObj->parse_charset($csConvObj->charSetArray[$langKey] ? $csConvObj->charSetArray[$langKey] : 'utf-8'); if ($charset) { $targetCharset = $csConvObj->parse_charset($charset); } else { @@ -4216,9 +4216,9 @@ final class t3lib_div { // converting the default language (English) // this needs to be done for a few accented loan words and extension names - if (is_array($LOCAL_LANG['default']) && $targetCharset != 'iso-8859-1') { + if (is_array($LOCAL_LANG['default']) && $targetCharset != 'utf-8') { foreach ($LOCAL_LANG['default'] as &$labelValue) { - $labelValue = $csConvObj->conv($labelValue, 'iso-8859-1', $targetCharset); + $labelValue = $csConvObj->conv($labelValue, 'utf-8', $targetCharset); } unset($labelValue); } @@ -5190,7 +5190,7 @@ final class t3lib_div { * @param string $charset Charset used for encoding * @return string The encoded string */ - public static function encodeHeader($line, $enc = 'quoted-printable', $charset = 'iso-8859-1') { + public static function encodeHeader($line, $enc = 'quoted-printable', $charset = 'utf-8') { // Avoid problems if "###" is found in $line (would conflict with the placeholder which is used below) if (strpos($line, '###') !== FALSE) { return $line; diff --git a/t3lib/l10n/parser/class.t3lib_l10n_parser_llphp.php b/t3lib/l10n/parser/class.t3lib_l10n_parser_llphp.php index 953f495520a0..497c7ac35aea 100644 --- a/t3lib/l10n/parser/class.t3lib_l10n_parser_llphp.php +++ b/t3lib/l10n/parser/class.t3lib_l10n_parser_llphp.php @@ -140,9 +140,9 @@ class t3lib_l10n_parser_Llphp implements t3lib_l10n_parser { // Converting the default language (English) // This needs to be done for a few accented loan words and extension names - if (is_array($LOCAL_LANG['default']) && $this->targetCharset !== 'iso-8859-1') { + if (is_array($LOCAL_LANG['default']) && $this->targetCharset !== 'utf-8') { foreach ($LOCAL_LANG['default'] as &$labelValue) { - $labelValue = $this->csConvObj->conv($labelValue, 'iso-8859-1', $this->targetCharset); + $labelValue = $this->csConvObj->conv($labelValue, 'utf-8', $this->targetCharset); } unset($labelValue); } @@ -211,7 +211,7 @@ class t3lib_l10n_parser_Llphp implements t3lib_l10n_parser { */ protected function setCharsets($languageKey, $charset) { $this->sourceCharset = $this->csConvObj->parse_charset($this->csConvObj->charSetArray[$languageKey] - ? $this->csConvObj->charSetArray[$languageKey] : 'iso-8859-1'); + ? $this->csConvObj->charSetArray[$languageKey] : 'utf-8'); if ($charset) { $this->targetCharset = $this->csConvObj->parse_charset($charset); } else { diff --git a/tests/t3lib/class.t3lib_pagerendererTest.php b/tests/t3lib/class.t3lib_pagerendererTest.php index 60290cac2cc4..775b4bc0aa56 100644 --- a/tests/t3lib/class.t3lib_pagerendererTest.php +++ b/tests/t3lib/class.t3lib_pagerendererTest.php @@ -766,7 +766,7 @@ class t3lib_PageRendererTest extends tx_phpunit_testcase { public function isInlineLanguageLabelDeliveredWithNonUTF8() { $testPrefix = uniqid('test'); $this->fixture->loadExtCore(); - $this->fixture->setCharSet('iso-8859-1'); + $this->fixture->setCharSet('utf-8'); $this->fixture->addInlineLanguageLabel($testPrefix, $testPrefix . "_\xd8"); $out = $this->fixture->render(); diff --git a/typo3/sysext/cms/tslib/class.tslib_fe.php b/typo3/sysext/cms/tslib/class.tslib_fe.php index d5f2fd519899..446f0f98893a 100644 --- a/typo3/sysext/cms/tslib/class.tslib_fe.php +++ b/typo3/sysext/cms/tslib/class.tslib_fe.php @@ -213,7 +213,7 @@ * @var t3lib_cs */ var $csConvObj; - var $defaultCharSet = 'iso-8859-1'; // The default charset used in the frontend if nothing else is set. + var $defaultCharSet = 'utf-8'; // The default charset used in the frontend if nothing else is set. var $renderCharset=''; // Internal charset of the frontend during rendering. (Default: UTF-8) var $metaCharset=''; // Output charset of the websites content. This is the charset found in the header, meta tag etc. If different from $renderCharset a conversion happens before output to browser. Defaults to ->renderCharset if not set. var $localeCharset=''; // Assumed charset of locale strings. @@ -4788,7 +4788,7 @@ if (version == "n3") { /** * Converts the charset of the input string if applicable. - * The "to" charset is determined by the currently used charset for the page which is "iso-8859-1" by default or set by $GLOBALS['TSFE']->config['config']['renderCharset'] + * The "to" charset is determined by the currently used charset for the page which is "utf-8" by default or set by $GLOBALS['TSFE']->config['config']['renderCharset'] * Only if there is a difference between the two charsets will a conversion be made * The conversion is done real-time - no caching for performance at this point! * diff --git a/typo3/sysext/impexp/class.tx_impexp.php b/typo3/sysext/impexp/class.tx_impexp.php index 762657152491..c67a1459cb3a 100755 --- a/typo3/sysext/impexp/class.tx_impexp.php +++ b/typo3/sysext/impexp/class.tx_impexp.php @@ -965,7 +965,7 @@ class tx_impexp { ); // Creating XML file from $outputArray: - $charset = $this->dat['header']['charset'] ? $this->dat['header']['charset'] : 'iso-8859-1'; + $charset = $this->dat['header']['charset'] ? $this->dat['header']['charset'] : 'utf-8'; $XML = '<?xml version="1.0" encoding="'.$charset.'" standalone="yes" ?>'.LF; $XML.= t3lib_div::array2xml($this->dat,'',0,'T3RecordDocument',0,$options); diff --git a/typo3/sysext/indexed_search/class.external_parser.php b/typo3/sysext/indexed_search/class.external_parser.php index b37f0450a9d7..118d237f0ff1 100755 --- a/typo3/sysext/indexed_search/class.external_parser.php +++ b/typo3/sysext/indexed_search/class.external_parser.php @@ -493,8 +493,9 @@ class tx_indexed_search_extparse { case 'txt': case 'csv': // Raw text $content = t3lib_div::getUrl($absFile); - // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...) - $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1'); + // TODO: Implement auto detection of charset (currently assuming utf-8) + $contentCharset = 'utf-8'; + $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset); $contentArr = $this->pObj->splitRegularContent($content); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! break; diff --git a/typo3/sysext/rtehtmlarea/class.tx_rtehtmlarea_base.php b/typo3/sysext/rtehtmlarea/class.tx_rtehtmlarea_base.php index 7172785d3fef..74c12c17e747 100644 --- a/typo3/sysext/rtehtmlarea/class.tx_rtehtmlarea_base.php +++ b/typo3/sysext/rtehtmlarea/class.tx_rtehtmlarea_base.php @@ -328,7 +328,7 @@ class tx_rtehtmlarea_base extends t3lib_rteapi { $this->OutputCharset = $this->charset; $this->contentCharset = $LANG->csConvObj->charSetArray[$this->contentTypo3Language]; - $this->contentCharset = $this->contentCharset ? $this->contentCharset : 'iso-8859-1'; + $this->contentCharset = $this->contentCharset ? $this->contentCharset : 'utf-8'; $this->origContentCharSet = $this->contentCharset; $this->contentCharset = 'utf-8'; diff --git a/typo3/sysext/rtehtmlarea/mod3/class.tx_rtehtmlarea_browse_links.php b/typo3/sysext/rtehtmlarea/mod3/class.tx_rtehtmlarea_browse_links.php index 4012ae025f31..e8ea0291c729 100644 --- a/typo3/sysext/rtehtmlarea/mod3/class.tx_rtehtmlarea_browse_links.php +++ b/typo3/sysext/rtehtmlarea/mod3/class.tx_rtehtmlarea_browse_links.php @@ -1139,7 +1139,7 @@ class tx_rtehtmlarea_browse_links extends browse_links { $LANG->lang = $this->contentTypo3Language; $LANG->origCharSet = $LANG->csConvObj->charSetArray[$this->contentTypo3Language]; - $LANG->origCharSet = $LANG->origCharSet ? $LANG->origCharSet : 'iso-8859-1'; + $LANG->origCharSet = $LANG->origCharSet ? $LANG->origCharSet : 'utf-8'; $LANG->charSet = $this->contentTypo3Charset; $LLString = $LANG->sL($string); diff --git a/typo3/sysext/rtehtmlarea/mod6/class.tx_rtehtmlarea_parse_html.php b/typo3/sysext/rtehtmlarea/mod6/class.tx_rtehtmlarea_parse_html.php index 82a0656ef630..20d03aecd8f0 100644 --- a/typo3/sysext/rtehtmlarea/mod6/class.tx_rtehtmlarea_parse_html.php +++ b/typo3/sysext/rtehtmlarea/mod6/class.tx_rtehtmlarea_parse_html.php @@ -78,7 +78,7 @@ class tx_rtehtmlarea_parse_html { $clientInfo = t3lib_div::clientInfo(); // the charset of the content element, possibly overidden by forceCharset - $toCharSet = t3lib_div::_GP('charset')?t3lib_div::_GP('charset'):'iso-8859-1'; + $toCharSet = t3lib_div::_GP('charset')?t3lib_div::_GP('charset'):'utf-8'; // IE wants it back in utf-8 if ( $clientInfo['BROWSER']= 'msie') { $toCharSet = 'utf-8'; diff --git a/typo3/sysext/rtehtmlarea/pi2/class.tx_rtehtmlarea_pi2.php b/typo3/sysext/rtehtmlarea/pi2/class.tx_rtehtmlarea_pi2.php index 9de5fe366996..87edeeccf8b4 100644 --- a/typo3/sysext/rtehtmlarea/pi2/class.tx_rtehtmlarea_pi2.php +++ b/typo3/sysext/rtehtmlarea/pi2/class.tx_rtehtmlarea_pi2.php @@ -169,7 +169,7 @@ class tx_rtehtmlarea_pi2 extends tx_rtehtmlarea_base { // Set the charset of the content $this->contentCharset = $TSFE->csConvObj->charSetArray[$this->contentTypo3Language]; - $this->contentCharset = $this->contentCharset ? $this->contentCharset : 'iso-8859-1'; + $this->contentCharset = $this->contentCharset ? $this->contentCharset : 'utf-8'; $this->contentCharset = trim($TSFE->config['config']['metaCharset']) ? trim($TSFE->config['config']['metaCharset']) : $this->contentCharset; /* ======================================= diff --git a/typo3/sysext/t3editor/res/tsref/tsref.xml b/typo3/sysext/t3editor/res/tsref/tsref.xml index 866497b064c4..d42e4ecc5dd9 100644 --- a/typo3/sysext/t3editor/res/tsref/tsref.xml +++ b/typo3/sysext/t3editor/res/tsref/tsref.xml @@ -774,7 +774,7 @@ If this property is set, images are not allowed to be scaled up in size. This pa </property> <property name="notification_email_charset" type="string"> <description><![CDATA[Alternative charset for the notification mails.]]></description> - <default><![CDATA[ISO-8859-1]]></default> + <default><![CDATA[utf-8]]></default> </property> <property name="notification_email_encoding" type="string"> <description><![CDATA[This sets the encoding of plaintext emails (notification messages). The default encoding is "quoted-printable". But setting this to eg. "base64" will encode the content with base64 encoding. diff --git a/typo3/template.php b/typo3/template.php index 42116aa35d4d..8b64af318bf6 100644 --- a/typo3/template.php +++ b/typo3/template.php @@ -128,7 +128,7 @@ class template { var $parseTimeFlag = 0; // Will output the parsetime of the scripts in milliseconds (for admin-users). Set this to FALSE when releasing TYPO3. Only for dev. // INTERNAL - var $charset = 'iso-8859-1'; // Default charset. see function initCharset() + var $charset = 'utf-8'; // Default charset. see function initCharset() var $sectionFlag=0; // Internal: Indicates if a <div>-output section is open var $divClass = ''; // (Default) Class for wrapping <DIV>-tag of page. Is set in class extensions. -- GitLab