From 11e1d82d9848a96a5b292c1a4a97a65167a84205 Mon Sep 17 00:00:00 2001
From: Benni Mack <benni@typo3.org>
Date: Tue, 12 Jan 2016 23:39:50 +0100
Subject: [PATCH] [!!!][TASK] Remove obsolete HtmlParser methods

Some now unused methods (after the XHTML cleanup)
within HtmlParser can be removed as well.

Resolves: #72667
Releases: master
Change-Id: I164505001d76db9b3e111d466f7b823e5cfe6b65
Reviewed-on: https://review.typo3.org/45854
Reviewed-by: Wouter Wolters <typo3@wouterwolters.nl>
Tested-by: Wouter Wolters <typo3@wouterwolters.nl>
Reviewed-by: Georg Ringer <georg.ringer@gmail.com>
Tested-by: Georg Ringer <georg.ringer@gmail.com>
---
 typo3/sysext/core/Classes/Html/HtmlParser.php | 179 +-----------------
 .../core/Classes/Html/RteHtmlParser.php       |  18 +-
 ...-72667-RTEUnusedInternalMethodsRemoved.rst |  30 +++
 3 files changed, 43 insertions(+), 184 deletions(-)
 create mode 100644 typo3/sysext/core/Documentation/Changelog/master/Breaking-72667-RTEUnusedInternalMethodsRemoved.rst

diff --git a/typo3/sysext/core/Classes/Html/HtmlParser.php b/typo3/sysext/core/Classes/Html/HtmlParser.php
index 3fdf78324a0f..a5d049eaa5a0 100644
--- a/typo3/sysext/core/Classes/Html/HtmlParser.php
+++ b/typo3/sysext/core/Classes/Html/HtmlParser.php
@@ -183,21 +183,20 @@ class HtmlParser
 
     /**
      * Returns an array with either tag or non-tag content of the result from ->splitIntoBlock()/->splitTags()
+     * Does not include the tags in the tag parts.
      *
      * @param array $parts Parts generated by ->splitIntoBlock() or >splitTags()
      * @param bool $tag_parts Whether to return the tag-parts (default,TRUE) or what was outside the tags.
-     * @param bool $include_tag Whether to include the tags in the tag-parts (most useful for input made by ->splitIntoBlock())
      * @return array Tag-parts/Non-tag-parts depending on input argument settings
      * @see splitIntoBlock(), splitTags()
+     * @private Currently only use to remove content from table cells inside RteHtmlParser
      */
-    public function getAllParts($parts, $tag_parts = true, $include_tag = true)
+    public function getAllParts($parts, $tag_parts = true)
     {
         $newParts = array();
         foreach ($parts as $k => $v) {
             if (($k + ($tag_parts ? 0 : 1)) % 2) {
-                if (!$include_tag) {
-                    $v = $this->removeFirstAndLastTag($v);
-                }
+                $v = $this->removeFirstAndLastTag($v);
                 $newParts[] = $v;
             }
         }
@@ -332,66 +331,6 @@ class HtmlParser
         return array($value, $metaValue);
     }
 
-    /**
-     * Checks whether block/solo tags are found in the correct amounts in HTML content
-     * Block tags are tags which are required to have an equal amount of start and end tags, eg. "<table>...</table>"
-     * Solo tags are tags which are required to have ONLY start tags (possibly with an XHTML ending like ".../>")
-     * NOTICE: Correct XHTML might actually fail since "<br></br>" is allowed as well as "<br/>". However only the LATTER is accepted by this function (with "br" in the "solo-tag" list), the first example will result in a warning.
-     * NOTICE: Correct XHTML might actually fail since "<p/>" is allowed as well as "<p></p>". However only the LATTER is accepted by this function (with "p" in the "block-tag" list), the first example will result in an ERROR!
-     * NOTICE: Correct HTML version "something" allows eg. <p> and <li> to be NON-ended (implicitly ended by other tags). However this is NOT accepted by this function (with "p" and "li" in the block-tag list) and it will result in an ERROR!
-     *
-     * @param string $content HTML content to analyze
-     * @param string $blockTags Tag names for block tags (eg. table or div or p) in lowercase, commalist (eg. "table,div,p")
-     * @param string $soloTags Tag names for solo tags (eg. img, br or input) in lowercase, commalist ("img,br,input")
-     * @return array Analyse data.
-     */
-    public function checkTagTypeCounts($content, $blockTags = 'a,b,blockquote,body,div,em,font,form,h1,h2,h3,h4,h5,h6,i,li,map,ol,option,p,pre,select,span,strong,table,td,textarea,tr,u,ul', $soloTags = 'br,hr,img,input,area')
-    {
-        $content = strtolower($content);
-        $analyzedOutput = array();
-        // Counts appearances of start-tags
-        $analyzedOutput['counts'] = array();
-        // Lists ERRORS
-        $analyzedOutput['errors'] = array();
-        // Lists warnings.
-        $analyzedOutput['warnings'] = array();
-        // Lists stats for block-tags
-        $analyzedOutput['blocks'] = array();
-        // Lists stats for solo-tags
-        $analyzedOutput['solo'] = array();
-        // Block tags, must have endings...
-        $blockTags = explode(',', $blockTags);
-        foreach ($blockTags as $tagName) {
-            $countBegin = count(preg_split(('/\\<' . preg_quote($tagName, '/') . '(\\s|\\>)/s'), $content)) - 1;
-            $countEnd = count(preg_split(('/\\<\\/' . preg_quote($tagName, '/') . '(\\s|\\>)/s'), $content)) - 1;
-            $analyzedOutput['blocks'][$tagName] = array($countBegin, $countEnd, $countBegin - $countEnd);
-            if ($countBegin) {
-                $analyzedOutput['counts'][$tagName] = $countBegin;
-            }
-            if ($countBegin - $countEnd) {
-                if ($countBegin - $countEnd > 0) {
-                    $analyzedOutput['errors'][$tagName] = 'There were more start-tags (' . $countBegin . ') than end-tags (' . $countEnd . ') for the element "' . $tagName . '". There should be an equal amount!';
-                } else {
-                    $analyzedOutput['warnings'][$tagName] = 'There were more end-tags (' . $countEnd . ') than start-tags (' . $countBegin . ') for the element "' . $tagName . '". There should be an equal amount! However the problem is not fatal.';
-                }
-            }
-        }
-        // Solo tags, must NOT have endings...
-        $soloTags = explode(',', $soloTags);
-        foreach ($soloTags as $tagName) {
-            $countBegin = count(preg_split(('/\\<' . preg_quote($tagName, '/') . '(\\s|\\>)/s'), $content)) - 1;
-            $countEnd = count(preg_split(('/\\<\\/' . preg_quote($tagName, '/') . '(\\s|\\>)/s'), $content)) - 1;
-            $analyzedOutput['solo'][$tagName] = array($countBegin, $countEnd);
-            if ($countBegin) {
-                $analyzedOutput['counts'][$tagName] = $countBegin;
-            }
-            if ($countEnd) {
-                $analyzedOutput['warnings'][$tagName] = 'There were end-tags found (' . $countEnd . ') for the element "' . $tagName . '". This was not expected (although XHTML technically allows it).';
-            }
-        }
-        return $analyzedOutput;
-    }
-
     /*********************************
      *
      * Clean HTML code
@@ -869,45 +808,6 @@ class HtmlParser
         return $srcVal;
     }
 
-    /**
-     * Cleans up the input $value for fonttags.
-     * If keepFace,-Size and -Color is set then font-tags with an allowed property is kept. Else deleted.
-     *
-     * @param string HTML content with font-tags inside to clean up.
-     * @param bool If set, keep "face" attribute
-     * @param bool If set, keep "size" attribute
-     * @param bool If set, keep "color" attribute
-     * @return string Processed HTML content
-     */
-    public function cleanFontTags($value, $keepFace = 0, $keepSize = 0, $keepColor = 0)
-    {
-        // ,1 ?? - could probably be more stable if splitTags() was used since this depends on end-tags being properly set!
-        $fontSplit = $this->splitIntoBlock('font', $value);
-        foreach ($fontSplit as $k => $v) {
-            // Font
-            if ($k % 2) {
-                $attribArray = $this->get_tag_attributes_classic($this->getFirstTag($v));
-                $newAttribs = array();
-                if ($keepFace && $attribArray['face']) {
-                    $newAttribs[] = 'face="' . $attribArray['face'] . '"';
-                }
-                if ($keepSize && $attribArray['size']) {
-                    $newAttribs[] = 'size="' . $attribArray['size'] . '"';
-                }
-                if ($keepColor && $attribArray['color']) {
-                    $newAttribs[] = 'color="' . $attribArray['color'] . '"';
-                }
-                $innerContent = $this->cleanFontTags($this->removeFirstAndLastTag($v), $keepFace, $keepSize, $keepColor);
-                if (!empty($newAttribs)) {
-                    $fontSplit[$k] = '<font ' . implode(' ', $newAttribs) . '>' . $innerContent . '</font>';
-                } else {
-                    $fontSplit[$k] = $innerContent;
-                }
-            }
-        }
-        return implode('', $fontSplit);
-    }
-
     /**
      * This is used to map certain tag-names into other names.
      *
@@ -925,44 +825,6 @@ class HtmlParser
         return $value;
     }
 
-    /**
-     * This converts htmlspecialchar()'ed tags (from $tagList) back to real tags. Eg. '&lt;strong&gt' would be converted back to '<strong>' if found in $tagList
-     *
-     * @param string $content HTML content
-     * @param string $tagList Tag list, separated by comma. Lowercase!
-     * @return string Processed HTML content
-     */
-    public function unprotectTags($content, $tagList = '')
-    {
-        $tagsArray = GeneralUtility::trimExplode(',', $tagList, true);
-        $contentParts = explode('&lt;', $content);
-        // bypass the first
-        $contentPartsSliced = array_slice($contentParts, 1, null, true);
-        foreach ($contentPartsSliced as $k => $tok) {
-            $firstChar = $tok[0];
-            if (trim($firstChar) !== '') {
-                $subparts = explode('&gt;', $tok, 2);
-                $tagEnd = strlen($subparts[0]);
-                if (strlen($tok) != $tagEnd) {
-                    $endTag = $firstChar == '/' ? 1 : 0;
-                    $tagContent = substr($tok, $endTag, $tagEnd - $endTag);
-                    $tagParts = preg_split('/\\s+/s', $tagContent, 2);
-                    $tagName = strtolower($tagParts[0]);
-                    if ((string)$tagList === '' || in_array($tagName, $tagsArray)) {
-                        $contentParts[$k] = '<' . $subparts[0] . '>' . $subparts[1];
-                    } else {
-                        $contentParts[$k] = '&lt;' . $tok;
-                    }
-                } else {
-                    $contentParts[$k] = '&lt;' . $tok;
-                }
-            } else {
-                $contentParts[$k] = '&lt;' . $tok;
-            }
-        }
-        return implode('', $contentParts);
-    }
-
     /**
      * Internal function for case shifting of a string or whole array
      *
@@ -1017,39 +879,6 @@ class HtmlParser
         return implode(' ', $accu);
     }
 
-    /**
-     * Get tag attributes, the classic version (which had some limitations?)
-     *
-     * @param string $tag The tag
-     * @param bool $deHSC De-htmlspecialchar flag.
-     * @return array
-     * @access private
-     */
-    public function get_tag_attributes_classic($tag, $deHSC = 0)
-    {
-        $attr = $this->get_tag_attributes($tag, $deHSC);
-        return is_array($attr[0]) ? $attr[0] : array();
-    }
-
-    /**
-     * Indents input content with $number instances of $indentChar
-     *
-     * @param string $content Content string, multiple lines.
-     * @param int $number Number of indents
-     * @param string $indentChar Indent character/string
-     * @return string Indented code (typ. HTML)
-     */
-    public function indentLines($content, $number = 1, $indentChar = TAB)
-    {
-        $preTab = str_pad('', $number * strlen($indentChar), $indentChar);
-        $lines = explode(LF, str_replace(CR, '', $content));
-        foreach ($lines as &$line) {
-            $line = $preTab . $line;
-        }
-        unset($line);
-        return implode(LF, $lines);
-    }
-
     /**
      * Converts TSconfig into an array for the HTMLcleaner function.
      *
diff --git a/typo3/sysext/core/Classes/Html/RteHtmlParser.php b/typo3/sysext/core/Classes/Html/RteHtmlParser.php
index 2680257e6cea..f63c5fbf7ef3 100644
--- a/typo3/sysext/core/Classes/Html/RteHtmlParser.php
+++ b/typo3/sysext/core/Classes/Html/RteHtmlParser.php
@@ -330,7 +330,7 @@ class RteHtmlParser extends \TYPO3\CMS\Core\Html\HtmlParser
                 // Image found, do processing:
                 if ($k % 2) {
                     // Get attributes
-                    $attribArray = $this->get_tag_attributes_classic($v, 1);
+                    list($attribArray) = $this->get_tag_attributes($v, true);
                     // It's always an absolute URL coming from the RTE into the Database.
                     $absoluteUrl = trim($attribArray['src']);
                     // Make path absolute if it is relative and we have a site path which is not '/'
@@ -479,7 +479,7 @@ class RteHtmlParser extends \TYPO3\CMS\Core\Html\HtmlParser
                 // Image found
                 if ($k % 2) {
                     // Get the attributes of the img tag
-                    $attribArray = $this->get_tag_attributes_classic($v, 1);
+                    list($attribArray) = $this->get_tag_attributes($v, true);
                     $absoluteUrl = trim($attribArray['src']);
                     // Transform the src attribute into an absolute url, if it not already
                     if (strtolower(substr($absoluteUrl, 0, 4)) !== 'http') {
@@ -521,7 +521,7 @@ class RteHtmlParser extends \TYPO3\CMS\Core\Html\HtmlParser
                 foreach ($blockSplit as $k => $v) {
                     // Block
                     if ($k % 2) {
-                        $attribArray = $this->get_tag_attributes_classic($this->getFirstTag($v), 1);
+                        list($attribArray) = $this->get_tag_attributes($this->getFirstTag($v), true);
                         // If the url is local, remove url-prefix
                         if ($siteURL && substr($attribArray['href'], 0, strlen($siteURL)) == $siteURL) {
                             $attribArray['href'] = $this->relBackPath . substr($attribArray['href'], strlen($siteURL));
@@ -553,7 +553,7 @@ class RteHtmlParser extends \TYPO3\CMS\Core\Html\HtmlParser
         foreach ($blockSplit as $k => $v) {
             // If an A-tag was found:
             if ($k % 2) {
-                $attribArray = $this->get_tag_attributes_classic($this->getFirstTag($v), 1);
+                list($attribArray) = $this->get_tag_attributes($this->getFirstTag($v), true);
                 $info = $this->urlInfoForLinkTags($attribArray['href']);
                 // Check options:
                 $attribArray_copy = $attribArray;
@@ -799,7 +799,7 @@ class RteHtmlParser extends \TYPO3\CMS\Core\Html\HtmlParser
         foreach ($blockSplit as $k => $v) {
             // Block
             if ($k % 2) {
-                $attribArray = $this->get_tag_attributes_classic($this->getFirstTag($v));
+                list($attribArray) = $this->get_tag_attributes($this->getFirstTag($v));
                 if ($attribArray['specialtag']) {
                     $theTag = rawurldecode($attribArray['specialtag']);
                     $theTagName = $this->getFirstTagName($theTag);
@@ -913,7 +913,7 @@ class RteHtmlParser extends \TYPO3\CMS\Core\Html\HtmlParser
 
                     case 'h6':
                         if (!$css) {
-                            $attribArray = $this->get_tag_attributes_classic($tag);
+                            list($attribArray) = $this->get_tag_attributes($tag);
                             // Processing inner content here:
                             $innerContent = $this->HTMLcleaner_db($this->removeFirstAndLastTag($blockSplit[$k]));
                             $blockSplit[$k] = '<' . $tagName . ($attribArray['align'] ? ' align="' . htmlspecialchars($attribArray['align']) . '"' : '') . ($attribArray['class'] ? ' class="' . htmlspecialchars($attribArray['class']) . '"' : '') . '>' . $innerContent . '</' . $tagName . '>' . $lastBR;
@@ -959,7 +959,7 @@ class RteHtmlParser extends \TYPO3\CMS\Core\Html\HtmlParser
         foreach ($blockSplit as $k => $v) {
             // If an A-tag was found
             if ($k % 2) {
-                $attribArray = $this->get_tag_attributes_classic($this->getFirstTag($v), 1);
+                list($attribArray) = $this->get_tag_attributes($this->getFirstTag($v), true);
                 // If "style" attribute is set and rteerror is not set!
                 if ($attribArray['style'] && !$attribArray['rteerror']) {
                     $attribArray_copy['style'] = $attribArray['style'];
@@ -1416,7 +1416,7 @@ class RteHtmlParser extends \TYPO3\CMS\Core\Html\HtmlParser
                 $rowSplit = $this->splitIntoBlock('tr', $v);
                 foreach ($rowSplit as $k2 => $v2) {
                     if ($k2 % 2) {
-                        $cellSplit = $this->getAllParts($this->splitIntoBlock('td', $v2), 1, 0);
+                        $cellSplit = $this->getAllParts($this->splitIntoBlock('td', $v2));
                         foreach ($cellSplit as $k3 => $v3) {
                             $tableSplit[$k] .= $v3 . $breakChar;
                         }
@@ -1559,7 +1559,7 @@ class RteHtmlParser extends \TYPO3\CMS\Core\Html\HtmlParser
         foreach ($blockSplit as $k => $v) {
             // Block
             if ($k % 2) {
-                $attribArray = $this->get_tag_attributes_classic($this->getFirstTag($v), 1);
+                list($attribArray) = $this->get_tag_attributes($this->getFirstTag($v), true);
                 // Checking if there is a scheme, and if not, prepend the current url.
                 // ONLY do this if href has content - the <a> tag COULD be an anchor and if so, it should be preserved...
                 if ($attribArray['href'] !== '') {
diff --git a/typo3/sysext/core/Documentation/Changelog/master/Breaking-72667-RTEUnusedInternalMethodsRemoved.rst b/typo3/sysext/core/Documentation/Changelog/master/Breaking-72667-RTEUnusedInternalMethodsRemoved.rst
new file mode 100644
index 000000000000..37b0bcc1dd22
--- /dev/null
+++ b/typo3/sysext/core/Documentation/Changelog/master/Breaking-72667-RTEUnusedInternalMethodsRemoved.rst
@@ -0,0 +1,30 @@
+=======================================================
+Breaking: #72667 - RTE: Unused internal methods removed
+=======================================================
+
+Description
+===========
+
+The HTML parsing features for the Rich Text Editor feature related to the xhtml_cleaning were removed. The following now obsolete methods are
+removed as well:
+
+* ``HtmlParser->checkTagTypeCounts()``
+* ``HtmlParser->unprotectTags()``
+* ``HtmlParser->get_tag_attributes_classic()``
+* ``HtmlParser->cleanFontTags()``
+* ``HtmlParser->indentLines()``
+
+Additionally, the third parameter for the method ``HtmlParser->getAllParts()`` was removed as well, resulting that the method will always include
+the parsed tags in the result set.
+
+
+Impact
+======
+
+Calling any of the methods will result in a fatal PHP error.
+
+
+Affected Installations
+======================
+
+Any installation which uses a third-party extension that modifies the HtmlParsing via PHP.
\ No newline at end of file
-- 
GitLab