From 03bd2256e3d35c556ac1bad50a4554eeb32f1098 Mon Sep 17 00:00:00 2001 From: Alexander Stehlik <alexander.stehlik@gmail.com> Date: Mon, 2 Mar 2015 20:07:34 +0100 Subject: [PATCH] [FEATURE] Allow stripping of empty tags in HtmlParser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a stripEmptyTags method to the HtmlParser It can be enabled by TypoScript or TSConfig: HTMLparser.stripEmptyTags = 1 HTMLparser.stripEmptyTags.tags = h2, h3 HTMLparser.stripEmptyTags.treatNonBreakingSpaceAsEmpty = 1 Resolves: #20555 Releases: master Change-Id: I640486e9f32da6ac1eba05e3c38d15a0aba41055 Reviewed-on: http://review.typo3.org/16975 Reviewed-by: Frank Nägler <typo3@naegler.net> Tested-by: Frank Nägler <typo3@naegler.net> Reviewed-by: Anja Leichsenring <aleichsenring@ab-softlab.de> Tested-by: Anja Leichsenring <aleichsenring@ab-softlab.de> --- typo3/sysext/core/Classes/Html/HtmlParser.php | 66 ++++++++++++++++++- .../Feature-20555-StripEmptyHtmlTags.rst | 60 +++++++++++++++++ .../core/Tests/Unit/Html/HtmlParserTest.php | 47 +++++++++++++ 3 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst diff --git a/typo3/sysext/core/Classes/Html/HtmlParser.php b/typo3/sysext/core/Classes/Html/HtmlParser.php index 7bcc385a5b96..150a9b0572d4 100644 --- a/typo3/sysext/core/Classes/Html/HtmlParser.php +++ b/typo3/sysext/core/Classes/Html/HtmlParser.php @@ -982,7 +982,9 @@ class HtmlParser { unset($newContent[$pKey]); } } - return implode('', $newContent); + $newContent = implode('', $newContent); + $newContent = $this->stripEmptyTagsIfConfigured($newContent, $addConfig); + return $newContent; } /** @@ -1410,6 +1412,12 @@ class HtmlParser { if ($TSconfig['xhtml_cleaning']) { $addConfig['xhtml'] = 1; } + if (isset($TSconfig['stripEmptyTags'])) { + $addConfig['stripEmptyTags'] = $TSconfig['stripEmptyTags']; + if (isset($TSconfig['stripEmptyTags.'])) { + $addConfig['stripEmptyTags.'] = $TSconfig['stripEmptyTags.']; + } + } return array( $keepTags, '' . $TSconfig['keepNonMatchedTags'], @@ -1523,4 +1531,60 @@ class HtmlParser { return $value; } + /** + * Strips empty tags from HTML. + * + * @param string $content The content to be stripped of empty tags + * @param string $tagList The comma separated list of tags to be stripped. + * If empty, all empty tags will be stripped + * @param bool $treatNonBreakingSpaceAsEmpty If TRUE tags containing only entities will be treated as empty. + * @return string the stripped content + */ + public function stripEmptyTags($content, $tagList = NULL, $treatNonBreakingSpaceAsEmpty = FALSE) { + $tagRegEx = '[^ >]+'; // all characters until you reach a > or space; + if ($tagList) { + $tags = preg_split('/,/', $tagList); + $tagRegEx = preg_replace('/ */', '', join('|', $tags)); + } + $count = 1; + $nbspRegex = $treatNonBreakingSpaceAsEmpty ? '|( )' : ''; + while ($count != 0) { + $content = preg_replace(sprintf('/<(%s)[^>]*>( %s)*<\/\\1[^>]*>/i', $tagRegEx, $nbspRegex), '', $content, -1, $count); + } + return $content; + } + + /** + * Strips the configured empty tags from the HMTL code. + * + * @param string $value + * @param array $configuration + * @return string + */ + protected function stripEmptyTagsIfConfigured($value, $configuration) { + + if (isset($configuration['stripEmptyTags']) && $configuration['stripEmptyTags']) { + + $tags = NULL; + if ( + isset($configuration['stripEmptyTags.']['tags']) + && $configuration['stripEmptyTags.']['tags'] !== '' + ) { + $tags = $configuration['stripEmptyTags.']['tags']; + } + + $treatNonBreakingSpaceAsEmpty = FALSE; + if ( + isset($configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty']) + && $configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty'] + ) { + $treatNonBreakingSpaceAsEmpty = (bool)$configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty']; + } + + + $value = $this->stripEmptyTags($value, $tags, $treatNonBreakingSpaceAsEmpty); + } + + return $value; + } } diff --git a/typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst b/typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst new file mode 100644 index 000000000000..5c6e15168a60 --- /dev/null +++ b/typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst @@ -0,0 +1,60 @@ +===================================================== +Feature: #20555 - Strip empty HTML tags in HtmlParser +===================================================== + +Description +=========== + +A new functionality is introduced in the HtmlParser that allows the stripping of empty HTML tags. + +It can be used in the Frontend by using the :ref:`HTMLparser<t3tsref:htmlparser>` TypoScript +configuration of :ref:`stdWrap<t3tsref:stdwrap-htmlparser>`: + +.. code-block:: typoscript + + stdWrap { + + // If this is set all empty tags are stripped, unless a list of tags is provided below. + HTMLparser.stripEmptyTags = 1 + + // This setting can be used to filter the tags that should be stripped if they are empty. + HTMLparser.stripEmptyTags.tags = h2, h3 + } + +It is also possible to use it in the +:ref:`HTMLparser_rte or HTMLparser_db<transformations-tsconfig-processing-htmlparser>` +in Page TSconfig: + +.. code-block:: typoscript + + // For rtehtmlarea we need to use the entry parser because otherwise the p tags will + // be converted to linebreaks during the RTE transformation. + RTE.default.proc.entryHTMLparser_db { + stripEmptyTags = 1 + stripEmptyTags.tags = p + + // Since rtehtmlarea adds non breaking spaces in empty <p> tags we need to + // tell the parser that should be treated as an empty string: + stripEmptyTags.treatNonBreakingSpaceAsEmpty = 1 + } + +**Hint!** Please note that the HTMLparser will strip all unknown tags by default. If you **only** want +to strip empty tags, you need to set ``keepNonMatchedTags`` to TRUE or configure the allowed tags: + +.. code-block:: typoscript + + stdWrap { + HTMLparser.keepNonMatchedTags = 1 + HTMLparser.stripEmptyTags = 1 + HTMLparser.stripEmptyTags.tags = h2, h3 + } + + +Impact +====== + +If the configuration is not set, the HtmlParser behaves like before so there is no +impact to existing systems (unless they already have used the stripEmptyTags setting +for whatever reason). + + diff --git a/typo3/sysext/core/Tests/Unit/Html/HtmlParserTest.php b/typo3/sysext/core/Tests/Unit/Html/HtmlParserTest.php index 70249b4f1ce7..49fba8fff315 100644 --- a/typo3/sysext/core/Tests/Unit/Html/HtmlParserTest.php +++ b/typo3/sysext/core/Tests/Unit/Html/HtmlParserTest.php @@ -267,4 +267,51 @@ Value 2.2 $this->assertSame($expected, $result); } + /** + * @return array + */ + public function emptyTagsDataProvider() { + return array( + array(0 , NULL, FALSE, '<h1></h1>', '<h1></h1>'), + array(1 , NULL, FALSE, '<h1></h1>', ''), + array(1 , NULL, FALSE, '<h1>hallo</h1>', '<h1>hallo</h1>'), + array(1 , NULL, FALSE, '<h1 class="something"></h1>', ''), + array(1 , NULL, FALSE, '<h1 class="something"></h1><h2></h2>', ''), + array(1 , 'h2', FALSE, '<h1 class="something"></h1><h2></h2>', '<h1 class="something"></h1>'), + array(1 , 'h2, h1', FALSE, '<h1 class="something"></h1><h2></h2>', ''), + array(1 , NULL, FALSE, '<div><p></p></div>', ''), + array(1 , NULL, FALSE, '<div><p> </p></div>', '<div><p> </p></div>'), + array(1 , NULL, TRUE, '<div><p> </p></div>', ''), + array(1 , NULL, TRUE, '<div> <p></p></div>', ''), + array(1 , NULL, FALSE, '<div>Some content<p></p></div>', '<div>Some content</div>'), + array(1 , NULL, TRUE, '<div>Some content<p></p></div>', '<div>Some content</div>'), + array(1 , NULL, FALSE, '<div>Some content</div>', '<div>Some content</div>'), + array(1 , NULL, TRUE, '<div>Some content</div>', '<div>Some content</div>'), + array(1 , NULL, FALSE, '<a href="#skiplinks">Skiplinks </a><b></b>', '<a href="#skiplinks">Skiplinks </a>'), + array(1 , NULL, TRUE, '<a href="#skiplinks">Skiplinks </a><b></b>', '<a href="#skiplinks">Skiplinks </a>'), + ); + } + + /** + * @test + * @dataProvider emptyTagsDataProvider + * @param bool $stripOn TRUE if stripping should be activated. + * @param string $tagList Comma seperated list of tags that should be stripped. + * @param bool $treatNonBreakingSpaceAsEmpty If TRUE will be considered empty. + * @param string $content The HTML code that should be modified. + * @param string $expectedResult The expected HTML code result. + */ + public function stripEmptyTags($stripOn, $tagList, $treatNonBreakingSpaceAsEmpty, $content, $expectedResult) { + $tsConfig = array( + 'keepNonMatchedTags' => 1, + 'stripEmptyTags' => $stripOn, + 'stripEmptyTags.' => array( + 'tags' => $tagList, + 'treatNonBreakingSpaceAsEmpty' => $treatNonBreakingSpaceAsEmpty + ), + ); + $config = $this->subject->HTMLparserConfig($tsConfig); + $result = $this->subject->HTMLcleaner($content, $config[0], $config[1], $config[2], $config[3]); + $this->assertEquals($expectedResult, $result); + } } -- GitLab