From 03bd2256e3d35c556ac1bad50a4554eeb32f1098 Mon Sep 17 00:00:00 2001
From: Alexander Stehlik <alexander.stehlik@gmail.com>
Date: Mon, 2 Mar 2015 20:07:34 +0100
Subject: [PATCH] [FEATURE] Allow stripping of empty tags in HtmlParser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a stripEmptyTags method to the HtmlParser
It can be enabled by TypoScript or TSConfig:

HTMLparser.stripEmptyTags = 1
HTMLparser.stripEmptyTags.tags = h2, h3
HTMLparser.stripEmptyTags.treatNonBreakingSpaceAsEmpty = 1

Resolves: #20555
Releases: master
Change-Id: I640486e9f32da6ac1eba05e3c38d15a0aba41055
Reviewed-on: http://review.typo3.org/16975
Reviewed-by: Frank Nägler <typo3@naegler.net>
Tested-by: Frank Nägler <typo3@naegler.net>
Reviewed-by: Anja Leichsenring <aleichsenring@ab-softlab.de>
Tested-by: Anja Leichsenring <aleichsenring@ab-softlab.de>
---
 typo3/sysext/core/Classes/Html/HtmlParser.php | 66 ++++++++++++++++++-
 .../Feature-20555-StripEmptyHtmlTags.rst      | 60 +++++++++++++++++
 .../core/Tests/Unit/Html/HtmlParserTest.php   | 47 +++++++++++++
 3 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100644 typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst

diff --git a/typo3/sysext/core/Classes/Html/HtmlParser.php b/typo3/sysext/core/Classes/Html/HtmlParser.php
index 7bcc385a5b96..150a9b0572d4 100644
--- a/typo3/sysext/core/Classes/Html/HtmlParser.php
+++ b/typo3/sysext/core/Classes/Html/HtmlParser.php
@@ -982,7 +982,9 @@ class HtmlParser {
 				unset($newContent[$pKey]);
 			}
 		}
-		return implode('', $newContent);
+		$newContent = implode('', $newContent);
+		$newContent = $this->stripEmptyTagsIfConfigured($newContent, $addConfig);
+		return $newContent;
 	}
 
 	/**
@@ -1410,6 +1412,12 @@ class HtmlParser {
 		if ($TSconfig['xhtml_cleaning']) {
 			$addConfig['xhtml'] = 1;
 		}
+		if (isset($TSconfig['stripEmptyTags'])) {
+			$addConfig['stripEmptyTags'] = $TSconfig['stripEmptyTags'];
+			if (isset($TSconfig['stripEmptyTags.'])) {
+				$addConfig['stripEmptyTags.'] = $TSconfig['stripEmptyTags.'];
+			}
+		}
 		return array(
 			$keepTags,
 			'' . $TSconfig['keepNonMatchedTags'],
@@ -1523,4 +1531,60 @@ class HtmlParser {
 		return $value;
 	}
 
+	/**
+	 * Strips empty tags from HTML.
+	 *
+	 * @param string $content The content to be stripped of empty tags
+	 * @param string $tagList The comma separated list of tags to be stripped.
+	 *                        If empty, all empty tags will be stripped
+	 * @param bool $treatNonBreakingSpaceAsEmpty If TRUE tags containing only &nbsp; entities will be treated as empty.
+	 * @return string the stripped content
+	 */
+	public function stripEmptyTags($content, $tagList = NULL, $treatNonBreakingSpaceAsEmpty = FALSE) {
+		$tagRegEx = '[^ >]+'; // all characters until you reach a > or space;
+		if ($tagList) {
+			$tags = preg_split('/,/', $tagList);
+			$tagRegEx = preg_replace('/ */', '', join('|', $tags));
+		}
+		$count = 1;
+		$nbspRegex = $treatNonBreakingSpaceAsEmpty ? '|(&nbsp;)' : '';
+		while ($count != 0) {
+			$content = preg_replace(sprintf('/<(%s)[^>]*>( %s)*<\/\\1[^>]*>/i', $tagRegEx, $nbspRegex), '', $content, -1, $count);
+		}
+		return $content;
+	}
+
+	/**
+	 * Strips the configured empty tags from the HMTL code.
+	 *
+	 * @param string $value
+	 * @param array $configuration
+	 * @return string
+	 */
+	protected function stripEmptyTagsIfConfigured($value, $configuration) {
+
+		if (isset($configuration['stripEmptyTags']) && $configuration['stripEmptyTags']) {
+
+			$tags = NULL;
+			if (
+				isset($configuration['stripEmptyTags.']['tags'])
+				&& $configuration['stripEmptyTags.']['tags'] !== ''
+			) {
+				$tags = $configuration['stripEmptyTags.']['tags'];
+			}
+
+			$treatNonBreakingSpaceAsEmpty = FALSE;
+			if (
+				isset($configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty'])
+				&& $configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty']
+			) {
+				$treatNonBreakingSpaceAsEmpty = (bool)$configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty'];
+			}
+
+
+			$value = $this->stripEmptyTags($value, $tags, $treatNonBreakingSpaceAsEmpty);
+		}
+
+		return $value;
+	}
 }
diff --git a/typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst b/typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst
new file mode 100644
index 000000000000..5c6e15168a60
--- /dev/null
+++ b/typo3/sysext/core/Documentation/Changelog/master/Feature-20555-StripEmptyHtmlTags.rst
@@ -0,0 +1,60 @@
+=====================================================
+Feature: #20555 - Strip empty HTML tags in HtmlParser
+=====================================================
+
+Description
+===========
+
+A new functionality is introduced in the HtmlParser that allows the stripping of empty HTML tags.
+
+It can be used in the Frontend by using the :ref:`HTMLparser<t3tsref:htmlparser>` TypoScript
+configuration of :ref:`stdWrap<t3tsref:stdwrap-htmlparser>`:
+
+.. code-block:: typoscript
+
+	stdWrap {
+
+		// If this is set all empty tags are stripped, unless a list of tags is provided below.
+		HTMLparser.stripEmptyTags = 1
+
+		// This setting can be used to filter the tags that should be stripped if they are empty.
+		HTMLparser.stripEmptyTags.tags = h2, h3
+	}
+
+It is also possible to use it in the
+:ref:`HTMLparser_rte or HTMLparser_db<transformations-tsconfig-processing-htmlparser>`
+in Page TSconfig:
+
+.. code-block:: typoscript
+
+	// For rtehtmlarea we need to use the entry parser because otherwise the p tags will
+	// be converted to linebreaks during the RTE transformation.
+	RTE.default.proc.entryHTMLparser_db {
+		stripEmptyTags = 1
+		stripEmptyTags.tags = p
+
+		// Since rtehtmlarea adds non breaking spaces in empty <p> tags we need to
+		// tell the parser that &nbsp; should be treated as an empty string:
+		stripEmptyTags.treatNonBreakingSpaceAsEmpty = 1
+	}
+
+**Hint!** Please note that the HTMLparser will strip all unknown tags by default. If you **only** want
+to strip empty tags, you need to set ``keepNonMatchedTags`` to TRUE or configure the allowed tags:
+
+.. code-block:: typoscript
+
+	stdWrap {
+		HTMLparser.keepNonMatchedTags = 1
+		HTMLparser.stripEmptyTags = 1
+		HTMLparser.stripEmptyTags.tags = h2, h3
+	}
+
+
+Impact
+======
+
+If the configuration is not set, the HtmlParser behaves like before so there is no
+impact to existing systems (unless they already have used the stripEmptyTags setting
+for whatever reason).
+
+
diff --git a/typo3/sysext/core/Tests/Unit/Html/HtmlParserTest.php b/typo3/sysext/core/Tests/Unit/Html/HtmlParserTest.php
index 70249b4f1ce7..49fba8fff315 100644
--- a/typo3/sysext/core/Tests/Unit/Html/HtmlParserTest.php
+++ b/typo3/sysext/core/Tests/Unit/Html/HtmlParserTest.php
@@ -267,4 +267,51 @@ Value 2.2
 		$this->assertSame($expected, $result);
 	}
 
+	/**
+	 * @return array
+	 */
+	public function emptyTagsDataProvider() {
+		return array(
+			array(0 , NULL, FALSE, '<h1></h1>', '<h1></h1>'),
+			array(1 , NULL, FALSE, '<h1></h1>', ''),
+			array(1 , NULL, FALSE, '<h1>hallo</h1>', '<h1>hallo</h1>'),
+			array(1 , NULL, FALSE, '<h1 class="something"></h1>', ''),
+			array(1 , NULL, FALSE, '<h1 class="something"></h1><h2></h2>', ''),
+			array(1 , 'h2', FALSE, '<h1 class="something"></h1><h2></h2>', '<h1 class="something"></h1>'),
+			array(1 , 'h2, h1', FALSE, '<h1 class="something"></h1><h2></h2>', ''),
+			array(1 , NULL, FALSE, '<div><p></p></div>', ''),
+			array(1 , NULL, FALSE, '<div><p>&nbsp;</p></div>', '<div><p>&nbsp;</p></div>'),
+			array(1 , NULL, TRUE, '<div><p>&nbsp;&nbsp;</p></div>', ''),
+			array(1 , NULL, TRUE, '<div>&nbsp;&nbsp;<p></p></div>', ''),
+			array(1 , NULL, FALSE, '<div>Some content<p></p></div>', '<div>Some content</div>'),
+			array(1 , NULL, TRUE, '<div>Some content<p></p></div>', '<div>Some content</div>'),
+			array(1 , NULL, FALSE, '<div>Some content</div>', '<div>Some content</div>'),
+			array(1 , NULL, TRUE, '<div>Some content</div>', '<div>Some content</div>'),
+			array(1 , NULL, FALSE, '<a href="#skiplinks">Skiplinks </a><b></b>', '<a href="#skiplinks">Skiplinks </a>'),
+			array(1 , NULL, TRUE, '<a href="#skiplinks">Skiplinks </a><b></b>', '<a href="#skiplinks">Skiplinks </a>'),
+		);
+	}
+
+	/**
+	 * @test
+	 * @dataProvider emptyTagsDataProvider
+	 * @param bool $stripOn TRUE if stripping should be activated.
+	 * @param string $tagList Comma seperated list of tags that should be stripped.
+	 * @param bool $treatNonBreakingSpaceAsEmpty If TRUE &nbsp; will be considered empty.
+	 * @param string $content The HTML code that should be modified.
+	 * @param string $expectedResult The expected HTML code result.
+	 */
+	public function stripEmptyTags($stripOn, $tagList, $treatNonBreakingSpaceAsEmpty, $content, $expectedResult) {
+		$tsConfig = array(
+			'keepNonMatchedTags' => 1,
+			'stripEmptyTags' => $stripOn,
+			'stripEmptyTags.' => array(
+				'tags' => $tagList,
+				'treatNonBreakingSpaceAsEmpty' => $treatNonBreakingSpaceAsEmpty
+			),
+		);
+		$config = $this->subject->HTMLparserConfig($tsConfig);
+		$result = $this->subject->HTMLcleaner($content, $config[0], $config[1], $config[2], $config[3]);
+		$this->assertEquals($expectedResult, $result);
+	}
 }
-- 
GitLab