From e1de3ea2c0661bc35d8c6a3344111d95e0b0d267 Mon Sep 17 00:00:00 2001 From: Martijn van der Ven Date: Mon, 26 Mar 2018 11:35:32 +0200 Subject: [PATCH 1/2] Introduce new way of getting plain text values from HTML elements --- Mf2/Parser.php | 134 +++++++++++------------------------- tests/Mf2/PlainTextTest.php | 76 ++++++++++++++++++++ 2 files changed, 117 insertions(+), 93 deletions(-) create mode 100644 tests/Mf2/PlainTextTest.php diff --git a/Mf2/Parser.php b/Mf2/Parser.php index 7048794..9bddf5d 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -438,96 +438,44 @@ private function resolveChildUrls(DOMElement $el) { } } - public function textContent(DOMElement $el) { - $excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset'); - - if (isset($el->tagName) and in_array(strtolower($el->tagName), $excludeTags)) { - return ''; - } - - $this->resolveChildUrls($el); - - $clonedEl = $el->cloneNode(true); - - foreach ($this->xpath->query('.//img', $clonedEl) as $imgEl) { - $newNode = $this->doc->createTextNode($imgEl->getAttribute($imgEl->hasAttribute('alt') ? 'alt' : 'src')); - $imgEl->parentNode->replaceChild($newNode, $imgEl); - } - - foreach ($excludeTags as $tagName) { - foreach ($this->xpath->query(".//{$tagName}", $clonedEl) as $elToRemove) { - $elToRemove->parentNode->removeChild($elToRemove); - } - } - - return $this->innerText($clonedEl); + /** + * The following two methods implements plain text parsing. + * @see https://wiki.zegnat.net/media/textparsing.html + **/ + public function textContent(DOMElement $element) + { + return preg_replace( + '/(^[\t\n\f\r ]+| +(?=\n)|(?<=\n) +| +(?= )|[\t\n\f\r ]+$)/', + '', + $this->elementToString($element) + ); } - - /** - * This method attempts to return a better 'innerText' representation than DOMNode::textContent - * - * @param DOMElement|DOMText $el - * @param bool $implied when parsing for implied name for h-*, rules may be slightly different - * @see: https://github.com/glennjones/microformat-shiv/blob/dev/lib/text.js - */ - public function innerText($el, $implied=false) { - $out = ''; - - $blockLevelTags = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'pre', 'table', - 'address', 'article', 'aside', 'blockquote', 'caption', 'col', 'colgroup', 'dd', 'div', - 'dt', 'dir', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'header', 'hgroup', 'hr', - 'li', 'map', 'menu', 'nav', 'optgroup', 'option', 'section', 'tbody', 'testarea', - 'tfoot', 'th', 'thead', 'tr', 'td', 'ul', 'ol', 'dl', 'details'); - - $excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset'); - - // PHP DOMDocument doesn’t correctly handle whitespace around elements it doesn’t recognise. - $unsupportedTags = array('data'); - - if (isset($el->tagName)) { - if (in_array(strtolower($el->tagName), $excludeTags)) { - return $out; - } else if ($el->tagName == 'img') { - if ($el->hasAttribute('alt')) { - return $el->getAttribute('alt'); - } else if (!$implied && $el->hasAttribute('src')) { - return $this->resolveUrl($el->getAttribute('src')); - } - } else if ($el->tagName == 'area' and $el->hasAttribute('alt')) { - return $el->getAttribute('alt'); - } else if ($el->tagName == 'abbr' and $el->hasAttribute('title')) { - return $el->getAttribute('title'); - } - } - - // if node is a text node get its text - if (isset($el->nodeType) && $el->nodeType === 3) { - $out .= $el->textContent; - } - - // get the text of the child nodes - if ($el->childNodes && $el->childNodes->length > 0) { - for ($j = 0; $j < $el->childNodes->length; $j++) { - $text = $this->innerText($el->childNodes->item($j), $implied); - if (!is_null($text)) { - $out .= $text; - } - } - } - - if (isset($el->tagName)) { - // if its a block level tag add an additional space at the end - if (in_array(strtolower($el->tagName), $blockLevelTags)) { - $out .= ' '; - } elseif ($implied and in_array(strtolower($el->tagName), $unsupportedTags)) { - $out .= ' '; - } else if (strtolower($el->tagName) == 'br') { - // else if its a br, replace with newline - $out .= "\n"; - } - } - - return ($out === '') ? NULL : $out; + private function elementToString(DOMElement $input) + { + $output = ''; + foreach ($input->childNodes as $child) { + if ($child->nodeType === XML_TEXT_NODE) { + $output .= str_replace(array("\t", "\n", "\r") , ' ', $child->textContent); + } else if ($child->nodeType === XML_ELEMENT_NODE) { + $tagName = strtoupper($child->tagName); + if (in_array($tagName, array('SCRIPT', 'STYLE'))) { + continue; + } else if ($tagName === 'IMG') { + if ($child->hasAttribute('alt')) { + $output .= ' ' . trim($child->getAttribute('alt'), "\t\n\f\r ") . ' '; + } else if ($child->hasAttribute('src')) { + $output .= ' ' . $this->resolveUrl(trim($child->getAttribute('src'), "\t\n\f\r ")) . ' '; + } + } else if ($tagName === 'BR') { + $output .= "\n"; + } else if ($tagName === 'P') { + $output .= "\n" . $this->elementToString($child); + } else { + $output .= $this->elementToString($child); + } + } + } + return $output; } /** @@ -643,7 +591,7 @@ public function parseP(\DOMElement $p) { } elseif (in_array($p->tagName, array('data', 'input')) and $p->hasAttribute('value')) { $pValue = $p->getAttribute('value'); } else { - $pValue = unicodeTrim($this->innerText($p)); + $pValue = $this->textContent($p); } return $pValue; @@ -680,7 +628,7 @@ public function parseU(\DOMElement $u) { } elseif (in_array($u->tagName, array('data', 'input')) and $u->hasAttribute('value')) { return $u->getAttribute('value'); } else { - return unicodeTrim($this->textContent($u)); + return $this->textContent($u); } } @@ -911,7 +859,7 @@ public function parseE(\DOMElement $e) { $return = array( 'html' => unicodeTrim($html), - 'value' => unicodeTrim($this->innerText($e)), + 'value' => $this->textContent($e), ); if($this->lang) { @@ -1118,7 +1066,7 @@ public function parseH(\DOMElement $e, $is_backcompat = false, $has_nested_mf = } } - throw new Exception($this->innerText($e, true)); + throw new Exception($this->textContent($e, true)); } catch (Exception $exc) { $return['name'][] = unicodeTrim($exc->getMessage()); } diff --git a/tests/Mf2/PlainTextTest.php b/tests/Mf2/PlainTextTest.php new file mode 100644 index 0000000..df9722f --- /dev/null +++ b/tests/Mf2/PlainTextTest.php @@ -0,0 +1,76 @@ +parse(); + $entryProperties = $output['items'][0]['properties']; + $this->assertEquals($pName, $entryProperties['name'][0]); + $this->assertEquals($eValue, $entryProperties['content'][0]['value']); + $this->assertEquals($eHtml, $entryProperties['content'][0]['html']); + } + + public function aaronpkExpectations() { + return array( + 1 => array( + "
\n

Hello World

\n
", + "Hello World", + "Hello World", + "

Hello World

" + ), + 2 => array( + "
\n

Hello
World

\n
", + "Hello\nWorld", + "Hello\nWorld", + "

Hello
World

" + ), + 3 => array( + "
\n

Hello
\nWorld

\n
", + "Hello\nWorld", + "Hello\nWorld", + "

Hello
\nWorld

" + ), + 4 => array( + "
\n
\n

Hello World

\n
\n
", + "Hello World", + "Hello World", + "

Hello World

" + ), + 5 => array( + "
\n
Hello\nWorld
\n
", + "Hello World", + "Hello World", + "Hello\nWorld" + ), + 6 => array( + "
\n

Hello

World

\n
", + "Hello\nWorld", + "Hello\nWorld", + "

Hello

World

" + ), + 7 => array( + "
\n
Hello
\n World
\n
", + "Hello\nWorld", + "Hello\nWorld", + "Hello
\n World", + ), + 8 => array( + "
\n

Hello
World
\n
", + "Hello\nWorld", + "Hello\nWorld", + "
Hello
World
" + ), + 9 => array( + "
\n
\n

One

\n

Two

\n

Three

\n
\n
", + "One\nTwo\nThree", + "One\nTwo\nThree", + "

One

\n

Two

\n

Three

" + ) + ); + } +} From 7dbe03ddc97fa28bb2135c5dd0b5794394fa1195 Mon Sep 17 00:00:00 2001 From: Martijn van der Ven Date: Mon, 26 Mar 2018 11:52:59 +0200 Subject: [PATCH 2/2] Fix three tests that failed with the new algorithm --- tests/Mf2/ParseImpliedTest.php | 4 ++-- tests/Mf2/ParserTest.php | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/Mf2/ParseImpliedTest.php b/tests/Mf2/ParseImpliedTest.php index d58152b..26badbf 100644 --- a/tests/Mf2/ParseImpliedTest.php +++ b/tests/Mf2/ParseImpliedTest.php @@ -193,8 +193,8 @@ public function testParsesImpliedNameConsistentWithPName() { $inner = "Name \nand more"; $test = ' ' . $inner .' ' . $inner . ' '; $result = Mf2\parse($test); - $this->assertEquals($inner, $result['items'][0]['properties']['name'][0]); - $this->assertEquals($inner, $result['items'][1]['properties']['name'][0]); + $this->assertEquals('Name and more', $result['items'][0]['properties']['name'][0]); + $this->assertEquals('Name and more', $result['items'][1]['properties']['name'][0]); } diff --git a/tests/Mf2/ParserTest.php b/tests/Mf2/ParserTest.php index e5767ab..0026766 100644 --- a/tests/Mf2/ParserTest.php +++ b/tests/Mf2/ParserTest.php @@ -93,7 +93,7 @@ public function testParseEResolvesRelativeLinks() { $output = $parser->parse(); $this->assertEquals('Blah blah thing. ', $output['items'][0]['properties']['content'][0]['html']); - $this->assertEquals('Blah blah thing. http://example.com/img', $output['items'][0]['properties']['content'][0]['value']); + $this->assertEquals('Blah blah thing. http://example.com/img', $output['items'][0]['properties']['content'][0]['value']); } public function testParseEWithBR() { @@ -156,7 +156,7 @@ public function testHtmlEncodesNonEProperties() { public function testHtmlEncodesImpliedProperties() { - $input = '<name>'; + $input = '<name>'; $parser = new Parser($input); $output = $parser->parse();