@@ -443,96 +443,44 @@ private function resolveChildUrls(DOMElement $el) {
443
443
}
444
444
}
445
445
446
- public function textContent (DOMElement $ el ) {
447
- $ excludeTags = array ('noframe ' , 'noscript ' , 'script ' , 'style ' , 'frames ' , 'frameset ' );
448
-
449
- if (isset ($ el ->tagName ) and in_array (strtolower ($ el ->tagName ), $ excludeTags )) {
450
- return '' ;
451
- }
452
-
453
- $ this ->resolveChildUrls ($ el );
454
-
455
- $ clonedEl = $ el ->cloneNode (true );
456
-
457
- foreach ($ this ->xpath ->query ('.//img ' , $ clonedEl ) as $ imgEl ) {
458
- $ newNode = $ this ->doc ->createTextNode ($ imgEl ->getAttribute ($ imgEl ->hasAttribute ('alt ' ) ? 'alt ' : 'src ' ));
459
- $ imgEl ->parentNode ->replaceChild ($ newNode , $ imgEl );
460
- }
461
-
462
- foreach ($ excludeTags as $ tagName ) {
463
- foreach ($ this ->xpath ->query (".// {$ tagName }" , $ clonedEl ) as $ elToRemove ) {
464
- $ elToRemove ->parentNode ->removeChild ($ elToRemove );
465
- }
466
- }
467
-
468
- return $ this ->innerText ($ clonedEl );
446
+ /**
447
+ * The following two methods implements plain text parsing.
448
+ * @see https://wiki.zegnat.net/media/textparsing.html
449
+ **/
450
+ public function textContent (DOMElement $ element )
451
+ {
452
+ return preg_replace (
453
+ '/(^[\t\n\f\r ]+| +(?=\n)|(?<=\n) +| +(?= )|[\t\n\f\r ]+$)/ ' ,
454
+ '' ,
455
+ $ this ->elementToString ($ element )
456
+ );
469
457
}
470
-
471
- /**
472
- * This method attempts to return a better 'innerText' representation than DOMNode::textContent
473
- *
474
- * @param DOMElement|DOMText $el
475
- * @param bool $implied when parsing for implied name for h-*, rules may be slightly different
476
- * @see: https://github.com/glennjones/microformat-shiv/blob/dev/lib/text.js
477
- */
478
- public function innerText ($ el , $ implied =false ) {
479
- $ out = '' ;
480
-
481
- $ blockLevelTags = array ('h1 ' , 'h2 ' , 'h3 ' , 'h4 ' , 'h5 ' , 'h6 ' , 'p ' , 'hr ' , 'pre ' , 'table ' ,
482
- 'address ' , 'article ' , 'aside ' , 'blockquote ' , 'caption ' , 'col ' , 'colgroup ' , 'dd ' , 'div ' ,
483
- 'dt ' , 'dir ' , 'fieldset ' , 'figcaption ' , 'figure ' , 'footer ' , 'form ' , 'header ' , 'hgroup ' , 'hr ' ,
484
- 'li ' , 'map ' , 'menu ' , 'nav ' , 'optgroup ' , 'option ' , 'section ' , 'tbody ' , 'testarea ' ,
485
- 'tfoot ' , 'th ' , 'thead ' , 'tr ' , 'td ' , 'ul ' , 'ol ' , 'dl ' , 'details ' );
486
-
487
- $ excludeTags = array ('noframe ' , 'noscript ' , 'script ' , 'style ' , 'frames ' , 'frameset ' );
488
-
489
- // PHP DOMDocument doesn’t correctly handle whitespace around elements it doesn’t recognise.
490
- $ unsupportedTags = array ('data ' );
491
-
492
- if (isset ($ el ->tagName )) {
493
- if (in_array (strtolower ($ el ->tagName ), $ excludeTags )) {
494
- return $ out ;
495
- } else if ($ el ->tagName == 'img ' ) {
496
- if ($ el ->hasAttribute ('alt ' )) {
497
- return $ el ->getAttribute ('alt ' );
498
- } else if (!$ implied && $ el ->hasAttribute ('src ' )) {
499
- return $ this ->resolveUrl ($ el ->getAttribute ('src ' ));
500
- }
501
- } else if ($ el ->tagName == 'area ' and $ el ->hasAttribute ('alt ' )) {
502
- return $ el ->getAttribute ('alt ' );
503
- } else if ($ el ->tagName == 'abbr ' and $ el ->hasAttribute ('title ' )) {
504
- return $ el ->getAttribute ('title ' );
505
- }
506
- }
507
-
508
- // if node is a text node get its text
509
- if (isset ($ el ->nodeType ) && $ el ->nodeType === 3 ) {
510
- $ out .= $ el ->textContent ;
511
- }
512
-
513
- // get the text of the child nodes
514
- if ($ el ->childNodes && $ el ->childNodes ->length > 0 ) {
515
- for ($ j = 0 ; $ j < $ el ->childNodes ->length ; $ j ++) {
516
- $ text = $ this ->innerText ($ el ->childNodes ->item ($ j ), $ implied );
517
- if (!is_null ($ text )) {
518
- $ out .= $ text ;
519
- }
520
- }
521
- }
522
-
523
- if (isset ($ el ->tagName )) {
524
- // if its a block level tag add an additional space at the end
525
- if (in_array (strtolower ($ el ->tagName ), $ blockLevelTags )) {
526
- $ out .= ' ' ;
527
- } elseif ($ implied and in_array (strtolower ($ el ->tagName ), $ unsupportedTags )) {
528
- $ out .= ' ' ;
529
- } else if (strtolower ($ el ->tagName ) == 'br ' ) {
530
- // else if its a br, replace with newline
531
- $ out .= "\n" ;
532
- }
533
- }
534
-
535
- return ($ out === '' ) ? NULL : $ out ;
458
+ private function elementToString (DOMElement $ input )
459
+ {
460
+ $ output = '' ;
461
+ foreach ($ input ->childNodes as $ child ) {
462
+ if ($ child ->nodeType === XML_TEXT_NODE ) {
463
+ $ output .= str_replace (array ("\t" , "\n" , "\r" ) , ' ' , $ child ->textContent );
464
+ } else if ($ child ->nodeType === XML_ELEMENT_NODE ) {
465
+ $ tagName = strtoupper ($ child ->tagName );
466
+ if (in_array ($ tagName , array ('SCRIPT ' , 'STYLE ' ))) {
467
+ continue ;
468
+ } else if ($ tagName === 'IMG ' ) {
469
+ if ($ child ->hasAttribute ('alt ' )) {
470
+ $ output .= ' ' . trim ($ child ->getAttribute ('alt ' ), "\t\n\f\r " ) . ' ' ;
471
+ } else if ($ child ->hasAttribute ('src ' )) {
472
+ $ output .= ' ' . $ this ->resolveUrl (trim ($ child ->getAttribute ('src ' ), "\t\n\f\r " )) . ' ' ;
473
+ }
474
+ } else if ($ tagName === 'BR ' ) {
475
+ $ output .= "\n" ;
476
+ } else if ($ tagName === 'P ' ) {
477
+ $ output .= "\n" . $ this ->elementToString ($ child );
478
+ } else {
479
+ $ output .= $ this ->elementToString ($ child );
480
+ }
481
+ }
482
+ }
483
+ return $ output ;
536
484
}
537
485
538
486
/**
@@ -648,7 +596,7 @@ public function parseP(\DOMElement $p) {
648
596
} elseif (in_array ($ p ->tagName , array ('data ' , 'input ' )) and $ p ->hasAttribute ('value ' )) {
649
597
$ pValue = $ p ->getAttribute ('value ' );
650
598
} else {
651
- $ pValue = unicodeTrim ( $ this ->innerText ($ p) );
599
+ $ pValue = $ this ->textContent ($ p );
652
600
}
653
601
654
602
return $ pValue ;
@@ -685,7 +633,7 @@ public function parseU(\DOMElement $u) {
685
633
} elseif (in_array ($ u ->tagName , array ('data ' , 'input ' )) and $ u ->hasAttribute ('value ' )) {
686
634
return $ u ->getAttribute ('value ' );
687
635
} else {
688
- return unicodeTrim ( $ this ->textContent ($ u) );
636
+ return $ this ->textContent ($ u );
689
637
}
690
638
}
691
639
@@ -916,7 +864,7 @@ public function parseE(\DOMElement $e) {
916
864
917
865
$ return = array (
918
866
'html ' => unicodeTrim ($ html ),
919
- 'value ' => unicodeTrim ( $ this ->innerText ($ e) ),
867
+ 'value ' => $ this ->textContent ($ e ),
920
868
);
921
869
922
870
if ($ this ->lang ) {
@@ -1123,7 +1071,7 @@ public function parseH(\DOMElement $e, $is_backcompat = false, $has_nested_mf =
1123
1071
}
1124
1072
}
1125
1073
1126
- throw new Exception ($ this ->innerText ($ e , true ));
1074
+ throw new Exception ($ this ->textContent ($ e , true ));
1127
1075
} catch (Exception $ exc ) {
1128
1076
$ return ['name ' ][] = unicodeTrim ($ exc ->getMessage ());
1129
1077
}
0 commit comments