Skip to content
This repository was archived by the owner on Oct 30, 2018. It is now read-only.

Commit 928e898

Browse files
committed
Issue #56 - Content not extracted from "article" tag: Adding 'article' as a possible root element for article content
1 parent 739e1b4 commit 928e898

File tree

12 files changed

+93
-892
lines changed

12 files changed

+93
-892
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
<groupId>com.gravity</groupId>
66
<artifactId>goose</artifactId>
7-
<version>2.1.17</version>
7+
<version>2.1.18</version>
88
<packaging>jar</packaging>
99
<name>goose</name>
1010
<url>http://maven.apache.org</url>

src/main/scala/com/gravity/goose/cleaners/DocumentCleaner.scala

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818
package com.gravity.goose.cleaners
1919

2020
import com.gravity.goose.utils.Logging
21-
import org.jsoup.select.Elements
2221
import java.util.regex.{Matcher, Pattern}
2322
import org.jsoup.nodes.{TextNode, Node, Element, Document}
2423
import com.gravity.goose.text.ReplaceSequence
2524
import scala.collection.JavaConversions._
2625
import com.gravity.goose.Article
2726
import collection.mutable.ListBuffer
27+
import org.jsoup.select.{TagsEvaluator, Collector, Elements}
2828

2929
trait DocumentCleaner {
3030

@@ -55,8 +55,9 @@ trait DocumentCleaner {
5555
docToClean = removeNodesViaRegEx(docToClean, facebookPattern)
5656
docToClean = removeNodesViaRegEx(docToClean, twitterPattern)
5757
docToClean = cleanUpSpanTagsInParagraphs(docToClean)
58-
docToClean = convertDivsToParagraphs(docToClean, "div")
59-
docToClean = convertDivsToParagraphs(docToClean, "span")
58+
docToClean = convertWantedTagsToParagraphs(docToClean, articleRootTags)
59+
// docToClean = convertDivsToParagraphs(docToClean, "div")
60+
// docToClean = convertDivsToParagraphs(docToClean, "span")
6061

6162
// docToClean = convertDivsToParagraphs(docToClean, "span")
6263
docToClean
@@ -213,6 +214,29 @@ trait DocumentCleaner {
213214
div.replaceWith(newNode)
214215
}
215216

217+
private def convertWantedTagsToParagraphs(doc: Document, wantedTags: TagsEvaluator): Document = {
218+
219+
val selected = Collector.collect(wantedTags, doc)
220+
221+
for (elem <- selected) {
222+
if (Collector.collect(blockElemementTags, elem).isEmpty) {
223+
replaceElementsWithPara(doc, elem)
224+
} else {
225+
val replacements = getReplacementNodes(doc, elem)
226+
elem.children().foreach(_.remove())
227+
replacements.foreach(n => {
228+
try {
229+
elem.appendChild(n)
230+
} catch {
231+
case ex: Exception => info(ex, "Failed to append cleaned child!")
232+
}
233+
})
234+
}
235+
}
236+
237+
doc
238+
}
239+
216240

217241
private def convertDivsToParagraphs(doc: Document, domType: String): Document = {
218242
trace("Starting to replace bad divs...")
@@ -302,7 +326,7 @@ trait DocumentCleaner {
302326

303327
val kidTextNode = kid.asInstanceOf[TextNode]
304328
val kidText = kidTextNode.attr("text")
305-
val replaceText = tabsAndNewLinesReplcesments.replaceAll(kidText)
329+
val replaceText = tabsAndNewLinesReplacements.replaceAll(kidText)
306330
if (replaceText.trim().length > 1) {
307331

308332
var prevSibNode = kidTextNode.previousSibling()
@@ -352,40 +376,37 @@ trait DocumentCleaner {
352376

353377

354378
object DocumentCleaner extends Logging {
379+
var sb: StringBuilder = new StringBuilder
380+
381+
// create negative elements
382+
sb.append("^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor")
383+
sb.append("|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|remember-tool-tip")
384+
sb.append("|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text")
385+
355386
/**
356387
* this regex is used to remove undesirable nodes from our doc
357388
* indicate that something maybe isn't content but more of a comment, footer or some other undesirable node
358389
*/
359-
var regExRemoveNodes: String = null
360-
var queryNaughtyIDs: String = null
361-
var queryNaughtyClasses: String = null
362-
var queryNaughtyNames: String = null
363-
var tabsAndNewLinesReplcesments: ReplaceSequence = null
390+
val regExRemoveNodes = sb.toString()
391+
val queryNaughtyIDs = "[id~=(" + regExRemoveNodes + ")]"
392+
val queryNaughtyClasses = "[class~=(" + regExRemoveNodes + ")]"
393+
val queryNaughtyNames = "[name~=(" + regExRemoveNodes + ")]"
394+
val tabsAndNewLinesReplacements = ReplaceSequence.create("\n", "\n\n").append("\t").append("^\\s+$")
364395
/**
365396
* regex to detect if there are block level elements inside of a div element
366397
*/
367398
val divToPElementsPattern: Pattern = Pattern.compile("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)")
368399

400+
val blockElemementTags = TagsEvaluator("a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul")
401+
val articleRootTags = TagsEvaluator("div", "span", "article")
402+
369403
val captionPattern: Pattern = Pattern.compile("^caption$")
370404
val googlePattern: Pattern = Pattern.compile(" google ")
371405
val entriesPattern: Pattern = Pattern.compile("^[^entry-]more.*$")
372406
val facebookPattern: Pattern = Pattern.compile("[^-]facebook")
373407
val twitterPattern: Pattern = Pattern.compile("[^-]twitter")
374408

375409
val logPrefix = "Cleaner: "
376-
var sb: StringBuilder = new StringBuilder
377-
378-
// create negative elements
379-
sb.append("^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor")
380-
sb.append("|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|remember-tool-tip")
381-
sb.append("|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text")
382-
383-
regExRemoveNodes = sb.toString()
384-
queryNaughtyIDs = "[id~=(" + regExRemoveNodes + ")]"
385-
queryNaughtyClasses = "[class~=(" + regExRemoveNodes + ")]"
386-
queryNaughtyNames = "[name~=(" + regExRemoveNodes + ")]"
387-
388-
tabsAndNewLinesReplcesments = ReplaceSequence.create("\n", "\n\n").append("\t").append("^\\s+$")
389410

390411
}
391412

src/main/scala/com/gravity/goose/extractors/ContentExtractor.scala

Lines changed: 19 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ import com.gravity.goose.text._
2222
import com.gravity.goose.utils.Logging
2323
import java.net.URL
2424
import java.util.ArrayList
25-
import collection.JavaConversions._
25+
import scala.collection._
26+
import scala.collection.JavaConversions._
2627
import org.jsoup.nodes.{Attributes, Element, Document}
27-
import org.jsoup.select.{Selector, Elements}
28-
import collection.mutable.{Buffer, ListBuffer, HashSet}
28+
import org.jsoup.select._
2929

3030
/**
3131
* Created by Jim Plush
@@ -53,6 +53,7 @@ trait ContentExtractor {
5353
val SPACE_SPLITTER: StringSplitter = new StringSplitter(" ")
5454
val NO_STRINGS = Set.empty[String]
5555
val A_REL_TAG_SELECTOR: String = "a[rel=tag], a[href*=/tag/]"
56+
val TOP_NODE_TAGS = new TagsEvaluator(Set("p", "td", "pre"))
5657

5758
def getTitle(article: Article): String = {
5859
var title: String = string.empty
@@ -91,7 +92,7 @@ trait ContentExtractor {
9192
}
9293
catch {
9394
case e: NullPointerException => {
94-
warn(e.toString);
95+
warn(e.toString)
9596
string.empty
9697
}
9798
}
@@ -169,7 +170,7 @@ trait ContentExtractor {
169170
if (node.children.size == 0) return NO_STRINGS
170171
val elements: Elements = Selector.select(A_REL_TAG_SELECTOR, node)
171172
if (elements.size == 0) return NO_STRINGS
172-
val tags = new HashSet[String]
173+
val tags = mutable.HashSet[String]()
173174

174175
for (el <- elements) {
175176
var tag: String = el.text
@@ -191,12 +192,12 @@ trait ContentExtractor {
191192
trace(logPrefix + "Starting to calculate TopNode")
192193
val doc = article.doc
193194
var topNode: Element = null
194-
val nodesToCheck: ArrayList[Element] = getNodesToCheck(doc)
195+
val nodesToCheck = Collector.collect(TOP_NODE_TAGS, doc)
195196
var startingBoost: Double = 1.0
196197
var cnt: Int = 0
197198
var i: Int = 0
198-
val parentNodes = new HashSet[Element]
199-
val nodesWithText: ArrayList[Element] = new ArrayList[Element]
199+
val parentNodes = mutable.HashSet[Element]()
200+
val nodesWithText = mutable.Buffer[Element]()
200201
for (node <- nodesToCheck) {
201202
val nodeText: String = node.text
202203
val wordStats: WordStats = StopWords.getStopWordCount(nodeText)
@@ -207,7 +208,7 @@ trait ContentExtractor {
207208
}
208209
val numberOfNodes: Int = nodesWithText.size
209210
val negativeScoring: Int = 0
210-
val bottomNodesForNegativeScore: Double = numberOfNodes.asInstanceOf[Float] * 0.25
211+
val bottomNodesForNegativeScore: Double = numberOfNodes * 0.25
211212

212213
trace(logPrefix + "About to inspect num of nodes with text: " + numberOfNodes)
213214

@@ -216,14 +217,14 @@ trait ContentExtractor {
216217
if (isOkToBoost(node)) {
217218
if (cnt >= 0) {
218219
boostScore = ((1.0 / startingBoost) * 50).asInstanceOf[Float]
219-
startingBoost += 1;
220+
startingBoost += 1
220221
}
221222
}
222223
if (numberOfNodes > 15) {
223224
if ((numberOfNodes - i) <= bottomNodesForNegativeScore) {
224225
val booster: Float = bottomNodesForNegativeScore.asInstanceOf[Float] - (numberOfNodes - i).asInstanceOf[Float]
225-
boostScore = -Math.pow(booster, 2.asInstanceOf[Float]).asInstanceOf[Float]
226-
val negscore: Float = Math.abs(boostScore) + negativeScoring
226+
boostScore = -math.pow(booster, 2.asInstanceOf[Float]).asInstanceOf[Float]
227+
val negscore: Float = math.abs(boostScore) + negativeScoring
227228
if (negscore > 40) {
228229
boostScore = 5
229230
}
@@ -246,8 +247,8 @@ trait ContentExtractor {
246247
parentNodes.add(node.parent.parent)
247248
}
248249

249-
cnt += 1;
250-
i += 1;
250+
cnt += 1
251+
i += 1
251252
}
252253
var topNodeScore: Int = 0
253254
for (e <- parentNodes) {
@@ -375,19 +376,6 @@ trait ContentExtractor {
375376
}
376377
}
377378

378-
/**
379-
* returns a list of nodes we want to search on like paragraphs and tables
380-
*
381-
* @return
382-
*/
383-
private def getNodesToCheck(doc: Document): ArrayList[Element] = {
384-
val nodesToCheck: ArrayList[Element] = new ArrayList[Element]
385-
nodesToCheck.addAll(doc.getElementsByTag("p"))
386-
nodesToCheck.addAll(doc.getElementsByTag("pre"))
387-
nodesToCheck.addAll(doc.getElementsByTag("td"))
388-
nodesToCheck
389-
}
390-
391379
/**
392380
* adds a score to the gravityScore Attribute we put on divs
393381
* we'll get the current score then add the score we're passing in to the current
@@ -438,7 +426,7 @@ trait ContentExtractor {
438426
*/
439427
def extractVideos(node: Element): List[Element] = {
440428
val candidates: ArrayList[Element] = new ArrayList[Element]
441-
val goodMovies = new ListBuffer[Element]
429+
val goodMovies = mutable.Buffer[Element]()
442430
val youtubeStr = "youtube"
443431
val vimdeoStr = "vimeo"
444432
try {
@@ -497,7 +485,7 @@ trait ContentExtractor {
497485
/**
498486
* remove any divs that looks like non-content, clusters of links, or paras with no gusto
499487
*
500-
* @param node
488+
* @param targetNode
501489
* @return
502490
*/
503491
def postExtractionCleanup(targetNode: Element): Element = {
@@ -540,7 +528,7 @@ trait ContentExtractor {
540528
/**
541529
* adds any siblings that may have a decent score to this node
542530
*
543-
* @param node
531+
* @param currentSibling
544532
* @return
545533
*/
546534
def getSiblingContent(currentSibling: Element, baselineScoreForSiblingParagraphs: Int): Option[String] = {
@@ -575,7 +563,7 @@ trait ContentExtractor {
575563

576564
def walkSiblings[T](node: Element)(work: (Element) => T): Seq[T] = {
577565
var currentSibling: Element = node.previousElementSibling
578-
val b = Buffer[T]()
566+
val b = mutable.Buffer[T]()
579567

580568
while (currentSibling != null) {
581569

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package org.jsoup.select
2+
3+
import org.jsoup.nodes.Element
4+
5+
/**
6+
* Created by IntelliJ IDEA.
7+
* Author: Robbie Coleman
8+
* Date: 6/12/12
9+
* Time: 12:04 PM
10+
*/
11+
12+
class TagsEvaluator(tags: scala.collection.Set[String]) extends Evaluator {
13+
def matches(root: Element, element: Element) = tags.contains(element.tagName())
14+
}
15+
16+
object TagsEvaluator {
17+
def apply(tags: String*): TagsEvaluator = new TagsEvaluator(tags.toSet)
18+
}

src/test/java/com/jimplush/goose/ConfigurationTestIT.java

Lines changed: 0 additions & 38 deletions
This file was deleted.

0 commit comments

Comments
 (0)