@@ -22,10 +22,10 @@ import com.gravity.goose.text._
22
22
import com .gravity .goose .utils .Logging
23
23
import java .net .URL
24
24
import java .util .ArrayList
25
- import collection .JavaConversions ._
25
+ import scala .collection ._
26
+ import scala .collection .JavaConversions ._
26
27
import org .jsoup .nodes .{Attributes , Element , Document }
27
- import org .jsoup .select .{Selector , Elements }
28
- import collection .mutable .{Buffer , ListBuffer , HashSet }
28
+ import org .jsoup .select ._
29
29
30
30
/**
31
31
* Created by Jim Plush
@@ -53,6 +53,7 @@ trait ContentExtractor {
53
53
val SPACE_SPLITTER : StringSplitter = new StringSplitter (" " )
54
54
val NO_STRINGS = Set .empty[String ]
55
55
val A_REL_TAG_SELECTOR : String = " a[rel=tag], a[href*=/tag/]"
56
+ val TOP_NODE_TAGS = new TagsEvaluator (Set (" p" , " td" , " pre" ))
56
57
57
58
def getTitle (article : Article ): String = {
58
59
var title : String = string.empty
@@ -91,7 +92,7 @@ trait ContentExtractor {
91
92
}
92
93
catch {
93
94
case e : NullPointerException => {
94
- warn(e.toString);
95
+ warn(e.toString)
95
96
string.empty
96
97
}
97
98
}
@@ -169,7 +170,7 @@ trait ContentExtractor {
169
170
if (node.children.size == 0 ) return NO_STRINGS
170
171
val elements : Elements = Selector .select(A_REL_TAG_SELECTOR , node)
171
172
if (elements.size == 0 ) return NO_STRINGS
172
- val tags = new HashSet [String ]
173
+ val tags = mutable. HashSet [String ]()
173
174
174
175
for (el <- elements) {
175
176
var tag : String = el.text
@@ -191,12 +192,12 @@ trait ContentExtractor {
191
192
trace(logPrefix + " Starting to calculate TopNode" )
192
193
val doc = article.doc
193
194
var topNode : Element = null
194
- val nodesToCheck : ArrayList [ Element ] = getNodesToCheck( doc)
195
+ val nodesToCheck = Collector .collect( TOP_NODE_TAGS , doc)
195
196
var startingBoost : Double = 1.0
196
197
var cnt : Int = 0
197
198
var i : Int = 0
198
- val parentNodes = new HashSet [Element ]
199
- val nodesWithText : ArrayList [ Element ] = new ArrayList [Element ]
199
+ val parentNodes = mutable. HashSet [Element ]()
200
+ val nodesWithText = mutable. Buffer [Element ]()
200
201
for (node <- nodesToCheck) {
201
202
val nodeText : String = node.text
202
203
val wordStats : WordStats = StopWords .getStopWordCount(nodeText)
@@ -207,7 +208,7 @@ trait ContentExtractor {
207
208
}
208
209
val numberOfNodes : Int = nodesWithText.size
209
210
val negativeScoring : Int = 0
210
- val bottomNodesForNegativeScore : Double = numberOfNodes. asInstanceOf [ Float ] * 0.25
211
+ val bottomNodesForNegativeScore : Double = numberOfNodes * 0.25
211
212
212
213
trace(logPrefix + " About to inspect num of nodes with text: " + numberOfNodes)
213
214
@@ -216,14 +217,14 @@ trait ContentExtractor {
216
217
if (isOkToBoost(node)) {
217
218
if (cnt >= 0 ) {
218
219
boostScore = ((1.0 / startingBoost) * 50 ).asInstanceOf [Float ]
219
- startingBoost += 1 ;
220
+ startingBoost += 1
220
221
}
221
222
}
222
223
if (numberOfNodes > 15 ) {
223
224
if ((numberOfNodes - i) <= bottomNodesForNegativeScore) {
224
225
val booster : Float = bottomNodesForNegativeScore.asInstanceOf [Float ] - (numberOfNodes - i).asInstanceOf [Float ]
225
- boostScore = - Math .pow(booster, 2 .asInstanceOf [Float ]).asInstanceOf [Float ]
226
- val negscore : Float = Math .abs(boostScore) + negativeScoring
226
+ boostScore = - math .pow(booster, 2 .asInstanceOf [Float ]).asInstanceOf [Float ]
227
+ val negscore : Float = math .abs(boostScore) + negativeScoring
227
228
if (negscore > 40 ) {
228
229
boostScore = 5
229
230
}
@@ -246,8 +247,8 @@ trait ContentExtractor {
246
247
parentNodes.add(node.parent.parent)
247
248
}
248
249
249
- cnt += 1 ;
250
- i += 1 ;
250
+ cnt += 1
251
+ i += 1
251
252
}
252
253
var topNodeScore : Int = 0
253
254
for (e <- parentNodes) {
@@ -375,19 +376,6 @@ trait ContentExtractor {
375
376
}
376
377
}
377
378
378
- /**
379
- * returns a list of nodes we want to search on like paragraphs and tables
380
- *
381
- * @return
382
- */
383
- private def getNodesToCheck (doc : Document ): ArrayList [Element ] = {
384
- val nodesToCheck : ArrayList [Element ] = new ArrayList [Element ]
385
- nodesToCheck.addAll(doc.getElementsByTag(" p" ))
386
- nodesToCheck.addAll(doc.getElementsByTag(" pre" ))
387
- nodesToCheck.addAll(doc.getElementsByTag(" td" ))
388
- nodesToCheck
389
- }
390
-
391
379
/**
392
380
* adds a score to the gravityScore Attribute we put on divs
393
381
* we'll get the current score then add the score we're passing in to the current
@@ -438,7 +426,7 @@ trait ContentExtractor {
438
426
*/
439
427
def extractVideos (node : Element ): List [Element ] = {
440
428
val candidates : ArrayList [Element ] = new ArrayList [Element ]
441
- val goodMovies = new ListBuffer [Element ]
429
+ val goodMovies = mutable. Buffer [Element ]()
442
430
val youtubeStr = " youtube"
443
431
val vimdeoStr = " vimeo"
444
432
try {
@@ -497,7 +485,7 @@ trait ContentExtractor {
497
485
/**
498
486
* remove any divs that looks like non-content, clusters of links, or paras with no gusto
499
487
*
500
- * @param node
488
+ * @param targetNode
501
489
* @return
502
490
*/
503
491
def postExtractionCleanup (targetNode : Element ): Element = {
@@ -540,7 +528,7 @@ trait ContentExtractor {
540
528
/**
541
529
* adds any siblings that may have a decent score to this node
542
530
*
543
- * @param node
531
+ * @param currentSibling
544
532
* @return
545
533
*/
546
534
def getSiblingContent (currentSibling : Element , baselineScoreForSiblingParagraphs : Int ): Option [String ] = {
@@ -575,7 +563,7 @@ trait ContentExtractor {
575
563
576
564
def walkSiblings [T ](node : Element )(work : (Element ) => T ): Seq [T ] = {
577
565
var currentSibling : Element = node.previousElementSibling
578
- val b = Buffer [T ]()
566
+ val b = mutable. Buffer [T ]()
579
567
580
568
while (currentSibling != null ) {
581
569
0 commit comments