Skip to content

Commit 4b81432

Browse files
committed
Add implementation of HITs algorithm.
Contributed by Jonathon Belotti (thundergolfer) via pull requests aimacode#236 and aimacode#237 (minor modifications and tidy up made before committing).
1 parent 53f05f9 commit 4b81432

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+87419
-1
lines changed
Lines changed: 397 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,397 @@
1+
package aima.core.nlp.ranking;
2+
3+
import java.util.ArrayList;
4+
import java.util.Collections;
5+
import java.util.Comparator;
6+
import java.util.HashSet;
7+
import java.util.Iterator;
8+
import java.util.List;
9+
import java.util.Map;
10+
import java.util.Set;
11+
12+
/**
13+
* Artificial Intelligence A Modern Approach (3rd Edition): page 871.<br>
14+
* <br>
15+
*
16+
* <pre>
17+
* function HITS(query) returns pages (with hub and authority numbers)
18+
* pages &larr; EXPAND-PAGES(RELEVANT-PAGES(query))
19+
* for each p in pages do
20+
* p.AUTHORITY &larr; 1
21+
* p.HUB &larr; 1
22+
* repeat until convergence do
23+
* for each p in pages do
24+
* p.AUTHORITY &larr; &Sigma;<sub>i</sub> INLINK<sub>i</sub>(p).HUB
25+
* p.HUB &larr; &Sigma;<sub>i</sub> OUTLINK<sub>i</sub>(p).AUTHORITY
26+
* NORMALIZE(pages)
27+
* return pages
28+
* </pre>
29+
*
30+
* Figure 22.1 The HITS algorithm for computing hubs and authorities with
31+
* respect to a query. RELEVANT-PAGES fetches the pages that match the query,
32+
* and EXPAND-PAGES add in every page that links to or is linked from one of the
33+
* relevant pages. NORMALIZE divides each page's score by the sum of the squares
34+
* of all pages' scores (separately for both the authority and hubs scores.<br>
35+
* <br>
36+
*
37+
* @author Jonathon Belotti (thundergolfer)
38+
*
39+
*/
40+
public class HITS {
41+
42+
final int RANK_HISTORY_DEPTH;
43+
final double DELTA_TOLERANCE; // somewhat arbitrary
44+
Map<String, Page> pTable;
45+
// DETECT CONVERGENCE VARS
46+
double[] prevAuthVals;
47+
double[] prevHubVals;
48+
double prevAveHubDelta = 0;
49+
double prevAveAuthDelta = 0;
50+
////////////////////////////
51+
52+
// TODO: Improve the convergence detection functionality
53+
public HITS(Map<String, Page> pTable, int rank_hist_depth, double delta_tolerance) {
54+
this.pTable = pTable;
55+
this.RANK_HISTORY_DEPTH = rank_hist_depth;
56+
this.DELTA_TOLERANCE = delta_tolerance;
57+
58+
}
59+
60+
public HITS(Map<String, Page> pTable) {
61+
this(pTable, 3, 0.05);
62+
}
63+
64+
// function HITS(query) returns pages with hub and authority number
65+
public List<Page> hits(String query) {
66+
// pages <- EXPAND-PAGES(RELEVANT-PAGES(query))
67+
List<Page> pages = expandPages(relevantPages(query));
68+
// for each p in pages
69+
for (Page p : pages) {
70+
// p.AUTHORITY <- 1
71+
p.authority = 1;
72+
// p.HUB <- 1
73+
p.hub = 1;
74+
}
75+
// repeat until convergence do
76+
while (!convergence(pages)) {
77+
// for each p in pages do
78+
for (Page p : pages) {
79+
// p.AUTHORITY <- &Sigma<sub>i</sub> INLINK<sub>i</sub>(p).HUB
80+
p.authority = SumInlinkHubScore(p);
81+
// p.HUB <- &Sigma;<sub>i</sub> OUTLINK<sub>i</sub>(p).AUTHORITY
82+
p.hub = SumOutlinkAuthorityScore(p);
83+
}
84+
// NORMALIZE(pages)
85+
normalize(pages);
86+
}
87+
return pages;
88+
89+
}
90+
91+
/**
92+
* Fetches and returns all pages that match the query
93+
*
94+
* @param query
95+
* @return
96+
* @throws UnsupportedEncodingException
97+
*/
98+
public List<Page> relevantPages(String query) {
99+
List<Page> relevantPages = new ArrayList<Page>();
100+
for (Page p : pTable.values()) {
101+
if (matches(query, p.getContent())) {
102+
relevantPages.add(p);
103+
}
104+
}
105+
return relevantPages;
106+
}
107+
108+
/**
109+
* Simple check if query string is a substring of a block of text.
110+
*
111+
* @param query
112+
* @param text
113+
* @return
114+
*/
115+
public boolean matches(String query, String text) {
116+
return text.contains(query);
117+
}
118+
119+
/**
120+
* Adds pages that are linked to or is linked from one of the pages passed
121+
* as argument.
122+
*
123+
* @param pages
124+
* @return
125+
*/
126+
public List<Page> expandPages(List<Page> pages) {
127+
128+
List<Page> expandedPages = new ArrayList<Page>();
129+
Set<String> inAndOutLinks = new HashSet<String>();
130+
// Go through all pages an build a list of String links
131+
for (int i = 0; i < pages.size(); i++) {
132+
Page currP = pages.get(i);
133+
if (!expandedPages.contains(currP)) {
134+
expandedPages.add(currP);
135+
}
136+
List<String> currInlinks = currP.getInlinks();
137+
for (int j = 0; j < currInlinks.size(); j++) {
138+
inAndOutLinks.add(currInlinks.get(i));
139+
}
140+
List<String> currOutlinks = currP.getOutlinks();
141+
for (int j = 0; j < currOutlinks.size(); j++) {
142+
inAndOutLinks.add(currOutlinks.get(i));
143+
}
144+
}
145+
// go through String links and add their respective pages to our return
146+
// list
147+
Iterator<String> it = inAndOutLinks.iterator();
148+
while (it.hasNext()) {
149+
String addr = it.next();
150+
Page p = pTable.get(addr);
151+
if (p != null && !expandedPages.contains(p)) { // a valid link may
152+
// not have an
153+
// associated page
154+
// in our table
155+
expandedPages.add(p);
156+
}
157+
}
158+
return expandedPages;
159+
} // end expandPages();
160+
161+
/**
162+
* Divides each page's score by the sum of the squares of all pages' scores
163+
* (separately for both the authority and hubs scores
164+
*
165+
* @param pages
166+
* @return
167+
*/
168+
public List<Page> normalize(List<Page> pages) {
169+
double hubTotal = 0;
170+
double authTotal = 0;
171+
for (Page p : pages) {
172+
// Sum Hub scores over all pages
173+
hubTotal += Math.pow(p.hub, 2);
174+
// Sum Authority scores over all pages
175+
authTotal += Math.pow(p.authority, 2);
176+
}
177+
// divide all hub and authority scores for all pages
178+
for (Page p : pages) {
179+
if (hubTotal > 0) {
180+
p.hub /= hubTotal;
181+
} else {
182+
p.hub = 0;
183+
}
184+
if (authTotal > 0) {
185+
p.authority /= authTotal;
186+
} else {
187+
p.authority = 0;
188+
}
189+
}
190+
return pages; // with normalised scores now
191+
} // end normalize()
192+
193+
/**
194+
* Calculate the Authority score of a page by summing the Hub scores of that
195+
* page's inlinks.
196+
*
197+
* @param page
198+
* @param pagesTable
199+
* @return
200+
*/
201+
public double SumInlinkHubScore(Page page) {
202+
List<String> inLinks = page.getInlinks();
203+
double hubScore = 0;
204+
for (int i = 0; i < inLinks.size(); i++) {
205+
Page inLink = pTable.get(inLinks.get(i));
206+
if (inLink != null) {
207+
hubScore += inLink.hub;
208+
} else {
209+
// page is linked to by a Page not in our table
210+
continue;
211+
}
212+
}
213+
return hubScore;
214+
} // end SumInlinkHubScore()
215+
216+
/**
217+
* Calculate the Hub score of a page by summing the Authority scores of that
218+
* page's outlinks.
219+
*
220+
* @param page
221+
* @param pagesTable
222+
* @return
223+
*/
224+
public double SumOutlinkAuthorityScore(Page page) {
225+
List<String> outLinks = page.getOutlinks();
226+
double authScore = 0;
227+
for (int i = 0; i < outLinks.size(); i++) {
228+
Page outLink = pTable.get(outLinks.get(i));
229+
if (outLink != null) {
230+
authScore += outLink.authority;
231+
}
232+
}
233+
return authScore;
234+
}
235+
236+
/**
237+
* pg. 872 : "If we then normalize the scores and repeat k times the process
238+
* will converge"
239+
*
240+
* @return
241+
*/
242+
private boolean convergence(List<Page> pages) {
243+
double aveHubDelta = 100;
244+
double aveAuthDelta = 100;
245+
if (pages == null) {
246+
return true;
247+
}
248+
249+
// get current values from pages
250+
double[] currHubVals = new double[pages.size()];
251+
double[] currAuthVals = new double[pages.size()];
252+
for (int i = 0; i < pages.size(); i++) {
253+
Page currPage = pages.get(i);
254+
currHubVals[i] = currPage.hub;
255+
currHubVals[i] = currPage.authority;
256+
}
257+
if (prevHubVals == null || prevAuthVals == null) {
258+
prevHubVals = currHubVals;
259+
prevAuthVals = currAuthVals;
260+
return false;
261+
}
262+
// compare to past values
263+
aveHubDelta = getAveDelta(currHubVals, prevHubVals);
264+
aveAuthDelta = getAveDelta(currAuthVals, prevAuthVals);
265+
if (aveHubDelta + aveAuthDelta < DELTA_TOLERANCE || (Math.abs(prevAveHubDelta - aveHubDelta) < 0.01
266+
&& Math.abs(prevAveAuthDelta - aveAuthDelta) < 0.01)) {
267+
return true;
268+
} else {
269+
prevHubVals = currHubVals;
270+
prevAuthVals = currAuthVals;
271+
prevAveHubDelta = aveHubDelta;
272+
prevAveAuthDelta = aveAuthDelta;
273+
return false;
274+
}
275+
}
276+
277+
/**
278+
* Determine how much values in a list are changing. Useful for detecting
279+
* convergence of data values.
280+
*
281+
* @param r
282+
* @return
283+
*/
284+
public double getAveDelta(double[] curr, double[] prev) {
285+
double aveDelta = 0;
286+
assert (curr.length == prev.length);
287+
for (int j = 0; j < curr.length; j++) {
288+
aveDelta += Math.abs(curr[j] - prev[j]);
289+
}
290+
aveDelta /= curr.length;
291+
return aveDelta;
292+
}
293+
294+
/**
295+
* Return from a set of Pages the Page with the greatest Hub value
296+
*
297+
* @param pageTable
298+
* @return
299+
*/
300+
public Page getMaxHub(List<Page> result) {
301+
Page maxHub = result.get(0);
302+
for (int i = 1; i < result.size(); i++) {
303+
Page currPage = result.get(i);
304+
if (currPage.hub > maxHub.hub) {
305+
maxHub = currPage;
306+
}
307+
}
308+
return maxHub;
309+
}
310+
311+
/**
312+
* Return from a set of Pages the Page with the greatest Authority value
313+
*
314+
* @param pageTable
315+
* @return
316+
*/
317+
public Page getMaxAuthority(List<Page> result) {
318+
Page maxAuthority = result.get(0);
319+
for (int i = 1; i < result.size(); i++) {
320+
Page currPage = result.get(i);
321+
if (currPage.authority > maxAuthority.authority) {
322+
maxAuthority = currPage;
323+
}
324+
}
325+
return maxAuthority;
326+
}
327+
328+
/**
329+
* Organize the list of pages according to their descending Hub scores.
330+
*
331+
* @param result
332+
*/
333+
public void sortHub(List<Page> result) {
334+
Collections.sort(result, new Comparator<Page>() {
335+
public int compare(Page p1, Page p2) {
336+
// Sorts by 'TimeStarted' property
337+
return p1.hub < p2.hub ? -1 : p1.hub > p2.hub ? 1 : secondaryOrderSort(p1, p2);
338+
}
339+
340+
// If 'TimeStarted' property is equal sorts by 'TimeEnded' property
341+
public int secondaryOrderSort(Page p1, Page p2) {
342+
return p1.getLocation().compareToIgnoreCase(p2.getLocation()) < 1 ? -1
343+
: p1.getLocation().compareToIgnoreCase(p2.getLocation()) > 1 ? 1 : 0;
344+
}
345+
});
346+
}
347+
348+
/**
349+
* Organize the list of pages according to their descending Authority Scores
350+
*
351+
* @param result
352+
*/
353+
public void sortAuthority(List<Page> result) {
354+
Collections.sort(result, new Comparator<Page>() {
355+
public int compare(Page p1, Page p2) {
356+
// Sorts by 'TimeStarted' property
357+
return p1.hub < p2.hub ? -1 : p1.hub > p2.hub ? 1 : secondaryOrderSort(p1, p2);
358+
}
359+
360+
// If 'TimeStarted' property is equal sorts by 'TimeEnded' property
361+
public int secondaryOrderSort(Page p1, Page p2) {
362+
return p1.getLocation().compareToIgnoreCase(p2.getLocation()) < 1 ? -1
363+
: p1.getLocation().compareToIgnoreCase(p2.getLocation()) > 1 ? 1 : 0;
364+
}
365+
});
366+
}
367+
368+
/**
369+
* Simple console display of HITS Algorithm results.
370+
*
371+
* @param result
372+
*/
373+
public void report(List<Page> result) {
374+
375+
// Print Pages out ranked by highest authority
376+
sortAuthority(result);
377+
System.out.println("AUTHORITY RANKINGS : ");
378+
for (int i = 0; i < result.size(); i++) {
379+
Page currP = result.get(i);
380+
System.out.printf(currP.getLocation() + ": " + "%.5f" + '\n', currP.authority);
381+
}
382+
System.out.println();
383+
// Print Pages out ranked by highest hub
384+
sortHub(result);
385+
System.out.println("HUB RANKINGS : ");
386+
for (int i = 0; i < result.size(); i++) {
387+
Page currP = result.get(i);
388+
System.out.printf(currP.getLocation() + ": " + "%.5f" + '\n', currP.hub);
389+
}
390+
System.out.println();
391+
// Print Max Authority
392+
System.out.println("Page with highest Authority score: " + getMaxAuthority(result).getLocation());
393+
// Print Max Authority
394+
System.out.println("Page with highest Hub score: " + getMaxAuthority(result).getLocation());
395+
}
396+
397+
}

0 commit comments

Comments
 (0)