Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

Commit 7ab2f2a

Browse files
author
Steve Yegge
committed
team/codintel - implemented symbol-pagerank POC
This CL implements symbol-pagerank, mostly as a talking point for how search results might get ranking boosts from it. The intuition is that if you treat symbols as web pages, and calls and references to symbols as hyperlinks, then PageRank would provide a good estimate of the "importance" of each symbol. We do not store much symbol scope information, so I have squashed all the symbols to be file-scoped. Source files inherit the incoming links (and corresponding page rank) for all symbols they contain. It's implemented as a command-line tool in the lib/codeintel tree. It relies heavily on the code that handles lsif uploads, and constructs the symbol reference graph from the in-memory lsif representation before it's chunked out. I also provide an option for including the "implements" graph as file links--that is to say, edges between implementable things (such as interfaces, or abstract classes) and their implementation sites. You can think of these as being just another kind of symbol reference. Depending on the indexer, edges may represent implementations, method overrides, or other specific symbol relationships. This information was available in the index, so I threw it in. However, although it was clearly effective in raising prominence for implementable types, it still suffers from various issues and is not yet ready to be a default. The implementation overall suffers from various shortcomings and should not be considered production-ready. The PageRank Go implementation is MIT-licensed on GitHub and needs a security and legal review, and also a review for accuracy, performance, etc. The output of the tool is a list of all filenames in the lsif index, paired with their page rank. It writes text lines to stdout for now. You can run it on any lsif index to see how the files rank; note that the algorithm is nondeterministic because it involves random jumps, and each time it runs, the list will be different. However, page ranks do not usually vary widely from run to run.
1 parent bc662c4 commit 7ab2f2a

File tree

7 files changed

+196
-7
lines changed

7 files changed

+196
-7
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ require (
3434
github.com/crewjam/saml v0.4.6
3535
github.com/davecgh/go-spew v1.1.1
3636
github.com/daviddengcn/go-colortext v1.0.0
37+
github.com/dcadenas/pagerank v1.0.0
3738
github.com/derision-test/glock v1.0.0
3839
github.com/derision-test/go-mockgen v1.3.4
3940
github.com/dghubble/gologin v2.2.0+incompatible

lib/codeintel/lsif/conversion/correlate.go

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,20 @@ import (
1919
//
2020
// If getChildren == nil, no pruning of irrelevant data is performed.
2121
func Correlate(ctx context.Context, r io.Reader, root string, getChildren pathexistence.GetChildrenFunc) (*precise.GroupedBundleDataChans, error) {
22+
state, err := CorrelateInMemory(ctx, r, root, getChildren)
23+
24+
// Convert data to the format we send to the writer
25+
groupedBundleData, err := groupBundleData(ctx, state)
26+
if err != nil {
27+
return nil, err
28+
}
29+
30+
return groupedBundleData, nil
31+
}
32+
33+
// Reads, correlates and canonicalizes an lsif index from a stream,
34+
// and converts it into a State representation.
35+
func CorrelateInMemory(ctx context.Context, r io.Reader, root string, getChildren pathexistence.GetChildrenFunc) (*State, error) {
2236
// Read raw upload stream and return a correlation state
2337
state, err := correlateFromReader(ctx, r, root)
2438
if err != nil {
@@ -35,13 +49,7 @@ func Correlate(ctx context.Context, r io.Reader, root string, getChildren pathex
3549
}
3650
}
3751

38-
// Convert data to the format we send to the writer
39-
groupedBundleData, err := groupBundleData(ctx, state)
40-
if err != nil {
41-
return nil, err
42-
}
43-
44-
return groupedBundleData, nil
52+
return state, nil
4553
}
4654

4755
func CorrelateLocalGitRelative(ctx context.Context, dumpPath, relativeRoot string) (*precise.GroupedBundleDataChans, error) {
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package main
2+
3+
import (
4+
"flag"
5+
)
6+
7+
var (
8+
indexFilePath = flag.String("index-file", "dump.lsif", "The LSIF index to rank.")
9+
outputFilePath = flag.String("output-file", "", "Output file; defaults to input + '-pagerank'.")
10+
addImplEdges = flag.Bool("include-impls", false, "True to include implementation edges")
11+
)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package main
2+
3+
import (
4+
"flag"
5+
"fmt"
6+
"os"
7+
"sort"
8+
)
9+
10+
// This tool reads the specified lsif file and prints pagerank-sorted
11+
// file paths in space-delimited output to stdout: filepath rank
12+
func main() {
13+
flag.Parse()
14+
15+
indexFile, err := os.OpenFile(*indexFilePath, os.O_RDONLY, 0)
16+
if err != nil {
17+
die("Unable to open index file %v: %v", *indexFilePath, err)
18+
}
19+
defer indexFile.Close()
20+
21+
rankings, err := PageRankLsif(indexFile)
22+
if err != nil {
23+
die("Error computing pagerank: %v", err)
24+
}
25+
26+
// For now, write file names and their rankings (sorted) to stdout.
27+
sorter := *rankings
28+
sort.Slice(sorter, func(i, j int) bool {
29+
return sorter[i].rank > sorter[j].rank
30+
})
31+
32+
for _, doc := range sorter {
33+
fmt.Printf("%v %v\n", doc.filePath, doc.rank)
34+
}
35+
}
36+
37+
func die(msg string, args ...any) {
38+
fmt.Fprintf(os.Stderr, "\nerror: "+fmt.Sprintf(msg, args))
39+
os.Exit(1)
40+
}
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"io"
6+
7+
"github.com/dcadenas/pagerank"
8+
"github.com/sourcegraph/sourcegraph/lib/codeintel/lsif/conversion"
9+
"github.com/sourcegraph/sourcegraph/lib/codeintel/lsif/conversion/datastructures"
10+
)
11+
12+
const (
13+
// Chance of following a link rather than jumping randomly.
14+
FOLLOW_LINK_CHANCE = 0.85
15+
// Smaller number here yields a more exact result; more CPU cycles required.
16+
TOLERANCE = 0.00001
17+
)
18+
19+
type Document struct {
20+
docId int
21+
filePath string
22+
rank float64
23+
}
24+
25+
// Read LSIF data from the given reader and return the files from the
26+
// index along with their pagerank. The results are not sorted.
27+
func PageRankLsif(indexFile io.Reader) (*[]Document, error) {
28+
// Build a conversion.State representing the input lsif index.
29+
state, err := conversion.CorrelateInMemory(context.TODO(), indexFile, "", nil)
30+
if err != nil {
31+
return nil, err
32+
}
33+
34+
edges := addReferenceEdges(state)
35+
36+
if *addImplEdges {
37+
addImplementationEdges(state, edges)
38+
}
39+
40+
return runPageRanker(state, edges), nil
41+
}
42+
43+
func addReferenceEdges(state *conversion.State) *map[int]int {
44+
// First, we need a map of range id -> doc id for the whole index.
45+
// Even for a very large index it's likely only to be a few million entries.
46+
// Since we're the only use case needing this lookup, we build it ephemerally.
47+
// TODO(stevey): Can we get this info from State without preprocessing?
48+
rangeToDoc := make(map[int]int)
49+
state.Contains.Each(func(docId int, rangeIDs *datastructures.IDSet) {
50+
rangeIDs.Each(func(rangeId int) {
51+
rangeToDoc[rangeId] = docId
52+
})
53+
})
54+
55+
// Walk the references and find each one's file, and also the file(s)
56+
// containing the definition being referenced. These two files make an edge.
57+
// We omit links from files referencing themselves because PageRank ignores them.
58+
edges := make(map[int]int)
59+
for _, documentMap := range state.ReferenceData {
60+
documentMap.Each(func(docId int, ranges *datastructures.IDSet) {
61+
ranges.Each(func(rangeId int) {
62+
// Pagerank source node is doc ID for this reference range.
63+
refDocId := rangeToDoc[rangeId]
64+
65+
// Pagerank dest nodes are doc IDs for the associated definition range(s).
66+
// I.e., if a definition is split across files, they each get an edge for now.
67+
if data, ok := state.DefinitionData[state.RangeData[rangeId].DefinitionResultID]; ok {
68+
for _, defDocId := range data.UnorderedKeys() {
69+
// Insert a link for the PageRank calculator.
70+
if refDocId != defDocId {
71+
edges[refDocId] = defDocId
72+
}
73+
}
74+
}
75+
})
76+
})
77+
}
78+
return &edges
79+
}
80+
81+
// Treat implementations as references to the type being implemented.
82+
//
83+
// Note: These edges seem to have a dramatic impact on the results. This adds in millions
84+
// of edges and tends to push up interfaces and classes with lots of implementations. This
85+
// leads to their effect being a bit overwhelming, compared to when only call/use references
86+
// are included as pagerank edges. Moreover, some very strange results seem to be bubbling
87+
// up for Java indexes (about 20% don't look like they should be in the top results).
88+
//
89+
// So for now this is disabled by default.
90+
// TODO(stevey): pagerank.go shouldn't read the flag; it should be passed as a config option.
91+
func addImplementationEdges(state *conversion.State, edges *map[int]int) {
92+
graph := *edges
93+
for _, docMap := range state.ImplementationData {
94+
docMap.Each(func(docId int, ranges *datastructures.IDSet) {
95+
// We interpret the destination vertex as the thing being implemented
96+
// (e.g., the definition of an interface or abstract class).
97+
destNode := docId
98+
ranges.Each(func(rangeId int) {
99+
if data, ok := state.ImplementationData[state.RangeData[rangeId].ImplementationResultID]; ok {
100+
for _, implDocId := range data.UnorderedKeys() {
101+
if destNode != implDocId { // skip self-references for pagerank
102+
graph[implDocId] = destNode
103+
}
104+
}
105+
}
106+
})
107+
})
108+
}
109+
}
110+
111+
// The API to this PageRank package is that you get one shot at seeing the results.
112+
// Rank the graph, and toss each file/rank pair into the result set.
113+
func runPageRanker(state *conversion.State, edges *map[int]int) *[]Document {
114+
graph := pagerank.New()
115+
116+
for srcDocId, targetDocId := range *edges {
117+
graph.Link(srcDocId, targetDocId)
118+
}
119+
120+
rankings := make([]Document, 0, len(state.DocumentData))
121+
122+
graph.Rank(FOLLOW_LINK_CHANCE, TOLERANCE, func(docId int, rank float64) {
123+
rankings = append(rankings, Document{docId: docId, filePath: state.DocumentData[docId], rank: rank})
124+
})
125+
return &rankings
126+
}

lib/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ require (
4343
github.com/cockroachdb/redact v1.1.3 // indirect
4444
github.com/dave/jennifer v1.4.1 // indirect
4545
github.com/davecgh/go-spew v1.1.1 // indirect
46+
github.com/dcadenas/pagerank v0.0.0-20171013173705-af922e3ceea8 // indirect
4647
github.com/dlclark/regexp2 v1.4.0 // indirect
4748
github.com/dustin/go-humanize v1.0.0 // indirect
4849
github.com/getsentry/sentry-go v0.13.0 // indirect

lib/go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ github.com/dave/jennifer v1.4.1/go.mod h1:7jEdnm+qBcxl8PC0zyp7vxcpSRnzXSt9r39tpT
5757
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
5858
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
5959
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
60+
github.com/dcadenas/pagerank v0.0.0-20171013173705-af922e3ceea8 h1:YG9TGUT3wOhOPxE99qpcd99TZE5zuxBaAvqjuuLMBBI=
61+
github.com/dcadenas/pagerank v0.0.0-20171013173705-af922e3ceea8/go.mod h1:s8JxxWxUtKGji2M1VIqcZEBQj3FPXSTt7aBsAjfYeh8=
6062
github.com/derision-test/go-mockgen v1.1.2 h1:bMNCerr4I3dz2/UlguwgMuMuJDQXqRnBw17ezXkGvyI=
6163
github.com/derision-test/go-mockgen v1.1.2/go.mod h1:9H3VGTWYnL1VJoHHCuPKDpPFmNQ1uVyNlpX6P63l5Sk=
6264
github.com/dgraph-io/badger v1.6.0/go.mod h1:zwt7syl517jmP8s94KqSxTlM6IMsdhYy6psNgSztDR4=

0 commit comments

Comments
 (0)