|
| 1 | +//| mvnDeps: |
| 2 | +//| - info.picocli:picocli:4.7.6 |
| 3 | +//| - com.squareup.okhttp3:okhttp:4.12.0 |
| 4 | +//| - com.fasterxml.jackson.core:jackson-databind:2.17.2 |
| 5 | + |
| 6 | +import com.fasterxml.jackson.databind.*; |
| 7 | +import com.fasterxml.jackson.core.util.DefaultIndenter; |
| 8 | +import com.fasterxml.jackson.core.util.DefaultPrettyPrinter; |
| 9 | +import okhttp3.*; |
| 10 | +import picocli.CommandLine; |
| 11 | +import java.io.*; |
| 12 | +import java.nio.file.*; |
| 13 | +import java.util.*; |
| 14 | +import java.util.concurrent.Callable; |
| 15 | + |
| 16 | +@CommandLine.Command(name = "Crawler", mixinStandardHelpOptions = true) |
| 17 | +public class Crawler implements Callable<Integer> { |
| 18 | + |
| 19 | + @CommandLine.Option( |
| 20 | + names = {"--start-article"}, |
| 21 | + required = true, |
| 22 | + description = "Starting article title" |
| 23 | + ) |
| 24 | + private String startArticle; |
| 25 | + |
| 26 | + @CommandLine.Option( |
| 27 | + names = {"--depth"}, |
| 28 | + required = true, |
| 29 | + description = "Depth of crawl" |
| 30 | + ) |
| 31 | + private int depth; |
| 32 | + |
| 33 | + private static final OkHttpClient client = new OkHttpClient(); |
| 34 | + private static final ObjectMapper mapper = new ObjectMapper(); |
| 35 | + |
| 36 | + public static List<String> fetchLinks(String title) throws IOException { |
| 37 | + var url = new HttpUrl.Builder() |
| 38 | + .scheme("https") |
| 39 | + .host("en.wikipedia.org") |
| 40 | + .addPathSegments("w/api.php") |
| 41 | + .addQueryParameter("action", "query") |
| 42 | + .addQueryParameter("titles", title) |
| 43 | + .addQueryParameter("prop", "links") |
| 44 | + .addQueryParameter("format", "json") |
| 45 | + .build(); |
| 46 | + |
| 47 | + var request = new Request.Builder() |
| 48 | + .url(url) |
| 49 | + . header( "User-Agent", "WikiFetcherBot/1.0 (https://example.com; [email protected])") |
| 50 | + .build(); |
| 51 | + try (Response response = client.newCall(request).execute()) { |
| 52 | + if (!response.isSuccessful()) |
| 53 | + throw new IOException("Unexpected code " + response); |
| 54 | + |
| 55 | + JsonNode root = mapper.readTree(response.body().byteStream()); |
| 56 | + JsonNode pages = root.path("query").path("pages"); |
| 57 | + List<String> links = new ArrayList<>(); |
| 58 | + |
| 59 | + for (Iterator<JsonNode> it = pages.elements(); it.hasNext();) { |
| 60 | + JsonNode linkArr = it.next().get("links"); |
| 61 | + if (linkArr != null && linkArr.isArray()) { |
| 62 | + for (JsonNode link : linkArr) { |
| 63 | + JsonNode titleNode = link.get("title"); |
| 64 | + if (titleNode != null) links.add(titleNode.asText()); |
| 65 | + } |
| 66 | + } |
| 67 | + } |
| 68 | + return links; |
| 69 | + } |
| 70 | + } |
| 71 | + |
| 72 | + @Override |
| 73 | + public Integer call() throws Exception { |
| 74 | + Set<String> seen = new HashSet<>(); |
| 75 | + Set<String> current = new HashSet<>(); |
| 76 | + seen.add(startArticle); |
| 77 | + current.add(startArticle); |
| 78 | + |
| 79 | + for (int i = 0; i < depth; i++) { |
| 80 | + Set<String> next = new HashSet<>(); |
| 81 | + for (String article : current) { |
| 82 | + for (String link : fetchLinks(article)) { |
| 83 | + if (!seen.contains(link)) next.add(link); |
| 84 | + } |
| 85 | + } |
| 86 | + seen.addAll(next); |
| 87 | + current = next; |
| 88 | + } |
| 89 | + |
| 90 | + Path output = Paths.get("fetched.json"); |
| 91 | + try (Writer w = Files.newBufferedWriter(output)) { |
| 92 | + DefaultPrettyPrinter printer = new DefaultPrettyPrinter(); |
| 93 | + printer.indentArraysWith(new DefaultIndenter(" ", "\n")); |
| 94 | + printer.indentObjectsWith(new DefaultIndenter(" ", "\n")); |
| 95 | + mapper.writer(printer).writeValue(w, seen); |
| 96 | + } |
| 97 | + return 0; |
| 98 | + } |
| 99 | + |
| 100 | + public static void main(String[] args) { |
| 101 | + int exitCode = new CommandLine(new Crawler()).execute(args); |
| 102 | + System.exit(exitCode); |
| 103 | + } |
| 104 | +} |
0 commit comments