Skip to content

Commit dc86914

Browse files
committed
Retrofit web crawler example.
1 parent cf2cdc4 commit dc86914

File tree

3 files changed

+154
-0
lines changed

3 files changed

+154
-0
lines changed

pom.xml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@
6363
<simplexml.version>2.7.1</simplexml.version>
6464
<moshi.version>1.1.0</moshi.version>
6565

66+
<!-- Sample Dependencies -->
67+
<jsoup.version>1.7.3</jsoup.version>
68+
6669
<!-- Test Dependencies -->
6770
<junit.version>4.12</junit.version>
6871
<assertj.version>1.7.0</assertj.version>

samples/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@
4242
<groupId>com.google.guava</groupId>
4343
<artifactId>guava</artifactId>
4444
</dependency>
45+
<dependency>
46+
<groupId>org.jsoup</groupId>
47+
<artifactId>jsoup</artifactId>
48+
<version>${jsoup.version}</version>
49+
</dependency>
4550
</dependencies>
4651

4752
<build>
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
/*
2+
* Copyright (C) 2016 Square, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.example.retrofit;
17+
18+
import java.io.IOException;
19+
import java.lang.annotation.Annotation;
20+
import java.lang.reflect.Type;
21+
import java.util.ArrayList;
22+
import java.util.Collections;
23+
import java.util.LinkedHashSet;
24+
import java.util.List;
25+
import java.util.Set;
26+
import java.util.concurrent.ConcurrentHashMap;
27+
import java.util.concurrent.Executors;
28+
import java.util.concurrent.TimeUnit;
29+
import java.util.concurrent.atomic.AtomicInteger;
30+
import okhttp3.ConnectionPool;
31+
import okhttp3.Dispatcher;
32+
import okhttp3.HttpUrl;
33+
import okhttp3.OkHttpClient;
34+
import okhttp3.ResponseBody;
35+
import org.jsoup.Jsoup;
36+
import org.jsoup.nodes.Document;
37+
import org.jsoup.nodes.Element;
38+
import retrofit2.Call;
39+
import retrofit2.Callback;
40+
import retrofit2.Converter;
41+
import retrofit2.Response;
42+
import retrofit2.Retrofit;
43+
import retrofit2.http.GET;
44+
import retrofit2.http.Url;
45+
46+
/** A simple web crawler that uses a Retrofit service to turn URLs into webpages. */
47+
public final class Crawler {
48+
private final Set<HttpUrl> fetchedUrls = Collections.synchronizedSet(
49+
new LinkedHashSet<HttpUrl>());
50+
private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>();
51+
private final PageService pageService;
52+
53+
public Crawler(PageService pageService) {
54+
this.pageService = pageService;
55+
}
56+
57+
public void crawlPage(HttpUrl url) {
58+
// Skip hosts that we've visited many times.
59+
AtomicInteger hostnameCount = new AtomicInteger();
60+
AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount);
61+
if (previous != null) hostnameCount = previous;
62+
if (hostnameCount.incrementAndGet() > 100) return;
63+
64+
// Asynchronously visit URL.
65+
pageService.get(url).enqueue(new Callback<Page>() {
66+
@Override public void onResponse(Call<Page> call, Response<Page> response) {
67+
if (!response.isSuccessful()) {
68+
System.out.println(call.request().url() + ": failed: " + response.code());
69+
return;
70+
}
71+
72+
// Print this page's URL and title.
73+
Page page = response.body();
74+
HttpUrl base = response.raw().request().url();
75+
System.out.println(base + ": " + page.title);
76+
77+
// Enqueue its links for visiting.
78+
for (String link : page.links) {
79+
HttpUrl linkUrl = base.resolve(link);
80+
if (linkUrl != null && !fetchedUrls.add(linkUrl)) {
81+
crawlPage(linkUrl);
82+
}
83+
}
84+
}
85+
86+
@Override public void onFailure(Call<Page> call, Throwable t) {
87+
System.out.println(call.request().url() + ": failed: " + t);
88+
}
89+
});
90+
}
91+
92+
public static void main(String... args) throws Exception {
93+
Dispatcher dispatcher = new Dispatcher(Executors.newFixedThreadPool(20));
94+
dispatcher.setMaxRequests(20);
95+
dispatcher.setMaxRequestsPerHost(1);
96+
97+
OkHttpClient okHttpClient = new OkHttpClient.Builder()
98+
.dispatcher(dispatcher)
99+
.connectionPool(new ConnectionPool(100, 30, TimeUnit.SECONDS))
100+
.build();
101+
102+
Retrofit retrofit = new Retrofit.Builder()
103+
.baseUrl(HttpUrl.parse("https://example.com/"))
104+
.addConverterFactory(PageAdapter.FACTORY)
105+
.client(okHttpClient)
106+
.build();
107+
108+
PageService pageService = retrofit.create(PageService.class);
109+
110+
Crawler crawler = new Crawler(pageService);
111+
crawler.crawlPage(HttpUrl.parse(args[0]));
112+
}
113+
114+
interface PageService {
115+
@GET Call<Page> get(@Url HttpUrl url);
116+
}
117+
118+
static class Page {
119+
public final String title;
120+
public final List<String> links;
121+
122+
public Page(String title, List<String> links) {
123+
this.title = title;
124+
this.links = links;
125+
}
126+
}
127+
128+
static final class PageAdapter implements Converter<ResponseBody, Page> {
129+
static final Converter.Factory FACTORY = new Converter.Factory() {
130+
@Override public Converter<ResponseBody, ?> responseBodyConverter(
131+
Type type, Annotation[] annotations, Retrofit retrofit) {
132+
if (type == Page.class) return new PageAdapter();
133+
return null;
134+
}
135+
};
136+
137+
@Override public Page convert(ResponseBody responseBody) throws IOException {
138+
Document document = Jsoup.parse(responseBody.string());
139+
List<String> links = new ArrayList<>();
140+
for (Element element : document.select("a[href]")) {
141+
links.add(element.attr("href"));
142+
}
143+
return new Page(document.title(), Collections.unmodifiableList(links));
144+
}
145+
}
146+
}

0 commit comments

Comments
 (0)