Skip to content

Commit 33a7080

Browse files
committed
add crawler
1 parent 7925279 commit 33a7080

12 files changed

+5505
-0
lines changed

src/crawler.erl

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
-module(crawler).
2+
-behaviour(gen_server).
3+
4+
-export([start/0, run/1]).
5+
-export([init/1, handle_call/3, terminate/2, handle_cast/2, handle_info/2, code_change/3]).
6+
7+
%%----------------------------------------------------------------------------
8+
start() ->
9+
gen_server:start_link({global, ?MODULE}, ?MODULE, [], []),
10+
crawler_store:start().
11+
12+
run(URL) ->
13+
gen_server:call({global, ?MODULE}, {run, URL}).
14+
%%----------------------------------------------------------------------------
15+
init([])->
16+
inets:start(),
17+
{ok, []}.
18+
terminate(_Reason, _State) -> ok.
19+
handle_info(_Message, State) -> { noreply, State }.
20+
code_change(_OldVersion, State, _Extra) -> { ok, State }.
21+
%%----------------------------------------------------------------------------
22+
23+
handle_call({run, URL}, _From, State) ->
24+
crawler_store:store(URL),
25+
Cid = crawler_worker:start(),
26+
crawler_worker:get_map(Cid, URL),
27+
{reply, ok, State};
28+
29+
handle_call(_Message, _Form, State) -> {reply, ok, State}.
30+
handle_cast(_Message, State) -> { noreply, State }.
31+
32+
%%--Support---------------------------------------------------------------------
33+
%rexp() -> ["^https?://.*", "^/.*/$"].

src/crawler_downloder.erl

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
-module(crawler_downloder).
2+
-behaviour(gen_server).
3+
4+
-export([start/0, download/2]).
5+
-export([init/1, handle_call/3, terminate/2, handle_cast/2, handle_info/2, code_change/3]).
6+
7+
%%----------------------------------------------------------------------------
8+
start() ->
9+
ID = get_id(),
10+
gen_server:start_link({global, ID}, ?MODULE, [], []),
11+
ID.
12+
13+
download(ID, Img) ->
14+
gen_server:cast({global, ID}, {download, Img}).
15+
16+
17+
18+
%%----------------------------------------------------------------------------
19+
init([])->
20+
inets:start(),
21+
{ok, []}.
22+
terminate(_Reason, _State) -> ok.
23+
handle_info(_Message, State) -> { noreply, State }.
24+
code_change(_OldVersion, State, _Extra) -> { ok, State }.
25+
%%----------------------------------------------------------------------------
26+
27+
handle_call(_Message, _Form, State) -> {reply, ok, State}.
28+
29+
handle_cast({download, Img}, State) ->
30+
try
31+
request(Img),
32+
{noreply, State}
33+
catch
34+
_:_ ->
35+
{noreply, State}
36+
end;
37+
38+
handle_cast(_Message, State) -> { noreply, State }.
39+
40+
%%--Support---------------------------------------------------------------------
41+
%rexp() -> ["^https?://.*", "^/.*/$"].
42+
43+
request(Img) ->
44+
{R, {{_Type, Code, _Ok}, _Headers, Body}} = httpc:request(Img),
45+
if
46+
(R == ok) and (Code == 200) ->
47+
file:write_file("/Images"++name(Img), Body)
48+
end.
49+
50+
name(URL) ->
51+
{ok, {http,_,_Root,_Port,Path,_Query}} = http_uri:parse(URL),
52+
string:sub_string(Path,string:rstr(Path,"/"), string:len(Path)).
53+
54+
get_id() ->
55+
list_to_atom(integer_to_list(erlang:system_time())).

src/crawler_store.erl

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
-module(crawler_store).
2+
-behaviour(gen_server).
3+
4+
-export([start/0, store/1, print/0, save/0, get_base/0]).
5+
-export([init/1,
6+
handle_call/3,
7+
terminate/2,
8+
handle_cast/2,
9+
handle_info/2,
10+
code_change/3]).
11+
12+
%%----------------------------------------------------------------------------
13+
start() ->
14+
gen_server:start_link({global, ?MODULE}, ?MODULE, [], []).
15+
16+
store(URL) ->
17+
gen_server:cast({global, ?MODULE}, {store, URL}).
18+
19+
print() ->
20+
gen_server:cast({global, ?MODULE}, {print}).
21+
22+
save() ->
23+
gen_server:cast({global, ?MODULE}, {save}).
24+
25+
get_base() ->
26+
gen_server:call({global, ?MODULE}, {get_base}).
27+
28+
29+
%%----------------------------------------------------------------------------
30+
init([])->
31+
inets:start(),
32+
{ok, [0, []]}.
33+
terminate(_Reason, _State) -> ok.
34+
handle_info(_Message, State) -> { noreply, State }.
35+
code_change(_OldVersion, State, _Extra) -> { ok, State }.
36+
%%----------------------------------------------------------------------------
37+
38+
handle_call({get_base}, _Form, State) ->
39+
{reply, State, State};
40+
handle_call(_Message, _Form, State) -> {reply, ok, State}.
41+
42+
handle_cast({store, URL}, [Count, Array]) ->
43+
io:format("~p~n",[Count]),
44+
{ noreply, [Count + 1,[URL|Array]] };
45+
handle_cast({print}, [Count, Array]) ->
46+
io:format("~p~n",[Array]),
47+
{ noreply, [Count, Array] };
48+
handle_cast({save}, [Count, Array]) ->
49+
save(remove_duplicates(Array)),
50+
{ noreply, [Count, Array] };
51+
handle_cast(_Message, State) -> { noreply, State }.
52+
53+
%%--Support---------------------------------------------------------------------
54+
save(Data) ->
55+
file:write_file("/test/map.txt",io_lib:fwrite("~p.\n",[Data])).
56+
57+
remove_duplicates(L) ->
58+
sets:to_list(sets:from_list(L)).
59+
60+
%file:write_file("/test/map.txt",io_lib:fwrite("~p.\n",[Data])).

src/crawler_worker.erl

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
-module(crawler_worker).
2+
-behaviour(gen_server).
3+
4+
-export([start/0, get_map/2]).
5+
-export([init/1, handle_call/3, terminate/2, handle_cast/2, handle_info/2, code_change/3]).
6+
7+
%%----------------------------------------------------------------------------
8+
start() ->
9+
ID = get_id(),
10+
gen_server:start_link({global, ID}, ?MODULE, [], []),
11+
ID.
12+
13+
get_map(ID, URL) ->
14+
gen_server:cast({global, ID}, {get_map, URL}).
15+
16+
17+
18+
%%----------------------------------------------------------------------------
19+
init([])->
20+
inets:start(),
21+
{ok, []}.
22+
terminate(_Reason, _State) -> ok.
23+
handle_info(_Message, State) -> { noreply, State }.
24+
code_change(_OldVersion, State, _Extra) -> { ok, State }.
25+
%%----------------------------------------------------------------------------
26+
27+
handle_call(_Message, _Form, State) -> {reply, ok, State}.
28+
29+
handle_cast({get_map, URL}, State) ->
30+
try
31+
{R, {{_Type, Code, _Ok}, _Headers, Body}} = httpc:request(URL),
32+
if
33+
(R == ok) and (Code == 200) ->
34+
tree_worker(URL,Body),
35+
{noreply, State};
36+
true ->
37+
{noreply, State}
38+
end
39+
catch
40+
_:_ ->
41+
{noreply, State}
42+
end;
43+
44+
handle_cast(_Message, State) -> { noreply, State }.
45+
46+
%%--Support---------------------------------------------------------------------
47+
%rexp() -> ["^https?://.*", "^/.*/$"].
48+
49+
tree_worker(URL, Body) ->
50+
Tree = mochiweb_html:parse(Body),
51+
Hrefs = remove_duplicates(mochiweb_xpath:execute("//a/@href",Tree)),
52+
Imgs = remove_duplicates(mochiweb_xpath:execute("//img/@src",Tree)),
53+
lists:foreach(fun(Img) -> download_image(url(URL), Img) end, Imgs),
54+
lists:foreach(fun(Href) -> spawn_workers(url(URL), Href) end, Hrefs).
55+
56+
spawn_workers(URL, Href) ->
57+
case re:run(Href,"^/.*",[{capture,none}]) of
58+
match ->
59+
FullURL = bjoin([list_to_binary(URL), Href]),
60+
case need_stop(FullURL) of
61+
false ->
62+
crawler_store:store(FullURL),
63+
Cid = crawler_worker:start(),
64+
crawler_worker:get_map(Cid, binary_to_list(FullURL));
65+
true ->
66+
nomatch
67+
end;
68+
nomatch ->
69+
nomatch
70+
end.
71+
72+
download_image(URL, Image) ->
73+
case re:run(Image,"^(http:\\/\\/|\\.\\.).*",[{capture,none}]) of
74+
match ->
75+
nothing;
76+
nomatch ->
77+
FullURL = bjoin([list_to_binary(URL), Image]),
78+
ID = crawler_downloder:start(),
79+
crawler_downloder:download(ID, binary_to_list(FullURL))
80+
end.
81+
82+
need_stop(URL) ->
83+
[C, L] = crawler_store:get_base(),
84+
(C > 5000) or
85+
lists:member(URL,L).
86+
87+
url(URL) ->
88+
{ok, {http,_,Root,_Port,Path,_Query}} = http_uri:parse(URL),
89+
Ctx = string:sub_string(Path,1, string:rstr(Path,"/")),
90+
"http://"++Root++Ctx.
91+
92+
93+
remove_duplicates(L) ->
94+
sets:to_list(sets:from_list(L)).
95+
96+
bjoin(List) ->
97+
F = fun(A, B) -> <<A/binary, B/binary>> end,
98+
lists:foldr(F, <<>>, List).
99+
100+
get_id() ->
101+
list_to_atom(integer_to_list(erlang:system_time())).

0 commit comments

Comments
 (0)