|
| 1 | +-module(crawler_worker). |
| 2 | +-behaviour(gen_server). |
| 3 | + |
| 4 | +-export([start/0, get_map/2]). |
| 5 | +-export([init/1, handle_call/3, terminate/2, handle_cast/2, handle_info/2, code_change/3]). |
| 6 | + |
| 7 | + %%---------------------------------------------------------------------------- |
| 8 | + start() -> |
| 9 | + ID = get_id(), |
| 10 | + gen_server:start_link({global, ID}, ?MODULE, [], []), |
| 11 | + ID. |
| 12 | + |
| 13 | + get_map(ID, URL) -> |
| 14 | + gen_server:cast({global, ID}, {get_map, URL}). |
| 15 | + |
| 16 | + |
| 17 | + |
| 18 | + %%---------------------------------------------------------------------------- |
| 19 | + init([])-> |
| 20 | + inets:start(), |
| 21 | + {ok, []}. |
| 22 | + terminate(_Reason, _State) -> ok. |
| 23 | + handle_info(_Message, State) -> { noreply, State }. |
| 24 | + code_change(_OldVersion, State, _Extra) -> { ok, State }. |
| 25 | + %%---------------------------------------------------------------------------- |
| 26 | + |
| 27 | +handle_call(_Message, _Form, State) -> {reply, ok, State}. |
| 28 | + |
| 29 | +handle_cast({get_map, URL}, State) -> |
| 30 | + try |
| 31 | + {R, {{_Type, Code, _Ok}, _Headers, Body}} = httpc:request(URL), |
| 32 | + if |
| 33 | + (R == ok) and (Code == 200) -> |
| 34 | + tree_worker(URL,Body), |
| 35 | + {noreply, State}; |
| 36 | + true -> |
| 37 | + {noreply, State} |
| 38 | + end |
| 39 | + catch |
| 40 | + _:_ -> |
| 41 | + {noreply, State} |
| 42 | + end; |
| 43 | + |
| 44 | +handle_cast(_Message, State) -> { noreply, State }. |
| 45 | + |
| 46 | +%%--Support--------------------------------------------------------------------- |
| 47 | +%rexp() -> ["^https?://.*", "^/.*/$"]. |
| 48 | + |
| 49 | +tree_worker(URL, Body) -> |
| 50 | + Tree = mochiweb_html:parse(Body), |
| 51 | + Hrefs = remove_duplicates(mochiweb_xpath:execute("//a/@href",Tree)), |
| 52 | + Imgs = remove_duplicates(mochiweb_xpath:execute("//img/@src",Tree)), |
| 53 | + lists:foreach(fun(Img) -> download_image(url(URL), Img) end, Imgs), |
| 54 | + lists:foreach(fun(Href) -> spawn_workers(url(URL), Href) end, Hrefs). |
| 55 | + |
| 56 | +spawn_workers(URL, Href) -> |
| 57 | + case re:run(Href,"^/.*",[{capture,none}]) of |
| 58 | + match -> |
| 59 | + FullURL = bjoin([list_to_binary(URL), Href]), |
| 60 | + case need_stop(FullURL) of |
| 61 | + false -> |
| 62 | + crawler_store:store(FullURL), |
| 63 | + Cid = crawler_worker:start(), |
| 64 | + crawler_worker:get_map(Cid, binary_to_list(FullURL)); |
| 65 | + true -> |
| 66 | + nomatch |
| 67 | + end; |
| 68 | + nomatch -> |
| 69 | + nomatch |
| 70 | + end. |
| 71 | + |
| 72 | +download_image(URL, Image) -> |
| 73 | + case re:run(Image,"^(http:\\/\\/|\\.\\.).*",[{capture,none}]) of |
| 74 | + match -> |
| 75 | + nothing; |
| 76 | + nomatch -> |
| 77 | + FullURL = bjoin([list_to_binary(URL), Image]), |
| 78 | + ID = crawler_downloder:start(), |
| 79 | + crawler_downloder:download(ID, binary_to_list(FullURL)) |
| 80 | + end. |
| 81 | + |
| 82 | +need_stop(URL) -> |
| 83 | + [C, L] = crawler_store:get_base(), |
| 84 | + (C > 5000) or |
| 85 | + lists:member(URL,L). |
| 86 | + |
| 87 | + url(URL) -> |
| 88 | + {ok, {http,_,Root,_Port,Path,_Query}} = http_uri:parse(URL), |
| 89 | + Ctx = string:sub_string(Path,1, string:rstr(Path,"/")), |
| 90 | + "http://"++Root++Ctx. |
| 91 | + |
| 92 | + |
| 93 | +remove_duplicates(L) -> |
| 94 | + sets:to_list(sets:from_list(L)). |
| 95 | + |
| 96 | +bjoin(List) -> |
| 97 | + F = fun(A, B) -> <<A/binary, B/binary>> end, |
| 98 | + lists:foldr(F, <<>>, List). |
| 99 | + |
| 100 | +get_id() -> |
| 101 | + list_to_atom(integer_to_list(erlang:system_time())). |
0 commit comments