Skip to content

Commit 6ff722a

Browse files
committed
-Revised code for PR
-Updated readme on GlassDoor's new dynamic functionality -TODO:Add Docker image to JobFunnel's packages
1 parent 2698738 commit 6ff722a

File tree

7 files changed

+50
-47
lines changed

7 files changed

+50
-47
lines changed

jobfunnel/indeed.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,9 @@ def get_number_of_pages(self, soup_base, max=0):
126126
"""
127127
Calculates the number of pages to be scraped.
128128
Args:
129-
soup_base: a BeautifulSoup object with the html data.
130-
At the moment this method assumes that the soup_base was prepared statically.
131-
max: the maximum number of pages to be scraped.
129+
soup_base: a BeautifulSoup object with the html data.
130+
At the moment this method assumes that the soup_base was prepared statically.
131+
max: the maximum number of pages to be scraped.
132132
Returns:
133133
The number of pages to be scraped.
134134
If the number of pages that soup_base yields is higher than max, then max is returned.
@@ -147,7 +147,7 @@ def get_title(self, soup):
147147
"""
148148
Fetches the title from a BeautifulSoup base.
149149
Args:
150-
soup: BeautifulSoup base to scrape the title from.
150+
soup: BeautifulSoup base to scrape the title from.
151151
Returns:
152152
The job title scraped from soup.
153153
Note that this function may throw an AttributeError if it cannot find the title.
@@ -160,7 +160,7 @@ def get_company(self, soup):
160160
"""
161161
Fetches the company from a BeautifulSoup base.
162162
Args:
163-
soup: BeautifulSoup base to scrape the company from.
163+
soup: BeautifulSoup base to scrape the company from.
164164
Returns:
165165
The company scraped from soup.
166166
Note that this function may throw an AttributeError if it cannot find the company.
@@ -173,7 +173,7 @@ def get_location(self, soup):
173173
"""
174174
Fetches the job location from a BeautifulSoup base.
175175
Args:
176-
soup: BeautifulSoup base to scrape the location from.
176+
soup: BeautifulSoup base to scrape the location from.
177177
Returns:
178178
The job location scraped from soup.
179179
Note that this function may throw an AttributeError if it cannot find the location.
@@ -186,7 +186,7 @@ def get_tags(self, soup):
186186
"""
187187
Fetches the job location from a BeautifulSoup base.
188188
Args:
189-
soup: BeautifulSoup base to scrape the location from.
189+
soup: BeautifulSoup base to scrape the location from.
190190
Returns:
191191
The job location scraped from soup.
192192
Note that this function may throw an AttributeError if it cannot find the location.
@@ -201,7 +201,7 @@ def get_date(self, soup):
201201
"""
202202
Fetches the job date from a BeautifulSoup base.
203203
Args:
204-
soup: BeautifulSoup base to scrape the date from.
204+
soup: BeautifulSoup base to scrape the date from.
205205
Returns:
206206
The job date scraped from soup.
207207
Note that this function may throw an AttributeError if it cannot find the date.
@@ -214,7 +214,7 @@ def get_id(self, soup):
214214
"""
215215
Fetches the job id from a BeautifulSoup base.
216216
Args:
217-
soup: BeautifulSoup base to scrape the id from.
217+
soup: BeautifulSoup base to scrape the id from.
218218
Returns:
219219
The job id scraped from soup.
220220
Note that this function may throw an AttributeError if it cannot find the id.
@@ -229,7 +229,7 @@ def get_link(self, job_id):
229229
"""
230230
Constructs the link with the given job_id.
231231
Args:
232-
job_id: The id to be used to construct the link for this job.
232+
job_id: The id to be used to construct the link for this job.
233233
Returns:
234234
The constructed job link.
235235
Note that this function does not check the correctness of this link.

jobfunnel/monster.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,12 @@ def get_search_url(self, method='get'):
8181
if method == 'get':
8282
search = ('https://www.monster.{0}/jobs/search/?'
8383
'q={1}&where={2}__2C-{3}&intcid={4}&rad={5}&where={2}__2c-{3}'.format(
84-
self.search_terms['region']['domain'],
85-
self.query,
86-
self.search_terms['region']['city'],
87-
self.search_terms['region']['province'],
88-
'skr_navigation_nhpso_searchMain',
89-
self.convert_radius(self.search_terms['region']['radius'])))
84+
self.search_terms['region']['domain'],
85+
self.query,
86+
self.search_terms['region']['city'],
87+
self.search_terms['region']['province'],
88+
'skr_navigation_nhpso_searchMain',
89+
self.convert_radius(self.search_terms['region']['radius'])))
9090

9191
return search
9292
elif method == 'post':
@@ -212,7 +212,7 @@ def scrape(self):
212212
self.scrape_data[str(job['id'])] = job
213213

214214
# Do not change the order of the next three statements if you want date_filter to work
215-
215+
216216
# stores references to jobs in list to be used in blurb retrieval
217217
scrape_list = [i for i in self.scrape_data.values()]
218218
# converts job date formats into a standard date format

readme.md

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,17 @@ Filter undesired companies by providing your own `yaml` configuration and adding
7676
JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
7777
For more information see the [crontab document][cron_doc].
7878

79-
**NOTE ABOUT AUTOMATING:** As of right now, Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA.
79+
* **Glassdoor Notes** <br />
80+
The `GlassDoor` scraper has two versions: `GlassDoorStatic` and `GlassDoorDynamic`. Both of these give you the same end result: they scrape GlassDoor and dump your job listings onto your `master_list.csv`. We recommend to *always* run `GlassDoorStatic` (this is the default preset we have on our demo `settings.yaml` file) because it is *a lot* faster than `GlassDoorDynamic`. However, given the event that `GlassDoorStatic` fails, you may use `GlassDoorDynamic`. It is very slow, but you'll still be able to scrape GlassDoor.
8081

81-
You may also of course disable the Glassdoor scraper in your `settings.yaml` to not have to complete any CAPTCHA at all:
82+
When using `GlassDoorDynamic` Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA.
83+
84+
You may also of course disable the Glassdoor scraper when using `GlassDoorDynamic` in your `settings.yaml` to not have to complete any CAPTCHA at all:
8285
```
83-
#- 'GlassDoor'
84-
- 'Indeed'
85-
- 'Monster'
86+
- 'Indeed'
87+
- 'Monster'
88+
#- 'GlassDoorStatic'
89+
# - 'GlassDoorDynamic'
8690
```
8791

8892
* **Reviewing Jobs in Terminal** <br />

tests/conftest.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,16 @@ def setup(attr_list: list, first_job_id: int = 0):
7171
return per_id_job_list
7272
return setup
7373

74+
@pytest.fixture()
75+
def get_number_of_pages_mock():
76+
def setup(mock_soup, mock_max):
77+
"""
78+
mock get_number_of_pages to ensure we only scrape no more than 1 page.
79+
"""
80+
return 1
81+
return setup
82+
83+
7484

7585
@ pytest.fixture()
7686
def init_scraper(configure_options):
@@ -90,7 +100,7 @@ def setup(provider: str, options: list = ['']):
90100
return setup
91101

92102

93-
@pytest.fixture
103+
@pytest.fixture()
94104
def setup_scraper(init_scraper):
95105
def setup(scraper: str):
96106
"""

tests/test_indeed.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,6 @@
66
from .conftest import search_term_configs
77

88

9-
# I think I should move this fixture to conftest
10-
def get_number_of_pages_mock(mock_soup, mock_max):
11-
"""
12-
mock get_number_of_pages to ensure we only scrape no more than 1 page.
13-
"""
14-
return 1
15-
16-
179
#test the correctness of search_tems since our tests depend on it
1810

1911
def test_search_terms(init_scraper):
@@ -24,8 +16,6 @@ def test_search_terms(init_scraper):
2416
@pytest.mark.parametrize('search_terms_config', search_term_configs)
2517
class TestClass():
2618

27-
28-
2919
def test_convert_radius(self, init_scraper, search_terms_config):
3020
provider = init_scraper('indeed')
3121
provider.search_terms = search_terms_config
@@ -38,6 +28,7 @@ def test_convert_radius(self, init_scraper, search_terms_config):
3828
assert 50 == provider.convert_radius(75)
3929
assert 100 == provider.convert_radius(300)
4030

31+
4132
def test_get_search_url(self, init_scraper, search_terms_config):
4233
provider = init_scraper('indeed')
4334
provider.search_terms = search_terms_config
@@ -49,6 +40,7 @@ def test_get_search_url(self, init_scraper, search_terms_config):
4940
with pytest.raises(NotImplementedError) as e:
5041
provider.get_search_url('post')
5142

43+
5244
def test_get_number_of_pages(self, init_scraper, search_terms_config):
5345
provider = init_scraper('indeed')
5446
provider.search_terms = search_terms_config
@@ -62,6 +54,7 @@ def test_get_number_of_pages(self, init_scraper, search_terms_config):
6254
soup_base = BeautifulSoup(request_html.text, provider.bs4_parser)
6355
assert provider.get_number_of_pages(soup_base, max=3) <= 3
6456

57+
6558
def test_search_page_for_job_soups(self, init_scraper, search_terms_config):
6659
provider = init_scraper('indeed')
6760
provider.search_terms = search_terms_config
@@ -77,7 +70,6 @@ def test_search_page_for_job_soups(self, init_scraper, search_terms_config):
7770

7871
# test the process of fetching title data from a job
7972

80-
8173
def test_get_title(self, setup_scraper, search_terms_config):
8274
scraper = setup_scraper('indeed')
8375
job_soup_list = scraper['job_list']
@@ -97,7 +89,6 @@ def test_get_title(self, setup_scraper, search_terms_config):
9789

9890
# test the process of fetching company data from a job
9991

100-
10192
def test_get_company(self, setup_scraper, search_terms_config):
10293
scraper = setup_scraper('indeed')
10394
job_soup_list = scraper['job_list']
@@ -117,7 +108,6 @@ def test_get_company(self, setup_scraper, search_terms_config):
117108

118109
# test the process of fetching location data from a job
119110

120-
121111
def test_get_location(self, setup_scraper, search_terms_config):
122112
scraper = setup_scraper('indeed')
123113
job_soup_list = scraper['job_list']
@@ -137,7 +127,6 @@ def test_get_location(self, setup_scraper, search_terms_config):
137127

138128
# test the process of fetching date data from a job
139129

140-
141130
def test_get_date(self, setup_scraper, search_terms_config):
142131
scraper = setup_scraper('indeed')
143132
job_soup_list = scraper['job_list']
@@ -154,8 +143,8 @@ def test_get_date(self, setup_scraper, search_terms_config):
154143
return
155144
assert False
156145

157-
#TODO: Have more strict tests for job id and link
158-
146+
# Test the id with a strict assertion because without a job id we have
147+
# no job link, and without job link, we have no job to apply to
159148
def test_get_id(self, setup_scraper, search_terms_config):
160149
scraper = setup_scraper('indeed')
161150
job_soup_list = scraper['job_list']
@@ -169,6 +158,7 @@ def test_get_id(self, setup_scraper, search_terms_config):
169158
assert False
170159
assert True
171160

161+
172162
# test the process of fetching the link to a job
173163

174164
def test_get_link(self, setup_scraper, search_terms_config):
@@ -189,6 +179,7 @@ def test_get_link(self, setup_scraper, search_terms_config):
189179

190180
assert False
191181

182+
192183
# test the process of fetching the blurb from a job
193184

194185
def test_get_blurb_with_delay(self, setup_scraper, search_terms_config):
@@ -215,6 +206,7 @@ def test_get_blurb_with_delay(self, setup_scraper, search_terms_config):
215206

216207
assert False
217208

209+
218210

219211
def test_search_joblink_for_blurb(self, setup_scraper, search_terms_config):
220212
"""
@@ -240,9 +232,11 @@ def test_search_joblink_for_blurb(self, setup_scraper, search_terms_config):
240232

241233
assert False
242234

235+
243236
# Test the entire integration
244237

245-
def test_scrape(self, init_scraper, monkeypatch, search_terms_config):
238+
def test_scrape(self, init_scraper, monkeypatch,
239+
search_terms_config, get_number_of_pages_mock):
246240
# ensure that we don't scrape more than one page
247241
monkeypatch.setattr(
248242
Indeed, 'get_number_of_pages', get_number_of_pages_mock)

tests/test_tools.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@
7070

7171
# test clean/dirty characters that may be on title and blurb fields
7272

73-
7473
def test_filter_non_printables_clean_title(job_listings):
7574
job_list = job_listings(attr_list[0:1])
7675
filter_non_printables(job_list[0])
@@ -96,7 +95,6 @@ def test_filter_non_printables_diryt_blurb(job_listings):
9695

9796
# test job_listing dates with all possible formats
9897

99-
10098
def test_post_date_from_relative_post_age_just_posted_pass(job_listings):
10199
job_list = job_listings(attr_list[4:5])
102100
post_date_from_relative_post_age(job_list)
@@ -114,17 +112,15 @@ def test_post_date_from_relative_post_age_1_hour_ago_pass(job_listings):
114112
post_date_from_relative_post_age(job_list)
115113
now = datetime.now()
116114
assert now.strftime('%Y-%m-%d') == job_list[0]['date'] or \
117-
(now - timedelta(days=int(1))
118-
).strftime('%Y-%m-%d') == job_list[0]['date']
115+
(now - timedelta(days=int(1))).strftime('%Y-%m-%d') == job_list[0]['date']
119116

120117

121118
def test_post_date_from_relative_post_age_2_hours_ago_pass(job_listings):
122119
job_list = job_listings(attr_list[7:8])
123120
post_date_from_relative_post_age(job_list)
124121
now = datetime.now()
125122
assert now.strftime('%Y-%m-%d') == job_list[0]['date'] or \
126-
(now - timedelta(days=int(1))
127-
).strftime('%Y-%m-%d') == job_list[0]['date']
123+
(now - timedelta(days=int(1))).strftime('%Y-%m-%d') == job_list[0]['date']
128124

129125

130126
def test_post_date_from_relative_ago_post_age_yesterday_ago_pass(job_listings):

tests/test_validate.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828

2929
# test all paths with invalid values
3030

31-
3231
def test_filter_list_path_fail(configure_options):
3332
path_configs = config_factory(
3433
configure_options(['']), attr_list[12: 13])[0]

0 commit comments

Comments
 (0)