Skip to content

Commit b30b284

Browse files
authored
Merge pull request PaulMcInnis#75 from thebigG/testing
Dynamic web scraping and static web scraping
2 parents f7e10d3 + df8b8a8 commit b30b284

File tree

6 files changed

+20
-27
lines changed

6 files changed

+20
-27
lines changed

jobfunnel/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '2.1.7'
1+
__version__ = '2.1.8'

jobfunnel/glassdoor_dynamic.py

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -121,21 +121,15 @@ def scrape(self):
121121

122122
# get the html data, initialize bs4 with lxml
123123
self.driver.get(search)
124-
print(
125-
"It's very likely that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA: "
126-
'\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n'
127-
' 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue'
128-
)
129-
# wait for user to complete CAPTCHA
130-
input()
131124

132125
# create the soup base
133126
soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
134-
num_res = soup_base.find('p', attrs={'class', 'jobsCount'})
135-
while num_res is None:
136-
print(
137-
'Looks like something went wrong. \nMake sure you complete the CAPTCHA in the new browser window that just popped up. Try refreshing the page and attempt to complete the CAPTCHA again. '
138-
)
127+
num_res = soup_base.find('p', attrs={
128+
'class', 'jobsCount'})
129+
while(num_res is None):
130+
print("It looks like that Glassdoor might require you to fill out a CAPTCHA form. Follow these steps if it does ask you to complete a CAPTCHA:"
131+
"\n 1.Refresh the glassdoor site in the new browser window that just popped up.\n" " 2.Then complete the CAPTCHA in the browser.\n 3.Press Enter to continue")
132+
# wait for user to complete CAPTCHA
139133
input()
140134
soup_base = BeautifulSoup(self.driver.page_source, self.bs4_parser)
141135
num_res = soup_base.find('p', attrs={'class', 'jobsCount'})
@@ -221,11 +215,9 @@ def scrape(self):
221215
job['tags'] = ''
222216

223217
try:
224-
job['date'] = (
225-
s.find('div', attrs={'class', 'jobLabels'})
226-
.find('span', attrs={'class', 'jobLabel nowrap'})
227-
.text.strip()
228-
)
218+
# dynamic way of fetching date
219+
job['date'] = s.find('div', attrs={
220+
'class', 'd-flex align-items-end pl-std minor css-65p68w'}).text.strip()
229221
except AttributeError:
230222
job['date'] = ''
231223

jobfunnel/tools/filters.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,19 @@ def date_filter(cur_dict: Dict[str, dict], number_of_days: int):
1414
cur_dict: today's job scrape dict
1515
number_of_days: how many days old a job can be
1616
"""
17-
if number_of_days<0 or cur_dict is None:
17+
if number_of_days < 0 or cur_dict is None:
1818
return
1919
print("date_filter running")
2020
cur_job_ids = [job['id'] for job in cur_dict.values()]
21-
#calculate the oldest date a job can be
22-
threshold_date = datetime.now() - timedelta(days=number_of_days)
21+
# calculate the oldest date a job can be
22+
threshold_date = datetime.now() - timedelta(days=number_of_days)
2323
for job_id in cur_job_ids:
24-
#get the date from job with job_id
24+
# get the date from job with job_id
2525
job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d')
26-
#if this job is older than threshold_date, delete it from current scrape
27-
if job_date<threshold_date:
26+
# if this job is older than threshold_date, delete it from current scrape
27+
if job_date < threshold_date:
2828
logging.info(f"{cur_dict[job_id]['link']} has been filtered out by date_filter because"
29-
f" it is older than {number_of_days} days")
29+
f" it is older than {number_of_days} days")
3030
del cur_dict[job_id]
3131

3232

jobfunnel/tools/tools.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def post_date_from_relative_post_age(job_list):
7979
logging.error(f"unknown date for job {job['id']}")
8080
# format date in standard format e.g. 2020-01-01
8181
job['date'] = post_date.strftime('%Y-%m-%d')
82+
# print('job['date']'')
8283

8384

8485
def split_url(url):
@@ -155,7 +156,7 @@ def get_webdriver():
155156
executable_path=GeckoDriverManager().install())
156157
except Exception:
157158
try:
158-
webdriver.Chrome(ChromeDriverManager().install())
159+
driver = webdriver.Chrome(ChromeDriverManager().install())
159160
except Exception:
160161
try:
161162
driver = webdriver.Ie(IEDriverManager().install())

readme.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ Filter undesired companies by providing your own `yaml` configuration and adding
7676
JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
7777
For more information see the [crontab document][cron_doc].
7878

79-
**NOTE ABOUT AUTOMATING:** As of right now, Glassdoor requires a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you need to be **physically present** to complete the Glassdoor CAPTCHA.
79+
**NOTE ABOUT AUTOMATING:** As of right now, Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA.
8080

8181
You may also of course disable the Glassdoor scraper in your `settings.yaml` to not have to complete any CAPTCHA at all:
8282
```

tests/test_glassdoor.py

Whitespace-only changes.

0 commit comments

Comments
 (0)