Skip to content

Commit 72faea2

Browse files
authored
Feature/updating indeed scraper (PaulMcInnis#166) (PaulMcInnis#170)
* Feature/updating indeed scraper (PaulMcInnis#166) * - Updated to mobile endpoints and user agents to prevent CAPTCHA - Updated parsing of indeed scraper - Fixed tags not being parsed correctly - Fixed remoteness not being parsed correctly - Changed to only scrape the first page of each search by default for speed * - Updated method of loading user agent files - Updated user agent file of indeed scraper * - Updated versions in requirements.txt - Added in black configuration file for formatting - Added a pre-commit hook so all contributors will have consistent formatting on upload - Updated all python files to conform to black formatter * Updated Python version * More black formatting updates * - Added prettierrc and prettierignore - Formatted all files other than python * Updated prettierignore so prettier can search through subdirectories * Reset formatting to longer line width * Reverted to previous commit * Updating again to longer line width after accounting for missing files * Updated prettierrc and prettierignore files and reran formatting * Updated version * - Reverted Markdown changes - Reverted settings_USA changes - Updated readme - Removed extra user-agent from phone user agents list - Removed extra comments * Changed readme to refer to python 3.11 instead of 3.8, and added the mobile user agent list to the MANIFEST.in
1 parent edf149b commit 72faea2

30 files changed

+286
-74
lines changed

.pre-commit-config.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
repos:
2+
- repo: https://github.com/psf/black
3+
rev: 24.8.0 # Replace this with the version of Black you want to use
4+
hooks:
5+
- id: black
6+
- repo: https://github.com/pre-commit/mirrors-prettier
7+
rev: "v3.1.0" # Specify Prettier version
8+
hooks:
9+
- id: prettier

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
include jobfunnel/demo/settings.yaml
22
include jobfunnel/demo/demo.png
33
include jobfunnel/resources/user_agent_list.txt
4+
include jobfunnel/resources/user_agent_list_mobile.txt
45
include readme.md
56
include LICENSE

demo/settings_USA.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ search:
2222
# FIXME: we need to add back GLASSDOOR when that's working again.
2323
providers:
2424
- INDEED
25-
- MONSTER
2625

2726
# Region that we are searching for jobs within:
2827
province_or_state: "Texas" # NOTE: this is generally 2 characters long.

jobfunnel/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
"""JobFunnel base package init, we keep module version here.
22
"""
3-
__version__ = "3.0.2"
3+
4+
__version__ = "4.0.0"

jobfunnel/backend/job.py

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Base Job class to be populated by Scrapers, manipulated by Filters and saved
22
to csv / etc by Exporter
33
"""
4+
45
from copy import deepcopy
56
from datetime import date, datetime
67
from typing import Dict, List, Optional
@@ -132,7 +133,7 @@ def update_if_newer(self, job: "Job") -> bool:
132133
Returns:
133134
True if we updated self with job, False if we didn't
134135
"""
135-
if job.post_date > self.post_date:
136+
if job.post_date >= self.post_date:
136137
# Update all attrs other than status (which user can set).
137138
self.company = deepcopy(job.company)
138139
self.location = deepcopy(job.location)
@@ -152,6 +153,7 @@ def update_if_newer(self, job: "Job") -> bool:
152153
# pylint: disable=protected-access
153154
self._raw_scrape_data = deepcopy(job._raw_scrape_data)
154155
# pylint: enable=protected-access
156+
155157
return True
156158
else:
157159
return False
@@ -187,7 +189,7 @@ def as_row(self) -> Dict[str, str]:
187189
self.location,
188190
self.post_date.strftime("%Y-%m-%d"),
189191
self.description,
190-
", ".join(self.tags),
192+
"\n".join(self.tags),
191193
self.url,
192194
self.key_id,
193195
self.provider,
@@ -210,9 +212,11 @@ def as_json_entry(self) -> Dict[str, str]:
210212
"title": self.title,
211213
"company": self.company,
212214
"post_date": self.post_date.strftime("%Y-%m-%d"),
213-
"description": (self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..")
214-
if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS
215-
else (self.description),
215+
"description": (
216+
(self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..")
217+
if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS
218+
else (self.description)
219+
),
216220
"status": self.status.name,
217221
}
218222

@@ -243,3 +247,29 @@ def validate(self) -> None:
243247
assert self.url, "URL is unset!"
244248
if len(self.description) < MIN_DESCRIPTION_CHARS:
245249
raise ValueError("Description too short!")
250+
251+
def __repr__(self) -> str:
252+
"""Developer-friendly representation of the Job object."""
253+
return (
254+
f"Job("
255+
f"title='{self.title}', "
256+
f"company='{self.company}', "
257+
f"location='{self.location}', "
258+
f"status={self.status.name}, "
259+
f"post_date={self.post_date}, "
260+
f"url='{self.url}')"
261+
)
262+
263+
def __str__(self) -> str:
264+
"""Human-readable string representation of the Job object."""
265+
return (
266+
f"Job Title: {self.title}\n"
267+
f"Company: {self.company}\n"
268+
f"Location: {self.location}\n"
269+
f"Post Date: {self.post_date.strftime('%Y-%m-%d') if self.post_date else 'N/A'}\n"
270+
f"Status: {self.status.name}\n"
271+
f"Wage: {self.wage if self.wage else 'N/A'}\n"
272+
f"Remoteness: {self.remoteness if self.remoteness else 'N/A'}\n"
273+
f"Description (truncated): {self.description[:100]}{'...' if len(self.description) > 100 else ''}\n"
274+
f"URL: {self.url}\n"
275+
)

jobfunnel/backend/jobfunnel.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Scrapes jobs, applies search filters and writes pickles to master list
22
Paul McInnis 2020
33
"""
4+
45
import csv
56
import json
67
import os
@@ -230,7 +231,9 @@ def scrape(self) -> Dict[str, Job]:
230231
try:
231232
incoming_jobs_dict = scraper.scrape()
232233
except Exception as e:
233-
self.logger.error(f"Failed to scrape jobs for {scraper_cls.__name__}")
234+
self.logger.error(
235+
f"Failed to scrape jobs for {scraper_cls.__name__}: {e}"
236+
)
234237

235238
# Ensure we have no duplicates between our scrapers by key-id
236239
# (since we are updating the jobs dict with results)
@@ -425,6 +428,7 @@ def read_master_csv(self) -> Dict[str, Job]:
425428
short_description=short_description,
426429
post_date=post_date,
427430
scrape_date=scrape_date,
431+
wage=wage,
428432
raw=raw,
429433
tags=row["tags"].split(","),
430434
remoteness=remoteness,

jobfunnel/backend/scrapers/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""The base scraper class to be used for all web-scraping emitting Job objects
22
Paul McInnis 2020
33
"""
4+
45
import random
56
from abc import ABC, abstractmethod
67
from concurrent.futures import ThreadPoolExecutor, as_completed

jobfunnel/backend/scrapers/glassdoor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Scraper for www.glassdoor.X
22
FIXME: this is currently unable to get past page 1 of job results.
33
"""
4+
45
import re
56
from abc import abstractmethod
67
from concurrent.futures import ThreadPoolExecutor, wait

0 commit comments

Comments
 (0)