Skip to content

Commit a36fbe0

Browse files
marchbnrPaulMcInnis
authored andcommitted
Add a scraper for german indeed
1 parent 05e7712 commit a36fbe0

File tree

6 files changed

+152
-4
lines changed

6 files changed

+152
-4
lines changed

demo/settings_DE.yaml

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# This is an example of a feature-complete JobFunnel configuration YAML.
2+
# Try this out by simply running: "funnel load -s demo/settings.yaml"
3+
4+
# Path where your master CSV, block-lists, and cache data will be stored
5+
# NOTE: we create any missing directories in these filepaths
6+
master_csv_file: demo_job_search_results/demo_search.csv
7+
cache_folder: demo_job_search_results/cache
8+
block_list_file: demo_job_search_results/demo_block_list.json
9+
duplicates_list_file: demo_job_search_results/demo_duplicates_list.json
10+
log_file: demo_job_search_results/log.log
11+
12+
# Job search configuration
13+
search:
14+
15+
# Locale settings, one of USA_ENGLISH, CANADA_ENGLISH, CANADA_FRENCH:
16+
# This tells JobFunnel where the website we are scraping is located, and
17+
# what language the contents are in.
18+
locale: GERMANY_GERMAN
19+
20+
# Job providers which we will search, one of INDEED, MONSTER, GLASSDOOR:
21+
# NOTE: we choose domain via locale (i.e. CANADA_ENGLISH -> www.indeed.ca)
22+
# FIXME: we need to add back GLASSDOOR when that's working again.
23+
providers:
24+
- INDEED
25+
26+
# Region that we are searching for jobs within:
27+
province_or_state: "108" # NOTE: this is generally 2 characters long.
28+
city: "Berlin" # NOTE: this is the full city / town name.
29+
radius: 0 # km (NOTE: if we were in locale: USA_ENGLISH it's in miles)
30+
31+
# These are the terms you would be typing into the website's search field:
32+
keywords:
33+
- community-manager
34+
35+
# Don't return any listings older than this:
36+
max_listing_days: 10
37+
38+
# Blocked company names that will never appear in any results:
39+
company_block_list:
40+
- "DFKI"
41+
42+
# The desired level of work-remoteness (i.e. IN_PERSON, FULLY_REMOTE, ANY,
43+
# TEMPORARILY_REMOTE, PARTIALLY_REMOTE)
44+
remoteness: ANY
45+
46+
# Logging level options are: critical, error, warning, info, debug, notset
47+
#log_level: INFO
48+
log_level: DEBUG
49+
50+
# Delaying algorithm configuration
51+
delay:
52+
# Functions used for delaying algorithm: CONSTANT, LINEAR, SIGMOID
53+
algorithm: LINEAR
54+
# Maximum delay/upper bound for converging random delay
55+
max_duration: 5.0
56+
# Minimum delay/lower bound for random delay
57+
min_duration: 1.0
58+
# Random delay
59+
random: True
60+
# Converging random delay, only used if 'random' is set to True
61+
converging: False
62+
63+
# # Proxy settings
64+
# proxy:
65+
# protocol: https # NOTE: you can also set to 'http'
66+
# ip: "1.1.1.1"
67+
# port: 200

jobfunnel/backend/scrapers/base.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,4 +459,12 @@ class BaseFRFreScraper(BaseScraper):
459459
"""
460460
@property
461461
def locale(self) -> Locale:
462-
return Locale.FRANCE_FRENCH
462+
return Locale.FRANCE_FRENCH
463+
464+
465+
class BaseDEGerScraper(BaseScraper):
466+
"""Localized scraper for Germany German
467+
"""
468+
@property
469+
def locale(self) -> Locale:
470+
return Locale.GERMANY_GERMAN

jobfunnel/backend/scrapers/indeed.py

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
from jobfunnel.backend.scrapers.base import (BaseCANEngScraper, BaseScraper,
1414
BaseUSAEngScraper,
1515
BaseUKEngScraper,
16-
BaseFRFreScraper)
16+
BaseFRFreScraper,
17+
BaseDEGerScraper)
1718
from jobfunnel.backend.tools.filters import JobFilter
1819
from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str
1920
from jobfunnel.resources import MAX_CPU_WORKERS, JobField, Remoteness
@@ -419,4 +420,72 @@ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
419420
elif number_of_pages < max_pages:
420421
return number_of_pages
421422
else:
422-
return max_pages
423+
return max_pages
424+
425+
426+
class IndeedScraperDEGer(BaseIndeedScraper, BaseDEGerScraper):
427+
"""Scrapes jobs from de.indeed.com
428+
"""
429+
430+
# The german locale has a different number separators.
431+
THOUSEP = "."
432+
433+
def _get_search_url(self, method: Optional[str] = 'get') -> str:
434+
"""Get the indeed search url from SearchTerms
435+
"""
436+
if method == 'get':
437+
return (
438+
# The URL is different to the base scraper because indeed.de is
439+
# redirecting to de.indeed.com. If the redirect is handled the
440+
# same URLs can be used.
441+
"https://{}.indeed.com/jobs?q={}&l={}&radius={}&"
442+
"limit={}&filter={}{}".format(
443+
self.config.search_config.domain,
444+
self.query,
445+
self.config.search_config.city.replace(' ', '+',),
446+
self._quantize_radius(self.config.search_config.radius),
447+
self.max_results_per_page,
448+
int(self.config.search_config.return_similar_results),
449+
REMOTENESS_TO_QUERY[self.config.search_config.remoteness],
450+
)
451+
)
452+
elif method == 'post':
453+
raise NotImplementedError()
454+
else:
455+
raise ValueError(f'No html method {method} exists')
456+
457+
458+
def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
459+
"""Calculates the number of pages of job listings to be scraped.
460+
461+
i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs
462+
463+
Args:
464+
max_pages: the maximum number of pages to be scraped.
465+
Returns:
466+
The number of pages to be scraped.
467+
"""
468+
# Get the html data, initialize bs4 with lxml
469+
request_html = self.session.get(search_url)
470+
self.logger.debug(
471+
"Got Base search results page: %s", search_url
472+
)
473+
query_resp = BeautifulSoup(request_html.text, self.config.bs4_parser)
474+
num_res = query_resp.find(id='searchCountPages')
475+
if not num_res:
476+
raise ValueError(
477+
"Unable to identify number of pages of results for query: {}"
478+
" Please ensure linked page contains results, you may have"
479+
" provided a city for which there are no results within this"
480+
" province or state.".format(search_url)
481+
)
482+
483+
num_res = num_res.contents[0].strip()
484+
num_res = int(re.findall(r'(\d+)', num_res.replace(self.THOUSEP, ''))[1])
485+
number_of_pages = int(ceil(num_res / self.max_results_per_page))
486+
if max_pages == 0:
487+
return number_of_pages
488+
elif number_of_pages < max_pages:
489+
return number_of_pages
490+
else:
491+
return max_pages

jobfunnel/backend/scrapers/registry.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77

88
from jobfunnel.backend.scrapers.indeed import (
99
IndeedScraperCANEng, IndeedScraperUSAEng,
10-
IndeedScraperUKEng, IndeedScraperFRFre
10+
IndeedScraperUKEng, IndeedScraperFRFre,
11+
IndeedScraperDEGer
1112
)
1213
from jobfunnel.backend.scrapers.monster import (
1314
MonsterScraperCANEng, MonsterScraperUSAEng,
@@ -25,6 +26,7 @@
2526
Locale.USA_ENGLISH: IndeedScraperUSAEng,
2627
Locale.UK_ENGLISH: IndeedScraperUKEng,
2728
Locale.FRANCE_FRENCH: IndeedScraperFRFre,
29+
Locale.GERMANY_GERMAN: IndeedScraperDEGer,
2830
},
2931
Provider.GLASSDOOR: {
3032
Locale.CANADA_ENGLISH: GlassDoorScraperCANEng,

jobfunnel/resources/defaults.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,5 @@
3232
Locale.USA_ENGLISH: 'com',
3333
Locale.UK_ENGLISH: 'co.uk',
3434
Locale.FRANCE_FRENCH: 'fr',
35+
Locale.GERMANY_GERMAN: 'de',
3536
}

jobfunnel/resources/enums.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ class Locale(Enum):
1313
USA_ENGLISH = 3
1414
UK_ENGLISH = 4
1515
FRANCE_FRENCH = 5
16+
GERMANY_GERMAN = 6
1617

1718

1819
class JobStatus(Enum):

0 commit comments

Comments
 (0)