1313from jobfunnel .backend .scrapers .base import (BaseCANEngScraper , BaseScraper ,
1414 BaseUSAEngScraper ,
1515 BaseUKEngScraper ,
16- BaseFRFreScraper )
16+ BaseFRFreScraper ,
17+ BaseDEGerScraper )
1718from jobfunnel .backend .tools .filters import JobFilter
1819from jobfunnel .backend .tools .tools import calc_post_date_from_relative_str
1920from jobfunnel .resources import MAX_CPU_WORKERS , JobField , Remoteness
@@ -419,4 +420,72 @@ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
419420 elif number_of_pages < max_pages :
420421 return number_of_pages
421422 else :
422- return max_pages
423+ return max_pages
424+
425+
426+ class IndeedScraperDEGer (BaseIndeedScraper , BaseDEGerScraper ):
427+ """Scrapes jobs from de.indeed.com
428+ """
429+
430+ # The german locale has a different number separators.
431+ THOUSEP = "."
432+
433+ def _get_search_url (self , method : Optional [str ] = 'get' ) -> str :
434+ """Get the indeed search url from SearchTerms
435+ """
436+ if method == 'get' :
437+ return (
438+ # The URL is different to the base scraper because indeed.de is
439+ # redirecting to de.indeed.com. If the redirect is handled the
440+ # same URLs can be used.
441+ "https://{}.indeed.com/jobs?q={}&l={}&radius={}&"
442+ "limit={}&filter={}{}" .format (
443+ self .config .search_config .domain ,
444+ self .query ,
445+ self .config .search_config .city .replace (' ' , '+' ,),
446+ self ._quantize_radius (self .config .search_config .radius ),
447+ self .max_results_per_page ,
448+ int (self .config .search_config .return_similar_results ),
449+ REMOTENESS_TO_QUERY [self .config .search_config .remoteness ],
450+ )
451+ )
452+ elif method == 'post' :
453+ raise NotImplementedError ()
454+ else :
455+ raise ValueError (f'No html method { method } exists' )
456+
457+
458+ def _get_num_search_result_pages (self , search_url : str , max_pages = 0 ) -> int :
459+ """Calculates the number of pages of job listings to be scraped.
460+
461+ i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs
462+
463+ Args:
464+ max_pages: the maximum number of pages to be scraped.
465+ Returns:
466+ The number of pages to be scraped.
467+ """
468+ # Get the html data, initialize bs4 with lxml
469+ request_html = self .session .get (search_url )
470+ self .logger .debug (
471+ "Got Base search results page: %s" , search_url
472+ )
473+ query_resp = BeautifulSoup (request_html .text , self .config .bs4_parser )
474+ num_res = query_resp .find (id = 'searchCountPages' )
475+ if not num_res :
476+ raise ValueError (
477+ "Unable to identify number of pages of results for query: {}"
478+ " Please ensure linked page contains results, you may have"
479+ " provided a city for which there are no results within this"
480+ " province or state." .format (search_url )
481+ )
482+
483+ num_res = num_res .contents [0 ].strip ()
484+ num_res = int (re .findall (r'(\d+)' , num_res .replace (self .THOUSEP , '' ))[1 ])
485+ number_of_pages = int (ceil (num_res / self .max_results_per_page ))
486+ if max_pages == 0 :
487+ return number_of_pages
488+ elif number_of_pages < max_pages :
489+ return number_of_pages
490+ else :
491+ return max_pages
0 commit comments