Add a scraper for german indeed

marchbnr · PaulMcInnis · commit a36fbe0bdf94 · 2021-09-21T11:21:31.000-04:00
diff --git a/demo/settings_DE.yaml b/demo/settings_DE.yaml
@@ -0,0 +1,67 @@
+# This is an example of a feature-complete JobFunnel configuration YAML.
+# Try this out by simply running: "funnel load -s demo/settings.yaml"
+
+# Path where your master CSV, block-lists, and cache data will be stored
+# NOTE: we create any missing directories in these filepaths
+master_csv_file: demo_job_search_results/demo_search.csv
+cache_folder: demo_job_search_results/cache
+block_list_file: demo_job_search_results/demo_block_list.json
+duplicates_list_file: demo_job_search_results/demo_duplicates_list.json
+log_file: demo_job_search_results/log.log
+
+# Job search configuration
+search:
+
+  # Locale settings, one of USA_ENGLISH, CANADA_ENGLISH, CANADA_FRENCH:
+  # This tells JobFunnel where the website we are scraping is located, and
+  # what language the contents are in.
+  locale: GERMANY_GERMAN
+
+  # Job providers which we will search, one of INDEED, MONSTER, GLASSDOOR:
+  # NOTE: we choose domain via locale (i.e. CANADA_ENGLISH -> www.indeed.ca)
+  # FIXME: we need to add back GLASSDOOR when that's working again.
+  providers:
+    - INDEED
+
+  # Region that we are searching for jobs within:
+  province_or_state: "108"  # NOTE: this is generally 2 characters long.
+  city: "Berlin"  # NOTE: this is the full city / town name.
+  radius: 0  # km (NOTE: if we were in locale: USA_ENGLISH it's in miles)
+
+  # These are the terms you would be typing into the website's search field:
+  keywords:
+    - community-manager
+
+  # Don't return any listings older than this:
+  max_listing_days: 10
+
+  # Blocked company names that will never appear in any results:
+  company_block_list:
+    - "DFKI"
+
+  # The desired level of work-remoteness (i.e. IN_PERSON, FULLY_REMOTE, ANY,
+  # TEMPORARILY_REMOTE, PARTIALLY_REMOTE)
+  remoteness: ANY
+
+# Logging level options are: critical, error, warning, info, debug, notset
+#log_level: INFO
+log_level: DEBUG
+
+# Delaying algorithm configuration
+delay:
+    # Functions used for delaying algorithm: CONSTANT, LINEAR, SIGMOID
+    algorithm: LINEAR
+    # Maximum delay/upper bound for converging random delay
+    max_duration: 5.0
+    # Minimum delay/lower bound for random delay
+    min_duration: 1.0
+    # Random delay
+    random: True
+    # Converging random delay, only used if 'random' is set to True
+    converging: False
+
+# # Proxy settings
+# proxy:
+#   protocol: https  # NOTE: you can also set to 'http'
+#   ip: "1.1.1.1"
+#   port: 200
diff --git a/jobfunnel/backend/scrapers/base.py b/jobfunnel/backend/scrapers/base.py
@@ -459,4 +459,12 @@ class BaseFRFreScraper(BaseScraper):
     """
     @property
     def locale(self) -> Locale:
-        return Locale.FRANCE_FRENCH
+        return Locale.FRANCE_FRENCH
+
+
+class BaseDEGerScraper(BaseScraper):
+    """Localized scraper for Germany German
+    """
+    @property
+    def locale(self) -> Locale:
+        return Locale.GERMANY_GERMAN
diff --git a/jobfunnel/backend/scrapers/indeed.py b/jobfunnel/backend/scrapers/indeed.py
@@ -13,7 +13,8 @@
 from jobfunnel.backend.scrapers.base import (BaseCANEngScraper, BaseScraper,
                                              BaseUSAEngScraper,
                                              BaseUKEngScraper,
-                                             BaseFRFreScraper)
+                                             BaseFRFreScraper,
+                                             BaseDEGerScraper)
 from jobfunnel.backend.tools.filters import JobFilter
 from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str
 from jobfunnel.resources import MAX_CPU_WORKERS, JobField, Remoteness
@@ -419,4 +420,72 @@ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
         elif number_of_pages < max_pages:
             return number_of_pages
         else:
-            return max_pages
+            return max_pages
+
+
+class IndeedScraperDEGer(BaseIndeedScraper, BaseDEGerScraper):
+    """Scrapes jobs from de.indeed.com
+    """
+
+    # The german locale has a different number separators.
+    THOUSEP = "."
+
+    def _get_search_url(self, method: Optional[str] = 'get') -> str:
+        """Get the indeed search url from SearchTerms
+        """
+        if method == 'get':
+            return (
+                # The URL is different to the base scraper because indeed.de is
+                # redirecting to de.indeed.com. If the redirect is handled the
+                # same URLs can be used.
+                "https://{}.indeed.com/jobs?q={}&l={}&radius={}&"
+                "limit={}&filter={}{}".format(
+                    self.config.search_config.domain,
+                    self.query,
+                    self.config.search_config.city.replace(' ', '+',),
+                    self._quantize_radius(self.config.search_config.radius),
+                    self.max_results_per_page,
+                    int(self.config.search_config.return_similar_results),
+                    REMOTENESS_TO_QUERY[self.config.search_config.remoteness],
+                )
+            )
+        elif method == 'post':
+            raise NotImplementedError()
+        else:
+            raise ValueError(f'No html method {method} exists')
+
+
+    def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
+        """Calculates the number of pages of job listings to be scraped.
+
+        i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs
+
+        Args:
+            max_pages: the maximum number of pages to be scraped.
+        Returns:
+            The number of pages to be scraped.
+        """
+        # Get the html data, initialize bs4 with lxml
+        request_html = self.session.get(search_url)
+        self.logger.debug(
+            "Got Base search results page: %s", search_url
+        )
+        query_resp = BeautifulSoup(request_html.text, self.config.bs4_parser)
+        num_res = query_resp.find(id='searchCountPages')
+        if not num_res:
+            raise ValueError(
+                "Unable to identify number of pages of results for query: {}"
+                " Please ensure linked page contains results, you may have"
+                " provided a city for which there are no results within this"
+                " province or state.".format(search_url)
+            )
+
+        num_res = num_res.contents[0].strip()
+        num_res = int(re.findall(r'(\d+)', num_res.replace(self.THOUSEP, ''))[1])
+        number_of_pages = int(ceil(num_res / self.max_results_per_page))
+        if max_pages == 0:
+            return number_of_pages
+        elif number_of_pages < max_pages:
+            return number_of_pages
+        else:
+            return max_pages
diff --git a/jobfunnel/backend/scrapers/registry.py b/jobfunnel/backend/scrapers/registry.py
@@ -7,7 +7,8 @@
 
 from jobfunnel.backend.scrapers.indeed import (
     IndeedScraperCANEng, IndeedScraperUSAEng,
-    IndeedScraperUKEng, IndeedScraperFRFre
+    IndeedScraperUKEng, IndeedScraperFRFre,
+    IndeedScraperDEGer
 )
 from jobfunnel.backend.scrapers.monster import (
     MonsterScraperCANEng, MonsterScraperUSAEng,
@@ -25,6 +26,7 @@
         Locale.USA_ENGLISH: IndeedScraperUSAEng,
         Locale.UK_ENGLISH: IndeedScraperUKEng,
         Locale.FRANCE_FRENCH: IndeedScraperFRFre,
+        Locale.GERMANY_GERMAN: IndeedScraperDEGer,
     },
     Provider.GLASSDOOR: {
         Locale.CANADA_ENGLISH: GlassDoorScraperCANEng,
diff --git a/jobfunnel/resources/defaults.py b/jobfunnel/resources/defaults.py
@@ -32,4 +32,5 @@
     Locale.USA_ENGLISH: 'com',
     Locale.UK_ENGLISH: 'co.uk',
     Locale.FRANCE_FRENCH: 'fr',
+    Locale.GERMANY_GERMAN: 'de',
 }
diff --git a/jobfunnel/resources/enums.py b/jobfunnel/resources/enums.py
@@ -13,6 +13,7 @@ class Locale(Enum):
     USA_ENGLISH = 3
     UK_ENGLISH = 4
     FRANCE_FRENCH = 5
+    GERMANY_GERMAN = 6
 
 
 class JobStatus(Enum):

Original file line number	Diff line number	Diff line change
`@@ -32,4 +32,5 @@`
`32`	`32`	`Locale.USA_ENGLISH: 'com',`
`33`	`33`	`Locale.UK_ENGLISH: 'co.uk',`
`34`	`34`	`Locale.FRANCE_FRENCH: 'fr',`
	`35`	`+ Locale.GERMANY_GERMAN: 'de',`
`35`	`36`	`}`