-Revised code for PR

thebigG · thebigG · commit 6ff722acf217 · 2020-06-24T16:35:28.000-04:00
-Updated readme on GlassDoor's new dynamic functionality
-TODO:Add Docker image to JobFunnel's packages
diff --git a/jobfunnel/indeed.py b/jobfunnel/indeed.py
@@ -126,9 +126,9 @@ def get_number_of_pages(self, soup_base, max=0):
         """
         Calculates the number of pages to be scraped.
         Args:
-        soup_base: a BeautifulSoup object with the html data. 
-        At the moment this method assumes that the soup_base was prepared statically.
-        max: the maximum number of pages to be scraped.
+			soup_base: a BeautifulSoup object with the html data. 
+			At the moment this method assumes that the soup_base was prepared statically.
+			max: the maximum number of pages to be scraped.
         Returns:
             The number of pages to be scraped.
             If the number of pages that soup_base yields is higher than max, then max is returned.
@@ -147,7 +147,7 @@ def get_title(self, soup):
         """
         Fetches the title from a BeautifulSoup base.
         Args:
-        soup: BeautifulSoup base to scrape the title from.
+			soup: BeautifulSoup base to scrape the title from.
         Returns:
             The job title scraped from soup. 
             Note that this function may throw an AttributeError if it cannot find the title. 
@@ -160,7 +160,7 @@ def get_company(self, soup):
         """
         Fetches the company from a BeautifulSoup base.
         Args:
-        soup: BeautifulSoup base to scrape the company from.
+			soup: BeautifulSoup base to scrape the company from.
         Returns:
             The company scraped from soup. 
             Note that this function may throw an AttributeError if it cannot find the company. 
@@ -173,7 +173,7 @@ def get_location(self, soup):
         """
         Fetches the job location from a BeautifulSoup base.
         Args:
-        soup: BeautifulSoup base to scrape the location from.
+			soup: BeautifulSoup base to scrape the location from.
         Returns:
             The job location scraped from soup. 
             Note that this function may throw an AttributeError if it cannot find the location. 
@@ -186,7 +186,7 @@ def get_tags(self, soup):
         """
         Fetches the job location from a BeautifulSoup base.
         Args:
-        soup: BeautifulSoup base to scrape the location from.
+			soup: BeautifulSoup base to scrape the location from.
         Returns:
             The job location scraped from soup. 
             Note that this function may throw an AttributeError if it cannot find the location. 
@@ -201,7 +201,7 @@ def get_date(self, soup):
         """
         Fetches the job date from a BeautifulSoup base.
         Args:
-        soup: BeautifulSoup base to scrape the date from.
+			soup: BeautifulSoup base to scrape the date from.
         Returns:
             The job date scraped from soup. 
             Note that this function may throw an AttributeError if it cannot find the date. 
@@ -214,7 +214,7 @@ def get_id(self, soup):
         """
         Fetches the job id from a BeautifulSoup base.
         Args:
-        soup: BeautifulSoup base to scrape the id from.
+			soup: BeautifulSoup base to scrape the id from.
         Returns:
             The job id scraped from soup. 
             Note that this function may throw an AttributeError if it cannot find the id. 
@@ -229,7 +229,7 @@ def get_link(self, job_id):
         """
         Constructs the link with the given job_id.
         Args:
-        job_id: The id to be used to construct the link for this job.
+			job_id: The id to be used to construct the link for this job.
         Returns:
                 The constructed job link. 
                 Note that this function does not check the correctness of this link. 
diff --git a/jobfunnel/monster.py b/jobfunnel/monster.py
@@ -81,12 +81,12 @@ def get_search_url(self, method='get'):
         if method == 'get':
             search = ('https://www.monster.{0}/jobs/search/?'
                       'q={1}&where={2}__2C-{3}&intcid={4}&rad={5}&where={2}__2c-{3}'.format(
-                          self.search_terms['region']['domain'],
-                          self.query,
-                          self.search_terms['region']['city'],
-                          self.search_terms['region']['province'],
-                          'skr_navigation_nhpso_searchMain',
-                          self.convert_radius(self.search_terms['region']['radius'])))
+                self.search_terms['region']['domain'],
+                self.query,
+                self.search_terms['region']['city'],
+                self.search_terms['region']['province'],
+                'skr_navigation_nhpso_searchMain',
+                self.convert_radius(self.search_terms['region']['radius'])))
 
             return search
         elif method == 'post':
@@ -212,7 +212,7 @@ def scrape(self):
             self.scrape_data[str(job['id'])] = job
 
          # Do not change the order of the next three statements if you want date_filter to work
-
+         
         # stores references to jobs in list to be used in blurb retrieval
         scrape_list = [i for i in self.scrape_data.values()]
         # converts job date formats into a standard date format
diff --git a/readme.md b/readme.md
@@ -76,13 +76,17 @@ Filter undesired companies by providing your own `yaml` configuration and adding
   JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
   For more information see the [crontab document][cron_doc].
   
-  **NOTE ABOUT AUTOMATING:** As of right now, Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA. 
+  * **Glassdoor Notes** <br /> 
+  The `GlassDoor` scraper has two versions: `GlassDoorStatic` and `GlassDoorDynamic`. Both of these give you the same end result: they scrape GlassDoor and dump your job listings onto your `master_list.csv`. We recommend to *always* run `GlassDoorStatic` (this is the default preset we have on our demo `settings.yaml` file) because it is *a lot* faster than `GlassDoorDynamic`. However, given the event that `GlassDoorStatic` fails, you may use `GlassDoorDynamic`. It is very slow, but you'll still be able to scrape GlassDoor.
   
-  You may also of course disable the Glassdoor scraper in your `settings.yaml` to not have to complete any CAPTCHA at all:
+	 When using `GlassDoorDynamic` Glassdoor **might** require a human to complete a CAPTCHA. Therefore, in the case of automating with something like cron, you **might** need to be **physically present** to complete the Glassdoor CAPTCHA. 
+  
+  	You may also of course disable the Glassdoor scraper when using `GlassDoorDynamic` in your `settings.yaml` to not have to complete any CAPTCHA at all:
 ``` 
-#- 'GlassDoor' 
-- 'Indeed'
-- 'Monster'
+  - 'Indeed'
+  - 'Monster'
+  #- 'GlassDoorStatic'
+  # - 'GlassDoorDynamic'
 ```
 
 * **Reviewing Jobs in Terminal** <br />
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -71,6 +71,16 @@ def setup(attr_list: list, first_job_id: int = 0):
         return per_id_job_list
     return setup
 
+@pytest.fixture()
+def get_number_of_pages_mock():
+	def setup(mock_soup, mock_max):
+		"""
+		mock get_number_of_pages to ensure we only scrape no more than 1 page.
+		"""
+		return 1
+	return setup
+   
+
 
 @ pytest.fixture()
 def init_scraper(configure_options):
@@ -90,7 +100,7 @@ def setup(provider: str, options: list = ['']):
     return setup
 
 
-@pytest.fixture
+@pytest.fixture()
 def setup_scraper(init_scraper):
     def setup(scraper: str):
         """
diff --git a/tests/test_indeed.py b/tests/test_indeed.py
@@ -6,14 +6,6 @@
 from .conftest import search_term_configs
 
 
-# I think I should move this fixture to conftest
-def get_number_of_pages_mock(mock_soup, mock_max):
-    """
-    mock get_number_of_pages to ensure we only scrape no more than 1 page.
-    """
-    return 1
-	
-
 #test the correctness of search_tems since our tests depend on it    
 
 def test_search_terms(init_scraper):
@@ -24,8 +16,6 @@ def test_search_terms(init_scraper):
 @pytest.mark.parametrize('search_terms_config', search_term_configs)
 class TestClass():
 
-
-
     def test_convert_radius(self, init_scraper, search_terms_config):
         provider = init_scraper('indeed')
         provider.search_terms = search_terms_config
@@ -38,6 +28,7 @@ def test_convert_radius(self, init_scraper, search_terms_config):
         assert 50 == provider.convert_radius(75)
         assert 100 == provider.convert_radius(300)
 
+
     def test_get_search_url(self, init_scraper, search_terms_config):
         provider = init_scraper('indeed')
         provider.search_terms = search_terms_config
@@ -49,6 +40,7 @@ def test_get_search_url(self, init_scraper, search_terms_config):
         with pytest.raises(NotImplementedError) as e:
             provider.get_search_url('post')
 
+
     def test_get_number_of_pages(self, init_scraper, search_terms_config):
         provider = init_scraper('indeed')
         provider.search_terms = search_terms_config
@@ -62,6 +54,7 @@ def test_get_number_of_pages(self, init_scraper, search_terms_config):
         soup_base = BeautifulSoup(request_html.text, provider.bs4_parser)
         assert provider.get_number_of_pages(soup_base, max=3) <= 3
 
+
     def test_search_page_for_job_soups(self, init_scraper, search_terms_config):
         provider = init_scraper('indeed')
         provider.search_terms = search_terms_config
@@ -77,7 +70,6 @@ def test_search_page_for_job_soups(self, init_scraper, search_terms_config):
 
 # test the process of fetching title data from a job
 
-
     def test_get_title(self, setup_scraper, search_terms_config):
         scraper = setup_scraper('indeed')
         job_soup_list = scraper['job_list']
@@ -97,7 +89,6 @@ def test_get_title(self, setup_scraper, search_terms_config):
 
 # test the process of fetching company data from a job
 
-
     def test_get_company(self, setup_scraper, search_terms_config):
         scraper = setup_scraper('indeed')
         job_soup_list = scraper['job_list']
@@ -117,7 +108,6 @@ def test_get_company(self, setup_scraper, search_terms_config):
 
 # test the process of fetching location data from a job
 
-
     def test_get_location(self, setup_scraper, search_terms_config):
         scraper = setup_scraper('indeed')
         job_soup_list = scraper['job_list']
@@ -137,7 +127,6 @@ def test_get_location(self, setup_scraper, search_terms_config):
 
 # test the process of fetching date data from a job
 
-
     def test_get_date(self, setup_scraper, search_terms_config):
         scraper = setup_scraper('indeed')
         job_soup_list = scraper['job_list']
@@ -154,8 +143,8 @@ def test_get_date(self, setup_scraper, search_terms_config):
                 return
         assert False
 
-#TODO: Have more strict tests for job id and link
-
+# Test the id with a strict assertion because without a job id we have 
+# no job link, and without job link, we have no job to apply to
     def test_get_id(self, setup_scraper, search_terms_config):
         scraper = setup_scraper('indeed')
         job_soup_list = scraper['job_list']
@@ -169,6 +158,7 @@ def test_get_id(self, setup_scraper, search_terms_config):
                 assert False
         assert True
 
+
 # test the process of fetching the link to a job
 
     def test_get_link(self, setup_scraper, search_terms_config):
@@ -189,6 +179,7 @@ def test_get_link(self, setup_scraper, search_terms_config):
 
         assert False
 
+
 # test the process of fetching the blurb from a job
 
     def test_get_blurb_with_delay(self, setup_scraper, search_terms_config):
@@ -215,6 +206,7 @@ def test_get_blurb_with_delay(self, setup_scraper, search_terms_config):
 
         assert False
 
+
          
     def test_search_joblink_for_blurb(self, setup_scraper, search_terms_config):
         """
@@ -240,9 +232,11 @@ def test_search_joblink_for_blurb(self, setup_scraper, search_terms_config):
 
         assert False
 
+
     # Test the entire integration
 
-    def test_scrape(self, init_scraper, monkeypatch, search_terms_config):
+    def test_scrape(self, init_scraper, monkeypatch, 
+    search_terms_config, get_number_of_pages_mock):
         # ensure that we don't scrape more than one page
         monkeypatch.setattr(
             Indeed, 'get_number_of_pages', get_number_of_pages_mock)
diff --git a/tests/test_tools.py b/tests/test_tools.py
@@ -70,7 +70,6 @@
 
 # test clean/dirty characters that may be on title and blurb fields
 
-
 def test_filter_non_printables_clean_title(job_listings):
     job_list = job_listings(attr_list[0:1])
     filter_non_printables(job_list[0])
@@ -96,7 +95,6 @@ def test_filter_non_printables_diryt_blurb(job_listings):
 
 # test job_listing dates with all possible formats
 
-
 def test_post_date_from_relative_post_age_just_posted_pass(job_listings):
     job_list = job_listings(attr_list[4:5])
     post_date_from_relative_post_age(job_list)
@@ -114,17 +112,15 @@ def test_post_date_from_relative_post_age_1_hour_ago_pass(job_listings):
     post_date_from_relative_post_age(job_list)
     now = datetime.now()
     assert now.strftime('%Y-%m-%d') == job_list[0]['date'] or \
-        (now - timedelta(days=int(1))
-         ).strftime('%Y-%m-%d') == job_list[0]['date']
+    (now - timedelta(days=int(1))).strftime('%Y-%m-%d') == job_list[0]['date']
 
 
 def test_post_date_from_relative_post_age_2_hours_ago_pass(job_listings):
     job_list = job_listings(attr_list[7:8])
     post_date_from_relative_post_age(job_list)
     now = datetime.now()
     assert now.strftime('%Y-%m-%d') == job_list[0]['date'] or \
-        (now - timedelta(days=int(1))
-         ).strftime('%Y-%m-%d') == job_list[0]['date']
+    (now - timedelta(days=int(1))).strftime('%Y-%m-%d') == job_list[0]['date']
 
 
 def test_post_date_from_relative_ago_post_age_yesterday_ago_pass(job_listings):
diff --git a/tests/test_validate.py b/tests/test_validate.py
@@ -28,7 +28,6 @@
 
 # test all paths with invalid values
 
-
 def test_filter_list_path_fail(configure_options):
     path_configs = config_factory(
         configure_options(['']), attr_list[12: 13])[0]