diff --git a/common_task.py b/common_task.py index cbb783b..7bb0596 100644 --- a/common_task.py +++ b/common_task.py @@ -94,7 +94,7 @@ def run_india_scraper(today_date): search_pattern = "search_result_india_*.csv" last_file = find_second_latest_file(folder_path, search_pattern) fresh_output = f"india_data/daily_process_folder/new_jobs_on_{today_date}.csv" - expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archieve_{today_date}.csv" + expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archive_{today_date}.csv" common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv" do_the_difference(india_search_output_file, last_file, 'jdURL', fresh_output, expired_output, common_output) @@ -110,7 +110,7 @@ def run_india_scraper(today_date): stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n") current_date = datetime.now() today_date = current_date.strftime('%d-%m-%Y') - upload_file_to_bucket(expired_output, f"Compete_1_India_Archieve_{today_date}.csv" , today_date) + upload_file_to_bucket(expired_output, f"Compete_1_India_Archive_{today_date}.csv" , today_date) upload_file_to_bucket(india_detail_file, f"Compete_1_India_Active_{today_date}.csv" , today_date) def run_gulf_scraper(today_date): @@ -121,10 +121,10 @@ def run_gulf_scraper(today_date): # current_date = datetime.now() # today_date = current_date.strftime('%d-%m-%Y') fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv" - expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archieve_{today_date}.csv" + expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archive_{today_date}.csv" common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv" do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output) - upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archieve_{today_date}.csv" ,today_date) + upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archive_{today_date}.csv" ,today_date) start_time = time.time() gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Active_{today_date}.csv" gulf_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt" diff --git a/naukri/search_india.py b/naukri/search_india.py index 4de9246..113eb6b 100644 --- a/naukri/search_india.py +++ b/naukri/search_india.py @@ -93,12 +93,15 @@ class NaukriJobScraper: total_pages = 1000 # self.stopcrawl = False start_page = 1 - + error_count = 0 print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}") while total_pages > 0: # if self.stopcrawl: # total_pages = 0 url = self.base_url.format(industry_name, start_page, industry_q) + if error_count >3: + total_pages -= 1 + start_page += 1 try: # print(url) # response = requests.get(url, headers=self.headers, timeout=self.timeout, @@ -132,6 +135,7 @@ class NaukriJobScraper: print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}") time.sleep(1) except Exception as e1: + error_count +=1 logging.error(url + '\n'+ str(e1) + '\n')