jyg
parent
3949b0ef0c
commit
81664f692a
|
@ -94,7 +94,7 @@ def run_india_scraper(today_date):
|
||||||
search_pattern = "search_result_india_*.csv"
|
search_pattern = "search_result_india_*.csv"
|
||||||
last_file = find_second_latest_file(folder_path, search_pattern)
|
last_file = find_second_latest_file(folder_path, search_pattern)
|
||||||
fresh_output = f"india_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
fresh_output = f"india_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
||||||
expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archieve_{today_date}.csv"
|
expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archive_{today_date}.csv"
|
||||||
common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv"
|
common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv"
|
||||||
do_the_difference(india_search_output_file, last_file, 'jdURL',
|
do_the_difference(india_search_output_file, last_file, 'jdURL',
|
||||||
fresh_output, expired_output, common_output)
|
fresh_output, expired_output, common_output)
|
||||||
|
@ -110,7 +110,7 @@ def run_india_scraper(today_date):
|
||||||
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
||||||
current_date = datetime.now()
|
current_date = datetime.now()
|
||||||
today_date = current_date.strftime('%d-%m-%Y')
|
today_date = current_date.strftime('%d-%m-%Y')
|
||||||
upload_file_to_bucket(expired_output, f"Compete_1_India_Archieve_{today_date}.csv" , today_date)
|
upload_file_to_bucket(expired_output, f"Compete_1_India_Archive_{today_date}.csv" , today_date)
|
||||||
upload_file_to_bucket(india_detail_file, f"Compete_1_India_Active_{today_date}.csv" , today_date)
|
upload_file_to_bucket(india_detail_file, f"Compete_1_India_Active_{today_date}.csv" , today_date)
|
||||||
|
|
||||||
def run_gulf_scraper(today_date):
|
def run_gulf_scraper(today_date):
|
||||||
|
@ -121,10 +121,10 @@ def run_gulf_scraper(today_date):
|
||||||
# current_date = datetime.now()
|
# current_date = datetime.now()
|
||||||
# today_date = current_date.strftime('%d-%m-%Y')
|
# today_date = current_date.strftime('%d-%m-%Y')
|
||||||
fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
||||||
expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archieve_{today_date}.csv"
|
expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archive_{today_date}.csv"
|
||||||
common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
|
common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
|
||||||
do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output)
|
do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output)
|
||||||
upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archieve_{today_date}.csv" ,today_date)
|
upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archive_{today_date}.csv" ,today_date)
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Active_{today_date}.csv"
|
gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Active_{today_date}.csv"
|
||||||
gulf_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
|
gulf_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
|
||||||
|
|
|
@ -93,12 +93,15 @@ class NaukriJobScraper:
|
||||||
total_pages = 1000
|
total_pages = 1000
|
||||||
# self.stopcrawl = False
|
# self.stopcrawl = False
|
||||||
start_page = 1
|
start_page = 1
|
||||||
|
error_count = 0
|
||||||
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
||||||
while total_pages > 0:
|
while total_pages > 0:
|
||||||
# if self.stopcrawl:
|
# if self.stopcrawl:
|
||||||
# total_pages = 0
|
# total_pages = 0
|
||||||
url = self.base_url.format(industry_name, start_page, industry_q)
|
url = self.base_url.format(industry_name, start_page, industry_q)
|
||||||
|
if error_count >3:
|
||||||
|
total_pages -= 1
|
||||||
|
start_page += 1
|
||||||
try:
|
try:
|
||||||
# print(url)
|
# print(url)
|
||||||
# response = requests.get(url, headers=self.headers, timeout=self.timeout,
|
# response = requests.get(url, headers=self.headers, timeout=self.timeout,
|
||||||
|
@ -132,6 +135,7 @@ class NaukriJobScraper:
|
||||||
print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}")
|
print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}")
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
except Exception as e1:
|
except Exception as e1:
|
||||||
|
error_count +=1
|
||||||
logging.error(url + '\n'+ str(e1) + '\n')
|
logging.error(url + '\n'+ str(e1) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue