Compare commits
2 Commits
c769a3b232
...
ec654ea0be
Author | SHA1 | Date |
---|---|---|
prahul11 | ec654ea0be | |
prahul11 | 5712d4cf8b |
|
@ -94,11 +94,11 @@ def run_india_scraper():
|
||||||
search_pattern = "search_result_india_*.csv"
|
search_pattern = "search_result_india_*.csv"
|
||||||
last_file = find_second_latest_file(folder_path, search_pattern)
|
last_file = find_second_latest_file(folder_path, search_pattern)
|
||||||
fresh_output = f"india_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
fresh_output = f"india_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
||||||
expired_output = f"india_data/daily_upload_folder/expired_Compete_1_India_{today_date}.csv"
|
expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archieve_{today_date}.csv"
|
||||||
common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv"
|
common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv"
|
||||||
do_the_difference(india_search_output_file, last_file, 'jdURL',
|
do_the_difference(india_search_output_file, last_file, 'jdURL',
|
||||||
fresh_output, expired_output, common_output)
|
fresh_output, expired_output, common_output)
|
||||||
india_detail_file = f"india_data/daily_upload_folder/Compete_1_India_{today_date}.csv"
|
india_detail_file = f"india_data/daily_upload_folder/Compete_1_India_Active_{today_date}.csv"
|
||||||
india_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
|
india_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
scraper = NaukriJobDetailScraper(fresh_output, india_detail_file, india_detail_error_file)
|
scraper = NaukriJobDetailScraper(fresh_output, india_detail_file, india_detail_error_file)
|
||||||
|
@ -110,8 +110,8 @@ def run_india_scraper():
|
||||||
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
||||||
current_date = datetime.now()
|
current_date = datetime.now()
|
||||||
today_date = current_date.strftime('%d-%m-%Y')
|
today_date = current_date.strftime('%d-%m-%Y')
|
||||||
upload_file_to_bucket(expired_output, f"expired_Compete_1_India_{today_date}.csv" )
|
upload_file_to_bucket(expired_output, f"Compete_1_India_Archieve_{today_date}.csv" )
|
||||||
upload_file_to_bucket(india_detail_file, f"Compete_1_India_{today_date}.csv" )
|
upload_file_to_bucket(india_detail_file, f"Compete_1_India_Active_{today_date}.csv" )
|
||||||
|
|
||||||
def run_gulf_scraper():
|
def run_gulf_scraper():
|
||||||
gulfSearch()
|
gulfSearch()
|
||||||
|
@ -121,12 +121,12 @@ def run_gulf_scraper():
|
||||||
current_date = datetime.now()
|
current_date = datetime.now()
|
||||||
today_date = current_date.strftime('%d-%m-%Y')
|
today_date = current_date.strftime('%d-%m-%Y')
|
||||||
fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
||||||
expired_output = f"gulf_data/daily_upload_folder/expired_Compete_1_gulf_{today_date}.csv"
|
expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archieve_{today_date}.csv"
|
||||||
common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
|
common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
|
||||||
do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output)
|
do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output)
|
||||||
upload_file_to_bucket(expired_output, f"expired_Compete_1_Gulf_{today_date}.csv" )
|
upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archieve_{today_date}.csv" )
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_gulf_{today_date}.csv"
|
gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Active_{today_date}.csv"
|
||||||
gulf_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
|
gulf_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
|
||||||
scraper = NaukriGulfJobDetailScraper(fresh_output, gulf_detail_file, gulf_detail_error_file)
|
scraper = NaukriGulfJobDetailScraper(fresh_output, gulf_detail_file, gulf_detail_error_file)
|
||||||
scraper.scrape()
|
scraper.scrape()
|
||||||
|
@ -135,7 +135,7 @@ def run_gulf_scraper():
|
||||||
print(f"Jobdata program took {duration_hours:.2f} hours to run.")
|
print(f"Jobdata program took {duration_hours:.2f} hours to run.")
|
||||||
with open(f'gulf_data/daily_stats_folder/stats_file_of_{today_date}.txt', "a") as stat:
|
with open(f'gulf_data/daily_stats_folder/stats_file_of_{today_date}.txt', "a") as stat:
|
||||||
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
||||||
upload_file_to_bucket(gulf_detail_file, f"Compete_1_Gulf_{today_date}.csv" )
|
upload_file_to_bucket(gulf_detail_file, f"Compete_1_Gulf_Active_{today_date}.csv" )
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -42,7 +42,7 @@ class NaukriJobScraper:
|
||||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
|
||||||
"content-encoding": "gzip",
|
"content-encoding": "gzip",
|
||||||
}
|
}
|
||||||
|
stopcrawl = False
|
||||||
# headers = {
|
# headers = {
|
||||||
# "appid": "109",
|
# "appid": "109",
|
||||||
# "systemid": "109"
|
# "systemid": "109"
|
||||||
|
@ -62,6 +62,17 @@ class NaukriJobScraper:
|
||||||
for job in json_data["jobDetails"]:
|
for job in json_data["jobDetails"]:
|
||||||
parsed_item = {field: job.get(field, None) for field in self.keys_to_extract}
|
parsed_item = {field: job.get(field, None) for field in self.keys_to_extract}
|
||||||
parsed_data.append(parsed_item)
|
parsed_data.append(parsed_item)
|
||||||
|
# with open('r,txt', 'w+', encoding='utf-8', newline='') as dr:
|
||||||
|
# import json
|
||||||
|
# dr.write(json.dumps(parsed_data))
|
||||||
|
# print(parsed_data)
|
||||||
|
days_ago_list = [x['footerPlaceholderLabel'] for x in parsed_data]
|
||||||
|
target = "7 Days Ago"
|
||||||
|
count = days_ago_list.count(target)
|
||||||
|
percentage = (count / len(days_ago_list)) * 100
|
||||||
|
if percentage > 60:
|
||||||
|
self.stopcrawl = True
|
||||||
|
|
||||||
|
|
||||||
with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
||||||
csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract)
|
csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract)
|
||||||
|
@ -79,10 +90,13 @@ class NaukriJobScraper:
|
||||||
industry_name=industry[1]
|
industry_name=industry[1]
|
||||||
industry_q=industry[2]
|
industry_q=industry[2]
|
||||||
total_pages = 1000
|
total_pages = 1000
|
||||||
|
self.stopcrawl = False
|
||||||
start_page = 1
|
start_page = 1
|
||||||
|
|
||||||
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
||||||
while total_pages > 0:
|
while total_pages > 0:
|
||||||
|
if self.stopcrawl:
|
||||||
|
total_pages = 0
|
||||||
url = self.base_url.format(industry_name, start_page, industry_q)
|
url = self.base_url.format(industry_name, start_page, industry_q)
|
||||||
try:
|
try:
|
||||||
# print(url)
|
# print(url)
|
||||||
|
|
Loading…
Reference in New Issue