prahul11 2023-10-26 13:21:39 +05:30
parent 1bbd067aea
commit 075130d7ce
3 changed files with 25 additions and 6 deletions

View File

@ -42,7 +42,7 @@ def read_s3_file(filenameInS3):
# # Print or process the file contents # # Print or process the file contents
# print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly # print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly
def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output): def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output, gi):
today_df = pd.read_csv(today_file) today_df = pd.read_csv(today_file)
last_file_df = pd.read_csv(last_file) last_file_df = pd.read_csv(last_file)
print(today_df.shape, last_file_df.shape) print(today_df.shape, last_file_df.shape)
@ -54,6 +54,14 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi
new_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1) new_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1)
new_df.to_csv(fresh_output, index=False) new_df.to_csv(fresh_output, index=False)
expired_df = pd.merge(last_file_df, today_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1) expired_df = pd.merge(last_file_df, today_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1)
child_df_copy = expired_df.copy()
if gi =="g":
child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key']
expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
else: # jobId
child_df_copy['jobId'] = 'i_' + child_df_copy['jobId']
expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
expired_df.to_csv(expired_output, index=False) expired_df.to_csv(expired_output, index=False)
print(new_df.shape, expired_df.shape) print(new_df.shape, expired_df.shape)
common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner') common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner')
@ -97,7 +105,7 @@ def run_india_scraper(today_date):
expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archive_{today_date}.csv" expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archive_{today_date}.csv"
common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv" common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv"
do_the_difference(india_search_output_file, last_file, 'jdURL', do_the_difference(india_search_output_file, last_file, 'jdURL',
fresh_output, expired_output, common_output) fresh_output, expired_output, common_output, "i")
india_detail_file = f"india_data/daily_upload_folder/Compete_1_India_Active_{today_date}.csv" india_detail_file = f"india_data/daily_upload_folder/Compete_1_India_Active_{today_date}.csv"
india_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt" india_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
start_time = time.time() start_time = time.time()
@ -123,7 +131,7 @@ def run_gulf_scraper(today_date):
fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv" fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archive_{today_date}.csv" expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archive_{today_date}.csv"
common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv" common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output) do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output, "g")
upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archive_{today_date}.csv" ,today_date) upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archive_{today_date}.csv" ,today_date)
start_time = time.time() start_time = time.time()
gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Active_{today_date}.csv" gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Active_{today_date}.csv"

View File

@ -106,13 +106,26 @@ class NaukriGulfJobDetailScraper:
def transform_data(self, job_id, jd_url, json_response): def transform_data(self, job_id, jd_url, json_response):
source_value1 = json_response.get('other', {'tag': ''}).get('tag', '') source_value1 = json_response.get('other', {'tag': ''}).get('tag', '')
source_value2 = json_response.get('contact', {'website': ''}).get('website', '') source_value2 = json_response.get('contact', {'website': ''}).get('website', '')
jd = json_response.get('description','')
desired_profile = json_response.get('desiredCandidate')
valid_pairs = None
if desired_profile:
valid_pairs = [(key, value) for key, value in desired_profile.items() if value is not None and value != '' and key != 'experience']
if valid_pairs:
html_output = '<br><h3 class="heading">Desired Candidate Profile</h3><br>'
for key, value in valid_pairs:
html_output += f"<strong>{key.title()}:</strong> <br>{value}<br>"
jd += html_output
json_data = { json_data = {
"Url" : jd_url, "Url" : jd_url,
"Job Key" : "g_" + str(job_id), "Job Key" : "g_" + str(job_id),
# "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \ # "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
# json_response.get('contact', {'website': ''}).get('website',''), # json_response.get('contact', {'website': ''}).get('website',''),
"Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '', "Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '',
"Job Description" : json_response.get('description',''), # "Job Description" : json_response.get('description',''),
"Job Description" : jd,
"Role Category" :"", "Role Category" :"",
"Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]), "Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]),
"Job Title" : json_response.get('designation'), "Job Title" : json_response.get('designation'),
@ -125,7 +138,6 @@ class NaukriGulfJobDetailScraper:
"Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'), "Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
"Salary Detail" : json_response.get('compensation'), "Salary Detail" : json_response.get('compensation'),
"Country" : json_response.get('compensation',{'country':''}).get('country') "Country" : json_response.get('compensation',{'country':''}).get('country')
} }
return json_data return json_data

View File

@ -132,7 +132,6 @@ class NaukriJobDetailScraper:
response = requests.get(url, headers=self.headers, timeout=self.timeout) response = requests.get(url, headers=self.headers, timeout=self.timeout)
print(f"{response.status_code} for {url}") print(f"{response.status_code} for {url}")
if response.status_code == 200: if response.status_code == 200:
json_response = response.json() json_response = response.json()