diff --git a/common_task.py b/common_task.py index 7bb0596..8cc21fb 100644 --- a/common_task.py +++ b/common_task.py @@ -42,7 +42,7 @@ def read_s3_file(filenameInS3): # # Print or process the file contents # print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly -def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output): +def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output, gi): today_df = pd.read_csv(today_file) last_file_df = pd.read_csv(last_file) print(today_df.shape, last_file_df.shape) @@ -54,6 +54,14 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi new_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1) new_df.to_csv(fresh_output, index=False) expired_df = pd.merge(last_file_df, today_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1) + child_df_copy = expired_df.copy() + if gi =="g": + child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key'] + expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True) + else: # jobId + child_df_copy['jobId'] = 'i_' + child_df_copy['jobId'] + expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True) + expired_df.to_csv(expired_output, index=False) print(new_df.shape, expired_df.shape) common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner') @@ -97,7 +105,7 @@ def run_india_scraper(today_date): expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archive_{today_date}.csv" common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv" do_the_difference(india_search_output_file, last_file, 'jdURL', - fresh_output, expired_output, common_output) + fresh_output, expired_output, common_output, "i") india_detail_file = f"india_data/daily_upload_folder/Compete_1_India_Active_{today_date}.csv" india_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt" start_time = time.time() @@ -123,7 +131,7 @@ def run_gulf_scraper(today_date): fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv" expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archive_{today_date}.csv" common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv" - do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output) + do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output, "g") upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archive_{today_date}.csv" ,today_date) start_time = time.time() gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Active_{today_date}.csv" diff --git a/naukri/jobdata_gulf_r.py b/naukri/jobdata_gulf_r.py index 77a57d1..9702a88 100644 --- a/naukri/jobdata_gulf_r.py +++ b/naukri/jobdata_gulf_r.py @@ -106,13 +106,26 @@ class NaukriGulfJobDetailScraper: def transform_data(self, job_id, jd_url, json_response): source_value1 = json_response.get('other', {'tag': ''}).get('tag', '') source_value2 = json_response.get('contact', {'website': ''}).get('website', '') + jd = json_response.get('description','') + desired_profile = json_response.get('desiredCandidate') + valid_pairs = None + if desired_profile: + valid_pairs = [(key, value) for key, value in desired_profile.items() if value is not None and value != '' and key != 'experience'] + + if valid_pairs: + html_output = '

Desired Candidate Profile


' + for key, value in valid_pairs: + html_output += f"{key.title()}:
{value}
" + jd += html_output + json_data = { "Url" : jd_url, "Job Key" : "g_" + str(job_id), # "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \ # json_response.get('contact', {'website': ''}).get('website',''), "Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '', - "Job Description" : json_response.get('description',''), + # "Job Description" : json_response.get('description',''), + "Job Description" : jd, "Role Category" :"", "Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]), "Job Title" : json_response.get('designation'), @@ -125,7 +138,6 @@ class NaukriGulfJobDetailScraper: "Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'), "Salary Detail" : json_response.get('compensation'), "Country" : json_response.get('compensation',{'country':''}).get('country') - } return json_data diff --git a/naukri/jobdata_india.py b/naukri/jobdata_india.py index 0ec8b0d..e6c40b0 100644 --- a/naukri/jobdata_india.py +++ b/naukri/jobdata_india.py @@ -132,7 +132,6 @@ class NaukriJobDetailScraper: response = requests.get(url, headers=self.headers, timeout=self.timeout) print(f"{response.status_code} for {url}") - if response.status_code == 200: json_response = response.json()