diff --git a/common_task.py b/common_task.py
index 7bb0596..8cc21fb 100644
--- a/common_task.py
+++ b/common_task.py
@@ -42,7 +42,7 @@ def read_s3_file(filenameInS3):
# # Print or process the file contents
# print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly
-def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output):
+def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output, gi):
today_df = pd.read_csv(today_file)
last_file_df = pd.read_csv(last_file)
print(today_df.shape, last_file_df.shape)
@@ -54,6 +54,14 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi
new_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1)
new_df.to_csv(fresh_output, index=False)
expired_df = pd.merge(last_file_df, today_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1)
+ child_df_copy = expired_df.copy()
+ if gi =="g":
+ child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key']
+ expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
+ else: # jobId
+ child_df_copy['jobId'] = 'i_' + child_df_copy['jobId']
+ expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
+
expired_df.to_csv(expired_output, index=False)
print(new_df.shape, expired_df.shape)
common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner')
@@ -97,7 +105,7 @@ def run_india_scraper(today_date):
expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archive_{today_date}.csv"
common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv"
do_the_difference(india_search_output_file, last_file, 'jdURL',
- fresh_output, expired_output, common_output)
+ fresh_output, expired_output, common_output, "i")
india_detail_file = f"india_data/daily_upload_folder/Compete_1_India_Active_{today_date}.csv"
india_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
start_time = time.time()
@@ -123,7 +131,7 @@ def run_gulf_scraper(today_date):
fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archive_{today_date}.csv"
common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
- do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output)
+ do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output, "g")
upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archive_{today_date}.csv" ,today_date)
start_time = time.time()
gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Active_{today_date}.csv"
diff --git a/naukri/jobdata_gulf_r.py b/naukri/jobdata_gulf_r.py
index 77a57d1..9702a88 100644
--- a/naukri/jobdata_gulf_r.py
+++ b/naukri/jobdata_gulf_r.py
@@ -106,13 +106,26 @@ class NaukriGulfJobDetailScraper:
def transform_data(self, job_id, jd_url, json_response):
source_value1 = json_response.get('other', {'tag': ''}).get('tag', '')
source_value2 = json_response.get('contact', {'website': ''}).get('website', '')
+ jd = json_response.get('description','')
+ desired_profile = json_response.get('desiredCandidate')
+ valid_pairs = None
+ if desired_profile:
+ valid_pairs = [(key, value) for key, value in desired_profile.items() if value is not None and value != '' and key != 'experience']
+
+ if valid_pairs:
+ html_output = '