jgu

2023-10-26 13:21:39 +05:30 · 2023-10-26 13:21:39 +05:30 · 075130d7ce
parent 1bbd067aea
commit 075130d7ce
3 changed files with 25 additions and 6 deletions
--- a/common_task.py
+++ b/common_task.py
@ -42,7 +42,7 @@ def read_s3_file(filenameInS3):
    # # Print or process the file contents
    # print(file_content.decode('utf-8'))  # Assumes the file is text; adjust accordingly
-def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output):
+def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output, gi):
    today_df = pd.read_csv(today_file)
    last_file_df = pd.read_csv(last_file)
    print(today_df.shape, last_file_df.shape)
@ -54,6 +54,14 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi
    new_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='left', indicator=True,  suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1)
    new_df.to_csv(fresh_output, index=False)
    expired_df = pd.merge(last_file_df, today_df, on=column_for_diff, how='left', indicator=True,  suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1)
    child_df_copy = expired_df.copy()
    if gi =="g":
        child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key']
        expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
    else: # jobId
        child_df_copy['jobId'] = 'i_' + child_df_copy['jobId']
        expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
    expired_df.to_csv(expired_output, index=False)
    print(new_df.shape, expired_df.shape)
    common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner')
@ -97,7 +105,7 @@ def run_india_scraper(today_date):
    expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archive_{today_date}.csv"
    common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv"
    do_the_difference(india_search_output_file, last_file, 'jdURL', 
-                      fresh_output, expired_output, common_output)
+                      fresh_output, expired_output, common_output, "i")
    india_detail_file = f"india_data/daily_upload_folder/Compete_1_India_Active_{today_date}.csv"
    india_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
    start_time = time.time()
@ -123,7 +131,7 @@ def run_gulf_scraper(today_date):
    fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
    expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archive_{today_date}.csv"
    common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
-    do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output)
+    do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output, "g")
    upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archive_{today_date}.csv" ,today_date)
    start_time = time.time()
    gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Active_{today_date}.csv"
--- a/naukri/jobdata_gulf_r.py
+++ b/naukri/jobdata_gulf_r.py
@ -106,13 +106,26 @@ class NaukriGulfJobDetailScraper:
    def transform_data(self, job_id, jd_url, json_response):
        source_value1 = json_response.get('other', {'tag': ''}).get('tag', '')
        source_value2 = json_response.get('contact', {'website': ''}).get('website', '')
        jd = json_response.get('description','')
        desired_profile = json_response.get('desiredCandidate')
        valid_pairs = None
        if desired_profile:
            valid_pairs = [(key, value) for key, value in desired_profile.items() if value is not None and value != '' and key != 'experience']
        if valid_pairs:
            html_output = '<br><h3 class="heading">Desired Candidate Profile</h3><br>'
            for key, value in valid_pairs:
                html_output += f"<strong>{key.title()}:</strong> <br>{value}<br>"
            jd += html_output
        json_data = {
            "Url" : jd_url,
            "Job Key" : "g_" + str(job_id),
            # "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
            # json_response.get('contact', {'website': ''}).get('website',''),
            "Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '',
-            "Job Description" : json_response.get('description',''),
+            # "Job Description" : json_response.get('description',''),
            "Job Description" : jd,
            "Role Category" :"",
            "Job Industry" :  ', '.join([t['title'] for t in  json_response['industryInterlinking']]),
            "Job Title" : json_response.get('designation'),
@ -125,7 +138,6 @@ class NaukriGulfJobDetailScraper:
            "Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
            "Salary Detail" : json_response.get('compensation'),
            "Country" : json_response.get('compensation',{'country':''}).get('country')
        }
        return json_data
--- a/naukri/jobdata_india.py
+++ b/naukri/jobdata_india.py
@ -132,7 +132,6 @@ class NaukriJobDetailScraper:
                      response = requests.get(url, headers=self.headers, timeout=self.timeout)
                      print(f"{response.status_code} for {url}")
                      if response.status_code == 200:
                          json_response = response.json()