jgu
parent
1bbd067aea
commit
075130d7ce
|
@ -42,7 +42,7 @@ def read_s3_file(filenameInS3):
|
|||
# # Print or process the file contents
|
||||
# print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly
|
||||
|
||||
def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output):
|
||||
def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output, gi):
|
||||
today_df = pd.read_csv(today_file)
|
||||
last_file_df = pd.read_csv(last_file)
|
||||
print(today_df.shape, last_file_df.shape)
|
||||
|
@ -54,6 +54,14 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi
|
|||
new_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1)
|
||||
new_df.to_csv(fresh_output, index=False)
|
||||
expired_df = pd.merge(last_file_df, today_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1)
|
||||
child_df_copy = expired_df.copy()
|
||||
if gi =="g":
|
||||
child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key']
|
||||
expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
|
||||
else: # jobId
|
||||
child_df_copy['jobId'] = 'i_' + child_df_copy['jobId']
|
||||
expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
|
||||
|
||||
expired_df.to_csv(expired_output, index=False)
|
||||
print(new_df.shape, expired_df.shape)
|
||||
common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner')
|
||||
|
@ -97,7 +105,7 @@ def run_india_scraper(today_date):
|
|||
expired_output = f"india_data/daily_upload_folder/Compete_1_India_Archive_{today_date}.csv"
|
||||
common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv"
|
||||
do_the_difference(india_search_output_file, last_file, 'jdURL',
|
||||
fresh_output, expired_output, common_output)
|
||||
fresh_output, expired_output, common_output, "i")
|
||||
india_detail_file = f"india_data/daily_upload_folder/Compete_1_India_Active_{today_date}.csv"
|
||||
india_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
|
||||
start_time = time.time()
|
||||
|
@ -123,7 +131,7 @@ def run_gulf_scraper(today_date):
|
|||
fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
||||
expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archive_{today_date}.csv"
|
||||
common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
|
||||
do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output)
|
||||
do_the_difference(gulf_search_file, last_file, "jdURL", fresh_output, expired_output, common_output, "g")
|
||||
upload_file_to_bucket(expired_output, f"Compete_1_Gulf_Archive_{today_date}.csv" ,today_date)
|
||||
start_time = time.time()
|
||||
gulf_detail_file = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Active_{today_date}.csv"
|
||||
|
|
|
@ -106,13 +106,26 @@ class NaukriGulfJobDetailScraper:
|
|||
def transform_data(self, job_id, jd_url, json_response):
|
||||
source_value1 = json_response.get('other', {'tag': ''}).get('tag', '')
|
||||
source_value2 = json_response.get('contact', {'website': ''}).get('website', '')
|
||||
jd = json_response.get('description','')
|
||||
desired_profile = json_response.get('desiredCandidate')
|
||||
valid_pairs = None
|
||||
if desired_profile:
|
||||
valid_pairs = [(key, value) for key, value in desired_profile.items() if value is not None and value != '' and key != 'experience']
|
||||
|
||||
if valid_pairs:
|
||||
html_output = '<br><h3 class="heading">Desired Candidate Profile</h3><br>'
|
||||
for key, value in valid_pairs:
|
||||
html_output += f"<strong>{key.title()}:</strong> <br>{value}<br>"
|
||||
jd += html_output
|
||||
|
||||
json_data = {
|
||||
"Url" : jd_url,
|
||||
"Job Key" : "g_" + str(job_id),
|
||||
# "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
|
||||
# json_response.get('contact', {'website': ''}).get('website',''),
|
||||
"Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '',
|
||||
"Job Description" : json_response.get('description',''),
|
||||
# "Job Description" : json_response.get('description',''),
|
||||
"Job Description" : jd,
|
||||
"Role Category" :"",
|
||||
"Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]),
|
||||
"Job Title" : json_response.get('designation'),
|
||||
|
@ -125,7 +138,6 @@ class NaukriGulfJobDetailScraper:
|
|||
"Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
|
||||
"Salary Detail" : json_response.get('compensation'),
|
||||
"Country" : json_response.get('compensation',{'country':''}).get('country')
|
||||
|
||||
}
|
||||
return json_data
|
||||
|
||||
|
|
|
@ -132,7 +132,6 @@ class NaukriJobDetailScraper:
|
|||
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
||||
|
||||
print(f"{response.status_code} for {url}")
|
||||
|
||||
if response.status_code == 200:
|
||||
json_response = response.json()
|
||||
|
||||
|
|
Loading…
Reference in New Issue