diff --git a/common_task.py b/common_task.py index c429e47..e65c93a 100644 --- a/common_task.py +++ b/common_task.py @@ -7,6 +7,7 @@ from naukri.search_india import NaukriJobScraper from naukri.jobdata_india import NaukriJobDetailScraper from naukri.search_gulf_r import main as gulfSearch, output_filename_csv as gulf_search_file from naukri.jobdata_gulf_r import NaukriGulfJobDetailScraper +from jobstreet.jst_id_search import search_jst_id import time import os import sys @@ -42,7 +43,7 @@ def read_s3_file(filenameInS3): # # Print or process the file contents # print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly -def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output, gi): +def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output=None, gi): today_df = pd.read_csv(today_file) last_file_df = pd.read_csv(last_file) print(today_df.shape, last_file_df.shape) @@ -59,6 +60,11 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key'].astype(str) expired_df['Job Key'] = expired_df['Job Key'].astype(str) expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True) + elif gi == "id": + # child_df_copy['id'] = 'id_' + child_df_copy['Job Key'].astype(str) + # expired_df['Job Key'] = expired_df['Job Key'].astype(str) + # expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True) + pass else: # jobId child_df_copy['jobId'] = 'i_' + child_df_copy['jobId'].astype(str) expired_df['jobId'] = expired_df['jobId'].astype(str) @@ -68,9 +74,9 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi expired_df.to_csv(expired_output, index=False) print(new_df.shape, expired_df.shape) - common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner') - print(common_df.shape) - common_df.to_csv(common_output, index=False) + # common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner') + # print(common_df.shape) + # common_df.to_csv(common_output, index=False) def extract_date_from_filename(filename): date_str = filename.split("_")[-1].replace(".csv", "") @@ -156,21 +162,37 @@ if __name__ == "__main__": current_date = datetime.now() today_date = current_date.strftime('%d-%m-%Y') bucket_name = 'compete-syndication' - folder_name = f'naukri/{today_date}' # Replace with your desired folder name - s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) - folder_key = folder_name + '/' # Note the trailing slash - s3.put_object(Bucket=bucket_name, Key=folder_key) + if len(sys.argv) != 2: print("Usage: python common_task.py [gulf|india]") sys.exit(1) option = sys.argv[1].lower() - + if option == 'gulf' or option == 'india': + folder_name = f'naukri/{today_date}' # Replace with your desired folder name + s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) + folder_key = folder_name + '/' # Note the trailing slash + s3.put_object(Bucket=bucket_name, Key=folder_key) + elif option in ['id', 'vi', 'ph', 'si', 'my']: + folder_name = f'jobstreet/{today_date}' # Replace with your desired folder name + s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) + folder_key = folder_name + '/' # Note the trailing slash + s3.put_object(Bucket=bucket_name, Key=folder_key) if option == 'gulf': run_gulf_scraper(today_date) elif option == 'india': run_india_scraper(today_date) + elif option =="id": + search_file = f"indonesia_data/daily_search_results/id_search_{today_date}.csv" + search_jst_id(search_file) + folder_path = "indonesia_data/daily_search_results/" + search_pattern = "id_search_*.csv" + last_file = find_second_latest_file(folder_path, search_pattern) + fresh_output = f"indonesia_data/daily_process_folder/new_jobs_on_{today_date}.csv" + expired_output = f"indonesia_data/daily_upload_folder/Compete_2_Indonesia_Archive_{today_date}.csv" + do_the_difference(search_file, last_file, 'id', fresh_output, expired_output, None, "id") + else: print("Invalid argument. Please use 'gulf' or 'india' as the argument.")