From d37e1fdd4667325c7c067c5162d60f203a2cc47a Mon Sep 17 00:00:00 2001 From: prahul11 Date: Tue, 17 Oct 2023 17:16:23 +0530 Subject: [PATCH] ky --- common_task.py | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/common_task.py b/common_task.py index b7dd65f..2f9775b 100644 --- a/common_task.py +++ b/common_task.py @@ -11,11 +11,11 @@ import time import os import sys -def upload_file_to_bucket(localFilePath, localFileName): +def upload_file_to_bucket(localFilePath, localFileName, today_date): s3 = boto3.client('s3') bucket_name = 'compete-syndication' file_path = localFilePath - s3_key = f'naukri/{localFileName}' + s3_key = f'naukri/{today_date}/{localFileName}' s3.upload_file(file_path, bucket_name, s3_key) print(f'File "{file_path}" uploaded to S3 bucket "{bucket_name}" as "{s3_key}"') @@ -75,9 +75,9 @@ def find_second_latest_file(folder_path, search_pattern): print("There are not enough files in the folder to find the second latest file.") return None -def run_india_scraper(): - current_date = datetime.now() - today_date = current_date.strftime('%d-%m-%Y') +def run_india_scraper(today_date): + # current_date = datetime.now() + # today_date = current_date.strftime('%d-%m-%Y') india_search_input_file = "naukri/_industry_urls.csv" india_search_output_file = f"india_data/daily_search_results/search_result_india_{today_date}.csv" india_search_error_file = f"india_data/daily_error_folder/search_error_india_{today_date}.csv" @@ -110,16 +110,16 @@ def run_india_scraper(): stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n") current_date = datetime.now() today_date = current_date.strftime('%d-%m-%Y') - upload_file_to_bucket(expired_output, f"Compete_1_India_Archieve_{today_date}.csv" ) - upload_file_to_bucket(india_detail_file, f"Compete_1_India_Active_{today_date}.csv" ) + upload_file_to_bucket(expired_output, f"Compete_1_India_Archieve_{today_date}.csv" , today_date) + upload_file_to_bucket(india_detail_file, f"Compete_1_India_Active_{today_date}.csv" , today_date) -def run_gulf_scraper(): +def run_gulf_scraper(today_date): gulfSearch() folder_path = "gulf_data/daily_search_results/" search_pattern = "search_result_gulf_*.csv" last_file = find_second_latest_file(folder_path, search_pattern) - current_date = datetime.now() - today_date = current_date.strftime('%d-%m-%Y') + # current_date = datetime.now() + # today_date = current_date.strftime('%d-%m-%Y') fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv" expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archieve_{today_date}.csv" common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv" @@ -139,16 +139,15 @@ def run_gulf_scraper(): if __name__ == "__main__": - # print("Choose which function to run:") - # print("1 for India Scraper") - # print("2 for Gulf scraper") - # choice = input("Enter your choice (1 or 2): ") - # if choice == "1": - # run_india_scraper() - # elif choice == "2": - # run_gulf_scraper() - # else: - # print("Invalid choice. Please enter 1 or 2.") + aws_access_key_id = 'AKIAWWHGITBE7XFXWA7U' + aws_secret_access_key = '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' + current_date = datetime.now() + today_date = current_date.strftime('%d-%m-%Y') + bucket_name = 'compete-syndication' + folder_name = f'naukri/{today_date}' # Replace with your desired folder name + s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) + folder_key = folder_name + '/' # Note the trailing slash + s3.put_object(Bucket=bucket_name, Key=folder_key) if len(sys.argv) != 2: print("Usage: python common_task.py [gulf|india]") @@ -157,9 +156,9 @@ if __name__ == "__main__": option = sys.argv[1].lower() if option == 'gulf': - run_gulf_scraper() + run_gulf_scraper(today_date) elif option == 'india': - run_india_scraper() + run_india_scraper(today_date) else: print("Invalid argument. Please use 'gulf' or 'india' as the argument.")