prahul11 2023-10-17 17:16:23 +05:30
parent b45dde14ee
commit d37e1fdd46
1 changed files with 21 additions and 22 deletions

View File

@ -11,11 +11,11 @@ import time
import os import os
import sys import sys
def upload_file_to_bucket(localFilePath, localFileName): def upload_file_to_bucket(localFilePath, localFileName, today_date):
s3 = boto3.client('s3') s3 = boto3.client('s3')
bucket_name = 'compete-syndication' bucket_name = 'compete-syndication'
file_path = localFilePath file_path = localFilePath
s3_key = f'naukri/{localFileName}' s3_key = f'naukri/{today_date}/{localFileName}'
s3.upload_file(file_path, bucket_name, s3_key) s3.upload_file(file_path, bucket_name, s3_key)
print(f'File "{file_path}" uploaded to S3 bucket "{bucket_name}" as "{s3_key}"') print(f'File "{file_path}" uploaded to S3 bucket "{bucket_name}" as "{s3_key}"')
@ -75,9 +75,9 @@ def find_second_latest_file(folder_path, search_pattern):
print("There are not enough files in the folder to find the second latest file.") print("There are not enough files in the folder to find the second latest file.")
return None return None
def run_india_scraper(): def run_india_scraper(today_date):
current_date = datetime.now() # current_date = datetime.now()
today_date = current_date.strftime('%d-%m-%Y') # today_date = current_date.strftime('%d-%m-%Y')
india_search_input_file = "naukri/_industry_urls.csv" india_search_input_file = "naukri/_industry_urls.csv"
india_search_output_file = f"india_data/daily_search_results/search_result_india_{today_date}.csv" india_search_output_file = f"india_data/daily_search_results/search_result_india_{today_date}.csv"
india_search_error_file = f"india_data/daily_error_folder/search_error_india_{today_date}.csv" india_search_error_file = f"india_data/daily_error_folder/search_error_india_{today_date}.csv"
@ -110,16 +110,16 @@ def run_india_scraper():
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n") stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
current_date = datetime.now() current_date = datetime.now()
today_date = current_date.strftime('%d-%m-%Y') today_date = current_date.strftime('%d-%m-%Y')
upload_file_to_bucket(expired_output, f"Compete_1_India_Archieve_{today_date}.csv" ) upload_file_to_bucket(expired_output, f"Compete_1_India_Archieve_{today_date}.csv" , today_date)
upload_file_to_bucket(india_detail_file, f"Compete_1_India_Active_{today_date}.csv" ) upload_file_to_bucket(india_detail_file, f"Compete_1_India_Active_{today_date}.csv" , today_date)
def run_gulf_scraper(): def run_gulf_scraper(today_date):
gulfSearch() gulfSearch()
folder_path = "gulf_data/daily_search_results/" folder_path = "gulf_data/daily_search_results/"
search_pattern = "search_result_gulf_*.csv" search_pattern = "search_result_gulf_*.csv"
last_file = find_second_latest_file(folder_path, search_pattern) last_file = find_second_latest_file(folder_path, search_pattern)
current_date = datetime.now() # current_date = datetime.now()
today_date = current_date.strftime('%d-%m-%Y') # today_date = current_date.strftime('%d-%m-%Y')
fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv" fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archieve_{today_date}.csv" expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archieve_{today_date}.csv"
common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv" common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
@ -139,16 +139,15 @@ def run_gulf_scraper():
if __name__ == "__main__": if __name__ == "__main__":
# print("Choose which function to run:") aws_access_key_id = 'AKIAWWHGITBE7XFXWA7U'
# print("1 for India Scraper") aws_secret_access_key = 'jGoGwiwRClje6fXcwOI9wHTcbSAWBt41DUjc8RBX'
# print("2 for Gulf scraper") current_date = datetime.now()
# choice = input("Enter your choice (1 or 2): ") today_date = current_date.strftime('%d-%m-%Y')
# if choice == "1": bucket_name = 'compete-syndication'
# run_india_scraper() folder_name = f'naukri/{today_date}' # Replace with your desired folder name
# elif choice == "2": s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
# run_gulf_scraper() folder_key = folder_name + '/' # Note the trailing slash
# else: s3.put_object(Bucket=bucket_name, Key=folder_key)
# print("Invalid choice. Please enter 1 or 2.")
if len(sys.argv) != 2: if len(sys.argv) != 2:
print("Usage: python common_task.py [gulf|india]") print("Usage: python common_task.py [gulf|india]")
@ -157,9 +156,9 @@ if __name__ == "__main__":
option = sys.argv[1].lower() option = sys.argv[1].lower()
if option == 'gulf': if option == 'gulf':
run_gulf_scraper() run_gulf_scraper(today_date)
elif option == 'india': elif option == 'india':
run_india_scraper() run_india_scraper(today_date)
else: else:
print("Invalid argument. Please use 'gulf' or 'india' as the argument.") print("Invalid argument. Please use 'gulf' or 'india' as the argument.")