parent
b45dde14ee
commit
d37e1fdd46
|
@ -11,11 +11,11 @@ import time
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
def upload_file_to_bucket(localFilePath, localFileName):
|
def upload_file_to_bucket(localFilePath, localFileName, today_date):
|
||||||
s3 = boto3.client('s3')
|
s3 = boto3.client('s3')
|
||||||
bucket_name = 'compete-syndication'
|
bucket_name = 'compete-syndication'
|
||||||
file_path = localFilePath
|
file_path = localFilePath
|
||||||
s3_key = f'naukri/{localFileName}'
|
s3_key = f'naukri/{today_date}/{localFileName}'
|
||||||
s3.upload_file(file_path, bucket_name, s3_key)
|
s3.upload_file(file_path, bucket_name, s3_key)
|
||||||
print(f'File "{file_path}" uploaded to S3 bucket "{bucket_name}" as "{s3_key}"')
|
print(f'File "{file_path}" uploaded to S3 bucket "{bucket_name}" as "{s3_key}"')
|
||||||
|
|
||||||
|
@ -75,9 +75,9 @@ def find_second_latest_file(folder_path, search_pattern):
|
||||||
print("There are not enough files in the folder to find the second latest file.")
|
print("There are not enough files in the folder to find the second latest file.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def run_india_scraper():
|
def run_india_scraper(today_date):
|
||||||
current_date = datetime.now()
|
# current_date = datetime.now()
|
||||||
today_date = current_date.strftime('%d-%m-%Y')
|
# today_date = current_date.strftime('%d-%m-%Y')
|
||||||
india_search_input_file = "naukri/_industry_urls.csv"
|
india_search_input_file = "naukri/_industry_urls.csv"
|
||||||
india_search_output_file = f"india_data/daily_search_results/search_result_india_{today_date}.csv"
|
india_search_output_file = f"india_data/daily_search_results/search_result_india_{today_date}.csv"
|
||||||
india_search_error_file = f"india_data/daily_error_folder/search_error_india_{today_date}.csv"
|
india_search_error_file = f"india_data/daily_error_folder/search_error_india_{today_date}.csv"
|
||||||
|
@ -110,16 +110,16 @@ def run_india_scraper():
|
||||||
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
||||||
current_date = datetime.now()
|
current_date = datetime.now()
|
||||||
today_date = current_date.strftime('%d-%m-%Y')
|
today_date = current_date.strftime('%d-%m-%Y')
|
||||||
upload_file_to_bucket(expired_output, f"Compete_1_India_Archieve_{today_date}.csv" )
|
upload_file_to_bucket(expired_output, f"Compete_1_India_Archieve_{today_date}.csv" , today_date)
|
||||||
upload_file_to_bucket(india_detail_file, f"Compete_1_India_Active_{today_date}.csv" )
|
upload_file_to_bucket(india_detail_file, f"Compete_1_India_Active_{today_date}.csv" , today_date)
|
||||||
|
|
||||||
def run_gulf_scraper():
|
def run_gulf_scraper(today_date):
|
||||||
gulfSearch()
|
gulfSearch()
|
||||||
folder_path = "gulf_data/daily_search_results/"
|
folder_path = "gulf_data/daily_search_results/"
|
||||||
search_pattern = "search_result_gulf_*.csv"
|
search_pattern = "search_result_gulf_*.csv"
|
||||||
last_file = find_second_latest_file(folder_path, search_pattern)
|
last_file = find_second_latest_file(folder_path, search_pattern)
|
||||||
current_date = datetime.now()
|
# current_date = datetime.now()
|
||||||
today_date = current_date.strftime('%d-%m-%Y')
|
# today_date = current_date.strftime('%d-%m-%Y')
|
||||||
fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
fresh_output = f"gulf_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
||||||
expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archieve_{today_date}.csv"
|
expired_output = f"gulf_data/daily_upload_folder/Compete_1_Gulf_Archieve_{today_date}.csv"
|
||||||
common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
|
common_output = f"gulf_data/daily_common_folder/common_data_on_{today_date}.csv"
|
||||||
|
@ -139,16 +139,15 @@ def run_gulf_scraper():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# print("Choose which function to run:")
|
aws_access_key_id = 'AKIAWWHGITBE7XFXWA7U'
|
||||||
# print("1 for India Scraper")
|
aws_secret_access_key = 'jGoGwiwRClje6fXcwOI9wHTcbSAWBt41DUjc8RBX'
|
||||||
# print("2 for Gulf scraper")
|
current_date = datetime.now()
|
||||||
# choice = input("Enter your choice (1 or 2): ")
|
today_date = current_date.strftime('%d-%m-%Y')
|
||||||
# if choice == "1":
|
bucket_name = 'compete-syndication'
|
||||||
# run_india_scraper()
|
folder_name = f'naukri/{today_date}' # Replace with your desired folder name
|
||||||
# elif choice == "2":
|
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
|
||||||
# run_gulf_scraper()
|
folder_key = folder_name + '/' # Note the trailing slash
|
||||||
# else:
|
s3.put_object(Bucket=bucket_name, Key=folder_key)
|
||||||
# print("Invalid choice. Please enter 1 or 2.")
|
|
||||||
|
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
print("Usage: python common_task.py [gulf|india]")
|
print("Usage: python common_task.py [gulf|india]")
|
||||||
|
@ -157,9 +156,9 @@ if __name__ == "__main__":
|
||||||
option = sys.argv[1].lower()
|
option = sys.argv[1].lower()
|
||||||
|
|
||||||
if option == 'gulf':
|
if option == 'gulf':
|
||||||
run_gulf_scraper()
|
run_gulf_scraper(today_date)
|
||||||
elif option == 'india':
|
elif option == 'india':
|
||||||
run_india_scraper()
|
run_india_scraper(today_date)
|
||||||
else:
|
else:
|
||||||
print("Invalid argument. Please use 'gulf' or 'india' as the argument.")
|
print("Invalid argument. Please use 'gulf' or 'india' as the argument.")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue