Update common_task.py
parent
56ff505c67
commit
f22010053c
|
@ -7,6 +7,7 @@ from naukri.search_india import NaukriJobScraper
|
|||
from naukri.jobdata_india import NaukriJobDetailScraper
|
||||
from naukri.search_gulf_r import main as gulfSearch, output_filename_csv as gulf_search_file
|
||||
from naukri.jobdata_gulf_r import NaukriGulfJobDetailScraper
|
||||
from jobstreet.jst_id_search import search_jst_id
|
||||
import time
|
||||
import os
|
||||
import sys
|
||||
|
@ -42,7 +43,7 @@ def read_s3_file(filenameInS3):
|
|||
# # Print or process the file contents
|
||||
# print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly
|
||||
|
||||
def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output, gi):
|
||||
def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output=None, gi):
|
||||
today_df = pd.read_csv(today_file)
|
||||
last_file_df = pd.read_csv(last_file)
|
||||
print(today_df.shape, last_file_df.shape)
|
||||
|
@ -59,6 +60,11 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi
|
|||
child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key'].astype(str)
|
||||
expired_df['Job Key'] = expired_df['Job Key'].astype(str)
|
||||
expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
|
||||
elif gi == "id":
|
||||
# child_df_copy['id'] = 'id_' + child_df_copy['Job Key'].astype(str)
|
||||
# expired_df['Job Key'] = expired_df['Job Key'].astype(str)
|
||||
# expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
|
||||
pass
|
||||
else: # jobId
|
||||
child_df_copy['jobId'] = 'i_' + child_df_copy['jobId'].astype(str)
|
||||
expired_df['jobId'] = expired_df['jobId'].astype(str)
|
||||
|
@ -68,9 +74,9 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi
|
|||
|
||||
expired_df.to_csv(expired_output, index=False)
|
||||
print(new_df.shape, expired_df.shape)
|
||||
common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner')
|
||||
print(common_df.shape)
|
||||
common_df.to_csv(common_output, index=False)
|
||||
# common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner')
|
||||
# print(common_df.shape)
|
||||
# common_df.to_csv(common_output, index=False)
|
||||
|
||||
def extract_date_from_filename(filename):
|
||||
date_str = filename.split("_")[-1].replace(".csv", "")
|
||||
|
@ -156,21 +162,37 @@ if __name__ == "__main__":
|
|||
current_date = datetime.now()
|
||||
today_date = current_date.strftime('%d-%m-%Y')
|
||||
bucket_name = 'compete-syndication'
|
||||
folder_name = f'naukri/{today_date}' # Replace with your desired folder name
|
||||
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
|
||||
folder_key = folder_name + '/' # Note the trailing slash
|
||||
s3.put_object(Bucket=bucket_name, Key=folder_key)
|
||||
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python common_task.py [gulf|india]")
|
||||
sys.exit(1)
|
||||
|
||||
option = sys.argv[1].lower()
|
||||
|
||||
if option == 'gulf' or option == 'india':
|
||||
folder_name = f'naukri/{today_date}' # Replace with your desired folder name
|
||||
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
|
||||
folder_key = folder_name + '/' # Note the trailing slash
|
||||
s3.put_object(Bucket=bucket_name, Key=folder_key)
|
||||
elif option in ['id', 'vi', 'ph', 'si', 'my']:
|
||||
folder_name = f'jobstreet/{today_date}' # Replace with your desired folder name
|
||||
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
|
||||
folder_key = folder_name + '/' # Note the trailing slash
|
||||
s3.put_object(Bucket=bucket_name, Key=folder_key)
|
||||
if option == 'gulf':
|
||||
run_gulf_scraper(today_date)
|
||||
elif option == 'india':
|
||||
run_india_scraper(today_date)
|
||||
elif option =="id":
|
||||
search_file = f"indonesia_data/daily_search_results/id_search_{today_date}.csv"
|
||||
search_jst_id(search_file)
|
||||
folder_path = "indonesia_data/daily_search_results/"
|
||||
search_pattern = "id_search_*.csv"
|
||||
last_file = find_second_latest_file(folder_path, search_pattern)
|
||||
fresh_output = f"indonesia_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
||||
expired_output = f"indonesia_data/daily_upload_folder/Compete_2_Indonesia_Archive_{today_date}.csv"
|
||||
do_the_difference(search_file, last_file, 'id', fresh_output, expired_output, None, "id")
|
||||
|
||||
else:
|
||||
print("Invalid argument. Please use 'gulf' or 'india' as the argument.")
|
||||
|
||||
|
|
Loading…
Reference in New Issue