Update common_task.py

prahul11 2023-11-06 15:53:15 +05:30
parent 56ff505c67
commit f22010053c
1 changed files with 31 additions and 9 deletions

View File

@ -7,6 +7,7 @@ from naukri.search_india import NaukriJobScraper
from naukri.jobdata_india import NaukriJobDetailScraper from naukri.jobdata_india import NaukriJobDetailScraper
from naukri.search_gulf_r import main as gulfSearch, output_filename_csv as gulf_search_file from naukri.search_gulf_r import main as gulfSearch, output_filename_csv as gulf_search_file
from naukri.jobdata_gulf_r import NaukriGulfJobDetailScraper from naukri.jobdata_gulf_r import NaukriGulfJobDetailScraper
from jobstreet.jst_id_search import search_jst_id
import time import time
import os import os
import sys import sys
@ -42,7 +43,7 @@ def read_s3_file(filenameInS3):
# # Print or process the file contents # # Print or process the file contents
# print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly # print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly
def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output, gi): def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output=None, gi):
today_df = pd.read_csv(today_file) today_df = pd.read_csv(today_file)
last_file_df = pd.read_csv(last_file) last_file_df = pd.read_csv(last_file)
print(today_df.shape, last_file_df.shape) print(today_df.shape, last_file_df.shape)
@ -59,6 +60,11 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi
child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key'].astype(str) child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key'].astype(str)
expired_df['Job Key'] = expired_df['Job Key'].astype(str) expired_df['Job Key'] = expired_df['Job Key'].astype(str)
expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True) expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
elif gi == "id":
# child_df_copy['id'] = 'id_' + child_df_copy['Job Key'].astype(str)
# expired_df['Job Key'] = expired_df['Job Key'].astype(str)
# expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
pass
else: # jobId else: # jobId
child_df_copy['jobId'] = 'i_' + child_df_copy['jobId'].astype(str) child_df_copy['jobId'] = 'i_' + child_df_copy['jobId'].astype(str)
expired_df['jobId'] = expired_df['jobId'].astype(str) expired_df['jobId'] = expired_df['jobId'].astype(str)
@ -68,9 +74,9 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi
expired_df.to_csv(expired_output, index=False) expired_df.to_csv(expired_output, index=False)
print(new_df.shape, expired_df.shape) print(new_df.shape, expired_df.shape)
common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner') # common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner')
print(common_df.shape) # print(common_df.shape)
common_df.to_csv(common_output, index=False) # common_df.to_csv(common_output, index=False)
def extract_date_from_filename(filename): def extract_date_from_filename(filename):
date_str = filename.split("_")[-1].replace(".csv", "") date_str = filename.split("_")[-1].replace(".csv", "")
@ -156,21 +162,37 @@ if __name__ == "__main__":
current_date = datetime.now() current_date = datetime.now()
today_date = current_date.strftime('%d-%m-%Y') today_date = current_date.strftime('%d-%m-%Y')
bucket_name = 'compete-syndication' bucket_name = 'compete-syndication'
folder_name = f'naukri/{today_date}' # Replace with your desired folder name
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
folder_key = folder_name + '/' # Note the trailing slash
s3.put_object(Bucket=bucket_name, Key=folder_key)
if len(sys.argv) != 2: if len(sys.argv) != 2:
print("Usage: python common_task.py [gulf|india]") print("Usage: python common_task.py [gulf|india]")
sys.exit(1) sys.exit(1)
option = sys.argv[1].lower() option = sys.argv[1].lower()
if option == 'gulf' or option == 'india':
folder_name = f'naukri/{today_date}' # Replace with your desired folder name
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
folder_key = folder_name + '/' # Note the trailing slash
s3.put_object(Bucket=bucket_name, Key=folder_key)
elif option in ['id', 'vi', 'ph', 'si', 'my']:
folder_name = f'jobstreet/{today_date}' # Replace with your desired folder name
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
folder_key = folder_name + '/' # Note the trailing slash
s3.put_object(Bucket=bucket_name, Key=folder_key)
if option == 'gulf': if option == 'gulf':
run_gulf_scraper(today_date) run_gulf_scraper(today_date)
elif option == 'india': elif option == 'india':
run_india_scraper(today_date) run_india_scraper(today_date)
elif option =="id":
search_file = f"indonesia_data/daily_search_results/id_search_{today_date}.csv"
search_jst_id(search_file)
folder_path = "indonesia_data/daily_search_results/"
search_pattern = "id_search_*.csv"
last_file = find_second_latest_file(folder_path, search_pattern)
fresh_output = f"indonesia_data/daily_process_folder/new_jobs_on_{today_date}.csv"
expired_output = f"indonesia_data/daily_upload_folder/Compete_2_Indonesia_Archive_{today_date}.csv"
do_the_difference(search_file, last_file, 'id', fresh_output, expired_output, None, "id")
else: else:
print("Invalid argument. Please use 'gulf' or 'india' as the argument.") print("Invalid argument. Please use 'gulf' or 'india' as the argument.")