Update common_task.py
parent
56ff505c67
commit
f22010053c
|
@ -7,6 +7,7 @@ from naukri.search_india import NaukriJobScraper
|
||||||
from naukri.jobdata_india import NaukriJobDetailScraper
|
from naukri.jobdata_india import NaukriJobDetailScraper
|
||||||
from naukri.search_gulf_r import main as gulfSearch, output_filename_csv as gulf_search_file
|
from naukri.search_gulf_r import main as gulfSearch, output_filename_csv as gulf_search_file
|
||||||
from naukri.jobdata_gulf_r import NaukriGulfJobDetailScraper
|
from naukri.jobdata_gulf_r import NaukriGulfJobDetailScraper
|
||||||
|
from jobstreet.jst_id_search import search_jst_id
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
@ -42,7 +43,7 @@ def read_s3_file(filenameInS3):
|
||||||
# # Print or process the file contents
|
# # Print or process the file contents
|
||||||
# print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly
|
# print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly
|
||||||
|
|
||||||
def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output, gi):
|
def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output=None, gi):
|
||||||
today_df = pd.read_csv(today_file)
|
today_df = pd.read_csv(today_file)
|
||||||
last_file_df = pd.read_csv(last_file)
|
last_file_df = pd.read_csv(last_file)
|
||||||
print(today_df.shape, last_file_df.shape)
|
print(today_df.shape, last_file_df.shape)
|
||||||
|
@ -59,6 +60,11 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi
|
||||||
child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key'].astype(str)
|
child_df_copy['Job Key'] = 'g_' + child_df_copy['Job Key'].astype(str)
|
||||||
expired_df['Job Key'] = expired_df['Job Key'].astype(str)
|
expired_df['Job Key'] = expired_df['Job Key'].astype(str)
|
||||||
expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
|
expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
|
||||||
|
elif gi == "id":
|
||||||
|
# child_df_copy['id'] = 'id_' + child_df_copy['Job Key'].astype(str)
|
||||||
|
# expired_df['Job Key'] = expired_df['Job Key'].astype(str)
|
||||||
|
# expired_df = pd.concat([expired_df, child_df_copy], ignore_index=True)
|
||||||
|
pass
|
||||||
else: # jobId
|
else: # jobId
|
||||||
child_df_copy['jobId'] = 'i_' + child_df_copy['jobId'].astype(str)
|
child_df_copy['jobId'] = 'i_' + child_df_copy['jobId'].astype(str)
|
||||||
expired_df['jobId'] = expired_df['jobId'].astype(str)
|
expired_df['jobId'] = expired_df['jobId'].astype(str)
|
||||||
|
@ -68,9 +74,9 @@ def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expi
|
||||||
|
|
||||||
expired_df.to_csv(expired_output, index=False)
|
expired_df.to_csv(expired_output, index=False)
|
||||||
print(new_df.shape, expired_df.shape)
|
print(new_df.shape, expired_df.shape)
|
||||||
common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner')
|
# common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner')
|
||||||
print(common_df.shape)
|
# print(common_df.shape)
|
||||||
common_df.to_csv(common_output, index=False)
|
# common_df.to_csv(common_output, index=False)
|
||||||
|
|
||||||
def extract_date_from_filename(filename):
|
def extract_date_from_filename(filename):
|
||||||
date_str = filename.split("_")[-1].replace(".csv", "")
|
date_str = filename.split("_")[-1].replace(".csv", "")
|
||||||
|
@ -156,21 +162,37 @@ if __name__ == "__main__":
|
||||||
current_date = datetime.now()
|
current_date = datetime.now()
|
||||||
today_date = current_date.strftime('%d-%m-%Y')
|
today_date = current_date.strftime('%d-%m-%Y')
|
||||||
bucket_name = 'compete-syndication'
|
bucket_name = 'compete-syndication'
|
||||||
folder_name = f'naukri/{today_date}' # Replace with your desired folder name
|
|
||||||
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
|
|
||||||
folder_key = folder_name + '/' # Note the trailing slash
|
|
||||||
s3.put_object(Bucket=bucket_name, Key=folder_key)
|
|
||||||
|
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
print("Usage: python common_task.py [gulf|india]")
|
print("Usage: python common_task.py [gulf|india]")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
option = sys.argv[1].lower()
|
option = sys.argv[1].lower()
|
||||||
|
if option == 'gulf' or option == 'india':
|
||||||
|
folder_name = f'naukri/{today_date}' # Replace with your desired folder name
|
||||||
|
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
|
||||||
|
folder_key = folder_name + '/' # Note the trailing slash
|
||||||
|
s3.put_object(Bucket=bucket_name, Key=folder_key)
|
||||||
|
elif option in ['id', 'vi', 'ph', 'si', 'my']:
|
||||||
|
folder_name = f'jobstreet/{today_date}' # Replace with your desired folder name
|
||||||
|
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
|
||||||
|
folder_key = folder_name + '/' # Note the trailing slash
|
||||||
|
s3.put_object(Bucket=bucket_name, Key=folder_key)
|
||||||
if option == 'gulf':
|
if option == 'gulf':
|
||||||
run_gulf_scraper(today_date)
|
run_gulf_scraper(today_date)
|
||||||
elif option == 'india':
|
elif option == 'india':
|
||||||
run_india_scraper(today_date)
|
run_india_scraper(today_date)
|
||||||
|
elif option =="id":
|
||||||
|
search_file = f"indonesia_data/daily_search_results/id_search_{today_date}.csv"
|
||||||
|
search_jst_id(search_file)
|
||||||
|
folder_path = "indonesia_data/daily_search_results/"
|
||||||
|
search_pattern = "id_search_*.csv"
|
||||||
|
last_file = find_second_latest_file(folder_path, search_pattern)
|
||||||
|
fresh_output = f"indonesia_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
||||||
|
expired_output = f"indonesia_data/daily_upload_folder/Compete_2_Indonesia_Archive_{today_date}.csv"
|
||||||
|
do_the_difference(search_file, last_file, 'id', fresh_output, expired_output, None, "id")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print("Invalid argument. Please use 'gulf' or 'india' as the argument.")
|
print("Invalid argument. Please use 'gulf' or 'india' as the argument.")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue