From a39288eda05003d242455763476e1eeccd159820 Mon Sep 17 00:00:00 2001 From: prahul11 Date: Fri, 13 Oct 2023 01:33:40 +0530 Subject: [PATCH] automation 1 --- .gitignore | 8 ++- common_task.py | 133 ++++++++++++++++++++++++++++++++++++++++ naukri/jobdata_india.py | 14 +++-- naukri/search_india.py | 15 ++--- 4 files changed, 156 insertions(+), 14 deletions(-) create mode 100644 common_task.py diff --git a/.gitignore b/.gitignore index f41222f..6f657a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ .vscode -data_naukri +data_naukri/ scrib data_naukri/*.csv gulf_data/*.csv @@ -7,4 +7,8 @@ not required/ data_naukri/*.txt gulf_data/*.txt *.sh -server scraper/ \ No newline at end of file +server scraper/ +india_data/*.txt +india_data/*.csv +__pycache__/ +test data/ \ No newline at end of file diff --git a/common_task.py b/common_task.py new file mode 100644 index 0000000..f990ddb --- /dev/null +++ b/common_task.py @@ -0,0 +1,133 @@ +import pandas as pd +import boto3 +from datetime import datetime +import glob + +from naukri.search_india import NaukriJobScraper +from naukri.jobdata_india import NaukriJobDetailScraper +import time +import os + +def upload_file_to_bucket(localFilePath, localFileName): + s3 = boto3.client('s3') + bucket_name = 'compete-syndication' + file_path = localFilePath + s3_key = f'naukri/{localFileName}' + s3.upload_file(file_path, bucket_name, s3_key) + print(f'File "{file_path}" uploaded to S3 bucket "{bucket_name}" as "{s3_key}"') + + + +def read_s3_file(filenameInS3): + aws_access_key_id = 'AKIAWWHGITBE7XFXWA7U' + aws_secret_access_key = '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' + # bucket_name = + # file_key = 'naukri/test_data.csv' + # file_key = f'naukri/{filenameInS3}' + s3_bucket = 'compete-syndication' + s3_file_key = f'naukri/{filenameInS3}' + session = boto3.Session( + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key + ) + s3_client = session.client('s3') + s3_object = s3_client.get_object(Bucket=s3_bucket, Key=s3_file_key) + df = pd.read_csv(s3_object['Body']) + print(df) + # file_content = response['Body'].read() + + # # Print or process the file contents + # print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly + +def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output): + today_df = pd.read_csv(today_file) + last_file_df = pd.read_csv(last_file) + print(today_df.shape, last_file_df.shape) + today_df.drop_duplicates(subset=[column_for_diff], keep='first', inplace=True) + # today_df.to_csv('unique Compete_1_09-10-2023.csv', index=False) + last_file_df.drop_duplicates(subset=[column_for_diff], keep='first', inplace=True) + # last_file_df.to_csv('unique Compete_1_29-09-2023.csv', index=False) + print(today_df.shape, last_file_df.shape) + new_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1) + new_df.to_csv(fresh_output, index=False) + expired_df = pd.merge(last_file_df, today_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1) + expired_df.to_csv(expired_output, index=False) + print(new_df.shape, expired_df.shape) + common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner') + print(common_df.shape) + common_df.to_csv(common_output, index=False) + +def extract_date_from_filename(filename): + date_str = filename.split("_")[-1].replace(".csv", "") + return datetime.strptime(date_str, "%d-%m-%Y") + +def find_second_latest_file(folder_path, search_pattern): + files = glob.glob(os.path.join(folder_path, search_pattern)) + files.sort(key=extract_date_from_filename, reverse=True) + if len(files) >= 2: + second_latest_file = files[1] + print("Second latest file:", second_latest_file) + return second_latest_file + else: + print("There are not enough files in the folder to find the second latest file.") + return None + +def run_india_scraper(): + current_date = datetime.now() + today_date = current_date.strftime('%d-%m-%Y') + india_search_input_file = "naukri/_industry_urls.csv" + india_search_output_file = f"india_data/daily_search_results/search_result_india_{today_date}.csv" + india_search_error_file = f"india_data/daily_error_folder/search_error_india_{today_date}.csv" + india_search_stats_file = f"india_data/stats_india_{today_date}.txt" + start_time = time.time() + scraper = NaukriJobScraper(india_search_input_file, india_search_output_file, india_search_error_file) + scraper.scrape() + end_time = time.time() + duration_hours = (end_time - start_time) / 3600 + print(f"Search program took {duration_hours:.2f} hours to run.") + with open(india_search_stats_file, "a") as stat: + stat.write(f"Search program took {duration_hours:.2f} hours to run. \n") + folder_path = "india_data/daily_search_results/" + search_pattern = "search_result_india_*.csv" + last_file = find_second_latest_file(folder_path, search_pattern) + fresh_output = f"india_data/daily_process_folder/new_jobs_on_{today_date}.csv" + expired_output = f"india_data/daily_upload_folder/expired_Compete_1_India_{today_date}.csv" + common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv" + do_the_difference(india_search_output_file, last_file, 'jdURL', + fresh_output, expired_output, common_output) + india_detail_file = f"india_data/daily_upload_folder/Compete_1_India_{today_date}.csv" + india_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt" + start_time = time.time() + scraper = NaukriJobDetailScraper(fresh_output, india_detail_file, india_detail_error_file) + scraper.scrape() + end_time = time.time() + duration_hours = (end_time - start_time) / 3600 + print(f"Jobdata program took {duration_hours:.2f} hours to run.") + with open(f'india_data/daily_stats_folder/stats_file_of_{today_date}.txt', "a") as stat: + stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n") + upload_file_to_bucket(expired_output, f"expired_Compete_1_India_{today_date}.csv" ) + upload_file_to_bucket(india_detail_file, f"Compete_1_India_{today_date}.csv" ) + +def run_gulf_scraper(): + pass + +if __name__ == "__main__": + print("Choose which function to run:") + print("1 for India Scraper") + print("2 for Gulf scraper") + choice = input("Enter your choice (1 or 2): ") + if choice == "1": + run_india_scraper() + elif choice == "2": + run_gulf_scraper() + else: + print("Invalid choice. Please enter 1 or 2.") + + + + + + + + + \ No newline at end of file diff --git a/naukri/jobdata_india.py b/naukri/jobdata_india.py index 4b8dc38..e6070a5 100644 --- a/naukri/jobdata_india.py +++ b/naukri/jobdata_india.py @@ -1,20 +1,23 @@ import requests import csv import time -import json import os import logging +from datetime import datetime +current_date = datetime.now() +today_date = current_date.strftime('%d-%m-%Y') # Configure the logging settings logging.basicConfig(filename='search_india_error.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger() # Global variables -input_file = "data_naukri/search_result_india.csv" -output_file = "data_naukri/jobdata_india.csv" -error_file = "data_naukri/jobdata_error_india.csv" -stats_file = "data_naukri/stats.txt" +input_file = f"india_data/search_result_india_{today_date}.csv" +output_file = f"india_data/jobdata_india_{today_date}.csv" +error_file = f"india_data/jobdata_error_india_{today_date}.csv" +stats_file = f"india_data/stats_{today_date}.txt" skip=0 + class NaukriJobDetailScraper: base_url = "https://www.naukri.com/jobapi/v4/job/{}" @@ -76,6 +79,7 @@ class NaukriJobDetailScraper: "Minimum Experience": job_details.get("minimumExperience"), "Maximum Experience": job_details.get("maximumExperience"), "Salary Detail": job_details.get("salaryDetail"), + "Country" : "India" } return json_data diff --git a/naukri/search_india.py b/naukri/search_india.py index 5bf722d..03486d5 100644 --- a/naukri/search_india.py +++ b/naukri/search_india.py @@ -1,20 +1,22 @@ import requests -import json import csv import os import time import math import logging +from datetime import datetime # Configure the logging settings logging.basicConfig(filename='search_india_error.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger() -# Global variables +current_date = datetime.now() +today_date = current_date.strftime('%d-%m-%Y') input_file = "naukri/_industry_urls.csv" -output_file = "data_naukri/search_result_india.csv" -error_file = "data_naukri/search_error_india.csv" -stats_file = "data_naukri/stats_india.txt" +output_file = f"india_data/search_result_india_{today_date}.csv" +error_file = f"india_data/search_error_india_{today_date}.csv" +stats_file = f"india_data/stats_india_{today_date}.txt" + class NaukriJobScraper: base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&pageNo={}&xt=catsrch&qi\[\]={}" headers = { @@ -59,13 +61,11 @@ class NaukriJobScraper: csv_writer.writerows(parsed_data) def scrape(self): - with open(self.output_file_path, "w", newline="", encoding="utf-8") as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract) csv_writer.writeheader() with open(self.input_file_path, 'r') as file: - file_read = csv.reader(file) for industry in file_read: industry_read_url = industry[0].replace("\n", "") @@ -122,3 +122,4 @@ def main(): if __name__ == "__main__": main() + \ No newline at end of file