automation 1
parent
c55aeda72a
commit
a39288eda0
|
@ -1,5 +1,5 @@
|
||||||
.vscode
|
.vscode
|
||||||
data_naukri
|
data_naukri/
|
||||||
scrib
|
scrib
|
||||||
data_naukri/*.csv
|
data_naukri/*.csv
|
||||||
gulf_data/*.csv
|
gulf_data/*.csv
|
||||||
|
@ -8,3 +8,7 @@ data_naukri/*.txt
|
||||||
gulf_data/*.txt
|
gulf_data/*.txt
|
||||||
*.sh
|
*.sh
|
||||||
server scraper/
|
server scraper/
|
||||||
|
india_data/*.txt
|
||||||
|
india_data/*.csv
|
||||||
|
__pycache__/
|
||||||
|
test data/
|
|
@ -0,0 +1,133 @@
|
||||||
|
import pandas as pd
|
||||||
|
import boto3
|
||||||
|
from datetime import datetime
|
||||||
|
import glob
|
||||||
|
|
||||||
|
from naukri.search_india import NaukriJobScraper
|
||||||
|
from naukri.jobdata_india import NaukriJobDetailScraper
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
|
def upload_file_to_bucket(localFilePath, localFileName):
|
||||||
|
s3 = boto3.client('s3')
|
||||||
|
bucket_name = 'compete-syndication'
|
||||||
|
file_path = localFilePath
|
||||||
|
s3_key = f'naukri/{localFileName}'
|
||||||
|
s3.upload_file(file_path, bucket_name, s3_key)
|
||||||
|
print(f'File "{file_path}" uploaded to S3 bucket "{bucket_name}" as "{s3_key}"')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def read_s3_file(filenameInS3):
|
||||||
|
aws_access_key_id = 'AKIAWWHGITBE7XFXWA7U'
|
||||||
|
aws_secret_access_key = 'jGoGwiwRClje6fXcwOI9wHTcbSAWBt41DUjc8RBX'
|
||||||
|
# bucket_name =
|
||||||
|
# file_key = 'naukri/test_data.csv'
|
||||||
|
# file_key = f'naukri/{filenameInS3}'
|
||||||
|
s3_bucket = 'compete-syndication'
|
||||||
|
s3_file_key = f'naukri/{filenameInS3}'
|
||||||
|
session = boto3.Session(
|
||||||
|
aws_access_key_id=aws_access_key_id,
|
||||||
|
aws_secret_access_key=aws_secret_access_key
|
||||||
|
)
|
||||||
|
s3_client = session.client('s3')
|
||||||
|
s3_object = s3_client.get_object(Bucket=s3_bucket, Key=s3_file_key)
|
||||||
|
df = pd.read_csv(s3_object['Body'])
|
||||||
|
print(df)
|
||||||
|
# file_content = response['Body'].read()
|
||||||
|
|
||||||
|
# # Print or process the file contents
|
||||||
|
# print(file_content.decode('utf-8')) # Assumes the file is text; adjust accordingly
|
||||||
|
|
||||||
|
def do_the_difference(today_file, last_file, column_for_diff, fresh_output, expired_output, common_output):
|
||||||
|
today_df = pd.read_csv(today_file)
|
||||||
|
last_file_df = pd.read_csv(last_file)
|
||||||
|
print(today_df.shape, last_file_df.shape)
|
||||||
|
today_df.drop_duplicates(subset=[column_for_diff], keep='first', inplace=True)
|
||||||
|
# today_df.to_csv('unique Compete_1_09-10-2023.csv', index=False)
|
||||||
|
last_file_df.drop_duplicates(subset=[column_for_diff], keep='first', inplace=True)
|
||||||
|
# last_file_df.to_csv('unique Compete_1_29-09-2023.csv', index=False)
|
||||||
|
print(today_df.shape, last_file_df.shape)
|
||||||
|
new_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1)
|
||||||
|
new_df.to_csv(fresh_output, index=False)
|
||||||
|
expired_df = pd.merge(last_file_df, today_df, on=column_for_diff, how='left', indicator=True, suffixes=('', '_ignored')).query('_merge == "left_only"').drop(['_merge'], axis=1)
|
||||||
|
expired_df.to_csv(expired_output, index=False)
|
||||||
|
print(new_df.shape, expired_df.shape)
|
||||||
|
common_df = pd.merge(today_df, last_file_df, on=column_for_diff, how='inner')
|
||||||
|
print(common_df.shape)
|
||||||
|
common_df.to_csv(common_output, index=False)
|
||||||
|
|
||||||
|
def extract_date_from_filename(filename):
|
||||||
|
date_str = filename.split("_")[-1].replace(".csv", "")
|
||||||
|
return datetime.strptime(date_str, "%d-%m-%Y")
|
||||||
|
|
||||||
|
def find_second_latest_file(folder_path, search_pattern):
|
||||||
|
files = glob.glob(os.path.join(folder_path, search_pattern))
|
||||||
|
files.sort(key=extract_date_from_filename, reverse=True)
|
||||||
|
if len(files) >= 2:
|
||||||
|
second_latest_file = files[1]
|
||||||
|
print("Second latest file:", second_latest_file)
|
||||||
|
return second_latest_file
|
||||||
|
else:
|
||||||
|
print("There are not enough files in the folder to find the second latest file.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def run_india_scraper():
|
||||||
|
current_date = datetime.now()
|
||||||
|
today_date = current_date.strftime('%d-%m-%Y')
|
||||||
|
india_search_input_file = "naukri/_industry_urls.csv"
|
||||||
|
india_search_output_file = f"india_data/daily_search_results/search_result_india_{today_date}.csv"
|
||||||
|
india_search_error_file = f"india_data/daily_error_folder/search_error_india_{today_date}.csv"
|
||||||
|
india_search_stats_file = f"india_data/stats_india_{today_date}.txt"
|
||||||
|
start_time = time.time()
|
||||||
|
scraper = NaukriJobScraper(india_search_input_file, india_search_output_file, india_search_error_file)
|
||||||
|
scraper.scrape()
|
||||||
|
end_time = time.time()
|
||||||
|
duration_hours = (end_time - start_time) / 3600
|
||||||
|
print(f"Search program took {duration_hours:.2f} hours to run.")
|
||||||
|
with open(india_search_stats_file, "a") as stat:
|
||||||
|
stat.write(f"Search program took {duration_hours:.2f} hours to run. \n")
|
||||||
|
folder_path = "india_data/daily_search_results/"
|
||||||
|
search_pattern = "search_result_india_*.csv"
|
||||||
|
last_file = find_second_latest_file(folder_path, search_pattern)
|
||||||
|
fresh_output = f"india_data/daily_process_folder/new_jobs_on_{today_date}.csv"
|
||||||
|
expired_output = f"india_data/daily_upload_folder/expired_Compete_1_India_{today_date}.csv"
|
||||||
|
common_output = f"india_data/daily_common_folder/common_data_on_{today_date}.csv"
|
||||||
|
do_the_difference(india_search_output_file, last_file, 'jdURL',
|
||||||
|
fresh_output, expired_output, common_output)
|
||||||
|
india_detail_file = f"india_data/daily_upload_folder/Compete_1_India_{today_date}.csv"
|
||||||
|
india_detail_error_file = f"india_data/daily_error_folder/error_on_India_detail_{today_date}.txt"
|
||||||
|
start_time = time.time()
|
||||||
|
scraper = NaukriJobDetailScraper(fresh_output, india_detail_file, india_detail_error_file)
|
||||||
|
scraper.scrape()
|
||||||
|
end_time = time.time()
|
||||||
|
duration_hours = (end_time - start_time) / 3600
|
||||||
|
print(f"Jobdata program took {duration_hours:.2f} hours to run.")
|
||||||
|
with open(f'india_data/daily_stats_folder/stats_file_of_{today_date}.txt', "a") as stat:
|
||||||
|
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
||||||
|
upload_file_to_bucket(expired_output, f"expired_Compete_1_India_{today_date}.csv" )
|
||||||
|
upload_file_to_bucket(india_detail_file, f"Compete_1_India_{today_date}.csv" )
|
||||||
|
|
||||||
|
def run_gulf_scraper():
|
||||||
|
pass
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Choose which function to run:")
|
||||||
|
print("1 for India Scraper")
|
||||||
|
print("2 for Gulf scraper")
|
||||||
|
choice = input("Enter your choice (1 or 2): ")
|
||||||
|
if choice == "1":
|
||||||
|
run_india_scraper()
|
||||||
|
elif choice == "2":
|
||||||
|
run_gulf_scraper()
|
||||||
|
else:
|
||||||
|
print("Invalid choice. Please enter 1 or 2.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,20 +1,23 @@
|
||||||
import requests
|
import requests
|
||||||
import csv
|
import csv
|
||||||
import time
|
import time
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
current_date = datetime.now()
|
||||||
|
today_date = current_date.strftime('%d-%m-%Y')
|
||||||
|
|
||||||
# Configure the logging settings
|
# Configure the logging settings
|
||||||
logging.basicConfig(filename='search_india_error.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
|
logging.basicConfig(filename='search_india_error.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
|
|
||||||
# Global variables
|
# Global variables
|
||||||
input_file = "data_naukri/search_result_india.csv"
|
input_file = f"india_data/search_result_india_{today_date}.csv"
|
||||||
output_file = "data_naukri/jobdata_india.csv"
|
output_file = f"india_data/jobdata_india_{today_date}.csv"
|
||||||
error_file = "data_naukri/jobdata_error_india.csv"
|
error_file = f"india_data/jobdata_error_india_{today_date}.csv"
|
||||||
stats_file = "data_naukri/stats.txt"
|
stats_file = f"india_data/stats_{today_date}.txt"
|
||||||
skip=0
|
skip=0
|
||||||
|
|
||||||
class NaukriJobDetailScraper:
|
class NaukriJobDetailScraper:
|
||||||
|
|
||||||
base_url = "https://www.naukri.com/jobapi/v4/job/{}"
|
base_url = "https://www.naukri.com/jobapi/v4/job/{}"
|
||||||
|
@ -76,6 +79,7 @@ class NaukriJobDetailScraper:
|
||||||
"Minimum Experience": job_details.get("minimumExperience"),
|
"Minimum Experience": job_details.get("minimumExperience"),
|
||||||
"Maximum Experience": job_details.get("maximumExperience"),
|
"Maximum Experience": job_details.get("maximumExperience"),
|
||||||
"Salary Detail": job_details.get("salaryDetail"),
|
"Salary Detail": job_details.get("salaryDetail"),
|
||||||
|
"Country" : "India"
|
||||||
}
|
}
|
||||||
return json_data
|
return json_data
|
||||||
|
|
||||||
|
|
|
@ -1,20 +1,22 @@
|
||||||
import requests
|
import requests
|
||||||
import json
|
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import math
|
import math
|
||||||
import logging
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
# Configure the logging settings
|
# Configure the logging settings
|
||||||
logging.basicConfig(filename='search_india_error.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
|
logging.basicConfig(filename='search_india_error.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
|
|
||||||
# Global variables
|
current_date = datetime.now()
|
||||||
|
today_date = current_date.strftime('%d-%m-%Y')
|
||||||
input_file = "naukri/_industry_urls.csv"
|
input_file = "naukri/_industry_urls.csv"
|
||||||
output_file = "data_naukri/search_result_india.csv"
|
output_file = f"india_data/search_result_india_{today_date}.csv"
|
||||||
error_file = "data_naukri/search_error_india.csv"
|
error_file = f"india_data/search_error_india_{today_date}.csv"
|
||||||
stats_file = "data_naukri/stats_india.txt"
|
stats_file = f"india_data/stats_india_{today_date}.txt"
|
||||||
|
|
||||||
class NaukriJobScraper:
|
class NaukriJobScraper:
|
||||||
base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&pageNo={}&xt=catsrch&qi\[\]={}"
|
base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&pageNo={}&xt=catsrch&qi\[\]={}"
|
||||||
headers = {
|
headers = {
|
||||||
|
@ -59,13 +61,11 @@ class NaukriJobScraper:
|
||||||
csv_writer.writerows(parsed_data)
|
csv_writer.writerows(parsed_data)
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
|
|
||||||
with open(self.output_file_path, "w", newline="", encoding="utf-8") as csvfile:
|
with open(self.output_file_path, "w", newline="", encoding="utf-8") as csvfile:
|
||||||
csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract)
|
csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract)
|
||||||
csv_writer.writeheader()
|
csv_writer.writeheader()
|
||||||
|
|
||||||
with open(self.input_file_path, 'r') as file:
|
with open(self.input_file_path, 'r') as file:
|
||||||
|
|
||||||
file_read = csv.reader(file)
|
file_read = csv.reader(file)
|
||||||
for industry in file_read:
|
for industry in file_read:
|
||||||
industry_read_url = industry[0].replace("\n", "")
|
industry_read_url = industry[0].replace("\n", "")
|
||||||
|
@ -122,3 +122,4 @@ def main():
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Reference in New Issue