compete_jobs/naukri/jobdata_gulf_r.py

213 lines
10 KiB
Python

import requests
import csv
from time import sleep, time
"""
# List of URLs to query
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
headers = {
'authority': 'www.naukrigulf.com',
'accept': 'application/json',
'accept-format': 'strict',
'accept-language': 'ENGLISH',
'appid': '205',
'cache-control': 'no-cache',
'client-type': 'desktop',
'clientid': 'desktop',
'device-type': 'desktop',
'puppeteer': 'false',
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'systemid': '2323',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
'userdata': '|IN'
}
with open("o.csv", 'a+', newline='', encoding='utf-8') as outfile:
outfile_writer = csv.writer(outfile)
with open("output_all_gulf old.csv", 'r', encoding="utf-8", newline='') as jobis:
j_read = list(csv.DictReader(jobis))
for item in j_read:
print(base_url.format(item.get('jobId')))
jd_url = base_url.format(item.get('jobId'))
sleep(0.5)
response = requests.get(base_url.format(item.get('jobId')), headers=headers)
if response.status_code == 200:
job_data = {
"Url" : jd_url,
"Job Key" : item.get('jobId'),
"Source Link": response.json().get('other', {'tag': ''}).get('tag',''),
"Job Description" : response.json().get('description',''),
"Role Category" :"",
"Job Industry" : ', '.join([t['title'] for t in response.json()['industryInterlinking']]),
"Job Title" : response.json().get('designation'),
"Formatted Location Full" : response.json().get('location'),
"Job Functions" : ', '.join([x['title'] for x in response.json()['fAreaInterlinking']]),
"Company" : response.json().get('company', {'name':''}).get('name'),
"Job Type" : response.json().get('employmentType'),
"Key Skills" : ', '.join([y['title'] for y in response.json()['keywordInterlinking']]),
"Minimum Experience" : response.json().get('desiredCandidate').get('experience').get('min'),
"Maximum Experience" : response.json().get('desiredCandidate').get('experience').get('max'),
"Salary Detail" : response.json().get('compensation')
}
if outfile.tell() == 0:
header = job_data.keys()
outfile_writer.writerow(header)
outfile_writer.writerow([str(z).replace('\n','').strip() for z in job_data.values()])
"""
# Global variables
input_file = "gulf_data/output_all_gulf.csv"
output_file = "gulf_data/jobdata_gulf.csv"
error_file = "gulf_data/jobdata_error_gulf.csv"
stats_file = "gulf_data/stats_gulf.txt"
skip=0
class NaukriGulfJobDetailScraper:
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
headers = {
'authority': 'www.naukrigulf.com',
'accept': 'application/json',
'accept-format': 'strict',
'accept-language': 'ENGLISH',
'appid': '205',
'cache-control': 'no-cache',
'client-type': 'desktop',
'clientid': 'desktop',
'device-type': 'desktop',
'puppeteer': 'false',
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'systemid': '2323',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
'userdata': '|IN'
}
def __init__(self, input_file, output_file, error_file):
self.input_file = input_file
self.output_file = output_file
self.error_file = error_file
self.timeout = 30
self.count = 1
# self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}
def transform_data(self, job_id, jd_url, json_response):
source_value1 = json_response.get('other', {'tag': ''}).get('tag', '')
source_value2 = json_response.get('contact', {'website': ''}).get('website', '')
jd = json_response.get('description','')
desired_profile = json_response.get('desiredCandidate')
valid_pairs = None
if desired_profile:
valid_pairs = [(key, value) for key, value in desired_profile.items() if value is not None and value != '' and key != 'experience']
if valid_pairs:
html_output = '<br><h3 class="heading">Desired Candidate Profile</h3><br>'
for key, value in valid_pairs:
html_output += f"<strong>{key.title()}:</strong> <br>{value}<br>"
jd += html_output
json_data = {
"Url" : jd_url,
"Job Key" : "g_" + str(job_id),
# "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
# json_response.get('contact', {'website': ''}).get('website',''),
"Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '',
# "Job Description" : json_response.get('description',''),
"Job Description" : jd,
"Role Category" :"",
"Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]),
"Job Title" : json_response.get('designation'),
"Formatted Location Full" : json_response.get('location'),
"Job Functions" : ', '.join([x['title'] for x in json_response['fAreaInterlinking']]),
"Company" : json_response.get('company', {'name':''}).get('name'),
"Job Type" : json_response.get('employmentType'),
"Key Skills" : ', '.join([y['title'] for y in json_response['keywordInterlinking']]),
"Minimum Experience" : json_response.get('desiredCandidate').get('experience').get('min'),
"Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
"Salary Detail" : json_response.get('compensation'),
"Country" : json_response.get('compensation',{'country':''}).get('country')
}
return json_data
def scrape(self):
with open(self.input_file, 'r', encoding='utf-8') as infile:
reader = csv.reader(infile)
total_input_count=0
all_job_ids = []
for row in reader:
jobid = row[1].strip()
mode = row[7].strip()
total_input_count+=1
if mode == "POSTED":
print("removed non tagged job with jobid %s" % jobid)
continue
all_job_ids.append(jobid)
print(f"Size of raw all_job_ids: {len(all_job_ids)}")
all_job_ids = list(set(all_job_ids))
print(f"Size of unique all_job_ids: {len(all_job_ids)}")
all_job_ids = all_job_ids[skip:]
print(f"Total input: {total_input_count}, Valid ids to scrape {len(all_job_ids)}")
with open(stats_file, "a") as stat:
stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n")
sleep(1)
with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
while all_job_ids:
job_id = all_job_ids[0]
url = self.base_url.format(job_id)
sleep(0.5)
try:
sleep(1)
response = requests.get(url, headers=self.headers, timeout=self.timeout)
print(f"{response.status_code} for {url}")
if response.status_code == 200:
json_response = response.json()
transformed_data = self.transform_data(job_id, url, json_response)
if outfile.tell() == 0 :
header = transformed_data.keys()
writer.writerow(header)
writer.writerow(transformed_data.values())
print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}")
all_job_ids.pop(0) # Remove the processed job ID
self.count += 1
# / elif response.status_code == 303:
# json_response = response.json()
# if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
# print(f"Expired job ID {jobid} with response 303")
# all_job_ids.pop(0) # Remove the processed job ID
elif response.status_code == 404:
all_job_ids.pop(0) # Remove the processed job ID
print(f"Expired job ID {jobid} with response 404")
else:
print(f"Error for job ID {job_id}")
except Exception as n1:
print(str(n1))
pass
def main():
start_time = time()
scraper = NaukriGulfJobDetailScraper(input_file, output_file, error_file)
scraper.scrape()
end_time = time()
duration_hours = (end_time - start_time) / 3600
print(f"Jobdata program took {duration_hours:.2f} hours to run.")
with open(stats_file, "a") as stat:
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
if __name__ == "__main__":
main()