2023-09-28 08:54:56 +00:00
|
|
|
import requests
|
|
|
|
import csv
|
2023-10-09 08:15:53 +00:00
|
|
|
from time import sleep, time
|
|
|
|
"""
|
2023-09-28 08:54:56 +00:00
|
|
|
# List of URLs to query
|
|
|
|
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
'authority': 'www.naukrigulf.com',
|
|
|
|
'accept': 'application/json',
|
|
|
|
'accept-format': 'strict',
|
|
|
|
'accept-language': 'ENGLISH',
|
|
|
|
'appid': '205',
|
|
|
|
'cache-control': 'no-cache',
|
|
|
|
'client-type': 'desktop',
|
|
|
|
'clientid': 'desktop',
|
|
|
|
'device-type': 'desktop',
|
|
|
|
'puppeteer': 'false',
|
|
|
|
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
|
|
|
|
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
|
|
|
|
'sec-ch-ua-mobile': '?0',
|
|
|
|
'sec-ch-ua-platform': 'Windows',
|
|
|
|
'sec-fetch-dest': 'empty',
|
|
|
|
'sec-fetch-mode': 'cors',
|
|
|
|
'sec-fetch-site': 'same-origin',
|
|
|
|
'systemid': '2323',
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
|
|
|
|
'userdata': '|IN'
|
|
|
|
}
|
|
|
|
|
2023-10-09 08:15:53 +00:00
|
|
|
with open("o.csv", 'a+', newline='', encoding='utf-8') as outfile:
|
|
|
|
outfile_writer = csv.writer(outfile)
|
|
|
|
with open("output_all_gulf old.csv", 'r', encoding="utf-8", newline='') as jobis:
|
|
|
|
j_read = list(csv.DictReader(jobis))
|
|
|
|
for item in j_read:
|
|
|
|
print(base_url.format(item.get('jobId')))
|
|
|
|
jd_url = base_url.format(item.get('jobId'))
|
|
|
|
sleep(0.5)
|
|
|
|
response = requests.get(base_url.format(item.get('jobId')), headers=headers)
|
|
|
|
if response.status_code == 200:
|
|
|
|
job_data = {
|
|
|
|
"Url" : jd_url,
|
|
|
|
"Job Key" : item.get('jobId'),
|
|
|
|
"Source Link": response.json().get('other', {'tag': ''}).get('tag',''),
|
|
|
|
"Job Description" : response.json().get('description',''),
|
|
|
|
"Role Category" :"",
|
|
|
|
"Job Industry" : ', '.join([t['title'] for t in response.json()['industryInterlinking']]),
|
|
|
|
"Job Title" : response.json().get('designation'),
|
|
|
|
"Formatted Location Full" : response.json().get('location'),
|
|
|
|
"Job Functions" : ', '.join([x['title'] for x in response.json()['fAreaInterlinking']]),
|
|
|
|
"Company" : response.json().get('company', {'name':''}).get('name'),
|
|
|
|
"Job Type" : response.json().get('employmentType'),
|
|
|
|
"Key Skills" : ', '.join([y['title'] for y in response.json()['keywordInterlinking']]),
|
|
|
|
"Minimum Experience" : response.json().get('desiredCandidate').get('experience').get('min'),
|
|
|
|
"Maximum Experience" : response.json().get('desiredCandidate').get('experience').get('max'),
|
|
|
|
"Salary Detail" : response.json().get('compensation')
|
|
|
|
}
|
|
|
|
if outfile.tell() == 0:
|
|
|
|
header = job_data.keys()
|
|
|
|
outfile_writer.writerow(header)
|
|
|
|
outfile_writer.writerow([str(z).replace('\n','').strip() for z in job_data.values()])
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Global variables
|
|
|
|
input_file = "gulf_data/output_all_gulf.csv"
|
|
|
|
output_file = "gulf_data/jobdata_gulf.csv"
|
|
|
|
error_file = "gulf_data/jobdata_error_gulf.csv"
|
|
|
|
stats_file = "gulf_data/stats_gulf.txt"
|
|
|
|
skip=0
|
|
|
|
|
|
|
|
class NaukriGulfJobDetailScraper:
|
|
|
|
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
|
|
|
|
headers = {
|
|
|
|
'authority': 'www.naukrigulf.com',
|
|
|
|
'accept': 'application/json',
|
|
|
|
'accept-format': 'strict',
|
|
|
|
'accept-language': 'ENGLISH',
|
|
|
|
'appid': '205',
|
|
|
|
'cache-control': 'no-cache',
|
|
|
|
'client-type': 'desktop',
|
|
|
|
'clientid': 'desktop',
|
|
|
|
'device-type': 'desktop',
|
|
|
|
'puppeteer': 'false',
|
|
|
|
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
|
|
|
|
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
|
|
|
|
'sec-ch-ua-mobile': '?0',
|
|
|
|
'sec-ch-ua-platform': 'Windows',
|
|
|
|
'sec-fetch-dest': 'empty',
|
|
|
|
'sec-fetch-mode': 'cors',
|
|
|
|
'sec-fetch-site': 'same-origin',
|
|
|
|
'systemid': '2323',
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
|
|
|
|
'userdata': '|IN'
|
|
|
|
}
|
|
|
|
|
|
|
|
def __init__(self, input_file, output_file, error_file):
|
|
|
|
self.input_file = input_file
|
|
|
|
self.output_file = output_file
|
|
|
|
self.error_file = error_file
|
|
|
|
self.timeout = 30
|
|
|
|
self.count = 1
|
|
|
|
# self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}
|
|
|
|
|
|
|
|
def transform_data(self, job_id, jd_url, json_response):
|
|
|
|
json_data = {
|
|
|
|
"Url" : jd_url,
|
|
|
|
"Job Key" : job_id,
|
|
|
|
"Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
|
|
|
|
json_response.get('contact', {'website': ''}).get('website',''),
|
|
|
|
"Job Description" : json_response.get('description',''),
|
|
|
|
"Role Category" :"",
|
|
|
|
"Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]),
|
|
|
|
"Job Title" : json_response.get('designation'),
|
|
|
|
"Formatted Location Full" : json_response.get('location'),
|
|
|
|
"Job Functions" : ', '.join([x['title'] for x in json_response['fAreaInterlinking']]),
|
|
|
|
"Company" : json_response.get('company', {'name':''}).get('name'),
|
|
|
|
"Job Type" : json_response.get('employmentType'),
|
|
|
|
"Key Skills" : ', '.join([y['title'] for y in json_response['keywordInterlinking']]),
|
|
|
|
"Minimum Experience" : json_response.get('desiredCandidate').get('experience').get('min'),
|
|
|
|
"Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
|
|
|
|
"Salary Detail" : json_response.get('compensation')
|
|
|
|
}
|
|
|
|
return json_data
|
|
|
|
|
|
|
|
def scrape(self):
|
|
|
|
with open(self.input_file, 'r', encoding='utf-8') as infile:
|
|
|
|
reader = csv.reader(infile)
|
|
|
|
total_input_count=0
|
|
|
|
all_job_ids = []
|
|
|
|
for row in reader:
|
|
|
|
jobid = row[1].strip()
|
|
|
|
mode = row[7].strip()
|
|
|
|
total_input_count+=1
|
|
|
|
if mode == "POSTED":
|
|
|
|
print("removed non tagged job with jobid %s" % jobid)
|
|
|
|
continue
|
|
|
|
|
|
|
|
all_job_ids.append(jobid)
|
|
|
|
|
|
|
|
print(f"Size of raw all_job_ids: {len(all_job_ids)}")
|
|
|
|
all_job_ids = list(set(all_job_ids))
|
|
|
|
print(f"Size of unique all_job_ids: {len(all_job_ids)}")
|
|
|
|
all_job_ids = all_job_ids[skip:]
|
|
|
|
print(f"Total input: {total_input_count}, Valid ids to scrape {len(all_job_ids)}")
|
|
|
|
with open(stats_file, "a") as stat:
|
|
|
|
stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n")
|
|
|
|
sleep(1)
|
|
|
|
with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile:
|
|
|
|
writer = csv.writer(outfile)
|
|
|
|
while all_job_ids:
|
|
|
|
job_id = all_job_ids[0]
|
|
|
|
url = self.base_url.format(job_id)
|
|
|
|
sleep(0.5)
|
|
|
|
try:
|
|
|
|
sleep(1)
|
|
|
|
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
|
|
|
print(f"{response.status_code} for {url}")
|
|
|
|
if response.status_code == 200:
|
|
|
|
json_response = response.json()
|
|
|
|
transformed_data = self.transform_data(job_id, url, json_response)
|
|
|
|
if outfile.tell() == 0 :
|
|
|
|
header = transformed_data.keys()
|
|
|
|
writer.writerow(header)
|
|
|
|
writer.writerow(transformed_data.values())
|
|
|
|
print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}")
|
|
|
|
all_job_ids.pop(0) # Remove the processed job ID
|
|
|
|
self.count += 1
|
|
|
|
# / elif response.status_code == 303:
|
|
|
|
# json_response = response.json()
|
|
|
|
# if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
|
|
|
|
# print(f"Expired job ID {jobid} with response 303")
|
|
|
|
# all_job_ids.pop(0) # Remove the processed job ID
|
|
|
|
elif response.status_code == 404:
|
|
|
|
all_job_ids.pop(0) # Remove the processed job ID
|
|
|
|
print(f"Expired job ID {jobid} with response 404")
|
2023-09-28 08:54:56 +00:00
|
|
|
else:
|
2023-10-09 08:15:53 +00:00
|
|
|
print(f"Error for job ID {job_id}")
|
|
|
|
except Exception as n1:
|
|
|
|
print(str(n1))
|
|
|
|
pass
|
|
|
|
|
|
|
|
def main():
|
|
|
|
start_time = time()
|
|
|
|
scraper = NaukriGulfJobDetailScraper(input_file, output_file, error_file)
|
|
|
|
scraper.scrape()
|
|
|
|
end_time = time()
|
|
|
|
duration_hours = (end_time - start_time) / 3600
|
|
|
|
print(f"Jobdata program took {duration_hours:.2f} hours to run.")
|
|
|
|
with open(stats_file, "a") as stat:
|
|
|
|
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
2023-09-28 08:54:56 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|