import requests
import csv
from time import sleep, time
"""
# List of URLs to query
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
headers = {
'authority': 'www.naukrigulf.com',
'accept': 'application/json',
'accept-format': 'strict',
'accept-language': 'ENGLISH',
'appid': '205',
'cache-control': 'no-cache',
'client-type': 'desktop',
'clientid': 'desktop',
'device-type': 'desktop',
'puppeteer': 'false',
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'systemid': '2323',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
'userdata': '|IN'
}
with open("o.csv", 'a+', newline='', encoding='utf-8') as outfile:
outfile_writer = csv.writer(outfile)
with open("output_all_gulf old.csv", 'r', encoding="utf-8", newline='') as jobis:
j_read = list(csv.DictReader(jobis))
for item in j_read:
print(base_url.format(item.get('jobId')))
jd_url = base_url.format(item.get('jobId'))
sleep(0.5)
response = requests.get(base_url.format(item.get('jobId')), headers=headers)
if response.status_code == 200:
job_data = {
"Url" : jd_url,
"Job Key" : item.get('jobId'),
"Source Link": response.json().get('other', {'tag': ''}).get('tag',''),
"Job Description" : response.json().get('description',''),
"Role Category" :"",
"Job Industry" : ', '.join([t['title'] for t in response.json()['industryInterlinking']]),
"Job Title" : response.json().get('designation'),
"Formatted Location Full" : response.json().get('location'),
"Job Functions" : ', '.join([x['title'] for x in response.json()['fAreaInterlinking']]),
"Company" : response.json().get('company', {'name':''}).get('name'),
"Job Type" : response.json().get('employmentType'),
"Key Skills" : ', '.join([y['title'] for y in response.json()['keywordInterlinking']]),
"Minimum Experience" : response.json().get('desiredCandidate').get('experience').get('min'),
"Maximum Experience" : response.json().get('desiredCandidate').get('experience').get('max'),
"Salary Detail" : response.json().get('compensation')
}
if outfile.tell() == 0:
header = job_data.keys()
outfile_writer.writerow(header)
outfile_writer.writerow([str(z).replace('\n','').strip() for z in job_data.values()])
"""
# Global variables
input_file = "gulf_data/output_all_gulf.csv"
output_file = "gulf_data/jobdata_gulf.csv"
error_file = "gulf_data/jobdata_error_gulf.csv"
stats_file = "gulf_data/stats_gulf.txt"
skip=0
class NaukriGulfJobDetailScraper:
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
headers = {
'authority': 'www.naukrigulf.com',
'accept': 'application/json',
'accept-format': 'strict',
'accept-language': 'ENGLISH',
'appid': '205',
'cache-control': 'no-cache',
'client-type': 'desktop',
'clientid': 'desktop',
'device-type': 'desktop',
'puppeteer': 'false',
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'systemid': '2323',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
'userdata': '|IN'
}
def __init__(self, input_file, output_file, error_file):
self.input_file = input_file
self.output_file = output_file
self.error_file = error_file
self.timeout = 30
self.count = 1
# self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}
def transform_data(self, job_id, jd_url, json_response):
source_value1 = json_response.get('other', {'tag': ''}).get('tag', '')
source_value2 = json_response.get('contact', {'website': ''}).get('website', '')
jd = json_response.get('description','')
desired_profile = json_response.get('desiredCandidate')
valid_pairs = None
if desired_profile:
valid_pairs = [(key, value) for key, value in desired_profile.items() if value is not None and value != '' and key != 'experience']
if valid_pairs:
html_output = '