import requests import csv from time import sleep, time """ # List of URLs to query base_url = "https://www.naukrigulf.com/spapi/jobs/{}" headers = { 'authority': 'www.naukrigulf.com', 'accept': 'application/json', 'accept-format': 'strict', 'accept-language': 'ENGLISH', 'appid': '205', 'cache-control': 'no-cache', 'client-type': 'desktop', 'clientid': 'desktop', 'device-type': 'desktop', 'puppeteer': 'false', 'referer': 'https://www.naukrigulf.com/jobs-in-uae', 'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': 'Windows', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'systemid': '2323', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12', 'userdata': '|IN' } with open("o.csv", 'a+', newline='', encoding='utf-8') as outfile: outfile_writer = csv.writer(outfile) with open("output_all_gulf old.csv", 'r', encoding="utf-8", newline='') as jobis: j_read = list(csv.DictReader(jobis)) for item in j_read: print(base_url.format(item.get('jobId'))) jd_url = base_url.format(item.get('jobId')) sleep(0.5) response = requests.get(base_url.format(item.get('jobId')), headers=headers) if response.status_code == 200: job_data = { "Url" : jd_url, "Job Key" : item.get('jobId'), "Source Link": response.json().get('other', {'tag': ''}).get('tag',''), "Job Description" : response.json().get('description',''), "Role Category" :"", "Job Industry" : ', '.join([t['title'] for t in response.json()['industryInterlinking']]), "Job Title" : response.json().get('designation'), "Formatted Location Full" : response.json().get('location'), "Job Functions" : ', '.join([x['title'] for x in response.json()['fAreaInterlinking']]), "Company" : response.json().get('company', {'name':''}).get('name'), "Job Type" : response.json().get('employmentType'), "Key Skills" : ', '.join([y['title'] for y in response.json()['keywordInterlinking']]), "Minimum Experience" : response.json().get('desiredCandidate').get('experience').get('min'), "Maximum Experience" : response.json().get('desiredCandidate').get('experience').get('max'), "Salary Detail" : response.json().get('compensation') } if outfile.tell() == 0: header = job_data.keys() outfile_writer.writerow(header) outfile_writer.writerow([str(z).replace('\n','').strip() for z in job_data.values()]) """ # Global variables input_file = "gulf_data/output_all_gulf.csv" output_file = "gulf_data/jobdata_gulf.csv" error_file = "gulf_data/jobdata_error_gulf.csv" stats_file = "gulf_data/stats_gulf.txt" skip=0 class NaukriGulfJobDetailScraper: base_url = "https://www.naukrigulf.com/spapi/jobs/{}" headers = { 'authority': 'www.naukrigulf.com', 'accept': 'application/json', 'accept-format': 'strict', 'accept-language': 'ENGLISH', 'appid': '205', 'cache-control': 'no-cache', 'client-type': 'desktop', 'clientid': 'desktop', 'device-type': 'desktop', 'puppeteer': 'false', 'referer': 'https://www.naukrigulf.com/jobs-in-uae', 'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': 'Windows', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'systemid': '2323', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12', 'userdata': '|IN' } def __init__(self, input_file, output_file, error_file): self.input_file = input_file self.output_file = output_file self.error_file = error_file self.timeout = 30 self.count = 1 # self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {} def transform_data(self, job_id, jd_url, json_response): source_value1 = json_response.get('other', {'tag': ''}).get('tag', '') source_value2 = json_response.get('contact', {'website': ''}).get('website', '') jd = json_response.get('description','') desired_profile = json_response.get('desiredCandidate') valid_pairs = None if desired_profile: valid_pairs = [(key, value) for key, value in desired_profile.items() if value is not None and value != '' and key != 'experience'] if valid_pairs: html_output = '

Desired Candidate Profile


' for key, value in valid_pairs: html_output += f"{key.title()}:
{value}
" jd += html_output json_data = { "Url" : jd_url, "Job Key" : "g_" + str(job_id), # "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \ # json_response.get('contact', {'website': ''}).get('website',''), "Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '', # "Job Description" : json_response.get('description',''), "Job Description" : jd, "Role Category" :"", "Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]), "Job Title" : json_response.get('designation'), "Formatted Location Full" : json_response.get('location'), "Job Functions" : ', '.join([x['title'] for x in json_response['fAreaInterlinking']]), "Company" : json_response.get('company', {'name':''}).get('name'), "Job Type" : json_response.get('employmentType'), "Key Skills" : ', '.join([y['title'] for y in json_response['keywordInterlinking']]), "Minimum Experience" : json_response.get('desiredCandidate').get('experience').get('min'), "Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'), "Salary Detail" : json_response.get('compensation'), "Country" : json_response.get('compensation',{'country':''}).get('country') } return json_data def scrape(self): with open(self.input_file, 'r', encoding='utf-8') as infile: reader = csv.reader(infile) total_input_count=0 all_job_ids = [] for row in reader: jobid = row[1].strip() mode = row[7].strip() total_input_count+=1 if mode == "POSTED": print("removed non tagged job with jobid %s" % jobid) continue all_job_ids.append(jobid) print(f"Size of raw all_job_ids: {len(all_job_ids)}") all_job_ids = list(set(all_job_ids)) print(f"Size of unique all_job_ids: {len(all_job_ids)}") all_job_ids = all_job_ids[skip:] print(f"Total input: {total_input_count}, Valid ids to scrape {len(all_job_ids)}") with open(stats_file, "a") as stat: stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n") sleep(1) with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile: writer = csv.writer(outfile) while all_job_ids: job_id = all_job_ids[0] url = self.base_url.format(job_id) sleep(0.5) try: sleep(1) response = requests.get(url, headers=self.headers, timeout=self.timeout) print(f"{response.status_code} for {url}") if response.status_code == 200: json_response = response.json() transformed_data = self.transform_data(job_id, url, json_response) if outfile.tell() == 0 : header = transformed_data.keys() writer.writerow(header) writer.writerow(transformed_data.values()) print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}") all_job_ids.pop(0) # Remove the processed job ID self.count += 1 # / elif response.status_code == 303: # json_response = response.json() # if json_response.get('metaSearch', {}).get('isExpiredJob') == '1': # print(f"Expired job ID {jobid} with response 303") # all_job_ids.pop(0) # Remove the processed job ID elif response.status_code == 404: all_job_ids.pop(0) # Remove the processed job ID print(f"Expired job ID {jobid} with response 404") else: print(f"Error for job ID {job_id}") except Exception as n1: print(str(n1)) pass def main(): start_time = time() scraper = NaukriGulfJobDetailScraper(input_file, output_file, error_file) scraper.scrape() end_time = time() duration_hours = (end_time - start_time) / 3600 print(f"Jobdata program took {duration_hours:.2f} hours to run.") with open(stats_file, "a") as stat: stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n") if __name__ == "__main__": main()