import requests import json import time import re import csv import math from datetime import datetime # input("remove lien 72 10000 limit wala") headers = { 'authority': 'www.naukrigulf.com', 'accept': 'application/json', 'accept-format': 'strict', 'accept-language': 'ENGLISH', 'appid': '205', 'cache-control': 'no-cache', 'client-type': 'desktop', 'clientid': 'desktop', 'device-type': 'desktop', 'puppeteer': 'false', 'referer': 'https://www.naukrigulf.com/jobs-in-uae', 'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': 'Windows', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'systemid': '2323', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12', 'userdata': '|IN' } error_pages = [] keys_to_extract = ['designation', 'jobId', 'company','Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies'] fields_to_write = ['designation', 'Job Key', 'Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies','city'] current_date = datetime.now() today_date = current_date.strftime('%d-%m-%Y') input_file = "naukri/_gulf_location.csv" output_filename_csv = f"gulf_data/daily_search_results/search_result_gulf_{today_date}.csv" jobs_per_pages = 50 base_url = "https://www.naukrigulf.com/spapi/jobapi/search?Experience=&Keywords=&KeywordsAr=&Limit=50&Location={}&LocationAr=&Offset={}&SortPreference=&breadcrumb=1&locationId=&nationality=&nationalityLabel=&pageNo={}&srchId='" def parse_and_save(json_data, csv_filename, city): parsed_data = [] for job in json_data["jobs"]: parsed_item = {field: job.get(field, None) for field in keys_to_extract} parsed_item['city'] = city # print("parsed_item ---", parsed_item) # print(parsed_item.get('company', {'name':''}).get('name')) # print(parsed_item.get('company', {'id':''}).get('id')) # print(parsed_item.get('company', {'url':''}).get('url')) for key, value in parsed_item.get('company', {'name':'', 'id':'', 'url':''}).items(): parsed_item["Company" + key] = value try: parsed_item.pop('company') except: pass parsed_item = {k.replace("jobId", "Job Key"): v for k, v in parsed_item.items()} # print("updated parsed_item--", parsed_item) parsed_data.append(parsed_item) #parsed_data.extend(city) with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile: # print("csv_filename---", csv_filename) csv_writer = csv.DictWriter(csvfile, fieldnames= fields_to_write) if csvfile.tell() == 0: csv_writer.writeheader() csv_writer.writerows(parsed_data) def main(): #for page_number in range(1, 4700): # Adjust the range as needed with open(input_file, 'r') as file: file_read = csv.reader(file) file_read = list(file_read) for city in file_read: city_read_url = city[0].replace("\n","") output_data=[] total_pages = 1000 # output_filename_json = f"{city[0]}.json" # output_filename_csv = output_filename_csv start_page = 1 # if(city[0] == "pharma"): # start_page = 173 # total_pages = 22 # total_page_num = 194 while total_pages>0: url = base_url.format(city[0],(jobs_per_pages*(start_page-1)),start_page) # print("url", url) # input() response = requests.get(url, headers=headers) if response.status_code == 200: json_data = response.json() if(total_pages == 1000): total_jobs = json_data["totalJobsCount"] total_pages = math.ceil(total_jobs/jobs_per_pages) total_page_num = total_pages parse_and_save(json_data, output_filename_csv, city[0]) print(f"Processed{url} : {start_page}/{total_page_num}/{total_pages}") total_pages = total_pages-1 start_page = start_page+1 else: print("Error : ",response.status_code," at url ",url) error_pages.append(url) total_pages = total_pages-1 start_page = start_page+1 # print("Data saved to output_new.json") print(error_pages) if __name__ == "__main__": main()