113 lines
4.6 KiB
Python
113 lines
4.6 KiB
Python
import requests
|
|
import json
|
|
import time
|
|
import re
|
|
import csv
|
|
import math
|
|
|
|
output_filename_csv = "gulf_data/output_all_gulf.csv"
|
|
# input("remove lien 72 10000 limit wala")
|
|
headers = {
|
|
'authority': 'www.naukrigulf.com',
|
|
'accept': 'application/json',
|
|
'accept-format': 'strict',
|
|
'accept-language': 'ENGLISH',
|
|
'appid': '205',
|
|
'cache-control': 'no-cache',
|
|
'client-type': 'desktop',
|
|
'clientid': 'desktop',
|
|
'device-type': 'desktop',
|
|
'puppeteer': 'false',
|
|
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
|
|
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': 'Windows',
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-mode': 'cors',
|
|
'sec-fetch-site': 'same-origin',
|
|
'systemid': '2323',
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
|
|
'userdata': '|IN'
|
|
}
|
|
|
|
error_pages = []
|
|
keys_to_extract = ['designation', 'jobId', 'company','Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies']
|
|
fields_to_write = ['designation', 'jobId', 'Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies','city']
|
|
input_file = "naukri/_gulf_location.csv"
|
|
jobs_per_pages = 50
|
|
base_url = "https://www.naukrigulf.com/spapi/jobapi/search?Experience=&Keywords=&KeywordsAr=&Limit=50&Location={}&LocationAr=&Offset={}&SortPreference=&breadcrumb=1&locationId=&nationality=&nationalityLabel=&pageNo={}&srchId='"
|
|
|
|
def parse_and_save(json_data, csv_filename, city):
|
|
parsed_data = []
|
|
for job in json_data["jobs"]:
|
|
parsed_item = {field: job.get(field, None) for field in keys_to_extract}
|
|
parsed_item['city'] = city
|
|
# print("parsed_item ---", parsed_item)
|
|
# print(parsed_item.get('company', {'name':''}).get('name'))
|
|
# print(parsed_item.get('company', {'id':''}).get('id'))
|
|
# print(parsed_item.get('company', {'url':''}).get('url'))
|
|
for key, value in parsed_item.get('company', {'name':'', 'id':'', 'url':''}).items():
|
|
parsed_item["Company" + key] = value
|
|
try:
|
|
parsed_item.pop('company')
|
|
except:
|
|
pass
|
|
# print("updated parsed_item--", parsed_item)
|
|
parsed_data.append(parsed_item)
|
|
#parsed_data.extend(city)
|
|
|
|
with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile:
|
|
# print("csv_filename---", csv_filename)
|
|
csv_writer = csv.DictWriter(csvfile, fieldnames= fields_to_write)
|
|
if csvfile.tell() == 0:
|
|
csv_writer.writeheader()
|
|
csv_writer.writerows(parsed_data)
|
|
|
|
def main():
|
|
#for page_number in range(1, 4700): # Adjust the range as needed
|
|
with open(input_file, 'r') as file:
|
|
file_read = csv.reader(file)
|
|
file_read = list(file_read)
|
|
for city in file_read:
|
|
city_read_url = city[0].replace("\n","")
|
|
output_data=[]
|
|
total_pages = 1000
|
|
output_filename_json = f"{city[0]}.json"
|
|
output_filename_csv = "gulf_data/output_all_gulf.csv"
|
|
start_page = 1
|
|
|
|
# if(city[0] == "pharma"):
|
|
# start_page = 173
|
|
# total_pages = 22
|
|
# total_page_num = 194
|
|
|
|
while total_pages>0:
|
|
url = base_url.format(city[0],(jobs_per_pages*(start_page-1)),start_page)
|
|
# print("url", url)
|
|
# input()
|
|
response = requests.get(url, headers=headers)
|
|
|
|
if response.status_code == 200:
|
|
json_data = response.json()
|
|
|
|
if(total_pages == 1000):
|
|
total_jobs = json_data["totalJobsCount"]
|
|
total_pages = math.ceil(total_jobs/jobs_per_pages)
|
|
total_page_num = total_pages
|
|
|
|
parse_and_save(json_data, output_filename_csv, city[0])
|
|
print(f"Processed{url} : {start_page}/{total_page_num}/{total_pages}")
|
|
total_pages = total_pages-1
|
|
start_page = start_page+1
|
|
|
|
else:
|
|
print("Error : ",response.status_code," at url ",url)
|
|
error_pages.append(url)
|
|
total_pages = total_pages-1
|
|
start_page = start_page+1
|
|
|
|
# print("Data saved to output_new.json")
|
|
print(error_pages)
|
|
|
|
if __name__ == "__main__":
|
|
main() |