directory updated
parent
d230e6280d
commit
a401cf37fc
|
@ -1,5 +1,10 @@
|
||||||
.vscode
|
.vscode
|
||||||
data_naukri
|
data_naukri
|
||||||
scrib
|
scrib
|
||||||
data_naukri/
|
data_naukri/*.csv
|
||||||
gulf_data/
|
gulf_data/*.csv
|
||||||
|
not required/
|
||||||
|
data_naukri/*.txt
|
||||||
|
gulf_data/*.txt
|
||||||
|
*.sh
|
||||||
|
server scraper/
|
|
@ -1,115 +0,0 @@
|
||||||
import requests
|
|
||||||
import csv
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
# Global variables
|
|
||||||
input_file = "data_naukri/old_jobdata_india.csv"
|
|
||||||
output_file = "data_naukri/expired.csv"
|
|
||||||
error_file = "data_naukri_india/expiry_error.csv"
|
|
||||||
stats_file = "data_naukri_india/stats.txt"
|
|
||||||
class NaukriExpiryScraper:
|
|
||||||
base_url="https://www.naukri.com/jobapi/v4/job/{}"
|
|
||||||
headers = {
|
|
||||||
'authority': 'www.naukri.com',
|
|
||||||
'accept': 'application/json',
|
|
||||||
'accept-language': 'en-US,en;q=0.9',
|
|
||||||
'appid': '121',
|
|
||||||
'cache-control': 'no-cache, no-store, must-revalidate',
|
|
||||||
'content-type': 'application/json',
|
|
||||||
'expires': '0',
|
|
||||||
'gid': 'LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE',
|
|
||||||
'pragma': 'no-cache',
|
|
||||||
'referer': 'https://www.naukri.com/job-listings-ps-technical-consultant-ii-ncr-corporation-india-pvt-ltd-kolkata-mumbai-new-delhi-hyderabad-secunderabad-pune-chennai-bangalore-bengaluru-3-to-6-years-120823501070',
|
|
||||||
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
|
|
||||||
'sec-ch-ua-mobile': '?0',
|
|
||||||
'sec-ch-ua-platform': '"Windows"',
|
|
||||||
'sec-fetch-dest': 'empty',
|
|
||||||
'sec-fetch-mode': 'cors',
|
|
||||||
'sec-fetch-site': 'same-origin',
|
|
||||||
'systemid': 'Naukri',
|
|
||||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43',
|
|
||||||
'x-requested-with': 'XMLHttpRequest',
|
|
||||||
'cookie': 'test=naukri.com; _t_ds=14c8c0f01691845374-19414c8c0f0-014c8c0f0; _gcl_au=1.1.1024691843.1691845381; _fbp=fb.1.1691845391563.1521284000; _t_r=1096%2F%2F; __utma=266160400.1059122291.1691845381.1691846963.1691846963.1; __utmc=266160400; __utmz=266160400.1691846963.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _gid=GA1.2.1097790226.1691946960; _cc_id=f102b3f9c375bbb80783e8e09a9c6a4d; panoramaId_expiry=1692033362592; panoramaId=f20d6d50d02ca8dbc9f4835382c2a9fb927ad954299218db60a7d9d7bca09362; panoramaIdType=panoDevice; _abck=EAF8CD87ED06F6FE0D1BE341378082D0~0~YAAQBCozarVfw8GJAQAAvF128gqV/yjff8AT5qkTc7EiVmNlJJ00nD16VEFeJh15q2bYAK8KlnGcPr7zpsi8USVMgui9DaCwoq15n4cW+Z/uKvUfCuUQAwVIKj2qlRT9tghTOfvBgGovWxTjFhD8B8DypZg3xbCBcOfMrTxIG0kml1V3V0teNzxQbKwxZBH+f9SpG1nWjcSqi0MuZ2Lp9njCQTDEXdyNn5FK9QUyBNIgMiZXGCroYN6g9Dqg50awS8p7GDin9O0yaBFnLYXYSSPqsjYlZsOAeZG1YDXhVfCIXFl9Ai4oQwulHEVR4kTx7E/GAxrPUMWKT1MJXJk38d/hHm/khF9WXryyuzBNGqBrHEmzbSK2Apvjhz+Hl7a1CDiFvYOTgurygc0o2F8E4e+o1OudsW0KCA==~-1~-1~-1; bm_sz=ED70280600D61C24AE8779690E6872A4~YAAQBCozardfw8GJAQAAvF128hRM9F6AMuh7Z7SvE3TmgXzJwI6StEga9y2KuTxZ8hXLMtJ7yq1I6ToCvJ1qcBfvYBY/W/7P2A4I+QADScKYSbs6S/S3UE9bL/lKee3NvEuD50tGUHrs59SQGoYdJGMrwml9npvfv+PANc8RaeobLmyx70LjTBajrTQruhnuEqphAnEPph1L6yqffRmta8KALbfw/sFFkvZWRte4uRCRS6IwyvdgNdGzHrvU90Cefnm1sAuK5Hm+F+JUvMVZhEWa/vukCd3Pz7toStN7N4P31cQ=~4539188~3289157; bm_mi=5266EA699B10C54B520AC0C335945591~YAAQBCozaslfw8GJAQAAFV528hRn0Dp7Ng6SjmmpdWbuBqjjlpOIm6e4no+DFPGfNvfuTNj9/tOe0zSzEbnFtWymp3K8PdRZcbO4azXh/4xphXqBeZXTZhE/H/7X6du3KAg3VyrF08jM/O2Hf8/7qtOXVUdBSpd8+mzH3IbW1d10UuiswDenQ6HiNRSkJISdZ8F6lXgGw2kpN3tAHIa9RixcTehrimRMipUgj4pRG/80a+tzAQQcAWUVOFaNoOHZ/C/oL2It920HJrOdtE85yrXx/LMaJlUb1RlHCG2KE/xkNMWpMI/FCimZYyI/DC8yQziKzxoqnP+GPA+JN5dMV76U4jXzYLqPOT5NwoKG7w==~1; ak_bmsc=0F69C083388867249F15237E773039FA~000000000000000000000000000000~YAAQBCozailgw8GJAQAAiGF28hTwkEIwbiDNaA96h/t+HbVduxzp6s1VtAmlm8JZxLg4LfiUPyA15rawjfgm3WgrQVB6GsFlaa+AvUvz1Pz3Q1P9td+LXZ5/+PFIAaTQN/O8SvcNd87eOmguE+T4BLbH5NDBcHEHBngYElDjkyqZkRtJ15EqweEPCpzn6yt+EYc/+sNuZI5/Wqj674CTqW8hmhvDToHdetlr8dh0zmRPwh1xdYnwb4uR6rGuaAIDwfopcqXdroQFVmDwMMXCkLNtTG3jToLxEDo7w/SHlJNK0LhicrXOQLyJu4k7udguvs4/Y+kXOEc04TkLKWa0gHsA+znQId6BT0CK4BFgGPYCMzpn379EH1ucz+mbjpX9p61CvxwEqFWV6O6hXXlbjHDGsuIiuIy3EP+38wb6B+uq2PBPgEmzZjLYjs9aNWGs0of7I0/V+ZL2xQDA2JD5FUXN1sgkl8r6w2sT5Fk1VuHGeorLkpIm0fkysZqAPM2yqJ5zaVkjyI4UENN56Aw79pKKVSkJtT5ALDmr1e+O8keIkg069ipenGburGc1Nw==; __gads=ID=da661383a92cc2b7:T=1691845731:RT=1691990009:S=ALNI_Ma5kdU-yCfi5vupriJnuuWUWmE_SQ; __gpi=UID=00000c2b451ccc2b:T=1691845731:RT=1691990009:S=ALNI_MZHpbDDCgSCaDcBTqfNHzHEDKk0JQ; jd=110823008324; _ga=GA1.2.1059122291.1691845381; cto_bundle=IfSELF9LbTF0TnAzamN1d2ZSSm5EMkdYekFhWDNJeElkOCUyQkElMkZ2RTRJNTFBNG95WENmVlBEV01wV3ZPSXB0dWpTZVFBZHZWQmt6WjVHTUpWNWEwQURTeWRaMWVGbyUyQjclMkZpSm5aNFZia0ZjcGklMkJFcSUyQlg2R3I3bUJkazJnaVN0cURyTUpGWUxQOHR6TFpBcDF6QU1MckFOdlg2cEElM0QlM0Q; _gat_UA-182658-1=1; bm_sv=33FDCB0BB2381FFCB1DA9B35AB25F10B~YAAQHSozaj2kUsGJAQAAFWF48hR1ZxWD9bmTihvsJwSN5urYMQoBOXsjILmBLpCp5Y8Wb2d+v8S1IsgfaFAjzZQJDWWGsM4VZOUHvjeEwqyhpkf95fegyYjUANSip9pcOY7JcbsJ3QemjclSynJdM2yjQovH+L9XiBHdKYFWDfacLicV2AGOtFikI1gVDGLSEqegx2bUuwmuQAlECM+lqj//OIwitlvDTMj9WCs40ybqG4D7o+JDWSXPBMYddaEqDw==~1; HOWTORT=ul=1691990122615&r=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1&hd=1691990122806&cl=1691990019014&nu=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1; _ga_K2YBNZVRLL=GS1.1.1691989990.4.1.1691990122.60.0.0'
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, input_file, output_file, error_file):
|
|
||||||
self.input_file = input_file
|
|
||||||
self.output_file = output_file
|
|
||||||
self.error_file = error_file
|
|
||||||
self.timeout = 30
|
|
||||||
self.expired_jobs_count=0
|
|
||||||
self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}
|
|
||||||
|
|
||||||
def scrape(self):
|
|
||||||
|
|
||||||
all_input = []
|
|
||||||
with open(self.input_file, 'r', encoding='utf-8') as infile:
|
|
||||||
header_line = infile.readline().strip()
|
|
||||||
#write header line
|
|
||||||
with open(self.output_file, 'w') as file:
|
|
||||||
file.write(header_line + "\n")
|
|
||||||
|
|
||||||
reader = csv.reader(infile)
|
|
||||||
|
|
||||||
for row in reader:
|
|
||||||
all_input.append(row)
|
|
||||||
|
|
||||||
with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile:
|
|
||||||
writer = csv.writer(outfile)
|
|
||||||
|
|
||||||
while all_input:
|
|
||||||
current_row=all_input[0]
|
|
||||||
source_link=current_row[2].strip()
|
|
||||||
jobid = current_row[1].strip()
|
|
||||||
url = self.base_url.format(jobid)
|
|
||||||
|
|
||||||
if source_link == "":
|
|
||||||
print(f"Not checking job without source link, job ID {jobid}")
|
|
||||||
all_input.pop(0) # Remove the processed job ID
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"Remaining to do: {len(all_input)}")
|
|
||||||
time.sleep(0.5)
|
|
||||||
response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
|
|
||||||
|
|
||||||
print(f"{response.status_code} for {url}")
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
print(f"Alive job ID {jobid}")
|
|
||||||
all_input.pop(0) # Remove the processed job ID
|
|
||||||
|
|
||||||
|
|
||||||
elif response.status_code == 303:
|
|
||||||
json_response = response.json()
|
|
||||||
if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
|
|
||||||
print(f"Expired job ID {jobid} with response 303")
|
|
||||||
writer.writerow(current_row)
|
|
||||||
self.expired_jobs_count+=1
|
|
||||||
all_input.pop(0) # Remove the processed job ID
|
|
||||||
|
|
||||||
elif response.status_code == 404:
|
|
||||||
print(f"Expired job ID {jobid} with response 404")
|
|
||||||
writer.writerow(current_row)
|
|
||||||
self.expired_jobs_count+=1
|
|
||||||
all_input.pop(0) # Remove the processed job ID
|
|
||||||
|
|
||||||
else:
|
|
||||||
print(f"Failed to fetch data for job ID {jobid}")
|
|
||||||
time.sleep(10)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
scraper = NaukriExpiryScraper(input_file, output_file, error_file)
|
|
||||||
scraper.scrape()
|
|
||||||
|
|
||||||
end_time = time.time()
|
|
||||||
duration_hours = (end_time - start_time) / 3600
|
|
||||||
print(f"Expiry program took {duration_hours:.2f} hours to run.")
|
|
||||||
with open(stats_file, "a") as stat:
|
|
||||||
stat.write(f"Expiry program took {duration_hours:.2f} hours to run.\n")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,132 +0,0 @@
|
||||||
import requests
|
|
||||||
import csv
|
|
||||||
import concurrent.futures
|
|
||||||
|
|
||||||
# List of URLs to query
|
|
||||||
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
'authority': 'www.naukrigulf.com',
|
|
||||||
'accept': 'application/json',
|
|
||||||
'accept-format': 'strict',
|
|
||||||
'accept-language': 'ENGLISH',
|
|
||||||
'appid': '205',
|
|
||||||
'cache-control': 'no-cache',
|
|
||||||
'client-type': 'desktop',
|
|
||||||
'clientid': 'desktop',
|
|
||||||
'device-type': 'desktop',
|
|
||||||
'puppeteer': 'false',
|
|
||||||
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
|
|
||||||
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
|
|
||||||
'sec-ch-ua-mobile': '?0',
|
|
||||||
'sec-ch-ua-platform': 'Windows',
|
|
||||||
'sec-fetch-dest': 'empty',
|
|
||||||
'sec-fetch-mode': 'cors',
|
|
||||||
'sec-fetch-site': 'same-origin',
|
|
||||||
'systemid': '2323',
|
|
||||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
|
|
||||||
'userdata': '|IN'
|
|
||||||
}
|
|
||||||
|
|
||||||
keys_to_extract = ['designation','description','company','compensation','industryType','functionalArea','jobSource','location','other','desiredCandidate','contact','isExpired','locationInterlinking']
|
|
||||||
company_keys = ['name','details']
|
|
||||||
salary_key = ['minimumSalary','maximumSalary','currency','label','hideSalary']
|
|
||||||
rfile = "ME_jobIds.csv"
|
|
||||||
loc_list = []
|
|
||||||
skill_other =[]
|
|
||||||
skill_pref = []
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_url(url):
|
|
||||||
try:
|
|
||||||
url = base_url.format(url)
|
|
||||||
response = requests.get(url, headers=headers)
|
|
||||||
return response.json(), response.status_code, url
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
return "", str(e), url
|
|
||||||
|
|
||||||
def batch_process(urls):
|
|
||||||
results = []
|
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
||||||
future_to_url = {executor.submit(fetch_url, url): url for url in urls}
|
|
||||||
|
|
||||||
for future in concurrent.futures.as_completed(future_to_url):
|
|
||||||
url = future_to_url[future]
|
|
||||||
try:
|
|
||||||
result = future.result()
|
|
||||||
results.append(result)
|
|
||||||
except Exception as e:
|
|
||||||
results.append((url, str(e)))
|
|
||||||
return results
|
|
||||||
|
|
||||||
def main():
|
|
||||||
batch_size = 50
|
|
||||||
results = []
|
|
||||||
count = 1
|
|
||||||
# Open a CSV file for writing
|
|
||||||
with open('output_jobs_0209_me.csv', 'a', newline='', encoding='utf-8') as csvfile:
|
|
||||||
csvwriter = csv.writer(csvfile)
|
|
||||||
|
|
||||||
# Write header to the CSV file
|
|
||||||
csvwriter.writerow(['URL'] + list(keys_to_extract))
|
|
||||||
|
|
||||||
with open(rfile,'r') as file:
|
|
||||||
csv_reader = csv.reader(file)
|
|
||||||
urls = [row.replace("\n","") for row in file]
|
|
||||||
|
|
||||||
for i in range(0, len(urls), batch_size):
|
|
||||||
batch = urls[i:i+batch_size]
|
|
||||||
batch_results = batch_process(batch)
|
|
||||||
# Make the HTTP GET request
|
|
||||||
#row = row.replace("\n","")
|
|
||||||
#`url = base_url.format(row)`
|
|
||||||
#try:
|
|
||||||
for response in batch_results:
|
|
||||||
print(count)
|
|
||||||
count = count + 1
|
|
||||||
if response[1]== 200:
|
|
||||||
json_data = response[0]
|
|
||||||
|
|
||||||
job_details = json_data
|
|
||||||
# Extract specific key values from the JSON response
|
|
||||||
values_to_store = [job_details.get(key, '') for key in keys_to_extract]
|
|
||||||
"""if values_to_store[0]!="":
|
|
||||||
|
|
||||||
[values_to_store.append(job_details["companyDetail"].get(key,'')) for key in company_keys]
|
|
||||||
[values_to_store.append(job_details["salaryDetail"].get(key,'')) for key in salary_key]
|
|
||||||
|
|
||||||
for loc in job_details["locations"]:
|
|
||||||
loc_list.append(loc.get('label',''))
|
|
||||||
values_to_store.append(loc_list)
|
|
||||||
|
|
||||||
for skill in job_details["keySkills"]["other"]:
|
|
||||||
skill_other.append(skill.get('label',''))
|
|
||||||
values_to_store.append(skill_other)
|
|
||||||
|
|
||||||
for skill in job_details["keySkills"]["preferred"]:
|
|
||||||
skill_pref.append(skill.get('label',''))
|
|
||||||
values_to_store.append(skill_pref)
|
|
||||||
|
|
||||||
else:
|
|
||||||
values_to_store[1]=""
|
|
||||||
values_to_store.append(job_details["companyDetail"])
|
|
||||||
values_to_store.append(job_details["salaryDetail"])
|
|
||||||
values_to_store.append(job_details["locations"])
|
|
||||||
values_to_store.append(job_details["keySkills"])
|
|
||||||
"""
|
|
||||||
# Write the extracted values to the CSV file
|
|
||||||
csvwriter.writerow([response[2]] + values_to_store)
|
|
||||||
else:
|
|
||||||
print(f"Failed to fetch data for job ID: {response[2]} with {response[0]}")
|
|
||||||
csvwriter.writerow([response[2]] + [response[0]])
|
|
||||||
|
|
||||||
# except requests.exceptions.RequestException as e:
|
|
||||||
# csvwriter.writerow([url] + [str(e)])
|
|
||||||
|
|
||||||
print("Data extraction and CSV writing complete.")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
|
|
|
@ -109,8 +109,8 @@ class NaukriGulfJobDetailScraper:
|
||||||
json_data = {
|
json_data = {
|
||||||
"Url" : jd_url,
|
"Url" : jd_url,
|
||||||
"Job Key" : job_id,
|
"Job Key" : job_id,
|
||||||
"Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
|
# "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
|
||||||
json_response.get('contact', {'website': ''}).get('website',''),
|
# json_response.get('contact', {'website': ''}).get('website',''),
|
||||||
"Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '',
|
"Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '',
|
||||||
"Job Description" : json_response.get('description',''),
|
"Job Description" : json_response.get('description',''),
|
||||||
"Role Category" :"",
|
"Role Category" :"",
|
||||||
|
|
|
@ -1,95 +0,0 @@
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
import csv
|
|
||||||
import math
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
'authority': 'www.naukrigulf.com',
|
|
||||||
'accept': 'application/json',
|
|
||||||
'accept-format': 'strict',
|
|
||||||
'accept-language': 'ENGLISH',
|
|
||||||
'appid': '205',
|
|
||||||
'cache-control': 'no-cache',
|
|
||||||
'client-type': 'desktop',
|
|
||||||
'clientid': 'desktop',
|
|
||||||
'device-type': 'desktop',
|
|
||||||
'puppeteer': 'false',
|
|
||||||
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
|
|
||||||
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
|
|
||||||
'sec-ch-ua-mobile': '?0',
|
|
||||||
'sec-ch-ua-platform': 'Windows',
|
|
||||||
'sec-fetch-dest': 'empty',
|
|
||||||
'sec-fetch-mode': 'cors',
|
|
||||||
'sec-fetch-site': 'same-origin',
|
|
||||||
'systemid': '2323',
|
|
||||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
|
|
||||||
'userdata': '|IN'
|
|
||||||
}
|
|
||||||
|
|
||||||
error_pages = []
|
|
||||||
keys_to_extract = ['designation', 'jobId', 'company','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies']
|
|
||||||
fields_to_write = ['designation', 'jobId', 'company','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies','city']
|
|
||||||
input_file = "naukri/_gulf_location.csv"
|
|
||||||
jobs_per_pages = 50
|
|
||||||
base_url = "https://www.naukrigulf.com/spapi/jobapi/search?Experience=&Keywords=&KeywordsAr=&Limit=50&Location={}&LocationAr=&Offset={}&SortPreference=&breadcrumb=1&locationId=&nationality=&nationalityLabel=&pageNo={}&srchId='"
|
|
||||||
|
|
||||||
def parse_and_save(json_data, csv_filename, city):
|
|
||||||
parsed_data = []
|
|
||||||
for job in json_data["jobs"]:
|
|
||||||
parsed_item = {field: job.get(field, None) for field in keys_to_extract}
|
|
||||||
parsed_item['city'] = city
|
|
||||||
parsed_data.append(parsed_item)
|
|
||||||
#parsed_data.extend(city)
|
|
||||||
|
|
||||||
with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile:
|
|
||||||
csv_writer = csv.DictWriter(csvfile, fieldnames= fields_to_write)
|
|
||||||
csv_writer.writeheader()
|
|
||||||
csv_writer.writerows(parsed_data)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
#for page_number in range(1, 4700): # Adjust the range as needed
|
|
||||||
with open(input_file, 'r') as file:
|
|
||||||
file_read = csv.reader(file)
|
|
||||||
for city in file_read:
|
|
||||||
city_read_url = city[0].replace("\n","")
|
|
||||||
output_data=[]
|
|
||||||
total_pages = 1000
|
|
||||||
output_filename_json = f"{city[0]}.json"
|
|
||||||
output_filename_csv = "output_all_gulf.csv"
|
|
||||||
start_page = 1
|
|
||||||
|
|
||||||
if(city[0] == "pharma"):
|
|
||||||
start_page = 173
|
|
||||||
total_pages = 22
|
|
||||||
total_page_num = 194
|
|
||||||
|
|
||||||
while total_pages>0:
|
|
||||||
url = base_url.format(city[0],(jobs_per_pages*(start_page-1)),start_page)
|
|
||||||
response = requests.get(url, headers=headers)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
json_data = response.json()
|
|
||||||
|
|
||||||
if(total_pages == 1000):
|
|
||||||
total_jobs = json_data["totalJobsCount"]
|
|
||||||
total_pages = math.ceil(total_jobs/jobs_per_pages)
|
|
||||||
total_page_num = total_pages
|
|
||||||
|
|
||||||
parse_and_save(json_data, output_filename_csv, city[0])
|
|
||||||
print(f"Processed{url} : {start_page}/{total_page_num}/{total_pages}")
|
|
||||||
total_pages = total_pages-1
|
|
||||||
start_page = start_page+1
|
|
||||||
|
|
||||||
else:
|
|
||||||
print("Error : ",response.status_code," at url ",url)
|
|
||||||
error_pages.append(url)
|
|
||||||
total_pages = total_pages-1
|
|
||||||
start_page = start_page+1
|
|
||||||
|
|
||||||
print("Data saved to output_new.json")
|
|
||||||
print(error_pages)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -4,6 +4,11 @@ import csv
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import math
|
import math
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Configure the logging settings
|
||||||
|
logging.basicConfig(filename='search_india_error.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
# Global variables
|
# Global variables
|
||||||
input_file = "naukri/_industry_urls.csv"
|
input_file = "naukri/_industry_urls.csv"
|
||||||
|
@ -72,35 +77,38 @@ class NaukriJobScraper:
|
||||||
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
||||||
while total_pages > 0:
|
while total_pages > 0:
|
||||||
url = self.base_url.format(industry_name, start_page, industry_q)
|
url = self.base_url.format(industry_name, start_page, industry_q)
|
||||||
print(url)
|
try:
|
||||||
response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
|
# print(url)
|
||||||
|
response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
|
||||||
|
|
||||||
print(f"{response.status_code} for {url}")
|
# print(f"{response.status_code} for {url}")
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
print(f"Error with page {start_page} for industry {industry_name}")
|
print(f"Error with page {start_page} for industry {industry_name}")
|
||||||
with open(self.error_file_path, "a") as file:
|
with open(self.error_file_path, "a") as file:
|
||||||
file.write(f"Error with page {start_page} for industry {industry_name}\n")
|
file.write(f"Error with page {start_page} for industry {industry_name}\n")
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# if 200 response
|
# if 200 response
|
||||||
data = response.json()
|
data = response.json()
|
||||||
if(total_pages == 1000):
|
if(total_pages == 1000):
|
||||||
total_jobs = data["noOfJobs"]
|
total_jobs = data["noOfJobs"]
|
||||||
total_pages = math.ceil(total_jobs/100)
|
total_pages = math.ceil(total_jobs/100)
|
||||||
|
|
||||||
self.parse_and_save(data)
|
self.parse_and_save(data)
|
||||||
|
|
||||||
|
# Assuming that you'll break the loop once all pages are scraped:
|
||||||
|
# (Add your logic to update 'total_pages' based on the response)
|
||||||
|
total_pages -= 1
|
||||||
|
start_page += 1
|
||||||
|
print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}")
|
||||||
|
time.sleep(1)
|
||||||
|
except Exception as e1:
|
||||||
|
logging.error(url + '\n'+ str(e1))
|
||||||
|
|
||||||
# Assuming that you'll break the loop once all pages are scraped:
|
|
||||||
# (Add your logic to update 'total_pages' based on the response)
|
|
||||||
total_pages -= 1
|
|
||||||
start_page += 1
|
|
||||||
print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}")
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
scraper = NaukriJobScraper(input_file, output_file, error_file)
|
scraper = NaukriJobScraper(input_file, output_file, error_file)
|
||||||
scraper.scrape()
|
scraper.scrape()
|
||||||
|
|
|
@ -72,15 +72,17 @@ class NaukriGulfDetailSpiderSpider(scrapy.Spider):
|
||||||
})
|
})
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
job_id = response.meta.get('jobid')
|
job_id = response.meta.get('jobid')
|
||||||
json_response = response.json()
|
json_response = response.json()
|
||||||
|
source_value1 = json_response.get('other', {'tag': ''}).get('tag', '')
|
||||||
|
source_value2 = json_response.get('contact', {'website': ''}).get('website', '')
|
||||||
jd_url = response.url
|
jd_url = response.url
|
||||||
json_data = {
|
json_data = {
|
||||||
"Url" : jd_url,
|
"Url" : jd_url,
|
||||||
"Job Key" : job_id,
|
"Job Key" : job_id,
|
||||||
"Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
|
"Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '',
|
||||||
json_response.get('contact', {'website': ''}).get('website',''),
|
|
||||||
"Job Description" : json_response.get('description',''),
|
"Job Description" : json_response.get('description',''),
|
||||||
"Role Category" :"",
|
"Role Category" :"",
|
||||||
"Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]),
|
"Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]),
|
||||||
|
|
Loading…
Reference in New Issue