added scrapy code
parent
adfb3a70a1
commit
3b8b51ac06
|
@ -1,3 +1,5 @@
|
||||||
.vscode
|
.vscode
|
||||||
data_naukri
|
data_naukri
|
||||||
scrib
|
scrib
|
||||||
|
data_naukri/
|
||||||
|
gulf_data/
|
|
@ -1,7 +1,7 @@
|
||||||
import requests
|
import requests
|
||||||
import csv
|
import csv
|
||||||
import concurrent.futures
|
from time import sleep, time
|
||||||
|
"""
|
||||||
# List of URLs to query
|
# List of URLs to query
|
||||||
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
|
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
|
||||||
|
|
||||||
|
@ -28,105 +28,168 @@ headers = {
|
||||||
'userdata': '|IN'
|
'userdata': '|IN'
|
||||||
}
|
}
|
||||||
|
|
||||||
keys_to_extract = ['designation','description','company','compensation','industryType','functionalArea','jobSource','location','other','desiredCandidate','contact','isExpired','locationInterlinking']
|
with open("o.csv", 'a+', newline='', encoding='utf-8') as outfile:
|
||||||
company_keys = ['name','details']
|
outfile_writer = csv.writer(outfile)
|
||||||
salary_key = ['minimumSalary','maximumSalary','currency','label','hideSalary']
|
with open("output_all_gulf old.csv", 'r', encoding="utf-8", newline='') as jobis:
|
||||||
rfile = "output_all_gulf.csv"
|
j_read = list(csv.DictReader(jobis))
|
||||||
loc_list = []
|
for item in j_read:
|
||||||
skill_other =[]
|
print(base_url.format(item.get('jobId')))
|
||||||
skill_pref = []
|
jd_url = base_url.format(item.get('jobId'))
|
||||||
|
sleep(0.5)
|
||||||
|
response = requests.get(base_url.format(item.get('jobId')), headers=headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
job_data = {
|
||||||
|
"Url" : jd_url,
|
||||||
|
"Job Key" : item.get('jobId'),
|
||||||
|
"Source Link": response.json().get('other', {'tag': ''}).get('tag',''),
|
||||||
|
"Job Description" : response.json().get('description',''),
|
||||||
|
"Role Category" :"",
|
||||||
|
"Job Industry" : ', '.join([t['title'] for t in response.json()['industryInterlinking']]),
|
||||||
|
"Job Title" : response.json().get('designation'),
|
||||||
|
"Formatted Location Full" : response.json().get('location'),
|
||||||
|
"Job Functions" : ', '.join([x['title'] for x in response.json()['fAreaInterlinking']]),
|
||||||
|
"Company" : response.json().get('company', {'name':''}).get('name'),
|
||||||
|
"Job Type" : response.json().get('employmentType'),
|
||||||
|
"Key Skills" : ', '.join([y['title'] for y in response.json()['keywordInterlinking']]),
|
||||||
|
"Minimum Experience" : response.json().get('desiredCandidate').get('experience').get('min'),
|
||||||
|
"Maximum Experience" : response.json().get('desiredCandidate').get('experience').get('max'),
|
||||||
|
"Salary Detail" : response.json().get('compensation')
|
||||||
|
}
|
||||||
|
if outfile.tell() == 0:
|
||||||
|
header = job_data.keys()
|
||||||
|
outfile_writer.writerow(header)
|
||||||
|
outfile_writer.writerow([str(z).replace('\n','').strip() for z in job_data.values()])
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_url(url):
|
# Global variables
|
||||||
|
input_file = "gulf_data/output_all_gulf.csv"
|
||||||
|
output_file = "gulf_data/jobdata_gulf.csv"
|
||||||
|
error_file = "gulf_data/jobdata_error_gulf.csv"
|
||||||
|
stats_file = "gulf_data/stats_gulf.txt"
|
||||||
|
skip=0
|
||||||
|
|
||||||
|
class NaukriGulfJobDetailScraper:
|
||||||
|
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
|
||||||
|
headers = {
|
||||||
|
'authority': 'www.naukrigulf.com',
|
||||||
|
'accept': 'application/json',
|
||||||
|
'accept-format': 'strict',
|
||||||
|
'accept-language': 'ENGLISH',
|
||||||
|
'appid': '205',
|
||||||
|
'cache-control': 'no-cache',
|
||||||
|
'client-type': 'desktop',
|
||||||
|
'clientid': 'desktop',
|
||||||
|
'device-type': 'desktop',
|
||||||
|
'puppeteer': 'false',
|
||||||
|
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
|
||||||
|
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': 'Windows',
|
||||||
|
'sec-fetch-dest': 'empty',
|
||||||
|
'sec-fetch-mode': 'cors',
|
||||||
|
'sec-fetch-site': 'same-origin',
|
||||||
|
'systemid': '2323',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
|
||||||
|
'userdata': '|IN'
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, input_file, output_file, error_file):
|
||||||
|
self.input_file = input_file
|
||||||
|
self.output_file = output_file
|
||||||
|
self.error_file = error_file
|
||||||
|
self.timeout = 30
|
||||||
|
self.count = 1
|
||||||
|
# self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}
|
||||||
|
|
||||||
|
def transform_data(self, job_id, jd_url, json_response):
|
||||||
|
json_data = {
|
||||||
|
"Url" : jd_url,
|
||||||
|
"Job Key" : job_id,
|
||||||
|
"Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
|
||||||
|
json_response.get('contact', {'website': ''}).get('website',''),
|
||||||
|
"Job Description" : json_response.get('description',''),
|
||||||
|
"Role Category" :"",
|
||||||
|
"Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]),
|
||||||
|
"Job Title" : json_response.get('designation'),
|
||||||
|
"Formatted Location Full" : json_response.get('location'),
|
||||||
|
"Job Functions" : ', '.join([x['title'] for x in json_response['fAreaInterlinking']]),
|
||||||
|
"Company" : json_response.get('company', {'name':''}).get('name'),
|
||||||
|
"Job Type" : json_response.get('employmentType'),
|
||||||
|
"Key Skills" : ', '.join([y['title'] for y in json_response['keywordInterlinking']]),
|
||||||
|
"Minimum Experience" : json_response.get('desiredCandidate').get('experience').get('min'),
|
||||||
|
"Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
|
||||||
|
"Salary Detail" : json_response.get('compensation')
|
||||||
|
}
|
||||||
|
return json_data
|
||||||
|
|
||||||
|
def scrape(self):
|
||||||
|
with open(self.input_file, 'r', encoding='utf-8') as infile:
|
||||||
|
reader = csv.reader(infile)
|
||||||
|
total_input_count=0
|
||||||
|
all_job_ids = []
|
||||||
|
for row in reader:
|
||||||
|
jobid = row[1].strip()
|
||||||
|
mode = row[7].strip()
|
||||||
|
total_input_count+=1
|
||||||
|
if mode == "POSTED":
|
||||||
|
print("removed non tagged job with jobid %s" % jobid)
|
||||||
|
continue
|
||||||
|
|
||||||
|
all_job_ids.append(jobid)
|
||||||
|
|
||||||
|
print(f"Size of raw all_job_ids: {len(all_job_ids)}")
|
||||||
|
all_job_ids = list(set(all_job_ids))
|
||||||
|
print(f"Size of unique all_job_ids: {len(all_job_ids)}")
|
||||||
|
all_job_ids = all_job_ids[skip:]
|
||||||
|
print(f"Total input: {total_input_count}, Valid ids to scrape {len(all_job_ids)}")
|
||||||
|
with open(stats_file, "a") as stat:
|
||||||
|
stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n")
|
||||||
|
sleep(1)
|
||||||
|
with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile:
|
||||||
|
writer = csv.writer(outfile)
|
||||||
|
while all_job_ids:
|
||||||
|
job_id = all_job_ids[0]
|
||||||
|
url = self.base_url.format(job_id)
|
||||||
|
sleep(0.5)
|
||||||
try:
|
try:
|
||||||
url = base_url.format(url)
|
sleep(1)
|
||||||
response = requests.get(url, headers=headers)
|
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
||||||
return response.json(), response.status_code, url
|
print(f"{response.status_code} for {url}")
|
||||||
except requests.exceptions.RequestException as e:
|
if response.status_code == 200:
|
||||||
return "", str(e), url
|
json_response = response.json()
|
||||||
|
transformed_data = self.transform_data(job_id, url, json_response)
|
||||||
def batch_process(urls):
|
if outfile.tell() == 0 :
|
||||||
results = []
|
header = transformed_data.keys()
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
writer.writerow(header)
|
||||||
future_to_url = {executor.submit(fetch_url, url): url for url in urls}
|
writer.writerow(transformed_data.values())
|
||||||
|
print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}")
|
||||||
for future in concurrent.futures.as_completed(future_to_url):
|
all_job_ids.pop(0) # Remove the processed job ID
|
||||||
url = future_to_url[future]
|
self.count += 1
|
||||||
try:
|
# / elif response.status_code == 303:
|
||||||
result = future.result()
|
# json_response = response.json()
|
||||||
results.append(result)
|
# if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
|
||||||
except Exception as e:
|
# print(f"Expired job ID {jobid} with response 303")
|
||||||
results.append((url, str(e)))
|
# all_job_ids.pop(0) # Remove the processed job ID
|
||||||
return results
|
elif response.status_code == 404:
|
||||||
|
all_job_ids.pop(0) # Remove the processed job ID
|
||||||
|
print(f"Expired job ID {jobid} with response 404")
|
||||||
|
else:
|
||||||
|
print(f"Error for job ID {job_id}")
|
||||||
|
except Exception as n1:
|
||||||
|
print(str(n1))
|
||||||
|
pass
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
batch_size = 50
|
start_time = time()
|
||||||
results = []
|
scraper = NaukriGulfJobDetailScraper(input_file, output_file, error_file)
|
||||||
count = 1
|
scraper.scrape()
|
||||||
# Open a CSV file for writing
|
end_time = time()
|
||||||
with open('output_jobs_0309_me.csv', 'a', newline='', encoding='utf-8') as csvfile:
|
duration_hours = (end_time - start_time) / 3600
|
||||||
csvwriter = csv.writer(csvfile)
|
print(f"Jobdata program took {duration_hours:.2f} hours to run.")
|
||||||
|
with open(stats_file, "a") as stat:
|
||||||
# Write header to the CSV file
|
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
|
||||||
csvwriter.writerow(['URL'] + list(keys_to_extract))
|
|
||||||
|
|
||||||
with open(rfile,'r') as file:
|
|
||||||
csv_reader = csv.reader(file)
|
|
||||||
urls = [row.replace("\n","") for row in file]
|
|
||||||
|
|
||||||
for i in range(0, len(urls), batch_size):
|
|
||||||
batch = urls[i:i+batch_size]
|
|
||||||
batch_results = batch_process(batch)
|
|
||||||
# Make the HTTP GET request
|
|
||||||
#row = row.replace("\n","")
|
|
||||||
#`url = base_url.format(row)`
|
|
||||||
#try:
|
|
||||||
for response in batch_results:
|
|
||||||
print(count)
|
|
||||||
count = count + 1
|
|
||||||
if response[1]== 200:
|
|
||||||
json_data = response[0]
|
|
||||||
|
|
||||||
job_details = json_data
|
|
||||||
# Extract specific key values from the JSON response
|
|
||||||
values_to_store = [job_details.get(key, '') for key in keys_to_extract]
|
|
||||||
"""if values_to_store[0]!="":
|
|
||||||
|
|
||||||
[values_to_store.append(job_details["companyDetail"].get(key,'')) for key in company_keys]
|
|
||||||
[values_to_store.append(job_details["salaryDetail"].get(key,'')) for key in salary_key]
|
|
||||||
|
|
||||||
for loc in job_details["locations"]:
|
|
||||||
loc_list.append(loc.get('label',''))
|
|
||||||
values_to_store.append(loc_list)
|
|
||||||
|
|
||||||
for skill in job_details["keySkills"]["other"]:
|
|
||||||
skill_other.append(skill.get('label',''))
|
|
||||||
values_to_store.append(skill_other)
|
|
||||||
|
|
||||||
for skill in job_details["keySkills"]["preferred"]:
|
|
||||||
skill_pref.append(skill.get('label',''))
|
|
||||||
values_to_store.append(skill_pref)
|
|
||||||
|
|
||||||
else:
|
|
||||||
values_to_store[1]=""
|
|
||||||
values_to_store.append(job_details["companyDetail"])
|
|
||||||
values_to_store.append(job_details["salaryDetail"])
|
|
||||||
values_to_store.append(job_details["locations"])
|
|
||||||
values_to_store.append(job_details["keySkills"])
|
|
||||||
"""
|
|
||||||
# Write the extracted values to the CSV file
|
|
||||||
csvwriter.writerow([response[2]] + values_to_store)
|
|
||||||
else:
|
|
||||||
print(f"Failed to fetch data for job ID: {response[2]} with {response[0]}")
|
|
||||||
csvwriter.writerow([response[2]] + [response[0]])
|
|
||||||
|
|
||||||
# except requests.exceptions.RequestException as e:
|
|
||||||
# csvwriter.writerow([url] + [str(e)])
|
|
||||||
|
|
||||||
print("Data extraction and CSV writing complete.")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,113 @@
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import csv
|
||||||
|
import math
|
||||||
|
|
||||||
|
output_filename_csv = "gulf_data/output_all_gulf.csv"
|
||||||
|
input("remove lien 72 10000 limit wala")
|
||||||
|
headers = {
|
||||||
|
'authority': 'www.naukrigulf.com',
|
||||||
|
'accept': 'application/json',
|
||||||
|
'accept-format': 'strict',
|
||||||
|
'accept-language': 'ENGLISH',
|
||||||
|
'appid': '205',
|
||||||
|
'cache-control': 'no-cache',
|
||||||
|
'client-type': 'desktop',
|
||||||
|
'clientid': 'desktop',
|
||||||
|
'device-type': 'desktop',
|
||||||
|
'puppeteer': 'false',
|
||||||
|
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
|
||||||
|
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': 'Windows',
|
||||||
|
'sec-fetch-dest': 'empty',
|
||||||
|
'sec-fetch-mode': 'cors',
|
||||||
|
'sec-fetch-site': 'same-origin',
|
||||||
|
'systemid': '2323',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
|
||||||
|
'userdata': '|IN'
|
||||||
|
}
|
||||||
|
|
||||||
|
error_pages = []
|
||||||
|
keys_to_extract = ['designation', 'jobId', 'company','Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies']
|
||||||
|
fields_to_write = ['designation', 'jobId', 'Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies','city']
|
||||||
|
input_file = "naukri/_gulf_location.csv"
|
||||||
|
jobs_per_pages = 50
|
||||||
|
base_url = "https://www.naukrigulf.com/spapi/jobapi/search?Experience=&Keywords=&KeywordsAr=&Limit=50&Location={}&LocationAr=&Offset={}&SortPreference=&breadcrumb=1&locationId=&nationality=&nationalityLabel=&pageNo={}&srchId='"
|
||||||
|
|
||||||
|
def parse_and_save(json_data, csv_filename, city):
|
||||||
|
parsed_data = []
|
||||||
|
for job in json_data["jobs"]:
|
||||||
|
parsed_item = {field: job.get(field, None) for field in keys_to_extract}
|
||||||
|
parsed_item['city'] = city
|
||||||
|
print("parsed_item ---", parsed_item)
|
||||||
|
print(parsed_item.get('company', {'name':''}).get('name'))
|
||||||
|
print(parsed_item.get('company', {'id':''}).get('id'))
|
||||||
|
print(parsed_item.get('company', {'url':''}).get('url'))
|
||||||
|
for key, value in parsed_item.get('company', {'name':'', 'id':'', 'url':''}).items():
|
||||||
|
parsed_item["Company" + key] = value
|
||||||
|
try:
|
||||||
|
parsed_item.pop('company')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# print("updated parsed_item--", parsed_item)
|
||||||
|
parsed_data.append(parsed_item)
|
||||||
|
#parsed_data.extend(city)
|
||||||
|
|
||||||
|
with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile:
|
||||||
|
print("csv_filename---", csv_filename)
|
||||||
|
csv_writer = csv.DictWriter(csvfile, fieldnames= fields_to_write)
|
||||||
|
if csvfile.tell() == 0:
|
||||||
|
csv_writer.writeheader()
|
||||||
|
csv_writer.writerows(parsed_data)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
#for page_number in range(1, 4700): # Adjust the range as needed
|
||||||
|
with open(input_file, 'r') as file:
|
||||||
|
file_read = csv.reader(file)
|
||||||
|
file_read = list(file_read)
|
||||||
|
for city in file_read:
|
||||||
|
city_read_url = city[0].replace("\n","")
|
||||||
|
output_data=[]
|
||||||
|
total_pages = 1000
|
||||||
|
output_filename_json = f"{city[0]}.json"
|
||||||
|
output_filename_csv = "gulf_data/output_all_gulf.csv"
|
||||||
|
start_page = 1
|
||||||
|
|
||||||
|
# if(city[0] == "pharma"):
|
||||||
|
# start_page = 173
|
||||||
|
# total_pages = 22
|
||||||
|
# total_page_num = 194
|
||||||
|
|
||||||
|
while total_pages>0:
|
||||||
|
url = base_url.format(city[0],(jobs_per_pages*(start_page-1)),start_page)
|
||||||
|
print("url", url)
|
||||||
|
# input()
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
json_data = response.json()
|
||||||
|
|
||||||
|
if(total_pages == 1000):
|
||||||
|
total_jobs = json_data["totalJobsCount"]
|
||||||
|
total_pages = math.ceil(total_jobs/jobs_per_pages)
|
||||||
|
total_page_num = total_pages
|
||||||
|
|
||||||
|
parse_and_save(json_data, output_filename_csv, city[0])
|
||||||
|
print(f"Processed{url} : {start_page}/{total_page_num}/{total_pages}")
|
||||||
|
total_pages = total_pages-1
|
||||||
|
start_page = start_page+1
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("Error : ",response.status_code," at url ",url)
|
||||||
|
error_pages.append(url)
|
||||||
|
total_pages = total_pages-1
|
||||||
|
start_page = start_page+1
|
||||||
|
|
||||||
|
print("Data saved to output_new.json")
|
||||||
|
print(error_pages)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -72,7 +72,7 @@ class NaukriJobScraper:
|
||||||
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
||||||
while total_pages > 0:
|
while total_pages > 0:
|
||||||
url = self.base_url.format(industry_name, start_page, industry_q)
|
url = self.base_url.format(industry_name, start_page, industry_q)
|
||||||
|
print(url)
|
||||||
response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
|
response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
|
||||||
|
|
||||||
print(f"{response.status_code} for {url}")
|
print(f"{response.status_code} for {url}")
|
||||||
|
@ -102,7 +102,6 @@ class NaukriJobScraper:
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
scraper = NaukriJobScraper(input_file, output_file, error_file)
|
scraper = NaukriJobScraper(input_file, output_file, error_file)
|
||||||
scraper.scrape()
|
scraper.scrape()
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,12 @@
|
||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriGulfDetailItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
pass
|
|
@ -0,0 +1,103 @@
|
||||||
|
# Define here the models for your spider middleware
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriGulfDetailSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_start_requests(self, start_requests, spider):
|
||||||
|
# Called with the start requests of the spider, and works
|
||||||
|
# similarly to the process_spider_output() method, except
|
||||||
|
# that it doesn’t have a response associated.
|
||||||
|
|
||||||
|
# Must return only requests (not items).
|
||||||
|
for r in start_requests:
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriGulfDetailDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download handler or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
|
@ -0,0 +1,23 @@
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
import csv
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
current_date = datetime.now()
|
||||||
|
formatted_date = current_date.strftime('%d-%m-%Y')
|
||||||
|
output_file = f'naukri_gulf_detail_{formatted_date}.csv'
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriGulfDetailPipeline:
|
||||||
|
def open_spider(self, spider):
|
||||||
|
self.csvfile = open(output_file, 'a', newline='', encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
self.csv_writer = csv.DictWriter(self.csvfile, fieldnames=item.keys())
|
||||||
|
if self.csvfile.tell() == 0:
|
||||||
|
self.csv_writer.writeheader()
|
||||||
|
self.csv_writer.writerow(item)
|
||||||
|
return item
|
||||||
|
|
||||||
|
def close_spider(self, spider):
|
||||||
|
self.csvfile.close()
|
|
@ -0,0 +1,93 @@
|
||||||
|
# Scrapy settings for naukri_gulf_detail project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "naukri_gulf_detail"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["naukri_gulf_detail.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "naukri_gulf_detail.spiders"
|
||||||
|
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = "naukri_gulf_detail (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = True
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
#DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
#DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
#SPIDER_MIDDLEWARES = {
|
||||||
|
# "naukri_gulf_detail.middlewares.NaukriGulfDetailSpiderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
#DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "naukri_gulf_detail.middlewares.NaukriGulfDetailDownloaderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
#EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
#ITEM_PIPELINES = {
|
||||||
|
# "naukri_gulf_detail.pipelines.NaukriGulfDetailPipeline": 300,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
#AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
#AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
#AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
#HTTPCACHE_ENABLED = True
|
||||||
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
#HTTPCACHE_DIR = "httpcache"
|
||||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
Binary file not shown.
|
@ -0,0 +1,101 @@
|
||||||
|
import scrapy
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class NaukriGulfDetailSpiderSpider(scrapy.Spider):
|
||||||
|
name = "naukri_gulf_detail_spider"
|
||||||
|
custom_settings = {
|
||||||
|
'DOWNLOAD_DELAY' : 0.5,
|
||||||
|
'CONCURRENT_REQUESTS' : 5,
|
||||||
|
'ITEM_PIPELINES': {
|
||||||
|
'naukri_gulf_detail.pipelines.NaukriGulfDetailPipeline': 300,
|
||||||
|
},
|
||||||
|
'LOG_LEVEL': 'ERROR',
|
||||||
|
'LOG_FILE': 'naukri_gulf_detail_error.log',
|
||||||
|
'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s',
|
||||||
|
}
|
||||||
|
|
||||||
|
current_date = datetime.now()
|
||||||
|
formatted_date = current_date.strftime('%d-%m-%Y')
|
||||||
|
yesterday = current_date - timedelta(days=1)
|
||||||
|
yesterday_str = yesterday.strftime('%d-%m-%Y')
|
||||||
|
yesterday_search_file = f'gulf_data/naukri_gulf_search_{yesterday_str}.csv'
|
||||||
|
today_search_file = f'gulf_data/naukri_gulf_search_{formatted_date}.csv'
|
||||||
|
today_search_df = pd.read_csv(today_search_file)
|
||||||
|
yesterday_search_df = pd.read_csv(yesterday_search_file)
|
||||||
|
newresult_df = pd.merge(today_search_df , yesterday_search_df, on='jobId', how='left', suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
|
||||||
|
oldresult_df = pd.merge(yesterday_search_df, today_search_df , on='jobId', how='left',suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
|
||||||
|
newresult_df = newresult_df.drop_duplicates(subset="jobId", keep="first")
|
||||||
|
oldresult_df = oldresult_df.drop_duplicates(subset="jobId", keep="first")
|
||||||
|
newresult_df = newresult_df.reset_index(drop=True)
|
||||||
|
newresult_df.to_csv('gulf_data/new_jobs_gulf.csv', index=False)
|
||||||
|
oldresult_df = oldresult_df.reset_index(drop=True)
|
||||||
|
oldresult_df.to_csv('gulf_data/expired_jobs_gulf.csv', index=False)
|
||||||
|
input_file = 'gulf_data/new_jobs_gulf.csv'
|
||||||
|
print(newresult_df.shape, oldresult_df.shape)
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
headers = {
|
||||||
|
'authority': 'www.naukrigulf.com',
|
||||||
|
'accept': 'application/json',
|
||||||
|
'accept-format': 'strict',
|
||||||
|
'accept-language': 'ENGLISH',
|
||||||
|
'appid': '205',
|
||||||
|
'cache-control': 'no-cache',
|
||||||
|
'client-type': 'desktop',
|
||||||
|
'clientid': 'desktop',
|
||||||
|
'device-type': 'desktop',
|
||||||
|
'puppeteer': 'false',
|
||||||
|
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
|
||||||
|
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': 'Windows',
|
||||||
|
'sec-fetch-dest': 'empty',
|
||||||
|
'sec-fetch-mode': 'cors',
|
||||||
|
'sec-fetch-site': 'same-origin',
|
||||||
|
'systemid': '2323',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
|
||||||
|
'userdata': '|IN'
|
||||||
|
}
|
||||||
|
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
|
||||||
|
with open(self.input_file, 'r', encoding='utf-8') as infile:
|
||||||
|
reader = csv.DictReader(infile)
|
||||||
|
for row in reader:
|
||||||
|
jobid = row.get('jobId').strip()
|
||||||
|
mode = row['jobSource'].strip()
|
||||||
|
if mode != "POSTED":
|
||||||
|
print(jobid)
|
||||||
|
yield scrapy.Request(base_url.format(jobid), headers=headers,callback=self.parse, meta={
|
||||||
|
'jobid' : jobid
|
||||||
|
})
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
try:
|
||||||
|
job_id = response.meta.get('jobid')
|
||||||
|
json_response = response.json()
|
||||||
|
jd_url = response.url
|
||||||
|
json_data = {
|
||||||
|
"Url" : jd_url,
|
||||||
|
"Job Key" : job_id,
|
||||||
|
"Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
|
||||||
|
json_response.get('contact', {'website': ''}).get('website',''),
|
||||||
|
"Job Description" : json_response.get('description',''),
|
||||||
|
"Role Category" :"",
|
||||||
|
"Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]),
|
||||||
|
"Job Title" : json_response.get('designation'),
|
||||||
|
"Formatted Location Full" : json_response.get('location'),
|
||||||
|
"Job Functions" : ', '.join([x['title'] for x in json_response['fAreaInterlinking']]),
|
||||||
|
"Company" : json_response.get('company', {'name':''}).get('name'),
|
||||||
|
"Job Type" : json_response.get('employmentType'),
|
||||||
|
"Key Skills" : ', '.join([y['title'] for y in json_response['keywordInterlinking']]),
|
||||||
|
"Minimum Experience" : json_response.get('desiredCandidate').get('experience').get('min'),
|
||||||
|
"Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
|
||||||
|
"Salary Detail" : json_response.get('compensation'),
|
||||||
|
"Country" : json_response.get('compensation',{'country':''}).get('country')
|
||||||
|
}
|
||||||
|
yield json_data
|
||||||
|
except Exception as naukriError:
|
||||||
|
self.logger.error(f'An error occured : {str(naukriError)}')
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = naukri_gulf_detail.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = naukri_gulf_detail
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,12 @@
|
||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriGulfSearchItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
pass
|
|
@ -0,0 +1,103 @@
|
||||||
|
# Define here the models for your spider middleware
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriGulfSearchSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_start_requests(self, start_requests, spider):
|
||||||
|
# Called with the start requests of the spider, and works
|
||||||
|
# similarly to the process_spider_output() method, except
|
||||||
|
# that it doesn’t have a response associated.
|
||||||
|
|
||||||
|
# Must return only requests (not items).
|
||||||
|
for r in start_requests:
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriGulfSearchDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download handler or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
|
@ -0,0 +1,24 @@
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
import csv
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
current_date = datetime.now()
|
||||||
|
formatted_date = current_date.strftime('%d-%m-%Y')
|
||||||
|
file_to_write = f'naukri_gulf_search_{formatted_date}.csv'
|
||||||
|
fields_to_write = ['designation', 'jobId', 'Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies','city']
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriGulfSearchPipeline:
|
||||||
|
# print("pipelien ere")
|
||||||
|
def open_spider(self, spider):
|
||||||
|
self.csvfile = open(file_to_write, 'a', newline='', encoding='utf-8')
|
||||||
|
self.csv_writer = csv.DictWriter(self.csvfile, fieldnames=fields_to_write)
|
||||||
|
if self.csvfile.tell() == 0:
|
||||||
|
self.csv_writer.writeheader()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
self.csv_writer.writerow(item)
|
||||||
|
return item
|
||||||
|
|
||||||
|
def close_spider(self, spider):
|
||||||
|
self.csvfile.close()
|
|
@ -0,0 +1,93 @@
|
||||||
|
# Scrapy settings for naukri_gulf_search project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "naukri_gulf_search"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["naukri_gulf_search.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "naukri_gulf_search.spiders"
|
||||||
|
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = "naukri_gulf_search (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = True
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
#DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
#DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
#SPIDER_MIDDLEWARES = {
|
||||||
|
# "naukri_gulf_search.middlewares.NaukriGulfSearchSpiderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
#DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "naukri_gulf_search.middlewares.NaukriGulfSearchDownloaderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
#EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
#ITEM_PIPELINES = {
|
||||||
|
# "naukri_gulf_search.pipelines.NaukriGulfSearchPipeline": 300,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
#AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
#AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
#AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
#HTTPCACHE_ENABLED = True
|
||||||
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
#HTTPCACHE_DIR = "httpcache"
|
||||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
Binary file not shown.
|
@ -0,0 +1,115 @@
|
||||||
|
import scrapy
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
|
||||||
|
input_file_path = "static_data/_gulf_location.csv"
|
||||||
|
# output_filename_csv = "output_all_gulf.csv"
|
||||||
|
|
||||||
|
class NaukriGulfSearchSpiderSpider(scrapy.Spider):
|
||||||
|
name = "naukri_gulf_search_spider"
|
||||||
|
custom_settings = {
|
||||||
|
'DOWNLOAD_DELAY' : 1,
|
||||||
|
'CONCURRENT_REQUESTS' : 5,
|
||||||
|
'ITEM_PIPELINES': {
|
||||||
|
'naukri_gulf_search.pipelines.NaukriGulfSearchPipeline': 301,
|
||||||
|
},
|
||||||
|
'LOG_LEVEL': 'ERROR',
|
||||||
|
'LOG_FILE': 'naukri_gulf_search_error.log',
|
||||||
|
'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s',
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(NaukriGulfSearchSpiderSpider, self).__init__(*args, **kwargs)
|
||||||
|
self.csv_file = input_file_path
|
||||||
|
self.keys_to_extract = ['designation', 'jobId', 'company','Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies']
|
||||||
|
self.headers = {
|
||||||
|
'authority': 'www.naukrigulf.com',
|
||||||
|
'accept': 'application/json',
|
||||||
|
'accept-format': 'strict',
|
||||||
|
'accept-language': 'ENGLISH',
|
||||||
|
'appid': '205',
|
||||||
|
'cache-control': 'no-cache',
|
||||||
|
'client-type': 'desktop',
|
||||||
|
'clientid': 'desktop',
|
||||||
|
'device-type': 'desktop',
|
||||||
|
'puppeteer': 'false',
|
||||||
|
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
|
||||||
|
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': 'Windows',
|
||||||
|
'sec-fetch-dest': 'empty',
|
||||||
|
'sec-fetch-mode': 'cors',
|
||||||
|
'sec-fetch-site': 'same-origin',
|
||||||
|
'systemid': '2323',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
|
||||||
|
'userdata': '|IN'
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
base_url = "https://www.naukrigulf.com/spapi/jobapi/search?Experience=&Keywords=&KeywordsAr=&Limit=50&Location={}&LocationAr=&Offset={}&SortPreference=&breadcrumb=1&locationId=&nationality=&nationalityLabel=&pageNo={}&srchId='"
|
||||||
|
with open(input_file_path, 'r') as file:
|
||||||
|
file_read = csv.reader(file)
|
||||||
|
file_read = list(file_read)
|
||||||
|
for city in file_read:
|
||||||
|
total_pages = 1000
|
||||||
|
start_page = 1
|
||||||
|
jobs_per_pages = 50
|
||||||
|
url = base_url.format(city[0],(jobs_per_pages*(start_page-1)),start_page)
|
||||||
|
custom_args = {
|
||||||
|
'url' : url,
|
||||||
|
'total_pages':total_pages,
|
||||||
|
'start_page':start_page,
|
||||||
|
'base_url' : base_url,
|
||||||
|
'jobs_per_pages' : jobs_per_pages,
|
||||||
|
'city_name' : city[0]
|
||||||
|
}
|
||||||
|
yield scrapy.Request(url, headers=self.headers,callback=self.parse , meta=custom_args)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
# status_code = response.status
|
||||||
|
total_pages = response.meta.get('total_pages')
|
||||||
|
start_page = response.meta.get('start_page')
|
||||||
|
base_url = response.meta.get('base_url')
|
||||||
|
url = response.meta.get('url')
|
||||||
|
city_name = response.meta.get('city_name')
|
||||||
|
jobs_per_pages = response.meta.get('jobs_per_pages')
|
||||||
|
while total_pages>0:
|
||||||
|
if response.status == 200:
|
||||||
|
json_data = response.json()
|
||||||
|
if(total_pages == 1000):
|
||||||
|
total_jobs = json_data["totalJobsCount"]
|
||||||
|
total_pages = math.ceil(total_jobs/jobs_per_pages)
|
||||||
|
total_page_num = total_pages
|
||||||
|
for job in json_data["jobs"]:
|
||||||
|
parsed_item = {field: job.get(field, None) for field in self.keys_to_extract}
|
||||||
|
parsed_item['city'] = city_name
|
||||||
|
# print("parsed_item ---", parsed_item)
|
||||||
|
# print(parsed_item.get('company', {'name':''}).get('name'))
|
||||||
|
# print(parsed_item.get('company', {'id':''}).get('id'))
|
||||||
|
# print(parsed_item.get('company', {'url':''}).get('url'))
|
||||||
|
for key, value in parsed_item.get('company', {'name':'', 'id':'', 'url':''}).items():
|
||||||
|
parsed_item["Company" + key] = value
|
||||||
|
try:
|
||||||
|
parsed_item.pop('company')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
yield parsed_item
|
||||||
|
print(f"Processed{url} : {start_page}/{total_page_num}/{total_pages}")
|
||||||
|
total_pages = total_pages-1
|
||||||
|
start_page = start_page+1
|
||||||
|
else:
|
||||||
|
print("Error : ",response.status," at url ")
|
||||||
|
total_pages = total_pages-1
|
||||||
|
start_page = start_page+1
|
||||||
|
custom_args = {
|
||||||
|
'url' : url,
|
||||||
|
'total_pages':total_pages,
|
||||||
|
'start_page':start_page,
|
||||||
|
'base_url' : base_url,
|
||||||
|
'jobs_per_pages' : jobs_per_pages,
|
||||||
|
'city_name' : city_name
|
||||||
|
}
|
||||||
|
yield scrapy.Request(base_url.format(city_name,(jobs_per_pages*(start_page-1)),start_page), headers= self.headers, callback=self.parse, meta=custom_args)
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = naukri_gulf_search.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = naukri_gulf_search
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,12 @@
|
||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriIndiaDetailItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
pass
|
|
@ -0,0 +1,103 @@
|
||||||
|
# Define here the models for your spider middleware
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriIndiaDetailSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_start_requests(self, start_requests, spider):
|
||||||
|
# Called with the start requests of the spider, and works
|
||||||
|
# similarly to the process_spider_output() method, except
|
||||||
|
# that it doesn’t have a response associated.
|
||||||
|
|
||||||
|
# Must return only requests (not items).
|
||||||
|
for r in start_requests:
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriIndiaDetailDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download handler or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
|
@ -0,0 +1,35 @@
|
||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
import csv
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
current_date = datetime.now()
|
||||||
|
formatted_date = current_date.strftime('%d-%m-%Y')
|
||||||
|
output_file = f'india_data/naukri_india_detail_{formatted_date}.csv'
|
||||||
|
json_data = [
|
||||||
|
"Url","Job Key","Source Link","Job Description","Role Category","Job Industry","Job Title",
|
||||||
|
"Formatted Location Full","Job Functions","Company","Job Type","Key Skills",
|
||||||
|
"Minimum Experience","Maximum Experience", "Salary Detail", "Country"]
|
||||||
|
|
||||||
|
class NaukriIndiaDetailPipeline:
|
||||||
|
# print("pipelien ere")
|
||||||
|
def open_spider(self, spider):
|
||||||
|
self.csvfile = open(output_file, 'a', newline='', encoding='utf-8')
|
||||||
|
self.csv_writer = csv.DictWriter(self.csvfile, fieldnames=json_data)
|
||||||
|
if self.csvfile.tell() == 0:
|
||||||
|
self.csv_writer.writeheader()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
self.csv_writer.writerow(item)
|
||||||
|
print("written To csv", )
|
||||||
|
return item
|
||||||
|
|
||||||
|
def close_spider(self, spider):
|
||||||
|
self.csvfile.close()
|
|
@ -0,0 +1,93 @@
|
||||||
|
# Scrapy settings for naukri_india_detail project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "naukri_india_detail"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["naukri_india_detail.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "naukri_india_detail.spiders"
|
||||||
|
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = "naukri_india_detail (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = True
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
#DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
#DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
#SPIDER_MIDDLEWARES = {
|
||||||
|
# "naukri_india_detail.middlewares.NaukriIndiaDetailSpiderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
#DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "naukri_india_detail.middlewares.NaukriIndiaDetailDownloaderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
#EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
#ITEM_PIPELINES = {
|
||||||
|
# "naukri_india_detail.pipelines.NaukriIndiaDetailPipeline": 300,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
#AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
#AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
#AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
#HTTPCACHE_ENABLED = True
|
||||||
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
#HTTPCACHE_DIR = "httpcache"
|
||||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,112 @@
|
||||||
|
import scrapy
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class NaukriIndiaDetailSpiderSpider(scrapy.Spider):
|
||||||
|
name = "naukri_india_detail_spider"
|
||||||
|
custom_settings = {
|
||||||
|
'DOWNLOAD_DELAY' : 0.5,
|
||||||
|
'CONCURRENT_REQUESTS' : 5,
|
||||||
|
'ITEM_PIPELINES': {
|
||||||
|
'naukri_india_detail.pipelines.NaukriIndiaDetailPipeline': 300,
|
||||||
|
},
|
||||||
|
'LOG_LEVEL': 'ERROR',
|
||||||
|
'LOG_FILE': 'naukri_india_detail_error.log',
|
||||||
|
'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s',
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
headers = {
|
||||||
|
'authority': 'www.naukri.com',
|
||||||
|
'accept': 'application/json',
|
||||||
|
'accept-language': 'en-US,en;q=0.9',
|
||||||
|
'appid': '121',
|
||||||
|
'cache-control': 'no-cache, no-store, must-revalidate',
|
||||||
|
'content-type': 'application/json',
|
||||||
|
'expires': '0',
|
||||||
|
'gid': 'LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE',
|
||||||
|
'pragma': 'no-cache',
|
||||||
|
'referer': 'https://www.naukri.com/job-listings-ps-technical-consultant-ii-ncr-corporation-india-pvt-ltd-kolkata-mumbai-new-delhi-hyderabad-secunderabad-pune-chennai-bangalore-bengaluru-3-to-6-years-120823501070',
|
||||||
|
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
'sec-fetch-dest': 'empty',
|
||||||
|
'sec-fetch-mode': 'cors',
|
||||||
|
'sec-fetch-site': 'same-origin',
|
||||||
|
'systemid': 'Naukri',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43',
|
||||||
|
'x-requested-with': 'XMLHttpRequest',
|
||||||
|
'cookie': 'test=naukri.com; _t_ds=14c8c0f01691845374-19414c8c0f0-014c8c0f0; _gcl_au=1.1.1024691843.1691845381; _fbp=fb.1.1691845391563.1521284000; _t_r=1096%2F%2F; __utma=266160400.1059122291.1691845381.1691846963.1691846963.1; __utmc=266160400; __utmz=266160400.1691846963.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _gid=GA1.2.1097790226.1691946960; _cc_id=f102b3f9c375bbb80783e8e09a9c6a4d; panoramaId_expiry=1692033362592; panoramaId=f20d6d50d02ca8dbc9f4835382c2a9fb927ad954299218db60a7d9d7bca09362; panoramaIdType=panoDevice; _abck=EAF8CD87ED06F6FE0D1BE341378082D0~0~YAAQBCozarVfw8GJAQAAvF128gqV/yjff8AT5qkTc7EiVmNlJJ00nD16VEFeJh15q2bYAK8KlnGcPr7zpsi8USVMgui9DaCwoq15n4cW+Z/uKvUfCuUQAwVIKj2qlRT9tghTOfvBgGovWxTjFhD8B8DypZg3xbCBcOfMrTxIG0kml1V3V0teNzxQbKwxZBH+f9SpG1nWjcSqi0MuZ2Lp9njCQTDEXdyNn5FK9QUyBNIgMiZXGCroYN6g9Dqg50awS8p7GDin9O0yaBFnLYXYSSPqsjYlZsOAeZG1YDXhVfCIXFl9Ai4oQwulHEVR4kTx7E/GAxrPUMWKT1MJXJk38d/hHm/khF9WXryyuzBNGqBrHEmzbSK2Apvjhz+Hl7a1CDiFvYOTgurygc0o2F8E4e+o1OudsW0KCA==~-1~-1~-1; bm_sz=ED70280600D61C24AE8779690E6872A4~YAAQBCozardfw8GJAQAAvF128hRM9F6AMuh7Z7SvE3TmgXzJwI6StEga9y2KuTxZ8hXLMtJ7yq1I6ToCvJ1qcBfvYBY/W/7P2A4I+QADScKYSbs6S/S3UE9bL/lKee3NvEuD50tGUHrs59SQGoYdJGMrwml9npvfv+PANc8RaeobLmyx70LjTBajrTQruhnuEqphAnEPph1L6yqffRmta8KALbfw/sFFkvZWRte4uRCRS6IwyvdgNdGzHrvU90Cefnm1sAuK5Hm+F+JUvMVZhEWa/vukCd3Pz7toStN7N4P31cQ=~4539188~3289157; bm_mi=5266EA699B10C54B520AC0C335945591~YAAQBCozaslfw8GJAQAAFV528hRn0Dp7Ng6SjmmpdWbuBqjjlpOIm6e4no+DFPGfNvfuTNj9/tOe0zSzEbnFtWymp3K8PdRZcbO4azXh/4xphXqBeZXTZhE/H/7X6du3KAg3VyrF08jM/O2Hf8/7qtOXVUdBSpd8+mzH3IbW1d10UuiswDenQ6HiNRSkJISdZ8F6lXgGw2kpN3tAHIa9RixcTehrimRMipUgj4pRG/80a+tzAQQcAWUVOFaNoOHZ/C/oL2It920HJrOdtE85yrXx/LMaJlUb1RlHCG2KE/xkNMWpMI/FCimZYyI/DC8yQziKzxoqnP+GPA+JN5dMV76U4jXzYLqPOT5NwoKG7w==~1; ak_bmsc=0F69C083388867249F15237E773039FA~000000000000000000000000000000~YAAQBCozailgw8GJAQAAiGF28hTwkEIwbiDNaA96h/t+HbVduxzp6s1VtAmlm8JZxLg4LfiUPyA15rawjfgm3WgrQVB6GsFlaa+AvUvz1Pz3Q1P9td+LXZ5/+PFIAaTQN/O8SvcNd87eOmguE+T4BLbH5NDBcHEHBngYElDjkyqZkRtJ15EqweEPCpzn6yt+EYc/+sNuZI5/Wqj674CTqW8hmhvDToHdetlr8dh0zmRPwh1xdYnwb4uR6rGuaAIDwfopcqXdroQFVmDwMMXCkLNtTG3jToLxEDo7w/SHlJNK0LhicrXOQLyJu4k7udguvs4/Y+kXOEc04TkLKWa0gHsA+znQId6BT0CK4BFgGPYCMzpn379EH1ucz+mbjpX9p61CvxwEqFWV6O6hXXlbjHDGsuIiuIy3EP+38wb6B+uq2PBPgEmzZjLYjs9aNWGs0of7I0/V+ZL2xQDA2JD5FUXN1sgkl8r6w2sT5Fk1VuHGeorLkpIm0fkysZqAPM2yqJ5zaVkjyI4UENN56Aw79pKKVSkJtT5ALDmr1e+O8keIkg069ipenGburGc1Nw==; __gads=ID=da661383a92cc2b7:T=1691845731:RT=1691990009:S=ALNI_Ma5kdU-yCfi5vupriJnuuWUWmE_SQ; __gpi=UID=00000c2b451ccc2b:T=1691845731:RT=1691990009:S=ALNI_MZHpbDDCgSCaDcBTqfNHzHEDKk0JQ; jd=110823008324; _ga=GA1.2.1059122291.1691845381; cto_bundle=IfSELF9LbTF0TnAzamN1d2ZSSm5EMkdYekFhWDNJeElkOCUyQkElMkZ2RTRJNTFBNG95WENmVlBEV01wV3ZPSXB0dWpTZVFBZHZWQmt6WjVHTUpWNWEwQURTeWRaMWVGbyUyQjclMkZpSm5aNFZia0ZjcGklMkJFcSUyQlg2R3I3bUJkazJnaVN0cURyTUpGWUxQOHR6TFpBcDF6QU1MckFOdlg2cEElM0QlM0Q; _gat_UA-182658-1=1; bm_sv=33FDCB0BB2381FFCB1DA9B35AB25F10B~YAAQHSozaj2kUsGJAQAAFWF48hR1ZxWD9bmTihvsJwSN5urYMQoBOXsjILmBLpCp5Y8Wb2d+v8S1IsgfaFAjzZQJDWWGsM4VZOUHvjeEwqyhpkf95fegyYjUANSip9pcOY7JcbsJ3QemjclSynJdM2yjQovH+L9XiBHdKYFWDfacLicV2AGOtFikI1gVDGLSEqegx2bUuwmuQAlECM+lqj//OIwitlvDTMj9WCs40ybqG4D7o+JDWSXPBMYddaEqDw==~1; HOWTORT=ul=1691990122615&r=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1&hd=1691990122806&cl=1691990019014&nu=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1; _ga_K2YBNZVRLL=GS1.1.1691989990.4.1.1691990122.60.0.0'
|
||||||
|
}
|
||||||
|
|
||||||
|
current_date = datetime.now()
|
||||||
|
formatted_date = current_date.strftime('%d-%m-%Y')
|
||||||
|
yesterday = current_date - timedelta(days=1)
|
||||||
|
yesterday_str = yesterday.strftime('%d-%m-%Y')
|
||||||
|
yesterday_search_file = f'india_data/naukri_india_search_{yesterday_str}.csv'
|
||||||
|
today_search_file = f'india_data/naukri_india_search_{formatted_date}.csv'
|
||||||
|
today_search_df = pd.read_csv(today_search_file)
|
||||||
|
yesterday_search_df = pd.read_csv(yesterday_search_file)
|
||||||
|
newresult_df = pd.merge(today_search_df , yesterday_search_df, on='jobId', how='left', suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
|
||||||
|
oldresult_df = pd.merge(yesterday_search_df, today_search_df , on='jobId', how='left',suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
|
||||||
|
newresult_df = newresult_df.drop_duplicates(subset="jobId", keep="first")
|
||||||
|
oldresult_df = oldresult_df.drop_duplicates(subset="jobId", keep="first")
|
||||||
|
newresult_df = newresult_df.reset_index(drop=True)
|
||||||
|
newresult_df.to_csv('india_data/new_jobs_india.csv', index=False)
|
||||||
|
oldresult_df = oldresult_df.reset_index(drop=True)
|
||||||
|
oldresult_df.to_csv('india_data/expired_jobs_india.csv', index=False)
|
||||||
|
input_file = 'india_data/new_jobs_india.csv'
|
||||||
|
print(newresult_df.shape, oldresult_df.shape)
|
||||||
|
|
||||||
|
|
||||||
|
with open(input_file, 'r') as csv_file:
|
||||||
|
csv_reader = csv.DictReader(csv_file)
|
||||||
|
for row in csv_reader:
|
||||||
|
if row['mode'] == "crawled":
|
||||||
|
jobId = row['jobId']
|
||||||
|
url = "https://www.naukri.com/jobapi/v4/job/{}".format(jobId)
|
||||||
|
print(url)
|
||||||
|
yield scrapy.Request(url, headers=headers,callback=self.parse)
|
||||||
|
# for url in self.start_urls:
|
||||||
|
# yield scrapy.Request(url, headers=headers, callback=self.parse)
|
||||||
|
# url = "https://www.naukri.com/jobapi/v4/job/260923007828"
|
||||||
|
# yield scrapy.Request(url, headers=headers, callback=self.parse)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
try:
|
||||||
|
url = response.url
|
||||||
|
print(f'processing {url}')
|
||||||
|
# print(response.text)
|
||||||
|
# input("---------------")
|
||||||
|
response = response.json()
|
||||||
|
job_details = response.get("jobDetails",{})
|
||||||
|
location_arr = [item['label'] for item in job_details["locations"]]
|
||||||
|
location_str = ', '.join(location_arr)
|
||||||
|
skills_arr = [skill["label"] for skill in job_details.get("keySkills")["other"] if skill["label"]]
|
||||||
|
skills_str = ", ".join(skills_arr)
|
||||||
|
json_data = {
|
||||||
|
"Url": url,
|
||||||
|
"Job Key": str(url.split('/')[-1]),
|
||||||
|
"Source Link": job_details.get("applyRedirectUrl"),
|
||||||
|
"Job Description": job_details.get("description"),
|
||||||
|
"Role Category": job_details.get("roleCategory"),
|
||||||
|
"Job Industry": job_details.get("industry"),
|
||||||
|
"Job Title": job_details.get("title"),
|
||||||
|
"Formatted Location Full": location_str,
|
||||||
|
"Job Functions": job_details.get("functionalArea"),
|
||||||
|
"Company": job_details.get("companyDetail", {}).get("name") if job_details.get("companyDetail") else None,
|
||||||
|
"Job Type": job_details.get("employmentType").split(',')[0].strip(),
|
||||||
|
|
||||||
|
##Only available in naukri
|
||||||
|
"Key Skills": skills_str,
|
||||||
|
"Minimum Experience": job_details.get("minimumExperience"),
|
||||||
|
"Maximum Experience": job_details.get("maximumExperience"),
|
||||||
|
"Salary Detail": job_details.get("salaryDetail"),
|
||||||
|
"Country" : "India"
|
||||||
|
}
|
||||||
|
|
||||||
|
yield json_data
|
||||||
|
except Exception as naukriError:
|
||||||
|
self.logger.error(f'An error occured : {str(naukriError)}')
|
|
@ -0,0 +1,38 @@
|
||||||
|
2023-10-06 18:16:08 [scrapy.core.engine] ERROR: Scraper close failure
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
|
||||||
|
yield self.engine.open_spider(self.spider, start_requests)
|
||||||
|
FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_detail_06-10-2023.csv'
|
||||||
|
|
||||||
|
During handling of the above exception, another exception occurred:
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 892, in _runCallbacks
|
||||||
|
current.result = callback( # type: ignore[misc]
|
||||||
|
File "C:\Rahul code\scrapy for naukri\server scraper\naukri_india_detail\naukri_india_detail\pipelines.py", line 35, in close_spider
|
||||||
|
self.csvfile.close()
|
||||||
|
AttributeError: 'NaukriIndiaDetailPipeline' object has no attribute 'csvfile'
|
||||||
|
2023-10-06 18:16:08 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method CoreStats.spider_closed of <scrapy.extensions.corestats.CoreStats object at 0x000001E30744A7D0>>
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
|
||||||
|
yield self.engine.open_spider(self.spider, start_requests)
|
||||||
|
FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_detail_06-10-2023.csv'
|
||||||
|
|
||||||
|
During handling of the above exception, another exception occurred:
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\defer.py", line 348, in maybeDeferred_coro
|
||||||
|
result = f(*args, **kw)
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
|
||||||
|
return receiver(*arguments, **named)
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\extensions\corestats.py", line 30, in spider_closed
|
||||||
|
elapsed_time = finish_time - self.start_time
|
||||||
|
TypeError: unsupported operand type(s) for -: 'datetime.datetime' and 'NoneType'
|
||||||
|
2023-10-06 18:16:08 [twisted] CRITICAL: Unhandled error in Deferred:
|
||||||
|
2023-10-06 18:16:08 [twisted] CRITICAL:
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1697, in _inlineCallbacks
|
||||||
|
result = context.run(gen.send, result)
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
|
||||||
|
yield self.engine.open_spider(self.spider, start_requests)
|
||||||
|
FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_detail_06-10-2023.csv'
|
|
@ -0,0 +1,11 @@
|
||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = naukri_india_detail.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = naukri_india_detail
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,12 @@
|
||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriIndiaSearchItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
pass
|
|
@ -0,0 +1,103 @@
|
||||||
|
# Define here the models for your spider middleware
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriIndiaSearchSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_start_requests(self, start_requests, spider):
|
||||||
|
# Called with the start requests of the spider, and works
|
||||||
|
# similarly to the process_spider_output() method, except
|
||||||
|
# that it doesn’t have a response associated.
|
||||||
|
|
||||||
|
# Must return only requests (not items).
|
||||||
|
for r in start_requests:
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriIndiaSearchDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download handler or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
|
@ -0,0 +1,32 @@
|
||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
import csv
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
current_date = datetime.now()
|
||||||
|
formatted_date = current_date.strftime('%d-%m-%Y')
|
||||||
|
file_to_write = f'india_data/naukri_india_search_{formatted_date}.csv'
|
||||||
|
json_data = ['title', 'jobId', 'footerPlaceholderLabel', 'companyName', 'companyId', 'jdURL', 'createdDate',
|
||||||
|
'mode', 'placeholders']
|
||||||
|
|
||||||
|
|
||||||
|
class NaukriIndiaSearchPipeline:
|
||||||
|
# print("pipelien ere")
|
||||||
|
def open_spider(self, spider):
|
||||||
|
self.csvfile = open(file_to_write, 'a', newline='', encoding='utf-8')
|
||||||
|
self.csv_writer = csv.DictWriter(self.csvfile, fieldnames=json_data)
|
||||||
|
if self.csvfile.tell() == 0:
|
||||||
|
self.csv_writer.writeheader()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
self.csv_writer.writerow(item)
|
||||||
|
return item
|
||||||
|
|
||||||
|
def close_spider(self, spider):
|
||||||
|
self.csvfile.close()
|
|
@ -0,0 +1,93 @@
|
||||||
|
# Scrapy settings for naukri_india_search project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "naukri_india_search"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["naukri_india_search.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "naukri_india_search.spiders"
|
||||||
|
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = "naukri_india_search (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = True
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
#DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
#DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
#SPIDER_MIDDLEWARES = {
|
||||||
|
# "naukri_india_search.middlewares.NaukriIndiaSearchSpiderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
#DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "naukri_india_search.middlewares.NaukriIndiaSearchDownloaderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
#EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
#ITEM_PIPELINES = {
|
||||||
|
# "naukri_india_search.pipelines.NaukriIndiaSearchPipeline": 300,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
#AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
#AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
#AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
#HTTPCACHE_ENABLED = True
|
||||||
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
#HTTPCACHE_DIR = "httpcache"
|
||||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,125 @@
|
||||||
|
import scrapy
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
|
||||||
|
input_file_path = "static_data/_industry_urls.csv"
|
||||||
|
headers = {
|
||||||
|
"authority": "www.naukri.com",
|
||||||
|
"accept": "application/json",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"appid": "109",
|
||||||
|
"cache-control": "no-cache",
|
||||||
|
"clientid": "d3skt0p",
|
||||||
|
"content-type": "application/json",
|
||||||
|
"cookie": "_t_ds=21836c671691564336-4621836c67-021836c67; jd=280323907884; _gcl_au=1.1.1767756339.1691564338; test=naukri.com; G_ENABLED_IDPS=google; _cc_id=c7a22b66b0e8b76ba5b1ab973ac2c4e2; _fbp=fb.1.1691586951863.1688541664; MYNAUKRI[UNID]=6decd0ec6dac4ea7adf498fd9aea1b02; MYNAUKBMS[TOTALEXP]=.; MYNAUKBMS[MISC]=%7CX%7C-1%3A-1.-1%7CX%7C-1%3A-1.-1; PHPSESSID=7r1itb4rb4a5vp75h16aj1p50j; PS=0e9c712cbbee09d64d62ed464ccf1ed68d69b9c8b8e0879f86ac8078180ed768ff003c62a2e1a36431b890266d0ecd01; _t_ds=21836c671691564336-4621836c67-021836c67; ACTIVE=1691746049; __utma=266160400.222629415.1691564339.1691747172.1691747172.1; __utmc=266160400; __utmz=266160400.1691747172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _t_s=direct; _gid=GA1.2.404208624.1692184309; _t_r=1091%2F%2F; _abck=17DF08AA6008335BFF57EC3D4F31C60A~0~YAAQBCozaovbVfWJAQAAyqlV/wqIPgjcUjD+7ht0W00DSxyvraAK8+dtCE9YPqwS+IJPRVvvHPVL4ZLzQ7cfGNXzfh3k+y2VLqP+s+cPut62fApHUtFEmbTrUNVNv9Zeq9lwI+e8zd1DsioeBQtdUG+kzSHGWky6sPhziobMkx1B7W04IwUfACS7Ve5fYBCJU5dbtVRjeDAoNXmctQPJApkPdaddRMuoeq4qCZcW/bb8bGR+nwyO8+ZBPpQqoBpZrIhpG66AkcOcsLIfBHMfb8E/1dUZyDcFEO4Y7P41NVSIGgF8BzyGksJsa+IlaCXYrz0MDX0QiHXyiozYmEocQYKeTOwkMlmoHq/+X8XLt70g2LvMc0Zszor74PL7ymsDvPRLoDCvPinCf4Uk844KKItZ6menX46Tpg==~-1~-1~-1; bm_sz=BD37187E9CC624B5599566E84E218D81~YAAQBCozao3bVfWJAQAAyqlV/xQaFSd0F+spatEEAmhMi6P20wPSNyvyqwLIgOZIqPyzNpNoeCiq27hIuVDssDqyYLJipRkLmTgJhtRpBI/UkMYHO1gve7KT27FIcZLAPM1GlmudVfZr/vsBgNU7vcq7YlESrOQUNFkdARzI9cnEHl0Uwh+TdW+jSx/uvvgN860EXQYxvgQFPwHcF6K1HLhnThG6W3LrVsKEnltKEJsWzq73YGJhtHR2gk/c2Rn2rsnlBSKkon06k/bBUNpImVfGIv57NluTzAf4HUKBL2dBFfo=~4272181~3684401; bm_mi=840B9E1760640F737B07DF6916477F14~YAAQBCozar8fV/WJAQAAemdo/xR295FqGfoDgkXCgp3Zs538VapFXehFbhWVc0uLC2Z7cfCczehDlj6/WNkwuGUEm6AQ+a2VS9H1cL3cF+vXFUomXcwhU4fmjNruimtgH2vNc8+t07S6CFswop+vgQr50vwaRKAobfsJi0jKNELyQOdgxf0EQ+vH31DwtJMCeNMFIlZxXSznSOUZ9VRY/HSFsMgPHu3ChcKnhfJhUpS2VEkwwh8FjyNNsp08Nc8B85Vbpq3PCTz1kpFWCIeBDDVthrtnKITPzciYZy5e2VhvJWKi+2iRyOVeXbLbCphszroTewz5d6Sd4RhwOg==~1; _gat_UA-182658-1=1; ak_bmsc=DC184FF5F5CF7CEC60DE28CF4A04B43E~000000000000000000000000000000~YAAQBCozakggV/WJAQAAo2xo/xST717WQAIeCYOI3htLys7gWAfwL6/uNZtCJv6fAyFBYEcPf/0asPA8yD7eyVNXLvegM9qh5IquUPoSFJH3Sjz7JyPcySdejoqwoRGhg4rYROybASf1olGEy4PNPGBCBwTi+KUhkVCkHEaDWiDa/feuQddoB3nWBPui267IP17/01afcmBsBA+xz5PFn+OVIp7pIHrsWwa3Z+QoA3+9ZTSs+D/jXsBCsrJojd8U6Ho8NPfgfUyNOJo0SzFIQbcLy5TmAQHEYBCLhYgkRJjGPRSOqEYCtOenp5WzQHRisSQUU837xfVnr42Pc9xoW73pafQv/pQiuB64SrdhVtABVsSWchE5RuqwnPPIBf6cjJWLNb71p+Is6F6zcvVmSIvx2wZO0QmLQ2pfXr6Lh+jcBNPcod8pLbWG5U5RPHQAVi0nGPOYS+3mcrkGCiTrteqyLmSEOGvThutsOfl5Kog6h78tCaHhfhnZt1mmPkanCex2CHjeuT4FESOf83XFCLDVT9v0VAh962a9KQ==; __gads=ID=85c2a6341a8344ec:T=1691641263:RT=1692207181:S=ALNI_MZnP35P-PINdjwxcv-SNoWRMxbz8w; __gpi=UID=00000c29ed221036:T=1691641263:RT=1692207181:S=ALNI_Majbvns7DTxm-L8Fcvi-v_e7zQCvA; bm_sv=743032F92D532DCFC228BE5DB12014CF~YAAQBCozarIgV/WJAQAAQnJo/xRLr5g+qzbOInTUPStEJ+njAToV8zwOvBbHEEF9WGABP3ObKrNGr0FSALH8SsyJxhCnJZP72tWp4RJ8IMvpVkNNNye2Kc0n+U9VxZhSg9RKvKTn/DwW5x0lwY6guqb4wJwZIND/pUfBqdWUPp77qF4rYSeBEg/no94nGlmXUVUY4GqTDj6hCo6XIBbTIg1BGSdrLjFRTjpKu9aRX0ScDPSxuyMe7KPZSsOGY1AL~1; cto_bundle=TYhEE19xSDJxQk1qdTBuR3hYWDklMkJ3SWhPZmRkcjg3TnYyREN1dUpHaDBlbWJoME40OTVBelNlZ3J3TnhjVmZhSTNTTXl2U2JjSWhIM29aaWJHMyUyQkIlMkJPUmZKaGNBRkJLQVNHU1FYWFlleTFVJTJGTWduTkppQzJzMW1SOFJyRWNEdndENkklMkJ6M25jaFpaJTJCUmdUOWNMY2Z3TlolMkJ3QSUzRCUzRA; HOWTORT=ul=1692207219428&r=https%3A%2F%2Fwww.naukri.com%2Faccounting-jobs%3Fxt%3Dcatsrch%26amp%3Bqi%255b%255d%3D8&hd=1692207219607; _ga=GA1.1.222629415.1691564339; _ga_K2YBNZVRLL=GS1.1.1692207181.10.1.1692207220.21.0.0", # Add your cookie value here
|
||||||
|
"gid": "LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE",
|
||||||
|
"referer": "https://www.naukri.com/fresher-jobs?src=gnbjobs_homepage_srch",
|
||||||
|
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
|
||||||
|
"sec-ch-ua-mobile": "?0",
|
||||||
|
"sec-ch-ua-platform": "Windows",
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-site": "same-origin",
|
||||||
|
"systemid": "109",
|
||||||
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
|
||||||
|
"content-encoding": "gzip",
|
||||||
|
}
|
||||||
|
|
||||||
|
class NaukriIndiaSearchSpiderSpider(scrapy.Spider):
|
||||||
|
name = "naukri_india_search_spider"
|
||||||
|
custom_settings = {
|
||||||
|
'DOWNLOAD_DELAY' : 1,
|
||||||
|
'CONCURRENT_REQUESTS' : 5,
|
||||||
|
'ITEM_PIPELINES': {
|
||||||
|
'naukri_india_search.pipelines.NaukriIndiaSearchPipeline': 301,
|
||||||
|
},
|
||||||
|
'LOG_LEVEL': 'ERROR',
|
||||||
|
'LOG_FILE': 'naukri_india_search_error.log',
|
||||||
|
'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s',
|
||||||
|
}
|
||||||
|
|
||||||
|
custom_urls =[]
|
||||||
|
def start_requests(self):
|
||||||
|
headers = {
|
||||||
|
"authority": "www.naukri.com",
|
||||||
|
"accept": "application/json",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"appid": "109",
|
||||||
|
"cache-control": "no-cache",
|
||||||
|
"clientid": "d3skt0p",
|
||||||
|
"content-type": "application/json",
|
||||||
|
"cookie": "_t_ds=21836c671691564336-4621836c67-021836c67; jd=280323907884; _gcl_au=1.1.1767756339.1691564338; test=naukri.com; G_ENABLED_IDPS=google; _cc_id=c7a22b66b0e8b76ba5b1ab973ac2c4e2; _fbp=fb.1.1691586951863.1688541664; MYNAUKRI[UNID]=6decd0ec6dac4ea7adf498fd9aea1b02; MYNAUKBMS[TOTALEXP]=.; MYNAUKBMS[MISC]=%7CX%7C-1%3A-1.-1%7CX%7C-1%3A-1.-1; PHPSESSID=7r1itb4rb4a5vp75h16aj1p50j; PS=0e9c712cbbee09d64d62ed464ccf1ed68d69b9c8b8e0879f86ac8078180ed768ff003c62a2e1a36431b890266d0ecd01; _t_ds=21836c671691564336-4621836c67-021836c67; ACTIVE=1691746049; __utma=266160400.222629415.1691564339.1691747172.1691747172.1; __utmc=266160400; __utmz=266160400.1691747172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _t_s=direct; _gid=GA1.2.404208624.1692184309; _t_r=1091%2F%2F; _abck=17DF08AA6008335BFF57EC3D4F31C60A~0~YAAQBCozaovbVfWJAQAAyqlV/wqIPgjcUjD+7ht0W00DSxyvraAK8+dtCE9YPqwS+IJPRVvvHPVL4ZLzQ7cfGNXzfh3k+y2VLqP+s+cPut62fApHUtFEmbTrUNVNv9Zeq9lwI+e8zd1DsioeBQtdUG+kzSHGWky6sPhziobMkx1B7W04IwUfACS7Ve5fYBCJU5dbtVRjeDAoNXmctQPJApkPdaddRMuoeq4qCZcW/bb8bGR+nwyO8+ZBPpQqoBpZrIhpG66AkcOcsLIfBHMfb8E/1dUZyDcFEO4Y7P41NVSIGgF8BzyGksJsa+IlaCXYrz0MDX0QiHXyiozYmEocQYKeTOwkMlmoHq/+X8XLt70g2LvMc0Zszor74PL7ymsDvPRLoDCvPinCf4Uk844KKItZ6menX46Tpg==~-1~-1~-1; bm_sz=BD37187E9CC624B5599566E84E218D81~YAAQBCozao3bVfWJAQAAyqlV/xQaFSd0F+spatEEAmhMi6P20wPSNyvyqwLIgOZIqPyzNpNoeCiq27hIuVDssDqyYLJipRkLmTgJhtRpBI/UkMYHO1gve7KT27FIcZLAPM1GlmudVfZr/vsBgNU7vcq7YlESrOQUNFkdARzI9cnEHl0Uwh+TdW+jSx/uvvgN860EXQYxvgQFPwHcF6K1HLhnThG6W3LrVsKEnltKEJsWzq73YGJhtHR2gk/c2Rn2rsnlBSKkon06k/bBUNpImVfGIv57NluTzAf4HUKBL2dBFfo=~4272181~3684401; bm_mi=840B9E1760640F737B07DF6916477F14~YAAQBCozar8fV/WJAQAAemdo/xR295FqGfoDgkXCgp3Zs538VapFXehFbhWVc0uLC2Z7cfCczehDlj6/WNkwuGUEm6AQ+a2VS9H1cL3cF+vXFUomXcwhU4fmjNruimtgH2vNc8+t07S6CFswop+vgQr50vwaRKAobfsJi0jKNELyQOdgxf0EQ+vH31DwtJMCeNMFIlZxXSznSOUZ9VRY/HSFsMgPHu3ChcKnhfJhUpS2VEkwwh8FjyNNsp08Nc8B85Vbpq3PCTz1kpFWCIeBDDVthrtnKITPzciYZy5e2VhvJWKi+2iRyOVeXbLbCphszroTewz5d6Sd4RhwOg==~1; _gat_UA-182658-1=1; ak_bmsc=DC184FF5F5CF7CEC60DE28CF4A04B43E~000000000000000000000000000000~YAAQBCozakggV/WJAQAAo2xo/xST717WQAIeCYOI3htLys7gWAfwL6/uNZtCJv6fAyFBYEcPf/0asPA8yD7eyVNXLvegM9qh5IquUPoSFJH3Sjz7JyPcySdejoqwoRGhg4rYROybASf1olGEy4PNPGBCBwTi+KUhkVCkHEaDWiDa/feuQddoB3nWBPui267IP17/01afcmBsBA+xz5PFn+OVIp7pIHrsWwa3Z+QoA3+9ZTSs+D/jXsBCsrJojd8U6Ho8NPfgfUyNOJo0SzFIQbcLy5TmAQHEYBCLhYgkRJjGPRSOqEYCtOenp5WzQHRisSQUU837xfVnr42Pc9xoW73pafQv/pQiuB64SrdhVtABVsSWchE5RuqwnPPIBf6cjJWLNb71p+Is6F6zcvVmSIvx2wZO0QmLQ2pfXr6Lh+jcBNPcod8pLbWG5U5RPHQAVi0nGPOYS+3mcrkGCiTrteqyLmSEOGvThutsOfl5Kog6h78tCaHhfhnZt1mmPkanCex2CHjeuT4FESOf83XFCLDVT9v0VAh962a9KQ==; __gads=ID=85c2a6341a8344ec:T=1691641263:RT=1692207181:S=ALNI_MZnP35P-PINdjwxcv-SNoWRMxbz8w; __gpi=UID=00000c29ed221036:T=1691641263:RT=1692207181:S=ALNI_Majbvns7DTxm-L8Fcvi-v_e7zQCvA; bm_sv=743032F92D532DCFC228BE5DB12014CF~YAAQBCozarIgV/WJAQAAQnJo/xRLr5g+qzbOInTUPStEJ+njAToV8zwOvBbHEEF9WGABP3ObKrNGr0FSALH8SsyJxhCnJZP72tWp4RJ8IMvpVkNNNye2Kc0n+U9VxZhSg9RKvKTn/DwW5x0lwY6guqb4wJwZIND/pUfBqdWUPp77qF4rYSeBEg/no94nGlmXUVUY4GqTDj6hCo6XIBbTIg1BGSdrLjFRTjpKu9aRX0ScDPSxuyMe7KPZSsOGY1AL~1; cto_bundle=TYhEE19xSDJxQk1qdTBuR3hYWDklMkJ3SWhPZmRkcjg3TnYyREN1dUpHaDBlbWJoME40OTVBelNlZ3J3TnhjVmZhSTNTTXl2U2JjSWhIM29aaWJHMyUyQkIlMkJPUmZKaGNBRkJLQVNHU1FYWFlleTFVJTJGTWduTkppQzJzMW1SOFJyRWNEdndENkklMkJ6M25jaFpaJTJCUmdUOWNMY2Z3TlolMkJ3QSUzRCUzRA; HOWTORT=ul=1692207219428&r=https%3A%2F%2Fwww.naukri.com%2Faccounting-jobs%3Fxt%3Dcatsrch%26amp%3Bqi%255b%255d%3D8&hd=1692207219607; _ga=GA1.1.222629415.1691564339; _ga_K2YBNZVRLL=GS1.1.1692207181.10.1.1692207220.21.0.0", # Add your cookie value here
|
||||||
|
"gid": "LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE",
|
||||||
|
"referer": "https://www.naukri.com/fresher-jobs?src=gnbjobs_homepage_srch",
|
||||||
|
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
|
||||||
|
"sec-ch-ua-mobile": "?0",
|
||||||
|
"sec-ch-ua-platform": "Windows",
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-site": "same-origin",
|
||||||
|
"systemid": "109",
|
||||||
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
|
||||||
|
"content-encoding": "gzip",
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(input_file_path, 'r') as file:
|
||||||
|
file_read = csv.reader(file)
|
||||||
|
for industry in list(file_read):
|
||||||
|
# industry_read_url = industry[0].replace("\n", "")
|
||||||
|
industry_name=industry[1]
|
||||||
|
industry_q=industry[2]
|
||||||
|
base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&pageNo={}&xt=catsrch&qi\[\]={}"
|
||||||
|
total_pages = 1000
|
||||||
|
start_page= 1
|
||||||
|
custom_args = {
|
||||||
|
'industry_name':industry[1],
|
||||||
|
'industry_q':industry[2],
|
||||||
|
'base_url' : base_url,
|
||||||
|
'total_pages' : 1000,
|
||||||
|
'start_page' : 1
|
||||||
|
}
|
||||||
|
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
||||||
|
url = base_url.format(industry_name, start_page, industry_q)
|
||||||
|
yield scrapy.Request(url, headers=headers,callback=self.parse , meta=custom_args)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
keys_to_extract = ['title', 'jobId', 'footerPlaceholderLabel', 'companyName', 'companyId', 'jdURL', 'createdDate',
|
||||||
|
'mode', 'placeholders']
|
||||||
|
total_pages = response.meta.get('total_pages')
|
||||||
|
start_page = response.meta.get('start_page')
|
||||||
|
base_url = response.meta.get('base_url')
|
||||||
|
industry_name = response.meta.get('industry_name')
|
||||||
|
industry_q = response.meta.get('industry_q')
|
||||||
|
|
||||||
|
if(total_pages == 1000):
|
||||||
|
total_jobs = response.json()["noOfJobs"]
|
||||||
|
total_pages = math.ceil(total_jobs/100)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# parsed_data = []
|
||||||
|
for job in response.json()["jobDetails"]:
|
||||||
|
parsed_item = {field: job.get(field, None) for field in keys_to_extract}
|
||||||
|
# parsed_data.append(parsed_item)
|
||||||
|
yield parsed_item
|
||||||
|
|
||||||
|
total_pages -= 1
|
||||||
|
start_page += 1
|
||||||
|
custom_args = {
|
||||||
|
'industry_name':industry_name,
|
||||||
|
'industry_q':industry_q,
|
||||||
|
'base_url' : base_url,
|
||||||
|
'total_pages' : total_pages,
|
||||||
|
'start_page' : start_page
|
||||||
|
}
|
||||||
|
self.custom_urls.append(base_url.format(industry_name, start_page, industry_q))
|
||||||
|
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
||||||
|
|
||||||
|
for url in self.custom_urls:
|
||||||
|
yield scrapy.Request(url=url, headers= headers, callback=self.parse, meta=custom_args)
|
||||||
|
# next_page = base_url.format(industry_name, start_page, industry_q)
|
||||||
|
# yield response.follow(next_page, callback=self.parse, meta={'my_arg': custom_args})
|
||||||
|
|
||||||
|
except Exception as naukriError:
|
||||||
|
self.logger.error(f'An error occured : {str(naukriError)}')
|
|
@ -0,0 +1,38 @@
|
||||||
|
2023-10-06 17:09:58 [scrapy.core.engine] ERROR: Scraper close failure
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
|
||||||
|
yield self.engine.open_spider(self.spider, start_requests)
|
||||||
|
FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_search_06-10-2023.csv'
|
||||||
|
|
||||||
|
During handling of the above exception, another exception occurred:
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 892, in _runCallbacks
|
||||||
|
current.result = callback( # type: ignore[misc]
|
||||||
|
File "C:\Rahul code\scrapy for naukri\server scraper\naukri_india_search\naukri_india_search\pipelines.py", line 32, in close_spider
|
||||||
|
self.csvfile.close()
|
||||||
|
AttributeError: 'NaukriIndiaSearchPipeline' object has no attribute 'csvfile'
|
||||||
|
2023-10-06 17:09:58 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method CoreStats.spider_closed of <scrapy.extensions.corestats.CoreStats object at 0x000001DD214DB070>>
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
|
||||||
|
yield self.engine.open_spider(self.spider, start_requests)
|
||||||
|
FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_search_06-10-2023.csv'
|
||||||
|
|
||||||
|
During handling of the above exception, another exception occurred:
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\defer.py", line 348, in maybeDeferred_coro
|
||||||
|
result = f(*args, **kw)
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
|
||||||
|
return receiver(*arguments, **named)
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\extensions\corestats.py", line 30, in spider_closed
|
||||||
|
elapsed_time = finish_time - self.start_time
|
||||||
|
TypeError: unsupported operand type(s) for -: 'datetime.datetime' and 'NoneType'
|
||||||
|
2023-10-06 17:09:58 [twisted] CRITICAL: Unhandled error in Deferred:
|
||||||
|
2023-10-06 17:09:58 [twisted] CRITICAL:
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1697, in _inlineCallbacks
|
||||||
|
result = context.run(gen.send, result)
|
||||||
|
File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
|
||||||
|
yield self.engine.open_spider(self.spider, start_requests)
|
||||||
|
FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_search_06-10-2023.csv'
|
|
@ -0,0 +1,11 @@
|
||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = naukri_india_search.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = naukri_india_search
|
|
@ -0,0 +1,61 @@
|
||||||
|
https://www.naukri.com/accounting-jobs?xt=catsrch&qi[]=8,accounting,8
|
||||||
|
https://www.naukri.com/advertising-jobs?xt=catsrch&qi[]=32,advertising,32
|
||||||
|
https://www.naukri.com/agriculture-jobs?xt=catsrch&qi[]=33,agriculture,33
|
||||||
|
https://www.naukri.com/animation-jobs?xt=catsrch&qi[]=56,animation,56
|
||||||
|
https://www.naukri.com/architecture-jobs?xt=catsrch&qi[]=30,architecture,30
|
||||||
|
https://www.naukri.com/automobile-jobs?xt=catsrch&qi[]=4,automobile,4
|
||||||
|
https://www.naukri.com/aviation-jobs?xt=catsrch&qi[]=46,aviation,46
|
||||||
|
https://www.naukri.com/bpo-jobs?xt=catsrch&qi[]=7,bpo,7
|
||||||
|
https://www.naukri.com/bank-jobs?xt=catsrch&qi[]=14,bank,14
|
||||||
|
https://www.naukri.com/brewery-jobs?xt=catsrch&qi[]=50,brewery,50
|
||||||
|
https://www.naukri.com/sanitary-jobs?xt=catsrch&qi[]=60,sanitary,60
|
||||||
|
https://www.naukri.com/chemical-jobs?xt=catsrch&qi[]=6,chemical,6
|
||||||
|
https://www.naukri.com/engineering-jobs?xt=catsrch&qi[]=12,engineering,12
|
||||||
|
https://www.naukri.com/consumer-durables-jobs?xt=catsrch&qi[]=10,consumer-durables,10
|
||||||
|
https://www.naukri.com/courier-jobs?xt=catsrch&qi[]=18,courier,18
|
||||||
|
https://www.naukri.com/defence-jobs?xt=catsrch&qi[]=42,defence,42
|
||||||
|
https://www.naukri.com/teaching-jobs?xt=catsrch&qi[]=26,teaching,26
|
||||||
|
https://www.naukri.com/electrical-jobs?xt=catsrch&qi[]=55,electrical,55
|
||||||
|
https://www.naukri.com/export-import-jobs?xt=catsrch&qi[]=13,export-import,13
|
||||||
|
https://www.naukri.com/fmcg-jobs?xt=catsrch&qi[]=9,fmcg,9
|
||||||
|
https://www.naukri.com/facility-management-jobs?xt=catsrch&qi[]=47,facility-management,47
|
||||||
|
https://www.naukri.com/fertilizers-jobs?xt=catsrch&qi[]=41,fertilizers,41
|
||||||
|
https://www.naukri.com/food-processing-jobs?xt=catsrch&qi[]=57,food-processing,57
|
||||||
|
https://www.naukri.com/fresher-jobs?xt=catsrch&qi[]=31,fresher,31
|
||||||
|
https://www.naukri.com/gems-jewellery-jobs?xt=catsrch&qi[]=35,gems-jewellery,35
|
||||||
|
https://www.naukri.com/glass-jobs?xt=catsrch&qi[]=49,glass,49
|
||||||
|
https://www.naukri.com/air-conditioning-jobs?xt=catsrch&qi[]=61,air-conditioning,61
|
||||||
|
https://www.naukri.com/airline-jobs?xt=catsrch&qi[]=2,airline,2
|
||||||
|
https://www.naukri.com/networking-jobs?xt=catsrch&qi[]=15,networking,15
|
||||||
|
https://www.naukri.com/information-technology-jobs?xt=catsrch&qi[]=25,information-technology,25
|
||||||
|
https://www.naukri.com/industrial-jobs?xt=catsrch&qi[]=16,industrial,16
|
||||||
|
https://www.naukri.com/insurance-jobs?xt=catsrch&qi[]=17,insurance,17
|
||||||
|
https://www.naukri.com/kpo-jobs?xt=catsrch&qi[]=48,kpo,48
|
||||||
|
https://www.naukri.com/legal-jobs?xt=catsrch&qi[]=36,legal,36
|
||||||
|
https://www.naukri.com/media-jobs?xt=catsrch&qi[]=19,media,19
|
||||||
|
https://www.naukri.com/dotcom-jobs?xt=catsrch&qi[]=19,dotcom,19
|
||||||
|
https://www.naukri.com/entertainment-jobs?xt=catsrch&qi[]=19,entertainment,19
|
||||||
|
https://www.naukri.com/medical-jobs?xt=catsrch&qi[]=20,medical,20
|
||||||
|
https://www.naukri.com/mining-jobs?xt=catsrch&qi[]=54,mining,54
|
||||||
|
https://www.naukri.com/ngo-jobs?xt=catsrch&qi[]=37,ngo,37
|
||||||
|
https://www.naukri.com/automation-jobs?xt=catsrch&qi[]=21,automation,21
|
||||||
|
https://www.naukri.com/oil-and-gas-jobs?xt=catsrch&qi[]=23,oil-and-gas,23
|
||||||
|
https://www.naukri.com/paper-jobs?xt=catsrch&qi[]=43,paper,43
|
||||||
|
https://www.naukri.com/pharma-jobs?xt=catsrch&qi[]=22,pharma,22
|
||||||
|
https://www.naukri.com/printing-jobs?xt=catsrch&qi[]=38,printing,38
|
||||||
|
https://www.naukri.com/publishing-jobs?xt=catsrch&qi[]=58,publishing,58
|
||||||
|
https://www.naukri.com/real-estate-jobs?xt=catsrch&qi[]=39,real-estate,39
|
||||||
|
https://www.naukri.com/recruitment-jobs?xt=catsrch&qi[]=34,recruitment,34
|
||||||
|
https://www.naukri.com/retail-jobs?xt=catsrch&qi[]=24,retail,24
|
||||||
|
https://www.naukri.com/security-jobs?xt=catsrch&qi[]=40,security,40
|
||||||
|
https://www.naukri.com/electronics-jobs?xt=catsrch&qi[]=28,electronics,28
|
||||||
|
https://www.naukri.com/shipping-jobs?xt=catsrch&qi[]=44,shipping,44
|
||||||
|
https://www.naukri.com/steel-jobs?xt=catsrch&qi[]=53,steel,53
|
||||||
|
https://www.naukri.com/consultant-jobs?xt=catsrch&qi[]=52,consultant,52
|
||||||
|
https://www.naukri.com/telecom-jobs?xt=catsrch&qi[]=27,telecom,27
|
||||||
|
https://www.naukri.com/textiles-jobs?xt=catsrch&qi[]=3,textiles,3
|
||||||
|
https://www.naukri.com/tyres-jobs?xt=catsrch&qi[]=45,tyres,45
|
||||||
|
https://www.naukri.com/water-treatment-jobs?xt=catsrch&qi[]=51,water-treatment,51
|
||||||
|
https://www.naukri.com/fitness-trainer-jobs?xt=catsrch&qi[]=59,fitness-trainer,59
|
||||||
|
https://www.naukri.com/ecommerce-jobs?xt=catsrch&qi[]=63,ecommerce,63
|
||||||
|
https://www.naukri.com/internet-jobs?xt=catsrch&qi[]=63,internet,63
|
|
Loading…
Reference in New Issue