150 lines
10 KiB
Python
150 lines
10 KiB
Python
import requests
|
|
import csv
|
|
import os
|
|
import time
|
|
import math
|
|
import logging
|
|
from datetime import datetime
|
|
|
|
# Configure the logging settings
|
|
logging.basicConfig(filename='search_india_error.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger()
|
|
|
|
current_date = datetime.now()
|
|
today_date = current_date.strftime('%d-%m-%Y')
|
|
input_file = "naukri/_industry_urls.csv"
|
|
output_file = f"india_data/search_result_india_{today_date}.csv"
|
|
error_file = f"india_data/search_error_india_{today_date}.csv"
|
|
stats_file = f"india_data/stats_india_{today_date}.txt"
|
|
|
|
class NaukriJobScraper:
|
|
base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&sort=f&pageNo={}&xt=catsrch&qi\[\]={}"
|
|
|
|
# base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&pageNo={}&xt=catsrch&qi\[\]={}"
|
|
headers = {
|
|
"authority": "www.naukri.com",
|
|
"accept": "application/json",
|
|
"accept-language": "en-US,en;q=0.9",
|
|
"appid": "109",
|
|
"cache-control": "no-cache",
|
|
"clientid": "d3skt0p",
|
|
"content-type": "application/json",
|
|
"cookie": "_t_ds=21836c671691564336-4621836c67-021836c67; jd=280323907884; _gcl_au=1.1.1767756339.1691564338; test=naukri.com; G_ENABLED_IDPS=google; _cc_id=c7a22b66b0e8b76ba5b1ab973ac2c4e2; _fbp=fb.1.1691586951863.1688541664; MYNAUKRI[UNID]=6decd0ec6dac4ea7adf498fd9aea1b02; MYNAUKBMS[TOTALEXP]=.; MYNAUKBMS[MISC]=%7CX%7C-1%3A-1.-1%7CX%7C-1%3A-1.-1; PHPSESSID=7r1itb4rb4a5vp75h16aj1p50j; PS=0e9c712cbbee09d64d62ed464ccf1ed68d69b9c8b8e0879f86ac8078180ed768ff003c62a2e1a36431b890266d0ecd01; _t_ds=21836c671691564336-4621836c67-021836c67; ACTIVE=1691746049; __utma=266160400.222629415.1691564339.1691747172.1691747172.1; __utmc=266160400; __utmz=266160400.1691747172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _t_s=direct; _gid=GA1.2.404208624.1692184309; _t_r=1091%2F%2F; _abck=17DF08AA6008335BFF57EC3D4F31C60A~0~YAAQBCozaovbVfWJAQAAyqlV/wqIPgjcUjD+7ht0W00DSxyvraAK8+dtCE9YPqwS+IJPRVvvHPVL4ZLzQ7cfGNXzfh3k+y2VLqP+s+cPut62fApHUtFEmbTrUNVNv9Zeq9lwI+e8zd1DsioeBQtdUG+kzSHGWky6sPhziobMkx1B7W04IwUfACS7Ve5fYBCJU5dbtVRjeDAoNXmctQPJApkPdaddRMuoeq4qCZcW/bb8bGR+nwyO8+ZBPpQqoBpZrIhpG66AkcOcsLIfBHMfb8E/1dUZyDcFEO4Y7P41NVSIGgF8BzyGksJsa+IlaCXYrz0MDX0QiHXyiozYmEocQYKeTOwkMlmoHq/+X8XLt70g2LvMc0Zszor74PL7ymsDvPRLoDCvPinCf4Uk844KKItZ6menX46Tpg==~-1~-1~-1; bm_sz=BD37187E9CC624B5599566E84E218D81~YAAQBCozao3bVfWJAQAAyqlV/xQaFSd0F+spatEEAmhMi6P20wPSNyvyqwLIgOZIqPyzNpNoeCiq27hIuVDssDqyYLJipRkLmTgJhtRpBI/UkMYHO1gve7KT27FIcZLAPM1GlmudVfZr/vsBgNU7vcq7YlESrOQUNFkdARzI9cnEHl0Uwh+TdW+jSx/uvvgN860EXQYxvgQFPwHcF6K1HLhnThG6W3LrVsKEnltKEJsWzq73YGJhtHR2gk/c2Rn2rsnlBSKkon06k/bBUNpImVfGIv57NluTzAf4HUKBL2dBFfo=~4272181~3684401; bm_mi=840B9E1760640F737B07DF6916477F14~YAAQBCozar8fV/WJAQAAemdo/xR295FqGfoDgkXCgp3Zs538VapFXehFbhWVc0uLC2Z7cfCczehDlj6/WNkwuGUEm6AQ+a2VS9H1cL3cF+vXFUomXcwhU4fmjNruimtgH2vNc8+t07S6CFswop+vgQr50vwaRKAobfsJi0jKNELyQOdgxf0EQ+vH31DwtJMCeNMFIlZxXSznSOUZ9VRY/HSFsMgPHu3ChcKnhfJhUpS2VEkwwh8FjyNNsp08Nc8B85Vbpq3PCTz1kpFWCIeBDDVthrtnKITPzciYZy5e2VhvJWKi+2iRyOVeXbLbCphszroTewz5d6Sd4RhwOg==~1; _gat_UA-182658-1=1; ak_bmsc=DC184FF5F5CF7CEC60DE28CF4A04B43E~000000000000000000000000000000~YAAQBCozakggV/WJAQAAo2xo/xST717WQAIeCYOI3htLys7gWAfwL6/uNZtCJv6fAyFBYEcPf/0asPA8yD7eyVNXLvegM9qh5IquUPoSFJH3Sjz7JyPcySdejoqwoRGhg4rYROybASf1olGEy4PNPGBCBwTi+KUhkVCkHEaDWiDa/feuQddoB3nWBPui267IP17/01afcmBsBA+xz5PFn+OVIp7pIHrsWwa3Z+QoA3+9ZTSs+D/jXsBCsrJojd8U6Ho8NPfgfUyNOJo0SzFIQbcLy5TmAQHEYBCLhYgkRJjGPRSOqEYCtOenp5WzQHRisSQUU837xfVnr42Pc9xoW73pafQv/pQiuB64SrdhVtABVsSWchE5RuqwnPPIBf6cjJWLNb71p+Is6F6zcvVmSIvx2wZO0QmLQ2pfXr6Lh+jcBNPcod8pLbWG5U5RPHQAVi0nGPOYS+3mcrkGCiTrteqyLmSEOGvThutsOfl5Kog6h78tCaHhfhnZt1mmPkanCex2CHjeuT4FESOf83XFCLDVT9v0VAh962a9KQ==; __gads=ID=85c2a6341a8344ec:T=1691641263:RT=1692207181:S=ALNI_MZnP35P-PINdjwxcv-SNoWRMxbz8w; __gpi=UID=00000c29ed221036:T=1691641263:RT=1692207181:S=ALNI_Majbvns7DTxm-L8Fcvi-v_e7zQCvA; bm_sv=743032F92D532DCFC228BE5DB12014CF~YAAQBCozarIgV/WJAQAAQnJo/xRLr5g+qzbOInTUPStEJ+njAToV8zwOvBbHEEF9WGABP3ObKrNGr0FSALH8SsyJxhCnJZP72tWp4RJ8IMvpVkNNNye2Kc0n+U9VxZhSg9RKvKTn/DwW5x0lwY6guqb4wJwZIND/pUfBqdWUPp77qF4rYSeBEg/no94nGlmXUVUY4GqTDj6hCo6XIBbTIg1BGSdrLjFRTjpKu9aRX0ScDPSxuyMe7KPZSsOGY1AL~1; cto_bundle=TYhEE19xSDJxQk1qdTBuR3hYWDklMkJ3SWhPZmRkcjg3TnYyREN1dUpHaDBlbWJoME40OTVBelNlZ3J3TnhjVmZhSTNTTXl2U2JjSWhIM29aaWJHMyUyQkIlMkJPUmZKaGNBRkJLQVNHU1FYWFlleTFVJTJGTWduTkppQzJzMW1SOFJyRWNEdndENkklMkJ6M25jaFpaJTJCUmdUOWNMY2Z3TlolMkJ3QSUzRCUzRA; HOWTORT=ul=1692207219428&r=https%3A%2F%2Fwww.naukri.com%2Faccounting-jobs%3Fxt%3Dcatsrch%26amp%3Bqi%255b%255d%3D8&hd=1692207219607; _ga=GA1.1.222629415.1691564339; _ga_K2YBNZVRLL=GS1.1.1692207181.10.1.1692207220.21.0.0", # Add your cookie value here
|
|
"gid": "LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE",
|
|
"referer": "https://www.naukri.com/fresher-jobs?src=gnbjobs_homepage_srch",
|
|
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
|
|
"sec-ch-ua-mobile": "?0",
|
|
"sec-ch-ua-platform": "Windows",
|
|
"sec-fetch-dest": "empty",
|
|
"sec-fetch-mode": "cors",
|
|
"sec-fetch-site": "same-origin",
|
|
"systemid": "109",
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
|
|
"content-encoding": "gzip",
|
|
}
|
|
stopcrawl = False
|
|
# headers = {
|
|
# "appid": "109",
|
|
# "systemid": "109"
|
|
# }
|
|
keys_to_extract = ['title', 'jobId', 'footerPlaceholderLabel', 'companyName', 'companyId', 'jdURL', 'createdDate',
|
|
'mode', 'placeholders']
|
|
|
|
def __init__(self, input_file_path, output_file_path, error_file_path):
|
|
self.input_file_path = input_file_path
|
|
self.output_file_path = output_file_path
|
|
self.error_file_path = error_file_path
|
|
self.timeout = 120
|
|
self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}
|
|
|
|
def parse_and_save(self, json_data):
|
|
parsed_data = []
|
|
for job in json_data["jobDetails"]:
|
|
parsed_item = {field: job.get(field, None) for field in self.keys_to_extract}
|
|
parsed_data.append(parsed_item)
|
|
# with open('r,txt', 'w+', encoding='utf-8', newline='') as dr:
|
|
# import json
|
|
# dr.write(json.dumps(parsed_data))
|
|
# print(parsed_data)
|
|
days_ago_list = [x['footerPlaceholderLabel'] for x in parsed_data]
|
|
target = "3 Days Ago"
|
|
count = days_ago_list.count(target)
|
|
percentage = (count / len(days_ago_list)) * 100
|
|
if percentage > 60:
|
|
self.stopcrawl = True
|
|
|
|
|
|
with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
|
csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract)
|
|
csv_writer.writerows(parsed_data)
|
|
|
|
def scrape(self):
|
|
with open(self.output_file_path, "w", newline="", encoding="utf-8") as csvfile:
|
|
csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract)
|
|
csv_writer.writeheader()
|
|
|
|
with open(self.input_file_path, 'r') as file:
|
|
file_read = csv.reader(file)
|
|
for industry in file_read:
|
|
industry_read_url = industry[0].replace("\n", "")
|
|
industry_name=industry[1]
|
|
industry_q=industry[2]
|
|
total_pages = 1000
|
|
self.stopcrawl = False
|
|
start_page = 1
|
|
|
|
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
|
while total_pages > 0:
|
|
if self.stopcrawl:
|
|
total_pages = 0
|
|
url = self.base_url.format(industry_name, start_page, industry_q)
|
|
try:
|
|
# print(url)
|
|
# response = requests.get(url, headers=self.headers, timeout=self.timeout,
|
|
# proxies=self.proxies)
|
|
# if response.status_code == 403:
|
|
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
|
# print(f"{response.status_code} for {url}")
|
|
|
|
if response.status_code != 200:
|
|
print(response.status_code)
|
|
print(response.reason)
|
|
print(response.text)
|
|
print(f"Error with page {start_page} for industry {industry_name}")
|
|
with open(self.error_file_path, "a") as file:
|
|
file.write(f"Error with page {start_page} for industry {industry_name}\n")
|
|
# time.sleep(10)
|
|
continue
|
|
|
|
# if 200 response
|
|
data = response.json()
|
|
if(total_pages == 1000):
|
|
total_jobs = data["noOfJobs"]
|
|
total_pages = math.ceil(total_jobs/100)
|
|
|
|
self.parse_and_save(data)
|
|
|
|
# Assuming that you'll break the loop once all pages are scraped:
|
|
# (Add your logic to update 'total_pages' based on the response)
|
|
total_pages -= 1
|
|
start_page += 1
|
|
print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}")
|
|
time.sleep(1)
|
|
except Exception as e1:
|
|
logging.error(url + '\n'+ str(e1) + '\n')
|
|
|
|
|
|
def main():
|
|
start_time = time.time()
|
|
scraper = NaukriJobScraper(input_file, output_file, error_file)
|
|
scraper.scrape()
|
|
end_time = time.time()
|
|
duration_hours = (end_time - start_time) / 3600
|
|
print(f"Search program took {duration_hours:.2f} hours to run.")
|
|
with open(stats_file, "a") as stat:
|
|
stat.write(f"Search program took {duration_hours:.2f} hours to run. \n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|