import requests import csv import os import time import math import logging from datetime import datetime # Configure the logging settings logging.basicConfig(filename='search_india_error.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger() current_date = datetime.now() today_date = current_date.strftime('%d-%m-%Y') input_file = "naukri/_industry_urls.csv" output_file = f"india_data/search_result_india_{today_date}.csv" error_file = f"india_data/search_error_india_{today_date}.csv" stats_file = f"india_data/stats_india_{today_date}.txt" class NaukriJobScraper: # base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&pageNo={}&xt=catsrch&qi\[\]={}" # headers = { # "authority": "www.naukri.com", # "accept": "application/json", # "accept-language": "en-US,en;q=0.9", # "appid": "109", # "cache-control": "no-cache", # "clientid": "d3skt0p", # "content-type": "application/json", # "cookie": "_t_ds=21836c671691564336-4621836c67-021836c67; jd=280323907884; _gcl_au=1.1.1767756339.1691564338; test=naukri.com; G_ENABLED_IDPS=google; _cc_id=c7a22b66b0e8b76ba5b1ab973ac2c4e2; _fbp=fb.1.1691586951863.1688541664; MYNAUKRI[UNID]=6decd0ec6dac4ea7adf498fd9aea1b02; MYNAUKBMS[TOTALEXP]=.; MYNAUKBMS[MISC]=%7CX%7C-1%3A-1.-1%7CX%7C-1%3A-1.-1; PHPSESSID=7r1itb4rb4a5vp75h16aj1p50j; PS=0e9c712cbbee09d64d62ed464ccf1ed68d69b9c8b8e0879f86ac8078180ed768ff003c62a2e1a36431b890266d0ecd01; _t_ds=21836c671691564336-4621836c67-021836c67; ACTIVE=1691746049; __utma=266160400.222629415.1691564339.1691747172.1691747172.1; __utmc=266160400; __utmz=266160400.1691747172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _t_s=direct; _gid=GA1.2.404208624.1692184309; _t_r=1091%2F%2F; _abck=17DF08AA6008335BFF57EC3D4F31C60A~0~YAAQBCozaovbVfWJAQAAyqlV/wqIPgjcUjD+7ht0W00DSxyvraAK8+dtCE9YPqwS+IJPRVvvHPVL4ZLzQ7cfGNXzfh3k+y2VLqP+s+cPut62fApHUtFEmbTrUNVNv9Zeq9lwI+e8zd1DsioeBQtdUG+kzSHGWky6sPhziobMkx1B7W04IwUfACS7Ve5fYBCJU5dbtVRjeDAoNXmctQPJApkPdaddRMuoeq4qCZcW/bb8bGR+nwyO8+ZBPpQqoBpZrIhpG66AkcOcsLIfBHMfb8E/1dUZyDcFEO4Y7P41NVSIGgF8BzyGksJsa+IlaCXYrz0MDX0QiHXyiozYmEocQYKeTOwkMlmoHq/+X8XLt70g2LvMc0Zszor74PL7ymsDvPRLoDCvPinCf4Uk844KKItZ6menX46Tpg==~-1~-1~-1; bm_sz=BD37187E9CC624B5599566E84E218D81~YAAQBCozao3bVfWJAQAAyqlV/xQaFSd0F+spatEEAmhMi6P20wPSNyvyqwLIgOZIqPyzNpNoeCiq27hIuVDssDqyYLJipRkLmTgJhtRpBI/UkMYHO1gve7KT27FIcZLAPM1GlmudVfZr/vsBgNU7vcq7YlESrOQUNFkdARzI9cnEHl0Uwh+TdW+jSx/uvvgN860EXQYxvgQFPwHcF6K1HLhnThG6W3LrVsKEnltKEJsWzq73YGJhtHR2gk/c2Rn2rsnlBSKkon06k/bBUNpImVfGIv57NluTzAf4HUKBL2dBFfo=~4272181~3684401; bm_mi=840B9E1760640F737B07DF6916477F14~YAAQBCozar8fV/WJAQAAemdo/xR295FqGfoDgkXCgp3Zs538VapFXehFbhWVc0uLC2Z7cfCczehDlj6/WNkwuGUEm6AQ+a2VS9H1cL3cF+vXFUomXcwhU4fmjNruimtgH2vNc8+t07S6CFswop+vgQr50vwaRKAobfsJi0jKNELyQOdgxf0EQ+vH31DwtJMCeNMFIlZxXSznSOUZ9VRY/HSFsMgPHu3ChcKnhfJhUpS2VEkwwh8FjyNNsp08Nc8B85Vbpq3PCTz1kpFWCIeBDDVthrtnKITPzciYZy5e2VhvJWKi+2iRyOVeXbLbCphszroTewz5d6Sd4RhwOg==~1; _gat_UA-182658-1=1; ak_bmsc=DC184FF5F5CF7CEC60DE28CF4A04B43E~000000000000000000000000000000~YAAQBCozakggV/WJAQAAo2xo/xST717WQAIeCYOI3htLys7gWAfwL6/uNZtCJv6fAyFBYEcPf/0asPA8yD7eyVNXLvegM9qh5IquUPoSFJH3Sjz7JyPcySdejoqwoRGhg4rYROybASf1olGEy4PNPGBCBwTi+KUhkVCkHEaDWiDa/feuQddoB3nWBPui267IP17/01afcmBsBA+xz5PFn+OVIp7pIHrsWwa3Z+QoA3+9ZTSs+D/jXsBCsrJojd8U6Ho8NPfgfUyNOJo0SzFIQbcLy5TmAQHEYBCLhYgkRJjGPRSOqEYCtOenp5WzQHRisSQUU837xfVnr42Pc9xoW73pafQv/pQiuB64SrdhVtABVsSWchE5RuqwnPPIBf6cjJWLNb71p+Is6F6zcvVmSIvx2wZO0QmLQ2pfXr6Lh+jcBNPcod8pLbWG5U5RPHQAVi0nGPOYS+3mcrkGCiTrteqyLmSEOGvThutsOfl5Kog6h78tCaHhfhnZt1mmPkanCex2CHjeuT4FESOf83XFCLDVT9v0VAh962a9KQ==; __gads=ID=85c2a6341a8344ec:T=1691641263:RT=1692207181:S=ALNI_MZnP35P-PINdjwxcv-SNoWRMxbz8w; __gpi=UID=00000c29ed221036:T=1691641263:RT=1692207181:S=ALNI_Majbvns7DTxm-L8Fcvi-v_e7zQCvA; bm_sv=743032F92D532DCFC228BE5DB12014CF~YAAQBCozarIgV/WJAQAAQnJo/xRLr5g+qzbOInTUPStEJ+njAToV8zwOvBbHEEF9WGABP3ObKrNGr0FSALH8SsyJxhCnJZP72tWp4RJ8IMvpVkNNNye2Kc0n+U9VxZhSg9RKvKTn/DwW5x0lwY6guqb4wJwZIND/pUfBqdWUPp77qF4rYSeBEg/no94nGlmXUVUY4GqTDj6hCo6XIBbTIg1BGSdrLjFRTjpKu9aRX0ScDPSxuyMe7KPZSsOGY1AL~1; cto_bundle=TYhEE19xSDJxQk1qdTBuR3hYWDklMkJ3SWhPZmRkcjg3TnYyREN1dUpHaDBlbWJoME40OTVBelNlZ3J3TnhjVmZhSTNTTXl2U2JjSWhIM29aaWJHMyUyQkIlMkJPUmZKaGNBRkJLQVNHU1FYWFlleTFVJTJGTWduTkppQzJzMW1SOFJyRWNEdndENkklMkJ6M25jaFpaJTJCUmdUOWNMY2Z3TlolMkJ3QSUzRCUzRA; HOWTORT=ul=1692207219428&r=https%3A%2F%2Fwww.naukri.com%2Faccounting-jobs%3Fxt%3Dcatsrch%26amp%3Bqi%255b%255d%3D8&hd=1692207219607; _ga=GA1.1.222629415.1691564339; _ga_K2YBNZVRLL=GS1.1.1692207181.10.1.1692207220.21.0.0", # Add your cookie value here # "gid": "LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE", # "referer": "https://www.naukri.com/fresher-jobs?src=gnbjobs_homepage_srch", # "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', # "sec-ch-ua-mobile": "?0", # "sec-ch-ua-platform": "Windows", # "sec-fetch-dest": "empty", # "sec-fetch-mode": "cors", # "sec-fetch-site": "same-origin", # "systemid": "109", # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43", # "content-encoding": "gzip", # } base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&sort=f&pageNo={}&xt=catsrch&qi\[\]={}" headers = { "appid": "109", "systemid": "109" } keys_to_extract = ['title', 'jobId', 'footerPlaceholderLabel', 'companyName', 'companyId', 'jdURL', 'createdDate', 'mode', 'placeholders'] def __init__(self, input_file_path, output_file_path, error_file_path): self.input_file_path = input_file_path self.output_file_path = output_file_path self.error_file_path = error_file_path self.timeout = 120 self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {} def parse_and_save(self, json_data): parsed_data = [] for job in json_data["jobDetails"]: parsed_item = {field: job.get(field, None) for field in self.keys_to_extract} parsed_data.append(parsed_item) with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract) csv_writer.writerows(parsed_data) def scrape(self): with open(self.output_file_path, "w", newline="", encoding="utf-8") as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract) csv_writer.writeheader() with open(self.input_file_path, 'r') as file: file_read = csv.reader(file) for industry in file_read: industry_read_url = industry[0].replace("\n", "") industry_name=industry[1] industry_q=industry[2] total_pages = 1000 start_page = 1 print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}") while total_pages > 0: url = self.base_url.format(industry_name, start_page, industry_q) try: # print(url) response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies) if response.status_code == 403: response = requests.get(url, headers=self.headers, timeout=self.timeout) # print(f"{response.status_code} for {url}") if response.status_code != 200: print(response.status_code) print(response.reason) print(f"Error with page {start_page} for industry {industry_name}") with open(self.error_file_path, "a") as file: file.write(f"Error with page {start_page} for industry {industry_name}\n") # time.sleep(10) continue # if 200 response data = response.json() if(total_pages == 1000): total_jobs = data["noOfJobs"] total_pages = math.ceil(total_jobs/100) self.parse_and_save(data) # Assuming that you'll break the loop once all pages are scraped: # (Add your logic to update 'total_pages' based on the response) total_pages -= 1 start_page += 1 print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}") time.sleep(1) except Exception as e1: logging.error(url + '\n'+ str(e1) + '\n') def main(): start_time = time.time() scraper = NaukriJobScraper(input_file, output_file, error_file) scraper.scrape() end_time = time.time() duration_hours = (end_time - start_time) / 3600 print(f"Search program took {duration_hours:.2f} hours to run.") with open(stats_file, "a") as stat: stat.write(f"Search program took {duration_hours:.2f} hours to run. \n") if __name__ == "__main__": main()