From 8cf0486a11e9feef51e8e22ac8aa2c5f1344dfae Mon Sep 17 00:00:00 2001 From: Rahul Pandey Date: Wed, 6 Dec 2023 08:15:39 +0000 Subject: [PATCH] Delete naukri/jobdata_india.py --- naukri/jobdata_india.py | 170 ---------------------------------------- 1 file changed, 170 deletions(-) delete mode 100644 naukri/jobdata_india.py diff --git a/naukri/jobdata_india.py b/naukri/jobdata_india.py deleted file mode 100644 index 79c8fb3..0000000 --- a/naukri/jobdata_india.py +++ /dev/null @@ -1,170 +0,0 @@ -import requests -import csv -import time -import json -import os - -# Global variables -input_file = "data_naukri/search_result_india.csv" -output_file = "data_naukri/jobdata_india.csv" -error_file = "data_naukri/jobdata_error_india.csv" -stats_file = "data_naukri/stats_india.txt" -skip=0 -class NaukriJobDetailScraper: - - base_url = "https://www.naukri.com/jobapi/v4/job/{}" - headers = { - 'authority': 'www.naukri.com', - 'accept': 'application/json', - 'accept-language': 'en-US,en;q=0.9', - 'appid': '121', - 'cache-control': 'no-cache, no-store, must-revalidate', - 'content-type': 'application/json', - 'expires': '0', - 'gid': 'LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE', - 'pragma': 'no-cache', - 'referer': 'https://www.naukri.com/job-listings-ps-technical-consultant-ii-ncr-corporation-india-pvt-ltd-kolkata-mumbai-new-delhi-hyderabad-secunderabad-pune-chennai-bangalore-bengaluru-3-to-6-years-120823501070', - 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'same-origin', - 'systemid': 'Naukri', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43', - 'x-requested-with': 'XMLHttpRequest', - 'cookie': 'test=naukri.com; _t_ds=14c8c0f01691845374-19414c8c0f0-014c8c0f0; _gcl_au=1.1.1024691843.1691845381; _fbp=fb.1.1691845391563.1521284000; _t_r=1096%2F%2F; __utma=266160400.1059122291.1691845381.1691846963.1691846963.1; __utmc=266160400; __utmz=266160400.1691846963.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _gid=GA1.2.1097790226.1691946960; _cc_id=f102b3f9c375bbb80783e8e09a9c6a4d; panoramaId_expiry=1692033362592; panoramaId=f20d6d50d02ca8dbc9f4835382c2a9fb927ad954299218db60a7d9d7bca09362; panoramaIdType=panoDevice; _abck=EAF8CD87ED06F6FE0D1BE341378082D0~0~YAAQBCozarVfw8GJAQAAvF128gqV/yjff8AT5qkTc7EiVmNlJJ00nD16VEFeJh15q2bYAK8KlnGcPr7zpsi8USVMgui9DaCwoq15n4cW+Z/uKvUfCuUQAwVIKj2qlRT9tghTOfvBgGovWxTjFhD8B8DypZg3xbCBcOfMrTxIG0kml1V3V0teNzxQbKwxZBH+f9SpG1nWjcSqi0MuZ2Lp9njCQTDEXdyNn5FK9QUyBNIgMiZXGCroYN6g9Dqg50awS8p7GDin9O0yaBFnLYXYSSPqsjYlZsOAeZG1YDXhVfCIXFl9Ai4oQwulHEVR4kTx7E/GAxrPUMWKT1MJXJk38d/hHm/khF9WXryyuzBNGqBrHEmzbSK2Apvjhz+Hl7a1CDiFvYOTgurygc0o2F8E4e+o1OudsW0KCA==~-1~-1~-1; bm_sz=ED70280600D61C24AE8779690E6872A4~YAAQBCozardfw8GJAQAAvF128hRM9F6AMuh7Z7SvE3TmgXzJwI6StEga9y2KuTxZ8hXLMtJ7yq1I6ToCvJ1qcBfvYBY/W/7P2A4I+QADScKYSbs6S/S3UE9bL/lKee3NvEuD50tGUHrs59SQGoYdJGMrwml9npvfv+PANc8RaeobLmyx70LjTBajrTQruhnuEqphAnEPph1L6yqffRmta8KALbfw/sFFkvZWRte4uRCRS6IwyvdgNdGzHrvU90Cefnm1sAuK5Hm+F+JUvMVZhEWa/vukCd3Pz7toStN7N4P31cQ=~4539188~3289157; bm_mi=5266EA699B10C54B520AC0C335945591~YAAQBCozaslfw8GJAQAAFV528hRn0Dp7Ng6SjmmpdWbuBqjjlpOIm6e4no+DFPGfNvfuTNj9/tOe0zSzEbnFtWymp3K8PdRZcbO4azXh/4xphXqBeZXTZhE/H/7X6du3KAg3VyrF08jM/O2Hf8/7qtOXVUdBSpd8+mzH3IbW1d10UuiswDenQ6HiNRSkJISdZ8F6lXgGw2kpN3tAHIa9RixcTehrimRMipUgj4pRG/80a+tzAQQcAWUVOFaNoOHZ/C/oL2It920HJrOdtE85yrXx/LMaJlUb1RlHCG2KE/xkNMWpMI/FCimZYyI/DC8yQziKzxoqnP+GPA+JN5dMV76U4jXzYLqPOT5NwoKG7w==~1; ak_bmsc=0F69C083388867249F15237E773039FA~000000000000000000000000000000~YAAQBCozailgw8GJAQAAiGF28hTwkEIwbiDNaA96h/t+HbVduxzp6s1VtAmlm8JZxLg4LfiUPyA15rawjfgm3WgrQVB6GsFlaa+AvUvz1Pz3Q1P9td+LXZ5/+PFIAaTQN/O8SvcNd87eOmguE+T4BLbH5NDBcHEHBngYElDjkyqZkRtJ15EqweEPCpzn6yt+EYc/+sNuZI5/Wqj674CTqW8hmhvDToHdetlr8dh0zmRPwh1xdYnwb4uR6rGuaAIDwfopcqXdroQFVmDwMMXCkLNtTG3jToLxEDo7w/SHlJNK0LhicrXOQLyJu4k7udguvs4/Y+kXOEc04TkLKWa0gHsA+znQId6BT0CK4BFgGPYCMzpn379EH1ucz+mbjpX9p61CvxwEqFWV6O6hXXlbjHDGsuIiuIy3EP+38wb6B+uq2PBPgEmzZjLYjs9aNWGs0of7I0/V+ZL2xQDA2JD5FUXN1sgkl8r6w2sT5Fk1VuHGeorLkpIm0fkysZqAPM2yqJ5zaVkjyI4UENN56Aw79pKKVSkJtT5ALDmr1e+O8keIkg069ipenGburGc1Nw==; __gads=ID=da661383a92cc2b7:T=1691845731:RT=1691990009:S=ALNI_Ma5kdU-yCfi5vupriJnuuWUWmE_SQ; __gpi=UID=00000c2b451ccc2b:T=1691845731:RT=1691990009:S=ALNI_MZHpbDDCgSCaDcBTqfNHzHEDKk0JQ; jd=110823008324; _ga=GA1.2.1059122291.1691845381; cto_bundle=IfSELF9LbTF0TnAzamN1d2ZSSm5EMkdYekFhWDNJeElkOCUyQkElMkZ2RTRJNTFBNG95WENmVlBEV01wV3ZPSXB0dWpTZVFBZHZWQmt6WjVHTUpWNWEwQURTeWRaMWVGbyUyQjclMkZpSm5aNFZia0ZjcGklMkJFcSUyQlg2R3I3bUJkazJnaVN0cURyTUpGWUxQOHR6TFpBcDF6QU1MckFOdlg2cEElM0QlM0Q; _gat_UA-182658-1=1; bm_sv=33FDCB0BB2381FFCB1DA9B35AB25F10B~YAAQHSozaj2kUsGJAQAAFWF48hR1ZxWD9bmTihvsJwSN5urYMQoBOXsjILmBLpCp5Y8Wb2d+v8S1IsgfaFAjzZQJDWWGsM4VZOUHvjeEwqyhpkf95fegyYjUANSip9pcOY7JcbsJ3QemjclSynJdM2yjQovH+L9XiBHdKYFWDfacLicV2AGOtFikI1gVDGLSEqegx2bUuwmuQAlECM+lqj//OIwitlvDTMj9WCs40ybqG4D7o+JDWSXPBMYddaEqDw==~1; HOWTORT=ul=1691990122615&r=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1&hd=1691990122806&cl=1691990019014&nu=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1; _ga_K2YBNZVRLL=GS1.1.1691989990.4.1.1691990122.60.0.0' - } - - def __init__(self, input_file, output_file, error_file): - self.input_file = input_file - self.output_file = output_file - self.error_file = error_file - self.timeout = 30 - self.count = 1 - self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {} - - def transform_data(self, job_id, url, json_response): - job_details = json_response.get("jobDetails",{}) - - location_arr = [item['label'] for item in job_details["locations"]] - location_str = ', '.join(location_arr) - - skills_arr = [skill["label"] for skill in job_details.get("keySkills")["other"] if skill["label"]] - skills_str = ", ".join(skills_arr) - - json_data = { - "Url": url, - "Job Key": str(url.split('/')[-1]), - "Source Link": job_details.get("applyRedirectUrl"), - "Job Description": job_details.get("description"), - "Role Category": job_details.get("roleCategory"), - "Job Industry": job_details.get("industry"), - "Job Title": job_details.get("title"), - "Formatted Location Full": location_str, - "Job Functions": job_details.get("functionalArea"), - "Company": job_details.get("companyDetail", {}).get("name") if job_details.get("companyDetail") else None, - "Job Type": job_details.get("employmentType").split(',')[0].strip(), - - ##Only available in naukri - "Key Skills": skills_str, - "Minimum Experience": job_details.get("minimumExperience"), - "Maximum Experience": job_details.get("maximumExperience"), - "Salary Detail": job_details.get("salaryDetail"), - } - return json_data - - def scrape(self): - with open(self.input_file, 'r', encoding='utf-8') as infile: - reader = csv.reader(infile) - total_input_count=0 - all_job_ids = [] - - for row in reader: - jobid = row[1].strip() - mode = row[7].strip() - total_input_count+=1 - - if mode != "crawled": - print("removed non crawled job with jobid %s" % jobid) - continue - - all_job_ids.append(jobid) - - - print(f"Size of raw all_job_ids: {len(all_job_ids)}") - all_job_ids = list(set(all_job_ids)) - - print(f"Size of unique all_job_ids: {len(all_job_ids)}") - - #adjust skip - all_job_ids = all_job_ids[skip:] - - print(f"Total input: {total_input_count}, Valid ids to scrape {len(all_job_ids)}") - with open(stats_file, "a") as stat: - stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n") - - time.sleep(10) - - header_written=False - - with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile: - writer = csv.writer(outfile) - - while all_job_ids: - job_id = all_job_ids[0] - url = self.base_url.format(job_id) - - time.sleep(0.5) - response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies) - - print(f"{response.status_code} for {url}") - - if response.status_code == 200: - json_response = response.json() - - transformed_data = self.transform_data(job_id, url, json_response) - - # Write the header row if needed - if not header_written: - header = transformed_data.keys() - writer.writerow(header) - header_written = True - - writer.writerow(transformed_data.values()) - print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}") - all_job_ids.pop(0) # Remove the processed job ID - self.count += 1 - - elif response.status_code == 303: - json_response = response.json() - - if json_response.get('metaSearch', {}).get('isExpiredJob') == '1': - - print(f"Expired job ID {jobid} with response 303") - all_job_ids.pop(0) # Remove the processed job ID - - elif response.status_code == 404: - all_job_ids.pop(0) # Remove the processed job ID - print(f"Expired job ID {jobid} with response 404") - - else: - print(f"Error for job ID {job_id}") - time.sleep(10) - -def main(): - - start_time = time.time() - - scraper = NaukriJobDetailScraper(input_file, output_file, error_file) - scraper.scrape() - - end_time = time.time() - duration_hours = (end_time - start_time) / 3600 - print(f"Jobdata program took {duration_hours:.2f} hours to run.") - with open(stats_file, "a") as stat: - stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n") - - -if __name__ == "__main__": - main()