From 8101070a79c400a95891f52b75b373295cb0d15b Mon Sep 17 00:00:00 2001 From: sameer17cs Date: Mon, 25 Sep 2023 14:51:49 +0530 Subject: [PATCH] init --- .gitignore | 3 + naukri/_gulf_location.csv | 41 +++++++++ naukri/_industry_urls.csv | 61 ++++++++++++++ naukri/expiry.py | 115 ++++++++++++++++++++++++++ naukri/jobdata_gulf.py | 132 +++++++++++++++++++++++++++++ naukri/jobdata_india.py | 170 ++++++++++++++++++++++++++++++++++++++ naukri/search_gulf.py | 95 +++++++++++++++++++++ naukri/search_india.py | 116 ++++++++++++++++++++++++++ requirements.txt | 2 + 9 files changed, 735 insertions(+) create mode 100644 .gitignore create mode 100644 naukri/_gulf_location.csv create mode 100644 naukri/_industry_urls.csv create mode 100644 naukri/expiry.py create mode 100644 naukri/jobdata_gulf.py create mode 100644 naukri/jobdata_india.py create mode 100644 naukri/search_gulf.py create mode 100644 naukri/search_india.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7b28625 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.vscode +data_naukri +scrib \ No newline at end of file diff --git a/naukri/_gulf_location.csv b/naukri/_gulf_location.csv new file mode 100644 index 0000000..c2f951b --- /dev/null +++ b/naukri/_gulf_location.csv @@ -0,0 +1,41 @@ +Abu Dhabi +Dubai +Sharjah +Ras Al Khaimah +Ajman +Fujairah +Umm Al Qaiwain +Al Ain +Riyadh +Dammam +Jeddah +Makkah +Madinah +Yanbu +Eastern Province +Jubail +Muscat +Salalah +Sohar +Zufar +Doha +Ahmadi +Manama +Alexandria +Algeria +Amman +Baghdad +Beirut +Cairo +Dammam +Egypt +Iraq +Jordan +Lagos +Lebanon +Libya +Middle East +Morocco +Palestine +Somalia +Yemen diff --git a/naukri/_industry_urls.csv b/naukri/_industry_urls.csv new file mode 100644 index 0000000..0395c6a --- /dev/null +++ b/naukri/_industry_urls.csv @@ -0,0 +1,61 @@ +https://www.naukri.com/accounting-jobs?xt=catsrch&qi[]=8,accounting,8 +https://www.naukri.com/advertising-jobs?xt=catsrch&qi[]=32,advertising,32 +https://www.naukri.com/agriculture-jobs?xt=catsrch&qi[]=33,agriculture,33 +https://www.naukri.com/animation-jobs?xt=catsrch&qi[]=56,animation,56 +https://www.naukri.com/architecture-jobs?xt=catsrch&qi[]=30,architecture,30 +https://www.naukri.com/automobile-jobs?xt=catsrch&qi[]=4,automobile,4 +https://www.naukri.com/aviation-jobs?xt=catsrch&qi[]=46,aviation,46 +https://www.naukri.com/bpo-jobs?xt=catsrch&qi[]=7,bpo,7 +https://www.naukri.com/bank-jobs?xt=catsrch&qi[]=14,bank,14 +https://www.naukri.com/brewery-jobs?xt=catsrch&qi[]=50,brewery,50 +https://www.naukri.com/sanitary-jobs?xt=catsrch&qi[]=60,sanitary,60 +https://www.naukri.com/chemical-jobs?xt=catsrch&qi[]=6,chemical,6 +https://www.naukri.com/engineering-jobs?xt=catsrch&qi[]=12,engineering,12 +https://www.naukri.com/consumer-durables-jobs?xt=catsrch&qi[]=10,consumer-durables,10 +https://www.naukri.com/courier-jobs?xt=catsrch&qi[]=18,courier,18 +https://www.naukri.com/defence-jobs?xt=catsrch&qi[]=42,defence,42 +https://www.naukri.com/teaching-jobs?xt=catsrch&qi[]=26,teaching,26 +https://www.naukri.com/electrical-jobs?xt=catsrch&qi[]=55,electrical,55 +https://www.naukri.com/export-import-jobs?xt=catsrch&qi[]=13,export-import,13 +https://www.naukri.com/fmcg-jobs?xt=catsrch&qi[]=9,fmcg,9 +https://www.naukri.com/facility-management-jobs?xt=catsrch&qi[]=47,facility-management,47 +https://www.naukri.com/fertilizers-jobs?xt=catsrch&qi[]=41,fertilizers,41 +https://www.naukri.com/food-processing-jobs?xt=catsrch&qi[]=57,food-processing,57 +https://www.naukri.com/fresher-jobs?xt=catsrch&qi[]=31,fresher,31 +https://www.naukri.com/gems-jewellery-jobs?xt=catsrch&qi[]=35,gems-jewellery,35 +https://www.naukri.com/glass-jobs?xt=catsrch&qi[]=49,glass,49 +https://www.naukri.com/air-conditioning-jobs?xt=catsrch&qi[]=61,air-conditioning,61 +https://www.naukri.com/airline-jobs?xt=catsrch&qi[]=2,airline,2 +https://www.naukri.com/networking-jobs?xt=catsrch&qi[]=15,networking,15 +https://www.naukri.com/information-technology-jobs?xt=catsrch&qi[]=25,information-technology,25 +https://www.naukri.com/industrial-jobs?xt=catsrch&qi[]=16,industrial,16 +https://www.naukri.com/insurance-jobs?xt=catsrch&qi[]=17,insurance,17 +https://www.naukri.com/kpo-jobs?xt=catsrch&qi[]=48,kpo,48 +https://www.naukri.com/legal-jobs?xt=catsrch&qi[]=36,legal,36 +https://www.naukri.com/media-jobs?xt=catsrch&qi[]=19,media,19 +https://www.naukri.com/dotcom-jobs?xt=catsrch&qi[]=19,dotcom,19 +https://www.naukri.com/entertainment-jobs?xt=catsrch&qi[]=19,entertainment,19 +https://www.naukri.com/medical-jobs?xt=catsrch&qi[]=20,medical,20 +https://www.naukri.com/mining-jobs?xt=catsrch&qi[]=54,mining,54 +https://www.naukri.com/ngo-jobs?xt=catsrch&qi[]=37,ngo,37 +https://www.naukri.com/automation-jobs?xt=catsrch&qi[]=21,automation,21 +https://www.naukri.com/oil-and-gas-jobs?xt=catsrch&qi[]=23,oil-and-gas,23 +https://www.naukri.com/paper-jobs?xt=catsrch&qi[]=43,paper,43 +https://www.naukri.com/pharma-jobs?xt=catsrch&qi[]=22,pharma,22 +https://www.naukri.com/printing-jobs?xt=catsrch&qi[]=38,printing,38 +https://www.naukri.com/publishing-jobs?xt=catsrch&qi[]=58,publishing,58 +https://www.naukri.com/real-estate-jobs?xt=catsrch&qi[]=39,real-estate,39 +https://www.naukri.com/recruitment-jobs?xt=catsrch&qi[]=34,recruitment,34 +https://www.naukri.com/retail-jobs?xt=catsrch&qi[]=24,retail,24 +https://www.naukri.com/security-jobs?xt=catsrch&qi[]=40,security,40 +https://www.naukri.com/electronics-jobs?xt=catsrch&qi[]=28,electronics,28 +https://www.naukri.com/shipping-jobs?xt=catsrch&qi[]=44,shipping,44 +https://www.naukri.com/steel-jobs?xt=catsrch&qi[]=53,steel,53 +https://www.naukri.com/consultant-jobs?xt=catsrch&qi[]=52,consultant,52 +https://www.naukri.com/telecom-jobs?xt=catsrch&qi[]=27,telecom,27 +https://www.naukri.com/textiles-jobs?xt=catsrch&qi[]=3,textiles,3 +https://www.naukri.com/tyres-jobs?xt=catsrch&qi[]=45,tyres,45 +https://www.naukri.com/water-treatment-jobs?xt=catsrch&qi[]=51,water-treatment,51 +https://www.naukri.com/fitness-trainer-jobs?xt=catsrch&qi[]=59,fitness-trainer,59 +https://www.naukri.com/ecommerce-jobs?xt=catsrch&qi[]=63,ecommerce,63 +https://www.naukri.com/internet-jobs?xt=catsrch&qi[]=63,internet,63 \ No newline at end of file diff --git a/naukri/expiry.py b/naukri/expiry.py new file mode 100644 index 0000000..db8efc1 --- /dev/null +++ b/naukri/expiry.py @@ -0,0 +1,115 @@ +import requests +import csv +import time +import json +import os + +# Global variables +input_file = "data_naukri/old_jobdata.csv" +output_file = "data_naukri/expired.csv" +error_file = "data_naukri_india/expiry_error.csv" +stats_file = "data_naukri_india/stats.txt" +class NaukriExpiryScraper: + base_url="https://www.naukri.com/jobapi/v4/job/{}" + headers = { + 'authority': 'www.naukri.com', + 'accept': 'application/json', + 'accept-language': 'en-US,en;q=0.9', + 'appid': '121', + 'cache-control': 'no-cache, no-store, must-revalidate', + 'content-type': 'application/json', + 'expires': '0', + 'gid': 'LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE', + 'pragma': 'no-cache', + 'referer': 'https://www.naukri.com/job-listings-ps-technical-consultant-ii-ncr-corporation-india-pvt-ltd-kolkata-mumbai-new-delhi-hyderabad-secunderabad-pune-chennai-bangalore-bengaluru-3-to-6-years-120823501070', + 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'systemid': 'Naukri', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43', + 'x-requested-with': 'XMLHttpRequest', + 'cookie': 'test=naukri.com; _t_ds=14c8c0f01691845374-19414c8c0f0-014c8c0f0; _gcl_au=1.1.1024691843.1691845381; _fbp=fb.1.1691845391563.1521284000; _t_r=1096%2F%2F; __utma=266160400.1059122291.1691845381.1691846963.1691846963.1; __utmc=266160400; __utmz=266160400.1691846963.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _gid=GA1.2.1097790226.1691946960; _cc_id=f102b3f9c375bbb80783e8e09a9c6a4d; panoramaId_expiry=1692033362592; panoramaId=f20d6d50d02ca8dbc9f4835382c2a9fb927ad954299218db60a7d9d7bca09362; panoramaIdType=panoDevice; _abck=EAF8CD87ED06F6FE0D1BE341378082D0~0~YAAQBCozarVfw8GJAQAAvF128gqV/yjff8AT5qkTc7EiVmNlJJ00nD16VEFeJh15q2bYAK8KlnGcPr7zpsi8USVMgui9DaCwoq15n4cW+Z/uKvUfCuUQAwVIKj2qlRT9tghTOfvBgGovWxTjFhD8B8DypZg3xbCBcOfMrTxIG0kml1V3V0teNzxQbKwxZBH+f9SpG1nWjcSqi0MuZ2Lp9njCQTDEXdyNn5FK9QUyBNIgMiZXGCroYN6g9Dqg50awS8p7GDin9O0yaBFnLYXYSSPqsjYlZsOAeZG1YDXhVfCIXFl9Ai4oQwulHEVR4kTx7E/GAxrPUMWKT1MJXJk38d/hHm/khF9WXryyuzBNGqBrHEmzbSK2Apvjhz+Hl7a1CDiFvYOTgurygc0o2F8E4e+o1OudsW0KCA==~-1~-1~-1; bm_sz=ED70280600D61C24AE8779690E6872A4~YAAQBCozardfw8GJAQAAvF128hRM9F6AMuh7Z7SvE3TmgXzJwI6StEga9y2KuTxZ8hXLMtJ7yq1I6ToCvJ1qcBfvYBY/W/7P2A4I+QADScKYSbs6S/S3UE9bL/lKee3NvEuD50tGUHrs59SQGoYdJGMrwml9npvfv+PANc8RaeobLmyx70LjTBajrTQruhnuEqphAnEPph1L6yqffRmta8KALbfw/sFFkvZWRte4uRCRS6IwyvdgNdGzHrvU90Cefnm1sAuK5Hm+F+JUvMVZhEWa/vukCd3Pz7toStN7N4P31cQ=~4539188~3289157; bm_mi=5266EA699B10C54B520AC0C335945591~YAAQBCozaslfw8GJAQAAFV528hRn0Dp7Ng6SjmmpdWbuBqjjlpOIm6e4no+DFPGfNvfuTNj9/tOe0zSzEbnFtWymp3K8PdRZcbO4azXh/4xphXqBeZXTZhE/H/7X6du3KAg3VyrF08jM/O2Hf8/7qtOXVUdBSpd8+mzH3IbW1d10UuiswDenQ6HiNRSkJISdZ8F6lXgGw2kpN3tAHIa9RixcTehrimRMipUgj4pRG/80a+tzAQQcAWUVOFaNoOHZ/C/oL2It920HJrOdtE85yrXx/LMaJlUb1RlHCG2KE/xkNMWpMI/FCimZYyI/DC8yQziKzxoqnP+GPA+JN5dMV76U4jXzYLqPOT5NwoKG7w==~1; ak_bmsc=0F69C083388867249F15237E773039FA~000000000000000000000000000000~YAAQBCozailgw8GJAQAAiGF28hTwkEIwbiDNaA96h/t+HbVduxzp6s1VtAmlm8JZxLg4LfiUPyA15rawjfgm3WgrQVB6GsFlaa+AvUvz1Pz3Q1P9td+LXZ5/+PFIAaTQN/O8SvcNd87eOmguE+T4BLbH5NDBcHEHBngYElDjkyqZkRtJ15EqweEPCpzn6yt+EYc/+sNuZI5/Wqj674CTqW8hmhvDToHdetlr8dh0zmRPwh1xdYnwb4uR6rGuaAIDwfopcqXdroQFVmDwMMXCkLNtTG3jToLxEDo7w/SHlJNK0LhicrXOQLyJu4k7udguvs4/Y+kXOEc04TkLKWa0gHsA+znQId6BT0CK4BFgGPYCMzpn379EH1ucz+mbjpX9p61CvxwEqFWV6O6hXXlbjHDGsuIiuIy3EP+38wb6B+uq2PBPgEmzZjLYjs9aNWGs0of7I0/V+ZL2xQDA2JD5FUXN1sgkl8r6w2sT5Fk1VuHGeorLkpIm0fkysZqAPM2yqJ5zaVkjyI4UENN56Aw79pKKVSkJtT5ALDmr1e+O8keIkg069ipenGburGc1Nw==; __gads=ID=da661383a92cc2b7:T=1691845731:RT=1691990009:S=ALNI_Ma5kdU-yCfi5vupriJnuuWUWmE_SQ; __gpi=UID=00000c2b451ccc2b:T=1691845731:RT=1691990009:S=ALNI_MZHpbDDCgSCaDcBTqfNHzHEDKk0JQ; jd=110823008324; _ga=GA1.2.1059122291.1691845381; cto_bundle=IfSELF9LbTF0TnAzamN1d2ZSSm5EMkdYekFhWDNJeElkOCUyQkElMkZ2RTRJNTFBNG95WENmVlBEV01wV3ZPSXB0dWpTZVFBZHZWQmt6WjVHTUpWNWEwQURTeWRaMWVGbyUyQjclMkZpSm5aNFZia0ZjcGklMkJFcSUyQlg2R3I3bUJkazJnaVN0cURyTUpGWUxQOHR6TFpBcDF6QU1MckFOdlg2cEElM0QlM0Q; _gat_UA-182658-1=1; bm_sv=33FDCB0BB2381FFCB1DA9B35AB25F10B~YAAQHSozaj2kUsGJAQAAFWF48hR1ZxWD9bmTihvsJwSN5urYMQoBOXsjILmBLpCp5Y8Wb2d+v8S1IsgfaFAjzZQJDWWGsM4VZOUHvjeEwqyhpkf95fegyYjUANSip9pcOY7JcbsJ3QemjclSynJdM2yjQovH+L9XiBHdKYFWDfacLicV2AGOtFikI1gVDGLSEqegx2bUuwmuQAlECM+lqj//OIwitlvDTMj9WCs40ybqG4D7o+JDWSXPBMYddaEqDw==~1; HOWTORT=ul=1691990122615&r=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1&hd=1691990122806&cl=1691990019014&nu=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1; _ga_K2YBNZVRLL=GS1.1.1691989990.4.1.1691990122.60.0.0' + } + + def __init__(self, input_file, output_file, error_file): + self.input_file = input_file + self.output_file = output_file + self.error_file = error_file + self.timeout = 30 + self.expired_jobs_count=0 + self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {} + + def scrape(self): + + all_input = [] + with open(self.input_file, 'r', encoding='utf-8') as infile: + header_line = infile.readline().strip() + #write header line + with open(self.output_file, 'w') as file: + file.write(header_line + "\n") + + reader = csv.reader(infile) + + for row in reader: + all_input.append(row) + + with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile: + writer = csv.writer(outfile) + + while all_input: + current_row=all_input[0] + source_link=current_row[2].strip() + jobid = current_row[1].strip() + url = self.base_url.format(jobid) + + if source_link == "": + print(f"Not checking job without source link, job ID {jobid}") + all_input.pop(0) # Remove the processed job ID + continue + + print(f"Remaining to do: {len(all_input)}") + time.sleep(0.5) + response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies) + + print(f"{response.status_code} for {url}") + + if response.status_code == 200: + print(f"Alive job ID {jobid}") + all_input.pop(0) # Remove the processed job ID + + + elif response.status_code == 303: + json_response = response.json() + if json_response.get('metaSearch', {}).get('isExpiredJob') == '1': + print(f"Expired job ID {jobid} with response 303") + writer.writerow(current_row) + self.expired_jobs_count+=1 + all_input.pop(0) # Remove the processed job ID + + elif response.status_code == 404: + print(f"Expired job ID {jobid} with response 404") + writer.writerow(current_row) + self.expired_jobs_count+=1 + all_input.pop(0) # Remove the processed job ID + + else: + print(f"Failed to fetch data for job ID {jobid}") + time.sleep(10) + +def main(): + start_time = time.time() + + scraper = NaukriExpiryScraper(input_file, output_file, error_file) + scraper.scrape() + + end_time = time.time() + duration_hours = (end_time - start_time) / 3600 + print(f"Expiry program took {duration_hours:.2f} hours to run.") + with open(stats_file, "a") as stat: + stat.write(f"Expiry program took {duration_hours:.2f} hours to run.\n") + +if __name__ == "__main__": + main() diff --git a/naukri/jobdata_gulf.py b/naukri/jobdata_gulf.py new file mode 100644 index 0000000..92debf4 --- /dev/null +++ b/naukri/jobdata_gulf.py @@ -0,0 +1,132 @@ +import requests +import csv +import concurrent.futures + +# List of URLs to query +base_url = "https://www.naukrigulf.com/spapi/jobs/{}" + +headers = { + 'authority': 'www.naukrigulf.com', + 'accept': 'application/json', + 'accept-format': 'strict', + 'accept-language': 'ENGLISH', + 'appid': '205', + 'cache-control': 'no-cache', + 'client-type': 'desktop', + 'clientid': 'desktop', + 'device-type': 'desktop', + 'puppeteer': 'false', + 'referer': 'https://www.naukrigulf.com/jobs-in-uae', + 'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': 'Windows', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'systemid': '2323', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12', + 'userdata': '|IN' +} + +keys_to_extract = ['designation','description','company','compensation','industryType','functionalArea','jobSource','location','other','desiredCandidate','contact','isExpired','locationInterlinking'] +company_keys = ['name','details'] +salary_key = ['minimumSalary','maximumSalary','currency','label','hideSalary'] +rfile = "ME_jobIds.csv" +loc_list = [] +skill_other =[] +skill_pref = [] + + + +def fetch_url(url): + try: + url = base_url.format(url) + response = requests.get(url, headers=headers) + return response.json(), response.status_code, url + except requests.exceptions.RequestException as e: + return "", str(e), url + +def batch_process(urls): + results = [] + with concurrent.futures.ThreadPoolExecutor() as executor: + future_to_url = {executor.submit(fetch_url, url): url for url in urls} + + for future in concurrent.futures.as_completed(future_to_url): + url = future_to_url[future] + try: + result = future.result() + results.append(result) + except Exception as e: + results.append((url, str(e))) + return results + +def main(): + batch_size = 50 + results = [] + count = 1 + # Open a CSV file for writing + with open('output_jobs_0209_me.csv', 'a', newline='', encoding='utf-8') as csvfile: + csvwriter = csv.writer(csvfile) + + # Write header to the CSV file + csvwriter.writerow(['URL'] + list(keys_to_extract)) + + with open(rfile,'r') as file: + csv_reader = csv.reader(file) + urls = [row.replace("\n","") for row in file] + + for i in range(0, len(urls), batch_size): + batch = urls[i:i+batch_size] + batch_results = batch_process(batch) + # Make the HTTP GET request + #row = row.replace("\n","") + #`url = base_url.format(row)` + #try: + for response in batch_results: + print(count) + count = count + 1 + if response[1]== 200: + json_data = response[0] + + job_details = json_data + # Extract specific key values from the JSON response + values_to_store = [job_details.get(key, '') for key in keys_to_extract] + """if values_to_store[0]!="": + + [values_to_store.append(job_details["companyDetail"].get(key,'')) for key in company_keys] + [values_to_store.append(job_details["salaryDetail"].get(key,'')) for key in salary_key] + + for loc in job_details["locations"]: + loc_list.append(loc.get('label','')) + values_to_store.append(loc_list) + + for skill in job_details["keySkills"]["other"]: + skill_other.append(skill.get('label','')) + values_to_store.append(skill_other) + + for skill in job_details["keySkills"]["preferred"]: + skill_pref.append(skill.get('label','')) + values_to_store.append(skill_pref) + + else: + values_to_store[1]="" + values_to_store.append(job_details["companyDetail"]) + values_to_store.append(job_details["salaryDetail"]) + values_to_store.append(job_details["locations"]) + values_to_store.append(job_details["keySkills"]) + """ + # Write the extracted values to the CSV file + csvwriter.writerow([response[2]] + values_to_store) + else: + print(f"Failed to fetch data for job ID: {response[2]} with {response[0]}") + csvwriter.writerow([response[2]] + [response[0]]) + + # except requests.exceptions.RequestException as e: + # csvwriter.writerow([url] + [str(e)]) + + print("Data extraction and CSV writing complete.") + +if __name__ == "__main__": + main() + + diff --git a/naukri/jobdata_india.py b/naukri/jobdata_india.py new file mode 100644 index 0000000..8d34674 --- /dev/null +++ b/naukri/jobdata_india.py @@ -0,0 +1,170 @@ +import requests +import csv +import time +import json +import os + +# Global variables +input_file = "data_naukri/search_result_india.csv" +output_file = "data_naukri/jobdata_india.csv" +error_file = "data_naukri/jobdata_error_india.csv" +stats_file = "data_naukri/stats.txt" +skip=0 +class NaukriJobDetailScraper: + + base_url = "https://www.naukri.com/jobapi/v4/job/{}" + headers = { + 'authority': 'www.naukri.com', + 'accept': 'application/json', + 'accept-language': 'en-US,en;q=0.9', + 'appid': '121', + 'cache-control': 'no-cache, no-store, must-revalidate', + 'content-type': 'application/json', + 'expires': '0', + 'gid': 'LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE', + 'pragma': 'no-cache', + 'referer': 'https://www.naukri.com/job-listings-ps-technical-consultant-ii-ncr-corporation-india-pvt-ltd-kolkata-mumbai-new-delhi-hyderabad-secunderabad-pune-chennai-bangalore-bengaluru-3-to-6-years-120823501070', + 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'systemid': 'Naukri', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43', + 'x-requested-with': 'XMLHttpRequest', + 'cookie': 'test=naukri.com; _t_ds=14c8c0f01691845374-19414c8c0f0-014c8c0f0; _gcl_au=1.1.1024691843.1691845381; _fbp=fb.1.1691845391563.1521284000; _t_r=1096%2F%2F; __utma=266160400.1059122291.1691845381.1691846963.1691846963.1; __utmc=266160400; __utmz=266160400.1691846963.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _gid=GA1.2.1097790226.1691946960; _cc_id=f102b3f9c375bbb80783e8e09a9c6a4d; panoramaId_expiry=1692033362592; panoramaId=f20d6d50d02ca8dbc9f4835382c2a9fb927ad954299218db60a7d9d7bca09362; panoramaIdType=panoDevice; _abck=EAF8CD87ED06F6FE0D1BE341378082D0~0~YAAQBCozarVfw8GJAQAAvF128gqV/yjff8AT5qkTc7EiVmNlJJ00nD16VEFeJh15q2bYAK8KlnGcPr7zpsi8USVMgui9DaCwoq15n4cW+Z/uKvUfCuUQAwVIKj2qlRT9tghTOfvBgGovWxTjFhD8B8DypZg3xbCBcOfMrTxIG0kml1V3V0teNzxQbKwxZBH+f9SpG1nWjcSqi0MuZ2Lp9njCQTDEXdyNn5FK9QUyBNIgMiZXGCroYN6g9Dqg50awS8p7GDin9O0yaBFnLYXYSSPqsjYlZsOAeZG1YDXhVfCIXFl9Ai4oQwulHEVR4kTx7E/GAxrPUMWKT1MJXJk38d/hHm/khF9WXryyuzBNGqBrHEmzbSK2Apvjhz+Hl7a1CDiFvYOTgurygc0o2F8E4e+o1OudsW0KCA==~-1~-1~-1; bm_sz=ED70280600D61C24AE8779690E6872A4~YAAQBCozardfw8GJAQAAvF128hRM9F6AMuh7Z7SvE3TmgXzJwI6StEga9y2KuTxZ8hXLMtJ7yq1I6ToCvJ1qcBfvYBY/W/7P2A4I+QADScKYSbs6S/S3UE9bL/lKee3NvEuD50tGUHrs59SQGoYdJGMrwml9npvfv+PANc8RaeobLmyx70LjTBajrTQruhnuEqphAnEPph1L6yqffRmta8KALbfw/sFFkvZWRte4uRCRS6IwyvdgNdGzHrvU90Cefnm1sAuK5Hm+F+JUvMVZhEWa/vukCd3Pz7toStN7N4P31cQ=~4539188~3289157; bm_mi=5266EA699B10C54B520AC0C335945591~YAAQBCozaslfw8GJAQAAFV528hRn0Dp7Ng6SjmmpdWbuBqjjlpOIm6e4no+DFPGfNvfuTNj9/tOe0zSzEbnFtWymp3K8PdRZcbO4azXh/4xphXqBeZXTZhE/H/7X6du3KAg3VyrF08jM/O2Hf8/7qtOXVUdBSpd8+mzH3IbW1d10UuiswDenQ6HiNRSkJISdZ8F6lXgGw2kpN3tAHIa9RixcTehrimRMipUgj4pRG/80a+tzAQQcAWUVOFaNoOHZ/C/oL2It920HJrOdtE85yrXx/LMaJlUb1RlHCG2KE/xkNMWpMI/FCimZYyI/DC8yQziKzxoqnP+GPA+JN5dMV76U4jXzYLqPOT5NwoKG7w==~1; ak_bmsc=0F69C083388867249F15237E773039FA~000000000000000000000000000000~YAAQBCozailgw8GJAQAAiGF28hTwkEIwbiDNaA96h/t+HbVduxzp6s1VtAmlm8JZxLg4LfiUPyA15rawjfgm3WgrQVB6GsFlaa+AvUvz1Pz3Q1P9td+LXZ5/+PFIAaTQN/O8SvcNd87eOmguE+T4BLbH5NDBcHEHBngYElDjkyqZkRtJ15EqweEPCpzn6yt+EYc/+sNuZI5/Wqj674CTqW8hmhvDToHdetlr8dh0zmRPwh1xdYnwb4uR6rGuaAIDwfopcqXdroQFVmDwMMXCkLNtTG3jToLxEDo7w/SHlJNK0LhicrXOQLyJu4k7udguvs4/Y+kXOEc04TkLKWa0gHsA+znQId6BT0CK4BFgGPYCMzpn379EH1ucz+mbjpX9p61CvxwEqFWV6O6hXXlbjHDGsuIiuIy3EP+38wb6B+uq2PBPgEmzZjLYjs9aNWGs0of7I0/V+ZL2xQDA2JD5FUXN1sgkl8r6w2sT5Fk1VuHGeorLkpIm0fkysZqAPM2yqJ5zaVkjyI4UENN56Aw79pKKVSkJtT5ALDmr1e+O8keIkg069ipenGburGc1Nw==; __gads=ID=da661383a92cc2b7:T=1691845731:RT=1691990009:S=ALNI_Ma5kdU-yCfi5vupriJnuuWUWmE_SQ; __gpi=UID=00000c2b451ccc2b:T=1691845731:RT=1691990009:S=ALNI_MZHpbDDCgSCaDcBTqfNHzHEDKk0JQ; jd=110823008324; _ga=GA1.2.1059122291.1691845381; cto_bundle=IfSELF9LbTF0TnAzamN1d2ZSSm5EMkdYekFhWDNJeElkOCUyQkElMkZ2RTRJNTFBNG95WENmVlBEV01wV3ZPSXB0dWpTZVFBZHZWQmt6WjVHTUpWNWEwQURTeWRaMWVGbyUyQjclMkZpSm5aNFZia0ZjcGklMkJFcSUyQlg2R3I3bUJkazJnaVN0cURyTUpGWUxQOHR6TFpBcDF6QU1MckFOdlg2cEElM0QlM0Q; _gat_UA-182658-1=1; bm_sv=33FDCB0BB2381FFCB1DA9B35AB25F10B~YAAQHSozaj2kUsGJAQAAFWF48hR1ZxWD9bmTihvsJwSN5urYMQoBOXsjILmBLpCp5Y8Wb2d+v8S1IsgfaFAjzZQJDWWGsM4VZOUHvjeEwqyhpkf95fegyYjUANSip9pcOY7JcbsJ3QemjclSynJdM2yjQovH+L9XiBHdKYFWDfacLicV2AGOtFikI1gVDGLSEqegx2bUuwmuQAlECM+lqj//OIwitlvDTMj9WCs40ybqG4D7o+JDWSXPBMYddaEqDw==~1; HOWTORT=ul=1691990122615&r=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1&hd=1691990122806&cl=1691990019014&nu=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1; _ga_K2YBNZVRLL=GS1.1.1691989990.4.1.1691990122.60.0.0' + } + + def __init__(self, input_file, output_file, error_file): + self.input_file = input_file + self.output_file = output_file + self.error_file = error_file + self.timeout = 30 + self.count = 1 + self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {} + + def transform_data(self, job_id, url, json_response): + job_details = json_response.get("jobDetails",{}) + + location_arr = [item['label'] for item in job_details["locations"]] + location_str = ', '.join(location_arr) + + skills_arr = [skill["label"] for skill in job_details.get("keySkills")["other"] if skill["label"]] + skills_str = ", ".join(skills_arr) + + json_data = { + "Url": url, + "Job Key": str(url.split('/')[-1]), + "Source Link": job_details.get("applyRedirectUrl"), + "Job Description": job_details.get("description"), + "Role Category": job_details.get("roleCategory"), + "Job Industry": job_details.get("industry"), + "Job Title": job_details.get("title"), + "Formatted Location Full": location_str, + "Job Functions": job_details.get("functionalArea"), + "Company": job_details.get("companyDetail", {}).get("name") if job_details.get("companyDetail") else None, + "Job Type": job_details.get("employmentType").split(',')[0].strip(), + + ##Only available in naukri + "Key Skills": skills_str, + "Minimum Experience": job_details.get("minimumExperience"), + "Maximum Experience": job_details.get("maximumExperience"), + "Salary Detail": job_details.get("salaryDetail"), + } + return json_data + + def scrape(self): + with open(self.input_file, 'r', encoding='utf-8') as infile: + reader = csv.reader(infile) + total_input_count=0 + all_job_ids = [] + + for row in reader: + jobid = row[1].strip() + mode = row[7].strip() + total_input_count+=1 + + if mode != "crawled": + print("removed non crawled job with jobid %s" % jobid) + continue + + all_job_ids.append(jobid) + + + print(f"Size of raw all_job_ids: {len(all_job_ids)}") + all_job_ids = list(set(all_job_ids)) + + print(f"Size of unique all_job_ids: {len(all_job_ids)}") + + #adjust skip + all_job_ids = all_job_ids[skip:] + + print(f"Total input: {total_input_count}, Valid ids to scrape {len(all_job_ids)}") + with open(stats_file, "a") as stat: + stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n") + + time.sleep(10) + + header_written=False + + with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile: + writer = csv.writer(outfile) + + while all_job_ids: + job_id = all_job_ids[0] + url = self.base_url.format(job_id) + + time.sleep(0.5) + response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies) + + print(f"{response.status_code} for {url}") + + if response.status_code == 200: + json_response = response.json() + + transformed_data = self.transform_data(job_id, url, json_response) + + # Write the header row if needed + if not header_written: + header = transformed_data.keys() + writer.writerow(header) + header_written = True + + writer.writerow(transformed_data.values()) + print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}") + all_job_ids.pop(0) # Remove the processed job ID + self.count += 1 + + elif response.status_code == 303: + json_response = response.json() + + if json_response.get('metaSearch', {}).get('isExpiredJob') == '1': + + print(f"Expired job ID {jobid} with response 303") + all_job_ids.pop(0) # Remove the processed job ID + + elif response.status_code == 404: + all_job_ids.pop(0) # Remove the processed job ID + print(f"Expired job ID {jobid} with response 404") + + else: + print(f"Error for job ID {job_id}") + time.sleep(10) + +def main(): + + start_time = time.time() + + scraper = NaukriJobDetailScraper(input_file, output_file, error_file) + scraper.scrape() + + end_time = time.time() + duration_hours = (end_time - start_time) / 3600 + print(f"Jobdata program took {duration_hours:.2f} hours to run.") + with open(stats_file, "a") as stat: + stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n") + + +if __name__ == "__main__": + main() diff --git a/naukri/search_gulf.py b/naukri/search_gulf.py new file mode 100644 index 0000000..22db5ae --- /dev/null +++ b/naukri/search_gulf.py @@ -0,0 +1,95 @@ +import requests +import json +import time +import re +import csv +import math + +headers = { + 'authority': 'www.naukrigulf.com', + 'accept': 'application/json', + 'accept-format': 'strict', + 'accept-language': 'ENGLISH', + 'appid': '205', + 'cache-control': 'no-cache', + 'client-type': 'desktop', + 'clientid': 'desktop', + 'device-type': 'desktop', + 'puppeteer': 'false', + 'referer': 'https://www.naukrigulf.com/jobs-in-uae', + 'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': 'Windows', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'systemid': '2323', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12', + 'userdata': '|IN' +} + +error_pages = [] +keys_to_extract = ['designation', 'jobId', 'company','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies'] +fields_to_write = ['designation', 'jobId', 'company','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies','city'] +input_file = "naukri/_gulf_location.csv" +jobs_per_pages = 50 +base_url = "https://www.naukrigulf.com/spapi/jobapi/search?Experience=&Keywords=&KeywordsAr=&Limit=50&Location={}&LocationAr=&Offset={}&SortPreference=&breadcrumb=1&locationId=&nationality=&nationalityLabel=&pageNo={}&srchId='" + +def parse_and_save(json_data, csv_filename, city): + parsed_data = [] + for job in json_data["jobs"]: + parsed_item = {field: job.get(field, None) for field in keys_to_extract} + parsed_item['city'] = city + parsed_data.append(parsed_item) + #parsed_data.extend(city) + + with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile: + csv_writer = csv.DictWriter(csvfile, fieldnames= fields_to_write) + csv_writer.writeheader() + csv_writer.writerows(parsed_data) + +def main(): +#for page_number in range(1, 4700): # Adjust the range as needed + with open(input_file, 'r') as file: + file_read = csv.reader(file) + for city in file_read: + city_read_url = city[0].replace("\n","") + output_data=[] + total_pages = 1000 + output_filename_json = f"{city[0]}.json" + output_filename_csv = "output_all_gulf.csv" + start_page = 1 + + if(city[0] == "pharma"): + start_page = 173 + total_pages = 22 + total_page_num = 194 + + while total_pages>0: + url = base_url.format(city[0],(jobs_per_pages*(start_page-1)),start_page) + response = requests.get(url, headers=headers) + + if response.status_code == 200: + json_data = response.json() + + if(total_pages == 1000): + total_jobs = json_data["totalJobsCount"] + total_pages = math.ceil(total_jobs/jobs_per_pages) + total_page_num = total_pages + + parse_and_save(json_data, output_filename_csv, city[0]) + print(f"Processed{url} : {start_page}/{total_page_num}/{total_pages}") + total_pages = total_pages-1 + start_page = start_page+1 + + else: + print("Error : ",response.status_code," at url ",url) + error_pages.append(url) + total_pages = total_pages-1 + start_page = start_page+1 + + print("Data saved to output_new.json") + print(error_pages) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/naukri/search_india.py b/naukri/search_india.py new file mode 100644 index 0000000..229a287 --- /dev/null +++ b/naukri/search_india.py @@ -0,0 +1,116 @@ +import requests +import json +import csv +import os +import time +import math + +# Global variables +input_file = "naukri/_industry_urls.csv" +output_file = "data_naukri/search_result_india.csv" +error_file = "data_naukri/search_error_india.csv" +stats_file = "data_naukri/stats_india.txt" +class NaukriJobScraper: + base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&pageNo={}&xt=catsrch&qi\[\]={}" + headers = { + "authority": "www.naukri.com", + "accept": "application/json", + "accept-language": "en-US,en;q=0.9", + "appid": "109", + "cache-control": "no-cache", + "clientid": "d3skt0p", + "content-type": "application/json", + "cookie": "_t_ds=21836c671691564336-4621836c67-021836c67; jd=280323907884; _gcl_au=1.1.1767756339.1691564338; test=naukri.com; G_ENABLED_IDPS=google; _cc_id=c7a22b66b0e8b76ba5b1ab973ac2c4e2; _fbp=fb.1.1691586951863.1688541664; MYNAUKRI[UNID]=6decd0ec6dac4ea7adf498fd9aea1b02; MYNAUKBMS[TOTALEXP]=.; MYNAUKBMS[MISC]=%7CX%7C-1%3A-1.-1%7CX%7C-1%3A-1.-1; PHPSESSID=7r1itb4rb4a5vp75h16aj1p50j; PS=0e9c712cbbee09d64d62ed464ccf1ed68d69b9c8b8e0879f86ac8078180ed768ff003c62a2e1a36431b890266d0ecd01; _t_ds=21836c671691564336-4621836c67-021836c67; ACTIVE=1691746049; __utma=266160400.222629415.1691564339.1691747172.1691747172.1; __utmc=266160400; __utmz=266160400.1691747172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _t_s=direct; _gid=GA1.2.404208624.1692184309; _t_r=1091%2F%2F; _abck=17DF08AA6008335BFF57EC3D4F31C60A~0~YAAQBCozaovbVfWJAQAAyqlV/wqIPgjcUjD+7ht0W00DSxyvraAK8+dtCE9YPqwS+IJPRVvvHPVL4ZLzQ7cfGNXzfh3k+y2VLqP+s+cPut62fApHUtFEmbTrUNVNv9Zeq9lwI+e8zd1DsioeBQtdUG+kzSHGWky6sPhziobMkx1B7W04IwUfACS7Ve5fYBCJU5dbtVRjeDAoNXmctQPJApkPdaddRMuoeq4qCZcW/bb8bGR+nwyO8+ZBPpQqoBpZrIhpG66AkcOcsLIfBHMfb8E/1dUZyDcFEO4Y7P41NVSIGgF8BzyGksJsa+IlaCXYrz0MDX0QiHXyiozYmEocQYKeTOwkMlmoHq/+X8XLt70g2LvMc0Zszor74PL7ymsDvPRLoDCvPinCf4Uk844KKItZ6menX46Tpg==~-1~-1~-1; bm_sz=BD37187E9CC624B5599566E84E218D81~YAAQBCozao3bVfWJAQAAyqlV/xQaFSd0F+spatEEAmhMi6P20wPSNyvyqwLIgOZIqPyzNpNoeCiq27hIuVDssDqyYLJipRkLmTgJhtRpBI/UkMYHO1gve7KT27FIcZLAPM1GlmudVfZr/vsBgNU7vcq7YlESrOQUNFkdARzI9cnEHl0Uwh+TdW+jSx/uvvgN860EXQYxvgQFPwHcF6K1HLhnThG6W3LrVsKEnltKEJsWzq73YGJhtHR2gk/c2Rn2rsnlBSKkon06k/bBUNpImVfGIv57NluTzAf4HUKBL2dBFfo=~4272181~3684401; bm_mi=840B9E1760640F737B07DF6916477F14~YAAQBCozar8fV/WJAQAAemdo/xR295FqGfoDgkXCgp3Zs538VapFXehFbhWVc0uLC2Z7cfCczehDlj6/WNkwuGUEm6AQ+a2VS9H1cL3cF+vXFUomXcwhU4fmjNruimtgH2vNc8+t07S6CFswop+vgQr50vwaRKAobfsJi0jKNELyQOdgxf0EQ+vH31DwtJMCeNMFIlZxXSznSOUZ9VRY/HSFsMgPHu3ChcKnhfJhUpS2VEkwwh8FjyNNsp08Nc8B85Vbpq3PCTz1kpFWCIeBDDVthrtnKITPzciYZy5e2VhvJWKi+2iRyOVeXbLbCphszroTewz5d6Sd4RhwOg==~1; _gat_UA-182658-1=1; ak_bmsc=DC184FF5F5CF7CEC60DE28CF4A04B43E~000000000000000000000000000000~YAAQBCozakggV/WJAQAAo2xo/xST717WQAIeCYOI3htLys7gWAfwL6/uNZtCJv6fAyFBYEcPf/0asPA8yD7eyVNXLvegM9qh5IquUPoSFJH3Sjz7JyPcySdejoqwoRGhg4rYROybASf1olGEy4PNPGBCBwTi+KUhkVCkHEaDWiDa/feuQddoB3nWBPui267IP17/01afcmBsBA+xz5PFn+OVIp7pIHrsWwa3Z+QoA3+9ZTSs+D/jXsBCsrJojd8U6Ho8NPfgfUyNOJo0SzFIQbcLy5TmAQHEYBCLhYgkRJjGPRSOqEYCtOenp5WzQHRisSQUU837xfVnr42Pc9xoW73pafQv/pQiuB64SrdhVtABVsSWchE5RuqwnPPIBf6cjJWLNb71p+Is6F6zcvVmSIvx2wZO0QmLQ2pfXr6Lh+jcBNPcod8pLbWG5U5RPHQAVi0nGPOYS+3mcrkGCiTrteqyLmSEOGvThutsOfl5Kog6h78tCaHhfhnZt1mmPkanCex2CHjeuT4FESOf83XFCLDVT9v0VAh962a9KQ==; __gads=ID=85c2a6341a8344ec:T=1691641263:RT=1692207181:S=ALNI_MZnP35P-PINdjwxcv-SNoWRMxbz8w; __gpi=UID=00000c29ed221036:T=1691641263:RT=1692207181:S=ALNI_Majbvns7DTxm-L8Fcvi-v_e7zQCvA; bm_sv=743032F92D532DCFC228BE5DB12014CF~YAAQBCozarIgV/WJAQAAQnJo/xRLr5g+qzbOInTUPStEJ+njAToV8zwOvBbHEEF9WGABP3ObKrNGr0FSALH8SsyJxhCnJZP72tWp4RJ8IMvpVkNNNye2Kc0n+U9VxZhSg9RKvKTn/DwW5x0lwY6guqb4wJwZIND/pUfBqdWUPp77qF4rYSeBEg/no94nGlmXUVUY4GqTDj6hCo6XIBbTIg1BGSdrLjFRTjpKu9aRX0ScDPSxuyMe7KPZSsOGY1AL~1; cto_bundle=TYhEE19xSDJxQk1qdTBuR3hYWDklMkJ3SWhPZmRkcjg3TnYyREN1dUpHaDBlbWJoME40OTVBelNlZ3J3TnhjVmZhSTNTTXl2U2JjSWhIM29aaWJHMyUyQkIlMkJPUmZKaGNBRkJLQVNHU1FYWFlleTFVJTJGTWduTkppQzJzMW1SOFJyRWNEdndENkklMkJ6M25jaFpaJTJCUmdUOWNMY2Z3TlolMkJ3QSUzRCUzRA; HOWTORT=ul=1692207219428&r=https%3A%2F%2Fwww.naukri.com%2Faccounting-jobs%3Fxt%3Dcatsrch%26amp%3Bqi%255b%255d%3D8&hd=1692207219607; _ga=GA1.1.222629415.1691564339; _ga_K2YBNZVRLL=GS1.1.1692207181.10.1.1692207220.21.0.0", # Add your cookie value here + "gid": "LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE", + "referer": "https://www.naukri.com/fresher-jobs?src=gnbjobs_homepage_srch", + "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "Windows", + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "systemid": "109", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43", + "content-encoding": "gzip", + } + keys_to_extract = ['title', 'jobId', 'footerPlaceholderLabel', 'companyName', 'companyId', 'jdURL', 'createdDate', + 'mode', 'placeholders'] + + def __init__(self, input_file_path, output_file_path, error_file_path): + self.input_file_path = input_file_path + self.output_file_path = output_file_path + self.error_file_path = error_file_path + self.timeout = 120 + self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {} + + def parse_and_save(self, json_data): + parsed_data = [] + for job in json_data["jobDetails"]: + parsed_item = {field: job.get(field, None) for field in self.keys_to_extract} + parsed_data.append(parsed_item) + + with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile: + csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract) + csv_writer.writerows(parsed_data) + + def scrape(self): + + with open(self.output_file_path, "w", newline="", encoding="utf-8") as csvfile: + csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract) + csv_writer.writeheader() + + with open(self.input_file_path, 'r') as file: + + file_read = csv.reader(file) + for industry in file_read: + industry_read_url = industry[0].replace("\n", "") + industry_name=industry[1] + industry_q=industry[2] + total_pages = 1000 + start_page = 1 + + print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}") + while total_pages > 0: + url = self.base_url.format(industry_name, start_page, industry_q) + + response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies) + + print(f"{response.status_code} for {url}") + + if response.status_code != 200: + print(f"Error with page {start_page} for industry {industry_name}") + with open(self.error_file_path, "a") as file: + file.write(f"Error with page {start_page} for industry {industry_name}\n") + time.sleep(10) + continue + + # if 200 response + data = response.json() + if(total_pages == 1000): + total_jobs = data["noOfJobs"] + total_pages = math.ceil(total_jobs/100) + + self.parse_and_save(data) + + # Assuming that you'll break the loop once all pages are scraped: + # (Add your logic to update 'total_pages' based on the response) + total_pages -= 1 + start_page += 1 + print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}") + time.sleep(1) + +def main(): + + start_time = time.time() + + scraper = NaukriJobScraper(input_file, output_file, error_file) + scraper.scrape() + end_time = time.time() + duration_hours = (end_time - start_time) / 3600 + print(f"Search program took {duration_hours:.2f} hours to run.") + with open(stats_file, "a") as stat: + stat.write(f"Search program took {duration_hours:.2f} hours to run. \n") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..57db5b3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests==2.25.1 +