main
sameer17cs 2023-09-25 14:51:49 +05:30
commit 8101070a79
9 changed files with 735 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
.vscode
data_naukri
scrib

41
naukri/_gulf_location.csv Normal file
View File

@ -0,0 +1,41 @@
Abu Dhabi
Dubai
Sharjah
Ras Al Khaimah
Ajman
Fujairah
Umm Al Qaiwain
Al Ain
Riyadh
Dammam
Jeddah
Makkah
Madinah
Yanbu
Eastern Province
Jubail
Muscat
Salalah
Sohar
Zufar
Doha
Ahmadi
Manama
Alexandria
Algeria
Amman
Baghdad
Beirut
Cairo
Dammam
Egypt
Iraq
Jordan
Lagos
Lebanon
Libya
Middle East
Morocco
Palestine
Somalia
Yemen
1 Abu Dhabi
2 Dubai
3 Sharjah
4 Ras Al Khaimah
5 Ajman
6 Fujairah
7 Umm Al Qaiwain
8 Al Ain
9 Riyadh
10 Dammam
11 Jeddah
12 Makkah
13 Madinah
14 Yanbu
15 Eastern Province
16 Jubail
17 Muscat
18 Salalah
19 Sohar
20 Zufar
21 Doha
22 Ahmadi
23 Manama
24 Alexandria
25 Algeria
26 Amman
27 Baghdad
28 Beirut
29 Cairo
30 Dammam
31 Egypt
32 Iraq
33 Jordan
34 Lagos
35 Lebanon
36 Libya
37 Middle East
38 Morocco
39 Palestine
40 Somalia
41 Yemen

61
naukri/_industry_urls.csv Normal file
View File

@ -0,0 +1,61 @@
https://www.naukri.com/accounting-jobs?xt=catsrch&qi[]=8,accounting,8
https://www.naukri.com/advertising-jobs?xt=catsrch&qi[]=32,advertising,32
https://www.naukri.com/agriculture-jobs?xt=catsrch&qi[]=33,agriculture,33
https://www.naukri.com/animation-jobs?xt=catsrch&qi[]=56,animation,56
https://www.naukri.com/architecture-jobs?xt=catsrch&qi[]=30,architecture,30
https://www.naukri.com/automobile-jobs?xt=catsrch&qi[]=4,automobile,4
https://www.naukri.com/aviation-jobs?xt=catsrch&qi[]=46,aviation,46
https://www.naukri.com/bpo-jobs?xt=catsrch&qi[]=7,bpo,7
https://www.naukri.com/bank-jobs?xt=catsrch&qi[]=14,bank,14
https://www.naukri.com/brewery-jobs?xt=catsrch&qi[]=50,brewery,50
https://www.naukri.com/sanitary-jobs?xt=catsrch&qi[]=60,sanitary,60
https://www.naukri.com/chemical-jobs?xt=catsrch&qi[]=6,chemical,6
https://www.naukri.com/engineering-jobs?xt=catsrch&qi[]=12,engineering,12
https://www.naukri.com/consumer-durables-jobs?xt=catsrch&qi[]=10,consumer-durables,10
https://www.naukri.com/courier-jobs?xt=catsrch&qi[]=18,courier,18
https://www.naukri.com/defence-jobs?xt=catsrch&qi[]=42,defence,42
https://www.naukri.com/teaching-jobs?xt=catsrch&qi[]=26,teaching,26
https://www.naukri.com/electrical-jobs?xt=catsrch&qi[]=55,electrical,55
https://www.naukri.com/export-import-jobs?xt=catsrch&qi[]=13,export-import,13
https://www.naukri.com/fmcg-jobs?xt=catsrch&qi[]=9,fmcg,9
https://www.naukri.com/facility-management-jobs?xt=catsrch&qi[]=47,facility-management,47
https://www.naukri.com/fertilizers-jobs?xt=catsrch&qi[]=41,fertilizers,41
https://www.naukri.com/food-processing-jobs?xt=catsrch&qi[]=57,food-processing,57
https://www.naukri.com/fresher-jobs?xt=catsrch&qi[]=31,fresher,31
https://www.naukri.com/gems-jewellery-jobs?xt=catsrch&qi[]=35,gems-jewellery,35
https://www.naukri.com/glass-jobs?xt=catsrch&qi[]=49,glass,49
https://www.naukri.com/air-conditioning-jobs?xt=catsrch&qi[]=61,air-conditioning,61
https://www.naukri.com/airline-jobs?xt=catsrch&qi[]=2,airline,2
https://www.naukri.com/networking-jobs?xt=catsrch&qi[]=15,networking,15
https://www.naukri.com/information-technology-jobs?xt=catsrch&qi[]=25,information-technology,25
https://www.naukri.com/industrial-jobs?xt=catsrch&qi[]=16,industrial,16
https://www.naukri.com/insurance-jobs?xt=catsrch&qi[]=17,insurance,17
https://www.naukri.com/kpo-jobs?xt=catsrch&qi[]=48,kpo,48
https://www.naukri.com/legal-jobs?xt=catsrch&qi[]=36,legal,36
https://www.naukri.com/media-jobs?xt=catsrch&qi[]=19,media,19
https://www.naukri.com/dotcom-jobs?xt=catsrch&qi[]=19,dotcom,19
https://www.naukri.com/entertainment-jobs?xt=catsrch&qi[]=19,entertainment,19
https://www.naukri.com/medical-jobs?xt=catsrch&qi[]=20,medical,20
https://www.naukri.com/mining-jobs?xt=catsrch&qi[]=54,mining,54
https://www.naukri.com/ngo-jobs?xt=catsrch&qi[]=37,ngo,37
https://www.naukri.com/automation-jobs?xt=catsrch&qi[]=21,automation,21
https://www.naukri.com/oil-and-gas-jobs?xt=catsrch&qi[]=23,oil-and-gas,23
https://www.naukri.com/paper-jobs?xt=catsrch&qi[]=43,paper,43
https://www.naukri.com/pharma-jobs?xt=catsrch&qi[]=22,pharma,22
https://www.naukri.com/printing-jobs?xt=catsrch&qi[]=38,printing,38
https://www.naukri.com/publishing-jobs?xt=catsrch&qi[]=58,publishing,58
https://www.naukri.com/real-estate-jobs?xt=catsrch&qi[]=39,real-estate,39
https://www.naukri.com/recruitment-jobs?xt=catsrch&qi[]=34,recruitment,34
https://www.naukri.com/retail-jobs?xt=catsrch&qi[]=24,retail,24
https://www.naukri.com/security-jobs?xt=catsrch&qi[]=40,security,40
https://www.naukri.com/electronics-jobs?xt=catsrch&qi[]=28,electronics,28
https://www.naukri.com/shipping-jobs?xt=catsrch&qi[]=44,shipping,44
https://www.naukri.com/steel-jobs?xt=catsrch&qi[]=53,steel,53
https://www.naukri.com/consultant-jobs?xt=catsrch&qi[]=52,consultant,52
https://www.naukri.com/telecom-jobs?xt=catsrch&qi[]=27,telecom,27
https://www.naukri.com/textiles-jobs?xt=catsrch&qi[]=3,textiles,3
https://www.naukri.com/tyres-jobs?xt=catsrch&qi[]=45,tyres,45
https://www.naukri.com/water-treatment-jobs?xt=catsrch&qi[]=51,water-treatment,51
https://www.naukri.com/fitness-trainer-jobs?xt=catsrch&qi[]=59,fitness-trainer,59
https://www.naukri.com/ecommerce-jobs?xt=catsrch&qi[]=63,ecommerce,63
https://www.naukri.com/internet-jobs?xt=catsrch&qi[]=63,internet,63
1 https://www.naukri.com/accounting-jobs?xt=catsrch&qi[]=8 accounting 8
2 https://www.naukri.com/advertising-jobs?xt=catsrch&qi[]=32 advertising 32
3 https://www.naukri.com/agriculture-jobs?xt=catsrch&qi[]=33 agriculture 33
4 https://www.naukri.com/animation-jobs?xt=catsrch&qi[]=56 animation 56
5 https://www.naukri.com/architecture-jobs?xt=catsrch&qi[]=30 architecture 30
6 https://www.naukri.com/automobile-jobs?xt=catsrch&qi[]=4 automobile 4
7 https://www.naukri.com/aviation-jobs?xt=catsrch&qi[]=46 aviation 46
8 https://www.naukri.com/bpo-jobs?xt=catsrch&qi[]=7 bpo 7
9 https://www.naukri.com/bank-jobs?xt=catsrch&qi[]=14 bank 14
10 https://www.naukri.com/brewery-jobs?xt=catsrch&qi[]=50 brewery 50
11 https://www.naukri.com/sanitary-jobs?xt=catsrch&qi[]=60 sanitary 60
12 https://www.naukri.com/chemical-jobs?xt=catsrch&qi[]=6 chemical 6
13 https://www.naukri.com/engineering-jobs?xt=catsrch&qi[]=12 engineering 12
14 https://www.naukri.com/consumer-durables-jobs?xt=catsrch&qi[]=10 consumer-durables 10
15 https://www.naukri.com/courier-jobs?xt=catsrch&qi[]=18 courier 18
16 https://www.naukri.com/defence-jobs?xt=catsrch&qi[]=42 defence 42
17 https://www.naukri.com/teaching-jobs?xt=catsrch&qi[]=26 teaching 26
18 https://www.naukri.com/electrical-jobs?xt=catsrch&qi[]=55 electrical 55
19 https://www.naukri.com/export-import-jobs?xt=catsrch&qi[]=13 export-import 13
20 https://www.naukri.com/fmcg-jobs?xt=catsrch&qi[]=9 fmcg 9
21 https://www.naukri.com/facility-management-jobs?xt=catsrch&qi[]=47 facility-management 47
22 https://www.naukri.com/fertilizers-jobs?xt=catsrch&qi[]=41 fertilizers 41
23 https://www.naukri.com/food-processing-jobs?xt=catsrch&qi[]=57 food-processing 57
24 https://www.naukri.com/fresher-jobs?xt=catsrch&qi[]=31 fresher 31
25 https://www.naukri.com/gems-jewellery-jobs?xt=catsrch&qi[]=35 gems-jewellery 35
26 https://www.naukri.com/glass-jobs?xt=catsrch&qi[]=49 glass 49
27 https://www.naukri.com/air-conditioning-jobs?xt=catsrch&qi[]=61 air-conditioning 61
28 https://www.naukri.com/airline-jobs?xt=catsrch&qi[]=2 airline 2
29 https://www.naukri.com/networking-jobs?xt=catsrch&qi[]=15 networking 15
30 https://www.naukri.com/information-technology-jobs?xt=catsrch&qi[]=25 information-technology 25
31 https://www.naukri.com/industrial-jobs?xt=catsrch&qi[]=16 industrial 16
32 https://www.naukri.com/insurance-jobs?xt=catsrch&qi[]=17 insurance 17
33 https://www.naukri.com/kpo-jobs?xt=catsrch&qi[]=48 kpo 48
34 https://www.naukri.com/legal-jobs?xt=catsrch&qi[]=36 legal 36
35 https://www.naukri.com/media-jobs?xt=catsrch&qi[]=19 media 19
36 https://www.naukri.com/dotcom-jobs?xt=catsrch&qi[]=19 dotcom 19
37 https://www.naukri.com/entertainment-jobs?xt=catsrch&qi[]=19 entertainment 19
38 https://www.naukri.com/medical-jobs?xt=catsrch&qi[]=20 medical 20
39 https://www.naukri.com/mining-jobs?xt=catsrch&qi[]=54 mining 54
40 https://www.naukri.com/ngo-jobs?xt=catsrch&qi[]=37 ngo 37
41 https://www.naukri.com/automation-jobs?xt=catsrch&qi[]=21 automation 21
42 https://www.naukri.com/oil-and-gas-jobs?xt=catsrch&qi[]=23 oil-and-gas 23
43 https://www.naukri.com/paper-jobs?xt=catsrch&qi[]=43 paper 43
44 https://www.naukri.com/pharma-jobs?xt=catsrch&qi[]=22 pharma 22
45 https://www.naukri.com/printing-jobs?xt=catsrch&qi[]=38 printing 38
46 https://www.naukri.com/publishing-jobs?xt=catsrch&qi[]=58 publishing 58
47 https://www.naukri.com/real-estate-jobs?xt=catsrch&qi[]=39 real-estate 39
48 https://www.naukri.com/recruitment-jobs?xt=catsrch&qi[]=34 recruitment 34
49 https://www.naukri.com/retail-jobs?xt=catsrch&qi[]=24 retail 24
50 https://www.naukri.com/security-jobs?xt=catsrch&qi[]=40 security 40
51 https://www.naukri.com/electronics-jobs?xt=catsrch&qi[]=28 electronics 28
52 https://www.naukri.com/shipping-jobs?xt=catsrch&qi[]=44 shipping 44
53 https://www.naukri.com/steel-jobs?xt=catsrch&qi[]=53 steel 53
54 https://www.naukri.com/consultant-jobs?xt=catsrch&qi[]=52 consultant 52
55 https://www.naukri.com/telecom-jobs?xt=catsrch&qi[]=27 telecom 27
56 https://www.naukri.com/textiles-jobs?xt=catsrch&qi[]=3 textiles 3
57 https://www.naukri.com/tyres-jobs?xt=catsrch&qi[]=45 tyres 45
58 https://www.naukri.com/water-treatment-jobs?xt=catsrch&qi[]=51 water-treatment 51
59 https://www.naukri.com/fitness-trainer-jobs?xt=catsrch&qi[]=59 fitness-trainer 59
60 https://www.naukri.com/ecommerce-jobs?xt=catsrch&qi[]=63 ecommerce 63
61 https://www.naukri.com/internet-jobs?xt=catsrch&qi[]=63 internet 63

115
naukri/expiry.py Normal file
View File

@ -0,0 +1,115 @@
import requests
import csv
import time
import json
import os
# Global variables
input_file = "data_naukri/old_jobdata.csv"
output_file = "data_naukri/expired.csv"
error_file = "data_naukri_india/expiry_error.csv"
stats_file = "data_naukri_india/stats.txt"
class NaukriExpiryScraper:
base_url="https://www.naukri.com/jobapi/v4/job/{}"
headers = {
'authority': 'www.naukri.com',
'accept': 'application/json',
'accept-language': 'en-US,en;q=0.9',
'appid': '121',
'cache-control': 'no-cache, no-store, must-revalidate',
'content-type': 'application/json',
'expires': '0',
'gid': 'LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE',
'pragma': 'no-cache',
'referer': 'https://www.naukri.com/job-listings-ps-technical-consultant-ii-ncr-corporation-india-pvt-ltd-kolkata-mumbai-new-delhi-hyderabad-secunderabad-pune-chennai-bangalore-bengaluru-3-to-6-years-120823501070',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'systemid': 'Naukri',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43',
'x-requested-with': 'XMLHttpRequest',
'cookie': 'test=naukri.com; _t_ds=14c8c0f01691845374-19414c8c0f0-014c8c0f0; _gcl_au=1.1.1024691843.1691845381; _fbp=fb.1.1691845391563.1521284000; _t_r=1096%2F%2F; __utma=266160400.1059122291.1691845381.1691846963.1691846963.1; __utmc=266160400; __utmz=266160400.1691846963.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _gid=GA1.2.1097790226.1691946960; _cc_id=f102b3f9c375bbb80783e8e09a9c6a4d; panoramaId_expiry=1692033362592; panoramaId=f20d6d50d02ca8dbc9f4835382c2a9fb927ad954299218db60a7d9d7bca09362; panoramaIdType=panoDevice; _abck=EAF8CD87ED06F6FE0D1BE341378082D0~0~YAAQBCozarVfw8GJAQAAvF128gqV/yjff8AT5qkTc7EiVmNlJJ00nD16VEFeJh15q2bYAK8KlnGcPr7zpsi8USVMgui9DaCwoq15n4cW+Z/uKvUfCuUQAwVIKj2qlRT9tghTOfvBgGovWxTjFhD8B8DypZg3xbCBcOfMrTxIG0kml1V3V0teNzxQbKwxZBH+f9SpG1nWjcSqi0MuZ2Lp9njCQTDEXdyNn5FK9QUyBNIgMiZXGCroYN6g9Dqg50awS8p7GDin9O0yaBFnLYXYSSPqsjYlZsOAeZG1YDXhVfCIXFl9Ai4oQwulHEVR4kTx7E/GAxrPUMWKT1MJXJk38d/hHm/khF9WXryyuzBNGqBrHEmzbSK2Apvjhz+Hl7a1CDiFvYOTgurygc0o2F8E4e+o1OudsW0KCA==~-1~-1~-1; bm_sz=ED70280600D61C24AE8779690E6872A4~YAAQBCozardfw8GJAQAAvF128hRM9F6AMuh7Z7SvE3TmgXzJwI6StEga9y2KuTxZ8hXLMtJ7yq1I6ToCvJ1qcBfvYBY/W/7P2A4I+QADScKYSbs6S/S3UE9bL/lKee3NvEuD50tGUHrs59SQGoYdJGMrwml9npvfv+PANc8RaeobLmyx70LjTBajrTQruhnuEqphAnEPph1L6yqffRmta8KALbfw/sFFkvZWRte4uRCRS6IwyvdgNdGzHrvU90Cefnm1sAuK5Hm+F+JUvMVZhEWa/vukCd3Pz7toStN7N4P31cQ=~4539188~3289157; bm_mi=5266EA699B10C54B520AC0C335945591~YAAQBCozaslfw8GJAQAAFV528hRn0Dp7Ng6SjmmpdWbuBqjjlpOIm6e4no+DFPGfNvfuTNj9/tOe0zSzEbnFtWymp3K8PdRZcbO4azXh/4xphXqBeZXTZhE/H/7X6du3KAg3VyrF08jM/O2Hf8/7qtOXVUdBSpd8+mzH3IbW1d10UuiswDenQ6HiNRSkJISdZ8F6lXgGw2kpN3tAHIa9RixcTehrimRMipUgj4pRG/80a+tzAQQcAWUVOFaNoOHZ/C/oL2It920HJrOdtE85yrXx/LMaJlUb1RlHCG2KE/xkNMWpMI/FCimZYyI/DC8yQziKzxoqnP+GPA+JN5dMV76U4jXzYLqPOT5NwoKG7w==~1; ak_bmsc=0F69C083388867249F15237E773039FA~000000000000000000000000000000~YAAQBCozailgw8GJAQAAiGF28hTwkEIwbiDNaA96h/t+HbVduxzp6s1VtAmlm8JZxLg4LfiUPyA15rawjfgm3WgrQVB6GsFlaa+AvUvz1Pz3Q1P9td+LXZ5/+PFIAaTQN/O8SvcNd87eOmguE+T4BLbH5NDBcHEHBngYElDjkyqZkRtJ15EqweEPCpzn6yt+EYc/+sNuZI5/Wqj674CTqW8hmhvDToHdetlr8dh0zmRPwh1xdYnwb4uR6rGuaAIDwfopcqXdroQFVmDwMMXCkLNtTG3jToLxEDo7w/SHlJNK0LhicrXOQLyJu4k7udguvs4/Y+kXOEc04TkLKWa0gHsA+znQId6BT0CK4BFgGPYCMzpn379EH1ucz+mbjpX9p61CvxwEqFWV6O6hXXlbjHDGsuIiuIy3EP+38wb6B+uq2PBPgEmzZjLYjs9aNWGs0of7I0/V+ZL2xQDA2JD5FUXN1sgkl8r6w2sT5Fk1VuHGeorLkpIm0fkysZqAPM2yqJ5zaVkjyI4UENN56Aw79pKKVSkJtT5ALDmr1e+O8keIkg069ipenGburGc1Nw==; __gads=ID=da661383a92cc2b7:T=1691845731:RT=1691990009:S=ALNI_Ma5kdU-yCfi5vupriJnuuWUWmE_SQ; __gpi=UID=00000c2b451ccc2b:T=1691845731:RT=1691990009:S=ALNI_MZHpbDDCgSCaDcBTqfNHzHEDKk0JQ; jd=110823008324; _ga=GA1.2.1059122291.1691845381; cto_bundle=IfSELF9LbTF0TnAzamN1d2ZSSm5EMkdYekFhWDNJeElkOCUyQkElMkZ2RTRJNTFBNG95WENmVlBEV01wV3ZPSXB0dWpTZVFBZHZWQmt6WjVHTUpWNWEwQURTeWRaMWVGbyUyQjclMkZpSm5aNFZia0ZjcGklMkJFcSUyQlg2R3I3bUJkazJnaVN0cURyTUpGWUxQOHR6TFpBcDF6QU1MckFOdlg2cEElM0QlM0Q; _gat_UA-182658-1=1; bm_sv=33FDCB0BB2381FFCB1DA9B35AB25F10B~YAAQHSozaj2kUsGJAQAAFWF48hR1ZxWD9bmTihvsJwSN5urYMQoBOXsjILmBLpCp5Y8Wb2d+v8S1IsgfaFAjzZQJDWWGsM4VZOUHvjeEwqyhpkf95fegyYjUANSip9pcOY7JcbsJ3QemjclSynJdM2yjQovH+L9XiBHdKYFWDfacLicV2AGOtFikI1gVDGLSEqegx2bUuwmuQAlECM+lqj//OIwitlvDTMj9WCs40ybqG4D7o+JDWSXPBMYddaEqDw==~1; HOWTORT=ul=1691990122615&r=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1&hd=1691990122806&cl=1691990019014&nu=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1; _ga_K2YBNZVRLL=GS1.1.1691989990.4.1.1691990122.60.0.0'
}
def __init__(self, input_file, output_file, error_file):
self.input_file = input_file
self.output_file = output_file
self.error_file = error_file
self.timeout = 30
self.expired_jobs_count=0
self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}
def scrape(self):
all_input = []
with open(self.input_file, 'r', encoding='utf-8') as infile:
header_line = infile.readline().strip()
#write header line
with open(self.output_file, 'w') as file:
file.write(header_line + "\n")
reader = csv.reader(infile)
for row in reader:
all_input.append(row)
with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
while all_input:
current_row=all_input[0]
source_link=current_row[2].strip()
jobid = current_row[1].strip()
url = self.base_url.format(jobid)
if source_link == "":
print(f"Not checking job without source link, job ID {jobid}")
all_input.pop(0) # Remove the processed job ID
continue
print(f"Remaining to do: {len(all_input)}")
time.sleep(0.5)
response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
print(f"{response.status_code} for {url}")
if response.status_code == 200:
print(f"Alive job ID {jobid}")
all_input.pop(0) # Remove the processed job ID
elif response.status_code == 303:
json_response = response.json()
if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
print(f"Expired job ID {jobid} with response 303")
writer.writerow(current_row)
self.expired_jobs_count+=1
all_input.pop(0) # Remove the processed job ID
elif response.status_code == 404:
print(f"Expired job ID {jobid} with response 404")
writer.writerow(current_row)
self.expired_jobs_count+=1
all_input.pop(0) # Remove the processed job ID
else:
print(f"Failed to fetch data for job ID {jobid}")
time.sleep(10)
def main():
start_time = time.time()
scraper = NaukriExpiryScraper(input_file, output_file, error_file)
scraper.scrape()
end_time = time.time()
duration_hours = (end_time - start_time) / 3600
print(f"Expiry program took {duration_hours:.2f} hours to run.")
with open(stats_file, "a") as stat:
stat.write(f"Expiry program took {duration_hours:.2f} hours to run.\n")
if __name__ == "__main__":
main()

132
naukri/jobdata_gulf.py Normal file
View File

@ -0,0 +1,132 @@
import requests
import csv
import concurrent.futures
# List of URLs to query
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
headers = {
'authority': 'www.naukrigulf.com',
'accept': 'application/json',
'accept-format': 'strict',
'accept-language': 'ENGLISH',
'appid': '205',
'cache-control': 'no-cache',
'client-type': 'desktop',
'clientid': 'desktop',
'device-type': 'desktop',
'puppeteer': 'false',
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'systemid': '2323',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
'userdata': '|IN'
}
keys_to_extract = ['designation','description','company','compensation','industryType','functionalArea','jobSource','location','other','desiredCandidate','contact','isExpired','locationInterlinking']
company_keys = ['name','details']
salary_key = ['minimumSalary','maximumSalary','currency','label','hideSalary']
rfile = "ME_jobIds.csv"
loc_list = []
skill_other =[]
skill_pref = []
def fetch_url(url):
try:
url = base_url.format(url)
response = requests.get(url, headers=headers)
return response.json(), response.status_code, url
except requests.exceptions.RequestException as e:
return "", str(e), url
def batch_process(urls):
results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_url = {executor.submit(fetch_url, url): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
results.append(result)
except Exception as e:
results.append((url, str(e)))
return results
def main():
batch_size = 50
results = []
count = 1
# Open a CSV file for writing
with open('output_jobs_0209_me.csv', 'a', newline='', encoding='utf-8') as csvfile:
csvwriter = csv.writer(csvfile)
# Write header to the CSV file
csvwriter.writerow(['URL'] + list(keys_to_extract))
with open(rfile,'r') as file:
csv_reader = csv.reader(file)
urls = [row.replace("\n","") for row in file]
for i in range(0, len(urls), batch_size):
batch = urls[i:i+batch_size]
batch_results = batch_process(batch)
# Make the HTTP GET request
#row = row.replace("\n","")
#`url = base_url.format(row)`
#try:
for response in batch_results:
print(count)
count = count + 1
if response[1]== 200:
json_data = response[0]
job_details = json_data
# Extract specific key values from the JSON response
values_to_store = [job_details.get(key, '') for key in keys_to_extract]
"""if values_to_store[0]!="":
[values_to_store.append(job_details["companyDetail"].get(key,'')) for key in company_keys]
[values_to_store.append(job_details["salaryDetail"].get(key,'')) for key in salary_key]
for loc in job_details["locations"]:
loc_list.append(loc.get('label',''))
values_to_store.append(loc_list)
for skill in job_details["keySkills"]["other"]:
skill_other.append(skill.get('label',''))
values_to_store.append(skill_other)
for skill in job_details["keySkills"]["preferred"]:
skill_pref.append(skill.get('label',''))
values_to_store.append(skill_pref)
else:
values_to_store[1]=""
values_to_store.append(job_details["companyDetail"])
values_to_store.append(job_details["salaryDetail"])
values_to_store.append(job_details["locations"])
values_to_store.append(job_details["keySkills"])
"""
# Write the extracted values to the CSV file
csvwriter.writerow([response[2]] + values_to_store)
else:
print(f"Failed to fetch data for job ID: {response[2]} with {response[0]}")
csvwriter.writerow([response[2]] + [response[0]])
# except requests.exceptions.RequestException as e:
# csvwriter.writerow([url] + [str(e)])
print("Data extraction and CSV writing complete.")
if __name__ == "__main__":
main()

170
naukri/jobdata_india.py Normal file
View File

@ -0,0 +1,170 @@
import requests
import csv
import time
import json
import os
# Global variables
input_file = "data_naukri/search_result_india.csv"
output_file = "data_naukri/jobdata_india.csv"
error_file = "data_naukri/jobdata_error_india.csv"
stats_file = "data_naukri/stats.txt"
skip=0
class NaukriJobDetailScraper:
base_url = "https://www.naukri.com/jobapi/v4/job/{}"
headers = {
'authority': 'www.naukri.com',
'accept': 'application/json',
'accept-language': 'en-US,en;q=0.9',
'appid': '121',
'cache-control': 'no-cache, no-store, must-revalidate',
'content-type': 'application/json',
'expires': '0',
'gid': 'LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE',
'pragma': 'no-cache',
'referer': 'https://www.naukri.com/job-listings-ps-technical-consultant-ii-ncr-corporation-india-pvt-ltd-kolkata-mumbai-new-delhi-hyderabad-secunderabad-pune-chennai-bangalore-bengaluru-3-to-6-years-120823501070',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'systemid': 'Naukri',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43',
'x-requested-with': 'XMLHttpRequest',
'cookie': 'test=naukri.com; _t_ds=14c8c0f01691845374-19414c8c0f0-014c8c0f0; _gcl_au=1.1.1024691843.1691845381; _fbp=fb.1.1691845391563.1521284000; _t_r=1096%2F%2F; __utma=266160400.1059122291.1691845381.1691846963.1691846963.1; __utmc=266160400; __utmz=266160400.1691846963.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _gid=GA1.2.1097790226.1691946960; _cc_id=f102b3f9c375bbb80783e8e09a9c6a4d; panoramaId_expiry=1692033362592; panoramaId=f20d6d50d02ca8dbc9f4835382c2a9fb927ad954299218db60a7d9d7bca09362; panoramaIdType=panoDevice; _abck=EAF8CD87ED06F6FE0D1BE341378082D0~0~YAAQBCozarVfw8GJAQAAvF128gqV/yjff8AT5qkTc7EiVmNlJJ00nD16VEFeJh15q2bYAK8KlnGcPr7zpsi8USVMgui9DaCwoq15n4cW+Z/uKvUfCuUQAwVIKj2qlRT9tghTOfvBgGovWxTjFhD8B8DypZg3xbCBcOfMrTxIG0kml1V3V0teNzxQbKwxZBH+f9SpG1nWjcSqi0MuZ2Lp9njCQTDEXdyNn5FK9QUyBNIgMiZXGCroYN6g9Dqg50awS8p7GDin9O0yaBFnLYXYSSPqsjYlZsOAeZG1YDXhVfCIXFl9Ai4oQwulHEVR4kTx7E/GAxrPUMWKT1MJXJk38d/hHm/khF9WXryyuzBNGqBrHEmzbSK2Apvjhz+Hl7a1CDiFvYOTgurygc0o2F8E4e+o1OudsW0KCA==~-1~-1~-1; bm_sz=ED70280600D61C24AE8779690E6872A4~YAAQBCozardfw8GJAQAAvF128hRM9F6AMuh7Z7SvE3TmgXzJwI6StEga9y2KuTxZ8hXLMtJ7yq1I6ToCvJ1qcBfvYBY/W/7P2A4I+QADScKYSbs6S/S3UE9bL/lKee3NvEuD50tGUHrs59SQGoYdJGMrwml9npvfv+PANc8RaeobLmyx70LjTBajrTQruhnuEqphAnEPph1L6yqffRmta8KALbfw/sFFkvZWRte4uRCRS6IwyvdgNdGzHrvU90Cefnm1sAuK5Hm+F+JUvMVZhEWa/vukCd3Pz7toStN7N4P31cQ=~4539188~3289157; bm_mi=5266EA699B10C54B520AC0C335945591~YAAQBCozaslfw8GJAQAAFV528hRn0Dp7Ng6SjmmpdWbuBqjjlpOIm6e4no+DFPGfNvfuTNj9/tOe0zSzEbnFtWymp3K8PdRZcbO4azXh/4xphXqBeZXTZhE/H/7X6du3KAg3VyrF08jM/O2Hf8/7qtOXVUdBSpd8+mzH3IbW1d10UuiswDenQ6HiNRSkJISdZ8F6lXgGw2kpN3tAHIa9RixcTehrimRMipUgj4pRG/80a+tzAQQcAWUVOFaNoOHZ/C/oL2It920HJrOdtE85yrXx/LMaJlUb1RlHCG2KE/xkNMWpMI/FCimZYyI/DC8yQziKzxoqnP+GPA+JN5dMV76U4jXzYLqPOT5NwoKG7w==~1; ak_bmsc=0F69C083388867249F15237E773039FA~000000000000000000000000000000~YAAQBCozailgw8GJAQAAiGF28hTwkEIwbiDNaA96h/t+HbVduxzp6s1VtAmlm8JZxLg4LfiUPyA15rawjfgm3WgrQVB6GsFlaa+AvUvz1Pz3Q1P9td+LXZ5/+PFIAaTQN/O8SvcNd87eOmguE+T4BLbH5NDBcHEHBngYElDjkyqZkRtJ15EqweEPCpzn6yt+EYc/+sNuZI5/Wqj674CTqW8hmhvDToHdetlr8dh0zmRPwh1xdYnwb4uR6rGuaAIDwfopcqXdroQFVmDwMMXCkLNtTG3jToLxEDo7w/SHlJNK0LhicrXOQLyJu4k7udguvs4/Y+kXOEc04TkLKWa0gHsA+znQId6BT0CK4BFgGPYCMzpn379EH1ucz+mbjpX9p61CvxwEqFWV6O6hXXlbjHDGsuIiuIy3EP+38wb6B+uq2PBPgEmzZjLYjs9aNWGs0of7I0/V+ZL2xQDA2JD5FUXN1sgkl8r6w2sT5Fk1VuHGeorLkpIm0fkysZqAPM2yqJ5zaVkjyI4UENN56Aw79pKKVSkJtT5ALDmr1e+O8keIkg069ipenGburGc1Nw==; __gads=ID=da661383a92cc2b7:T=1691845731:RT=1691990009:S=ALNI_Ma5kdU-yCfi5vupriJnuuWUWmE_SQ; __gpi=UID=00000c2b451ccc2b:T=1691845731:RT=1691990009:S=ALNI_MZHpbDDCgSCaDcBTqfNHzHEDKk0JQ; jd=110823008324; _ga=GA1.2.1059122291.1691845381; cto_bundle=IfSELF9LbTF0TnAzamN1d2ZSSm5EMkdYekFhWDNJeElkOCUyQkElMkZ2RTRJNTFBNG95WENmVlBEV01wV3ZPSXB0dWpTZVFBZHZWQmt6WjVHTUpWNWEwQURTeWRaMWVGbyUyQjclMkZpSm5aNFZia0ZjcGklMkJFcSUyQlg2R3I3bUJkazJnaVN0cURyTUpGWUxQOHR6TFpBcDF6QU1MckFOdlg2cEElM0QlM0Q; _gat_UA-182658-1=1; bm_sv=33FDCB0BB2381FFCB1DA9B35AB25F10B~YAAQHSozaj2kUsGJAQAAFWF48hR1ZxWD9bmTihvsJwSN5urYMQoBOXsjILmBLpCp5Y8Wb2d+v8S1IsgfaFAjzZQJDWWGsM4VZOUHvjeEwqyhpkf95fegyYjUANSip9pcOY7JcbsJ3QemjclSynJdM2yjQovH+L9XiBHdKYFWDfacLicV2AGOtFikI1gVDGLSEqegx2bUuwmuQAlECM+lqj//OIwitlvDTMj9WCs40ybqG4D7o+JDWSXPBMYddaEqDw==~1; HOWTORT=ul=1691990122615&r=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1&hd=1691990122806&cl=1691990019014&nu=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1; _ga_K2YBNZVRLL=GS1.1.1691989990.4.1.1691990122.60.0.0'
}
def __init__(self, input_file, output_file, error_file):
self.input_file = input_file
self.output_file = output_file
self.error_file = error_file
self.timeout = 30
self.count = 1
self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}
def transform_data(self, job_id, url, json_response):
job_details = json_response.get("jobDetails",{})
location_arr = [item['label'] for item in job_details["locations"]]
location_str = ', '.join(location_arr)
skills_arr = [skill["label"] for skill in job_details.get("keySkills")["other"] if skill["label"]]
skills_str = ", ".join(skills_arr)
json_data = {
"Url": url,
"Job Key": str(url.split('/')[-1]),
"Source Link": job_details.get("applyRedirectUrl"),
"Job Description": job_details.get("description"),
"Role Category": job_details.get("roleCategory"),
"Job Industry": job_details.get("industry"),
"Job Title": job_details.get("title"),
"Formatted Location Full": location_str,
"Job Functions": job_details.get("functionalArea"),
"Company": job_details.get("companyDetail", {}).get("name") if job_details.get("companyDetail") else None,
"Job Type": job_details.get("employmentType").split(',')[0].strip(),
##Only available in naukri
"Key Skills": skills_str,
"Minimum Experience": job_details.get("minimumExperience"),
"Maximum Experience": job_details.get("maximumExperience"),
"Salary Detail": job_details.get("salaryDetail"),
}
return json_data
def scrape(self):
with open(self.input_file, 'r', encoding='utf-8') as infile:
reader = csv.reader(infile)
total_input_count=0
all_job_ids = []
for row in reader:
jobid = row[1].strip()
mode = row[7].strip()
total_input_count+=1
if mode != "crawled":
print("removed non crawled job with jobid %s" % jobid)
continue
all_job_ids.append(jobid)
print(f"Size of raw all_job_ids: {len(all_job_ids)}")
all_job_ids = list(set(all_job_ids))
print(f"Size of unique all_job_ids: {len(all_job_ids)}")
#adjust skip
all_job_ids = all_job_ids[skip:]
print(f"Total input: {total_input_count}, Valid ids to scrape {len(all_job_ids)}")
with open(stats_file, "a") as stat:
stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n")
time.sleep(10)
header_written=False
with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
while all_job_ids:
job_id = all_job_ids[0]
url = self.base_url.format(job_id)
time.sleep(0.5)
response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
print(f"{response.status_code} for {url}")
if response.status_code == 200:
json_response = response.json()
transformed_data = self.transform_data(job_id, url, json_response)
# Write the header row if needed
if not header_written:
header = transformed_data.keys()
writer.writerow(header)
header_written = True
writer.writerow(transformed_data.values())
print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}")
all_job_ids.pop(0) # Remove the processed job ID
self.count += 1
elif response.status_code == 303:
json_response = response.json()
if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
print(f"Expired job ID {jobid} with response 303")
all_job_ids.pop(0) # Remove the processed job ID
elif response.status_code == 404:
all_job_ids.pop(0) # Remove the processed job ID
print(f"Expired job ID {jobid} with response 404")
else:
print(f"Error for job ID {job_id}")
time.sleep(10)
def main():
start_time = time.time()
scraper = NaukriJobDetailScraper(input_file, output_file, error_file)
scraper.scrape()
end_time = time.time()
duration_hours = (end_time - start_time) / 3600
print(f"Jobdata program took {duration_hours:.2f} hours to run.")
with open(stats_file, "a") as stat:
stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
if __name__ == "__main__":
main()

95
naukri/search_gulf.py Normal file
View File

@ -0,0 +1,95 @@
import requests
import json
import time
import re
import csv
import math
headers = {
'authority': 'www.naukrigulf.com',
'accept': 'application/json',
'accept-format': 'strict',
'accept-language': 'ENGLISH',
'appid': '205',
'cache-control': 'no-cache',
'client-type': 'desktop',
'clientid': 'desktop',
'device-type': 'desktop',
'puppeteer': 'false',
'referer': 'https://www.naukrigulf.com/jobs-in-uae',
'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'systemid': '2323',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
'userdata': '|IN'
}
error_pages = []
keys_to_extract = ['designation', 'jobId', 'company','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies']
fields_to_write = ['designation', 'jobId', 'company','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies','city']
input_file = "naukri/_gulf_location.csv"
jobs_per_pages = 50
base_url = "https://www.naukrigulf.com/spapi/jobapi/search?Experience=&Keywords=&KeywordsAr=&Limit=50&Location={}&LocationAr=&Offset={}&SortPreference=&breadcrumb=1&locationId=&nationality=&nationalityLabel=&pageNo={}&srchId='"
def parse_and_save(json_data, csv_filename, city):
parsed_data = []
for job in json_data["jobs"]:
parsed_item = {field: job.get(field, None) for field in keys_to_extract}
parsed_item['city'] = city
parsed_data.append(parsed_item)
#parsed_data.extend(city)
with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames= fields_to_write)
csv_writer.writeheader()
csv_writer.writerows(parsed_data)
def main():
#for page_number in range(1, 4700): # Adjust the range as needed
with open(input_file, 'r') as file:
file_read = csv.reader(file)
for city in file_read:
city_read_url = city[0].replace("\n","")
output_data=[]
total_pages = 1000
output_filename_json = f"{city[0]}.json"
output_filename_csv = "output_all_gulf.csv"
start_page = 1
if(city[0] == "pharma"):
start_page = 173
total_pages = 22
total_page_num = 194
while total_pages>0:
url = base_url.format(city[0],(jobs_per_pages*(start_page-1)),start_page)
response = requests.get(url, headers=headers)
if response.status_code == 200:
json_data = response.json()
if(total_pages == 1000):
total_jobs = json_data["totalJobsCount"]
total_pages = math.ceil(total_jobs/jobs_per_pages)
total_page_num = total_pages
parse_and_save(json_data, output_filename_csv, city[0])
print(f"Processed{url} : {start_page}/{total_page_num}/{total_pages}")
total_pages = total_pages-1
start_page = start_page+1
else:
print("Error : ",response.status_code," at url ",url)
error_pages.append(url)
total_pages = total_pages-1
start_page = start_page+1
print("Data saved to output_new.json")
print(error_pages)
if __name__ == "__main__":
main()

116
naukri/search_india.py Normal file
View File

@ -0,0 +1,116 @@
import requests
import json
import csv
import os
import time
import math
# Global variables
input_file = "naukri/_industry_urls.csv"
output_file = "data_naukri/search_result_india.csv"
error_file = "data_naukri/search_error_india.csv"
stats_file = "data_naukri/stats_india.txt"
class NaukriJobScraper:
base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&pageNo={}&xt=catsrch&qi\[\]={}"
headers = {
"authority": "www.naukri.com",
"accept": "application/json",
"accept-language": "en-US,en;q=0.9",
"appid": "109",
"cache-control": "no-cache",
"clientid": "d3skt0p",
"content-type": "application/json",
"cookie": "_t_ds=21836c671691564336-4621836c67-021836c67; jd=280323907884; _gcl_au=1.1.1767756339.1691564338; test=naukri.com; G_ENABLED_IDPS=google; _cc_id=c7a22b66b0e8b76ba5b1ab973ac2c4e2; _fbp=fb.1.1691586951863.1688541664; MYNAUKRI[UNID]=6decd0ec6dac4ea7adf498fd9aea1b02; MYNAUKBMS[TOTALEXP]=.; MYNAUKBMS[MISC]=%7CX%7C-1%3A-1.-1%7CX%7C-1%3A-1.-1; PHPSESSID=7r1itb4rb4a5vp75h16aj1p50j; PS=0e9c712cbbee09d64d62ed464ccf1ed68d69b9c8b8e0879f86ac8078180ed768ff003c62a2e1a36431b890266d0ecd01; _t_ds=21836c671691564336-4621836c67-021836c67; ACTIVE=1691746049; __utma=266160400.222629415.1691564339.1691747172.1691747172.1; __utmc=266160400; __utmz=266160400.1691747172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _t_s=direct; _gid=GA1.2.404208624.1692184309; _t_r=1091%2F%2F; _abck=17DF08AA6008335BFF57EC3D4F31C60A~0~YAAQBCozaovbVfWJAQAAyqlV/wqIPgjcUjD+7ht0W00DSxyvraAK8+dtCE9YPqwS+IJPRVvvHPVL4ZLzQ7cfGNXzfh3k+y2VLqP+s+cPut62fApHUtFEmbTrUNVNv9Zeq9lwI+e8zd1DsioeBQtdUG+kzSHGWky6sPhziobMkx1B7W04IwUfACS7Ve5fYBCJU5dbtVRjeDAoNXmctQPJApkPdaddRMuoeq4qCZcW/bb8bGR+nwyO8+ZBPpQqoBpZrIhpG66AkcOcsLIfBHMfb8E/1dUZyDcFEO4Y7P41NVSIGgF8BzyGksJsa+IlaCXYrz0MDX0QiHXyiozYmEocQYKeTOwkMlmoHq/+X8XLt70g2LvMc0Zszor74PL7ymsDvPRLoDCvPinCf4Uk844KKItZ6menX46Tpg==~-1~-1~-1; bm_sz=BD37187E9CC624B5599566E84E218D81~YAAQBCozao3bVfWJAQAAyqlV/xQaFSd0F+spatEEAmhMi6P20wPSNyvyqwLIgOZIqPyzNpNoeCiq27hIuVDssDqyYLJipRkLmTgJhtRpBI/UkMYHO1gve7KT27FIcZLAPM1GlmudVfZr/vsBgNU7vcq7YlESrOQUNFkdARzI9cnEHl0Uwh+TdW+jSx/uvvgN860EXQYxvgQFPwHcF6K1HLhnThG6W3LrVsKEnltKEJsWzq73YGJhtHR2gk/c2Rn2rsnlBSKkon06k/bBUNpImVfGIv57NluTzAf4HUKBL2dBFfo=~4272181~3684401; bm_mi=840B9E1760640F737B07DF6916477F14~YAAQBCozar8fV/WJAQAAemdo/xR295FqGfoDgkXCgp3Zs538VapFXehFbhWVc0uLC2Z7cfCczehDlj6/WNkwuGUEm6AQ+a2VS9H1cL3cF+vXFUomXcwhU4fmjNruimtgH2vNc8+t07S6CFswop+vgQr50vwaRKAobfsJi0jKNELyQOdgxf0EQ+vH31DwtJMCeNMFIlZxXSznSOUZ9VRY/HSFsMgPHu3ChcKnhfJhUpS2VEkwwh8FjyNNsp08Nc8B85Vbpq3PCTz1kpFWCIeBDDVthrtnKITPzciYZy5e2VhvJWKi+2iRyOVeXbLbCphszroTewz5d6Sd4RhwOg==~1; _gat_UA-182658-1=1; ak_bmsc=DC184FF5F5CF7CEC60DE28CF4A04B43E~000000000000000000000000000000~YAAQBCozakggV/WJAQAAo2xo/xST717WQAIeCYOI3htLys7gWAfwL6/uNZtCJv6fAyFBYEcPf/0asPA8yD7eyVNXLvegM9qh5IquUPoSFJH3Sjz7JyPcySdejoqwoRGhg4rYROybASf1olGEy4PNPGBCBwTi+KUhkVCkHEaDWiDa/feuQddoB3nWBPui267IP17/01afcmBsBA+xz5PFn+OVIp7pIHrsWwa3Z+QoA3+9ZTSs+D/jXsBCsrJojd8U6Ho8NPfgfUyNOJo0SzFIQbcLy5TmAQHEYBCLhYgkRJjGPRSOqEYCtOenp5WzQHRisSQUU837xfVnr42Pc9xoW73pafQv/pQiuB64SrdhVtABVsSWchE5RuqwnPPIBf6cjJWLNb71p+Is6F6zcvVmSIvx2wZO0QmLQ2pfXr6Lh+jcBNPcod8pLbWG5U5RPHQAVi0nGPOYS+3mcrkGCiTrteqyLmSEOGvThutsOfl5Kog6h78tCaHhfhnZt1mmPkanCex2CHjeuT4FESOf83XFCLDVT9v0VAh962a9KQ==; __gads=ID=85c2a6341a8344ec:T=1691641263:RT=1692207181:S=ALNI_MZnP35P-PINdjwxcv-SNoWRMxbz8w; __gpi=UID=00000c29ed221036:T=1691641263:RT=1692207181:S=ALNI_Majbvns7DTxm-L8Fcvi-v_e7zQCvA; bm_sv=743032F92D532DCFC228BE5DB12014CF~YAAQBCozarIgV/WJAQAAQnJo/xRLr5g+qzbOInTUPStEJ+njAToV8zwOvBbHEEF9WGABP3ObKrNGr0FSALH8SsyJxhCnJZP72tWp4RJ8IMvpVkNNNye2Kc0n+U9VxZhSg9RKvKTn/DwW5x0lwY6guqb4wJwZIND/pUfBqdWUPp77qF4rYSeBEg/no94nGlmXUVUY4GqTDj6hCo6XIBbTIg1BGSdrLjFRTjpKu9aRX0ScDPSxuyMe7KPZSsOGY1AL~1; cto_bundle=TYhEE19xSDJxQk1qdTBuR3hYWDklMkJ3SWhPZmRkcjg3TnYyREN1dUpHaDBlbWJoME40OTVBelNlZ3J3TnhjVmZhSTNTTXl2U2JjSWhIM29aaWJHMyUyQkIlMkJPUmZKaGNBRkJLQVNHU1FYWFlleTFVJTJGTWduTkppQzJzMW1SOFJyRWNEdndENkklMkJ6M25jaFpaJTJCUmdUOWNMY2Z3TlolMkJ3QSUzRCUzRA; HOWTORT=ul=1692207219428&r=https%3A%2F%2Fwww.naukri.com%2Faccounting-jobs%3Fxt%3Dcatsrch%26amp%3Bqi%255b%255d%3D8&hd=1692207219607; _ga=GA1.1.222629415.1691564339; _ga_K2YBNZVRLL=GS1.1.1692207181.10.1.1692207220.21.0.0", # Add your cookie value here
"gid": "LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE",
"referer": "https://www.naukri.com/fresher-jobs?src=gnbjobs_homepage_srch",
"sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"systemid": "109",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
"content-encoding": "gzip",
}
keys_to_extract = ['title', 'jobId', 'footerPlaceholderLabel', 'companyName', 'companyId', 'jdURL', 'createdDate',
'mode', 'placeholders']
def __init__(self, input_file_path, output_file_path, error_file_path):
self.input_file_path = input_file_path
self.output_file_path = output_file_path
self.error_file_path = error_file_path
self.timeout = 120
self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}
def parse_and_save(self, json_data):
parsed_data = []
for job in json_data["jobDetails"]:
parsed_item = {field: job.get(field, None) for field in self.keys_to_extract}
parsed_data.append(parsed_item)
with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract)
csv_writer.writerows(parsed_data)
def scrape(self):
with open(self.output_file_path, "w", newline="", encoding="utf-8") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract)
csv_writer.writeheader()
with open(self.input_file_path, 'r') as file:
file_read = csv.reader(file)
for industry in file_read:
industry_read_url = industry[0].replace("\n", "")
industry_name=industry[1]
industry_q=industry[2]
total_pages = 1000
start_page = 1
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
while total_pages > 0:
url = self.base_url.format(industry_name, start_page, industry_q)
response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
print(f"{response.status_code} for {url}")
if response.status_code != 200:
print(f"Error with page {start_page} for industry {industry_name}")
with open(self.error_file_path, "a") as file:
file.write(f"Error with page {start_page} for industry {industry_name}\n")
time.sleep(10)
continue
# if 200 response
data = response.json()
if(total_pages == 1000):
total_jobs = data["noOfJobs"]
total_pages = math.ceil(total_jobs/100)
self.parse_and_save(data)
# Assuming that you'll break the loop once all pages are scraped:
# (Add your logic to update 'total_pages' based on the response)
total_pages -= 1
start_page += 1
print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}")
time.sleep(1)
def main():
start_time = time.time()
scraper = NaukriJobScraper(input_file, output_file, error_file)
scraper.scrape()
end_time = time.time()
duration_hours = (end_time - start_time) / 3600
print(f"Search program took {duration_hours:.2f} hours to run.")
with open(stats_file, "a") as stat:
stat.write(f"Search program took {duration_hours:.2f} hours to run. \n")
if __name__ == "__main__":
main()

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
requests==2.25.1