prahul11 2023-10-17 15:52:37 +05:30
parent 285491b01a
commit 53a0aee5a2
1 changed files with 11 additions and 10 deletions

View File

@ -42,7 +42,7 @@ class NaukriJobScraper:
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
"content-encoding": "gzip", "content-encoding": "gzip",
} }
stopcrawl = False # stopcrawl = False
# headers = { # headers = {
# "appid": "109", # "appid": "109",
# "systemid": "109" # "systemid": "109"
@ -66,12 +66,13 @@ class NaukriJobScraper:
# import json # import json
# dr.write(json.dumps(parsed_data)) # dr.write(json.dumps(parsed_data))
# print(parsed_data) # print(parsed_data)
days_ago_list = [x['footerPlaceholderLabel'] for x in parsed_data] # ---------------------------------------
target = "3 Days Ago" # days_ago_list = [x['footerPlaceholderLabel'] for x in parsed_data]
count = days_ago_list.count(target) # target = "3 Days Ago"
percentage = (count / len(days_ago_list)) * 100 # count = days_ago_list.count(target)
if percentage > 60: # percentage = (count / len(days_ago_list)) * 100
self.stopcrawl = True # if percentage > 60:
# self.stopcrawl = True
with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile: with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile:
@ -90,13 +91,13 @@ class NaukriJobScraper:
industry_name=industry[1] industry_name=industry[1]
industry_q=industry[2] industry_q=industry[2]
total_pages = 1000 total_pages = 1000
self.stopcrawl = False # self.stopcrawl = False
start_page = 1 start_page = 1
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}") print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
while total_pages > 0: while total_pages > 0:
if self.stopcrawl: # if self.stopcrawl:
total_pages = 0 # total_pages = 0
url = self.base_url.format(industry_name, start_page, industry_q) url = self.base_url.format(industry_name, start_page, industry_q)
try: try:
# print(url) # print(url)