kjh
parent
285491b01a
commit
53a0aee5a2
|
@ -42,7 +42,7 @@ class NaukriJobScraper:
|
||||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
|
||||||
"content-encoding": "gzip",
|
"content-encoding": "gzip",
|
||||||
}
|
}
|
||||||
stopcrawl = False
|
# stopcrawl = False
|
||||||
# headers = {
|
# headers = {
|
||||||
# "appid": "109",
|
# "appid": "109",
|
||||||
# "systemid": "109"
|
# "systemid": "109"
|
||||||
|
@ -66,12 +66,13 @@ class NaukriJobScraper:
|
||||||
# import json
|
# import json
|
||||||
# dr.write(json.dumps(parsed_data))
|
# dr.write(json.dumps(parsed_data))
|
||||||
# print(parsed_data)
|
# print(parsed_data)
|
||||||
days_ago_list = [x['footerPlaceholderLabel'] for x in parsed_data]
|
# ---------------------------------------
|
||||||
target = "3 Days Ago"
|
# days_ago_list = [x['footerPlaceholderLabel'] for x in parsed_data]
|
||||||
count = days_ago_list.count(target)
|
# target = "3 Days Ago"
|
||||||
percentage = (count / len(days_ago_list)) * 100
|
# count = days_ago_list.count(target)
|
||||||
if percentage > 60:
|
# percentage = (count / len(days_ago_list)) * 100
|
||||||
self.stopcrawl = True
|
# if percentage > 60:
|
||||||
|
# self.stopcrawl = True
|
||||||
|
|
||||||
|
|
||||||
with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
||||||
|
@ -90,13 +91,13 @@ class NaukriJobScraper:
|
||||||
industry_name=industry[1]
|
industry_name=industry[1]
|
||||||
industry_q=industry[2]
|
industry_q=industry[2]
|
||||||
total_pages = 1000
|
total_pages = 1000
|
||||||
self.stopcrawl = False
|
# self.stopcrawl = False
|
||||||
start_page = 1
|
start_page = 1
|
||||||
|
|
||||||
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
|
||||||
while total_pages > 0:
|
while total_pages > 0:
|
||||||
if self.stopcrawl:
|
# if self.stopcrawl:
|
||||||
total_pages = 0
|
# total_pages = 0
|
||||||
url = self.base_url.format(industry_name, start_page, industry_q)
|
url = self.base_url.format(industry_name, start_page, industry_q)
|
||||||
try:
|
try:
|
||||||
# print(url)
|
# print(url)
|
||||||
|
|
Loading…
Reference in New Issue