hgj

2023-10-17 13:15:22 +05:30 · 2023-10-17 13:15:22 +05:30 · ec654ea0be
parent 5712d4cf8b
commit ec654ea0be
1 changed files with 15 additions and 1 deletions
--- a/naukri/search_india.py
+++ b/naukri/search_india.py
@ -42,7 +42,7 @@ class NaukriJobScraper:
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
    "content-encoding": "gzip",
    }   
-
+    stopcrawl = False
    # headers = { 
    # "appid": "109",
    # "systemid": "109"
@ -62,6 +62,17 @@ class NaukriJobScraper:
        for job in json_data["jobDetails"]:
            parsed_item = {field: job.get(field, None) for field in self.keys_to_extract}
            parsed_data.append(parsed_item)
+            # with open('r,txt', 'w+', encoding='utf-8', newline='') as dr:
+            #     import json
+            #     dr.write(json.dumps(parsed_data))
+            # print(parsed_data)
+        days_ago_list = [x['footerPlaceholderLabel'] for x in parsed_data]
+        target = "7 Days Ago"
+        count = days_ago_list.count(target)
+        percentage = (count / len(days_ago_list)) * 100
+        if percentage > 60:
+            self.stopcrawl = True
+        

        with open(self.output_file_path, "a", newline="", encoding="utf-8") as csvfile:
            csv_writer = csv.DictWriter(csvfile, fieldnames=self.keys_to_extract)
@ -79,10 +90,13 @@ class NaukriJobScraper:
                industry_name=industry[1]
                industry_q=industry[2]
                total_pages = 1000
+                self.stopcrawl = False
                start_page = 1

                print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
                while total_pages > 0:
+                    if self.stopcrawl:
+                        total_pages = 0
                    url = self.base_url.format(industry_name, start_page, industry_q)
                    try:
                        # print(url)