added scrapy code

2023-10-09 13:45:53 +05:30 · 2023-10-09 13:45:53 +05:30 · 3b8b51ac06
parent adfb3a70a1
commit 3b8b51ac06
57 changed files with 328147 additions and 94 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 .vscode
 data_naukri
 scrib
 data_naukri/
 gulf_data/
--- a/naukri/jobdata_gulf_r.py
+++ b/naukri/jobdata_gulf_r.py
@ -1,7 +1,7 @@
 import requests
 import csv
-import concurrent.futures
+from time import sleep, time
-
+"""
 # List of URLs to query
 base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
@ -28,105 +28,168 @@ headers = {
    'userdata': '|IN'
 }
-keys_to_extract = ['designation','description','company','compensation','industryType','functionalArea','jobSource','location','other','desiredCandidate','contact','isExpired','locationInterlinking']
+with open("o.csv", 'a+', newline='', encoding='utf-8') as outfile:
-company_keys = ['name','details']
+    outfile_writer = csv.writer(outfile)
-salary_key = ['minimumSalary','maximumSalary','currency','label','hideSalary']  
+    with open("output_all_gulf old.csv", 'r', encoding="utf-8", newline='') as jobis:
-rfile = "output_all_gulf.csv"
+        j_read = list(csv.DictReader(jobis))
-loc_list = []
+        for item in j_read:
-skill_other =[]
+            print(base_url.format(item.get('jobId')))
-skill_pref = []
+            jd_url = base_url.format(item.get('jobId'))
            sleep(0.5)
            response = requests.get(base_url.format(item.get('jobId')), headers=headers)
            if response.status_code == 200:
                job_data = {
                    "Url" : jd_url,
                    "Job Key" : item.get('jobId'),
                    "Source Link": response.json().get('other', {'tag': ''}).get('tag',''),
                    "Job Description" : response.json().get('description',''),
                    "Role Category" :"",
                    "Job Industry" :  ', '.join([t['title'] for t in  response.json()['industryInterlinking']]),
                    "Job Title" : response.json().get('designation'),
                    "Formatted Location Full" : response.json().get('location'),
                    "Job Functions" :  ', '.join([x['title'] for x in  response.json()['fAreaInterlinking']]),
                    "Company" : response.json().get('company', {'name':''}).get('name'),
                    "Job Type" : response.json().get('employmentType'),
                    "Key Skills" : ', '.join([y['title'] for y in  response.json()['keywordInterlinking']]),
                    "Minimum Experience" : response.json().get('desiredCandidate').get('experience').get('min'),
                    "Maximum Experience" : response.json().get('desiredCandidate').get('experience').get('max'),
                    "Salary Detail" : response.json().get('compensation')
                }
                if outfile.tell() == 0:
                    header = job_data.keys()
                    outfile_writer.writerow(header)
                outfile_writer.writerow([str(z).replace('\n','').strip() for z in job_data.values()])
 """
-def fetch_url(url):
+# Global variables
 input_file = "gulf_data/output_all_gulf.csv"
 output_file = "gulf_data/jobdata_gulf.csv"
 error_file = "gulf_data/jobdata_error_gulf.csv"
 stats_file = "gulf_data/stats_gulf.txt"
 skip=0
 class NaukriGulfJobDetailScraper:
    base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
    headers = {
        'authority': 'www.naukrigulf.com',
        'accept': 'application/json',
        'accept-format': 'strict',
        'accept-language': 'ENGLISH',
        'appid': '205',
        'cache-control': 'no-cache',
        'client-type': 'desktop',
        'clientid': 'desktop',
        'device-type': 'desktop',
        'puppeteer': 'false',
        'referer': 'https://www.naukrigulf.com/jobs-in-uae',
        'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': 'Windows',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'systemid': '2323',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
        'userdata': '|IN'
    }
    def __init__(self, input_file, output_file, error_file):
        self.input_file = input_file
        self.output_file = output_file
        self.error_file = error_file
        self.timeout = 30
        self.count = 1
        # self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}
    def transform_data(self, job_id, jd_url, json_response):
        json_data = {
            "Url" : jd_url,
            "Job Key" : job_id,
            "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
            json_response.get('contact', {'website': ''}).get('website',''),
            "Job Description" : json_response.get('description',''),
            "Role Category" :"",
            "Job Industry" :  ', '.join([t['title'] for t in  json_response['industryInterlinking']]),
            "Job Title" : json_response.get('designation'),
            "Formatted Location Full" : json_response.get('location'),
            "Job Functions" :  ', '.join([x['title'] for x in  json_response['fAreaInterlinking']]),
            "Company" : json_response.get('company', {'name':''}).get('name'),
            "Job Type" : json_response.get('employmentType'),
            "Key Skills" : ', '.join([y['title'] for y in  json_response['keywordInterlinking']]),
            "Minimum Experience" : json_response.get('desiredCandidate').get('experience').get('min'),
            "Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
            "Salary Detail" : json_response.get('compensation')
        }
        return json_data
    def scrape(self):
        with open(self.input_file, 'r', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            total_input_count=0
            all_job_ids = []
            for row in reader:
                jobid = row[1].strip()
                mode = row[7].strip()
                total_input_count+=1
                if mode == "POSTED":
                    print("removed non tagged job with jobid %s" % jobid)
                    continue
                all_job_ids.append(jobid)
            print(f"Size of raw all_job_ids: {len(all_job_ids)}")
            all_job_ids = list(set(all_job_ids))
            print(f"Size of unique all_job_ids: {len(all_job_ids)}")
            all_job_ids = all_job_ids[skip:]
            print(f"Total input: {total_input_count},  Valid ids to scrape {len(all_job_ids)}")
            with open(stats_file, "a") as stat:
                stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n")
            sleep(1)
            with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile:
                writer = csv.writer(outfile)
                while all_job_ids:
                    job_id = all_job_ids[0]
                    url = self.base_url.format(job_id)
                    sleep(0.5)
                    try:
-        url = base_url.format(url)
+                        sleep(1)
-        response = requests.get(url, headers=headers)
+                        response = requests.get(url, headers=self.headers, timeout=self.timeout)
-        return response.json(), response.status_code, url
+                        print(f"{response.status_code} for {url}")
-    except requests.exceptions.RequestException as e:
+                        if response.status_code == 200:
-        return "", str(e), url
+                            json_response = response.json()
-    
+                            transformed_data = self.transform_data(job_id, url, json_response)
-def batch_process(urls):
+                            if outfile.tell() == 0 :
-    results = []
+                                header = transformed_data.keys()
-    with concurrent.futures.ThreadPoolExecutor() as executor:
+                                writer.writerow(header)
-        future_to_url = {executor.submit(fetch_url, url): url for url in urls}
+                            writer.writerow(transformed_data.values())
-        
+                            print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}")
-        for future in concurrent.futures.as_completed(future_to_url):
+                            all_job_ids.pop(0)  # Remove the processed job ID
-            url = future_to_url[future]
+                            self.count += 1
-            try:
+    # /                    elif response.status_code == 303:
-                result = future.result()
+    #                         json_response = response.json()
-                results.append(result)
+    #                         if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
-            except Exception as e:
+    #                           print(f"Expired job ID {jobid} with response 303")
-                results.append((url, str(e)))
+    #                           all_job_ids.pop(0)  # Remove the processed job ID
-    return results
+                        elif response.status_code == 404:
                            all_job_ids.pop(0)  # Remove the processed job ID
                            print(f"Expired job ID {jobid} with response 404")
                        else:
                            print(f"Error for job ID {job_id}")
                    except Exception as n1:
                        print(str(n1))
                        pass
 def main(): 
-    batch_size = 50
+    start_time = time()
-    results = []
+    scraper = NaukriGulfJobDetailScraper(input_file, output_file, error_file)
-    count = 1
+    scraper.scrape()
-    # Open a CSV file for writing
+    end_time = time()
-    with open('output_jobs_0309_me.csv', 'a', newline='', encoding='utf-8') as csvfile:
+    duration_hours = (end_time - start_time) / 3600
-        csvwriter = csv.writer(csvfile)
+    print(f"Jobdata program took {duration_hours:.2f} hours to run.")
-        
+    with open(stats_file, "a") as stat:
-        # Write header to the CSV file
+                stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")
        csvwriter.writerow(['URL'] + list(keys_to_extract))
        with open(rfile,'r') as file:
            csv_reader = csv.reader(file)
            urls = [row.replace("\n","") for row in file]
        for i in range(0, len(urls), batch_size):
            batch = urls[i:i+batch_size]
            batch_results = batch_process(batch)
            # Make the HTTP GET request
            #row = row.replace("\n","")
            #`url = base_url.format(row)`
                #try:
            for response in batch_results:
                    print(count)
                    count = count + 1
                    if response[1]== 200:
                        json_data = response[0]
                        job_details = json_data
                        # Extract specific key values from the JSON response
                        values_to_store = [job_details.get(key, '') for key in keys_to_extract]
                        """if values_to_store[0]!="": 
                            [values_to_store.append(job_details["companyDetail"].get(key,'')) for key in company_keys]
                            [values_to_store.append(job_details["salaryDetail"].get(key,'')) for key in salary_key]
                            for loc in job_details["locations"]:
                                loc_list.append(loc.get('label',''))
                            values_to_store.append(loc_list)
                            for skill in job_details["keySkills"]["other"]:
                                skill_other.append(skill.get('label',''))
                            values_to_store.append(skill_other)
                            for skill in job_details["keySkills"]["preferred"]:
                                skill_pref.append(skill.get('label',''))  
                            values_to_store.append(skill_pref)
                        else:
                            values_to_store[1]=""
                            values_to_store.append(job_details["companyDetail"])
                            values_to_store.append(job_details["salaryDetail"])
                            values_to_store.append(job_details["locations"])
                            values_to_store.append(job_details["keySkills"])
                        """ 
                        # Write the extracted values to the CSV file
                        csvwriter.writerow([response[2]] + values_to_store)
                    else:
                        print(f"Failed to fetch data for job ID: {response[2]} with {response[0]}")
                        csvwriter.writerow([response[2]] + [response[0]])
              #  except requests.exceptions.RequestException as e:
               #     csvwriter.writerow([url] + [str(e)])
    print("Data extraction and CSV writing complete.")
 if __name__ == "__main__":
    main()
--- a/naukri/search_gulf_r.py
+++ b/naukri/search_gulf_r.py
@ -0,0 +1,113 @@
 import requests
 import json
 import time
 import re
 import csv
 import math
 output_filename_csv = "gulf_data/output_all_gulf.csv"
 input("remove lien 72 10000 limit wala")
 headers = {
    'authority': 'www.naukrigulf.com',
    'accept': 'application/json',
    'accept-format': 'strict',
    'accept-language': 'ENGLISH',
    'appid': '205',
    'cache-control': 'no-cache',
    'client-type': 'desktop',
    'clientid': 'desktop',
    'device-type': 'desktop',
    'puppeteer': 'false',
    'referer': 'https://www.naukrigulf.com/jobs-in-uae',
    'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': 'Windows',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'systemid': '2323',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
    'userdata': '|IN'
 }
 error_pages = []
 keys_to_extract = ['designation', 'jobId', 'company','Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies']
 fields_to_write = ['designation', 'jobId', 'Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies','city']
 input_file = "naukri/_gulf_location.csv"
 jobs_per_pages = 50
 base_url = "https://www.naukrigulf.com/spapi/jobapi/search?Experience=&Keywords=&KeywordsAr=&Limit=50&Location={}&LocationAr=&Offset={}&SortPreference=&breadcrumb=1&locationId=&nationality=&nationalityLabel=&pageNo={}&srchId='"
 def parse_and_save(json_data, csv_filename, city):
    parsed_data = []
    for job in json_data["jobs"]:
        parsed_item = {field: job.get(field, None) for field in keys_to_extract}
        parsed_item['city'] = city
        print("parsed_item ---", parsed_item)
        print(parsed_item.get('company', {'name':''}).get('name'))
        print(parsed_item.get('company', {'id':''}).get('id'))
        print(parsed_item.get('company', {'url':''}).get('url'))
        for key, value in parsed_item.get('company', {'name':'', 'id':'', 'url':''}).items():
            parsed_item["Company" + key] = value
        try:
            parsed_item.pop('company')
        except:
            pass
        # print("updated parsed_item--", parsed_item)
        parsed_data.append(parsed_item)
        #parsed_data.extend(city)
    with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile:
        print("csv_filename---", csv_filename)
        csv_writer = csv.DictWriter(csvfile, fieldnames= fields_to_write)
        if csvfile.tell() == 0:
            csv_writer.writeheader()
        csv_writer.writerows(parsed_data)
 def main():
 #for page_number in range(1, 4700):  # Adjust the range as needed
    with open(input_file, 'r') as file:
        file_read = csv.reader(file)
        file_read = list(file_read)
        for city in file_read:
            city_read_url = city[0].replace("\n","")
            output_data=[]
            total_pages = 1000
            output_filename_json = f"{city[0]}.json"
            output_filename_csv = "gulf_data/output_all_gulf.csv"
            start_page = 1
            # if(city[0] == "pharma"):
            #     start_page = 173
            #     total_pages = 22
            #     total_page_num = 194
            while total_pages>0:
                url = base_url.format(city[0],(jobs_per_pages*(start_page-1)),start_page)
                print("url", url)
                # input()
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    json_data = response.json()
                    if(total_pages == 1000):
                        total_jobs = json_data["totalJobsCount"] 
                        total_pages = math.ceil(total_jobs/jobs_per_pages)
                        total_page_num = total_pages
                    parse_and_save(json_data, output_filename_csv, city[0])
                    print(f"Processed{url} : {start_page}/{total_page_num}/{total_pages}")
                    total_pages = total_pages-1
                    start_page = start_page+1
                else:
                    print("Error : ",response.status_code," at url ",url)
                    error_pages.append(url)
                    total_pages = total_pages-1
                    start_page = start_page+1
            print("Data saved to output_new.json")
            print(error_pages)
 if __name__ == "__main__":
    main()
--- a/naukri/search_india.py
+++ b/naukri/search_india.py
@ -72,7 +72,7 @@ class NaukriJobScraper:
                print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
                while total_pages > 0:
                    url = self.base_url.format(industry_name, start_page, industry_q)
-
+                    print(url)
                    response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
                    print(f"{response.status_code} for {url}")
@ -102,7 +102,6 @@ class NaukriJobScraper:
 def main():
    start_time = time.time()
    scraper = NaukriJobScraper(input_file, output_file, error_file)
    scraper.scrape()
    end_time = time.time()
--- a/scraper/naukri_gulf_detail/naukri_gulf_detail/init.py
+++ b/scraper/naukri_gulf_detail/naukri_gulf_detail/init.py
--- a/scraper/naukri_gulf_detail/naukri_gulf_detail/pycache/init.cpython-310.pyc
+++ b/scraper/naukri_gulf_detail/naukri_gulf_detail/pycache/init.cpython-310.pyc
--- a/scraper/naukri_gulf_detail/naukri_gulf_detail/pycache/settings.cpython-310.pyc
+++ b/scraper/naukri_gulf_detail/naukri_gulf_detail/pycache/settings.cpython-310.pyc
--- a/scraper/naukri_gulf_detail/naukri_gulf_detail/items.py
+++ b/scraper/naukri_gulf_detail/naukri_gulf_detail/items.py
@ -0,0 +1,12 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
 class NaukriGulfDetailItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
--- a/scraper/naukri_gulf_detail/naukri_gulf_detail/middlewares.py
+++ b/scraper/naukri_gulf_detail/naukri_gulf_detail/middlewares.py
@ -0,0 +1,103 @@
 # Define here the models for your spider middleware
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 class NaukriGulfDetailSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i
    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
        # Should return either None or an iterable of Request or item objects.
        pass
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.
        # Must return only requests (not items).
        for r in start_requests:
            yield r
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
 class NaukriGulfDetailDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.
        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
--- a/scraper/naukri_gulf_detail/naukri_gulf_detail/pipelines.py
+++ b/scraper/naukri_gulf_detail/naukri_gulf_detail/pipelines.py
@ -0,0 +1,23 @@
 from itemadapter import ItemAdapter
 import csv
 from datetime import datetime
 current_date = datetime.now()
 formatted_date = current_date.strftime('%d-%m-%Y')
 output_file = f'naukri_gulf_detail_{formatted_date}.csv'
 class NaukriGulfDetailPipeline:
    def open_spider(self, spider):
        self.csvfile = open(output_file, 'a', newline='', encoding='utf-8')
    def process_item(self, item, spider):
        self.csv_writer = csv.DictWriter(self.csvfile, fieldnames=item.keys())
        if self.csvfile.tell() == 0:
            self.csv_writer.writeheader()
        self.csv_writer.writerow(item)
        return item
    def close_spider(self, spider):
        self.csvfile.close()
--- a/scraper/naukri_gulf_detail/naukri_gulf_detail/settings.py
+++ b/scraper/naukri_gulf_detail/naukri_gulf_detail/settings.py
@ -0,0 +1,93 @@
 # Scrapy settings for naukri_gulf_detail project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 BOT_NAME = "naukri_gulf_detail"
 SPIDER_MODULES = ["naukri_gulf_detail.spiders"]
 NEWSPIDER_MODULE = "naukri_gulf_detail.spiders"
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = "naukri_gulf_detail (+http://www.yourdomain.com)"
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 #DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 #DEFAULT_REQUEST_HEADERS = {
 #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 #    "Accept-Language": "en",
 #}
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    "naukri_gulf_detail.middlewares.NaukriGulfDetailSpiderMiddleware": 543,
 #}
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
 #    "naukri_gulf_detail.middlewares.NaukriGulfDetailDownloaderMiddleware": 543,
 #}
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
 #    "scrapy.extensions.telnet.TelnetConsole": None,
 #}
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
 #    "naukri_gulf_detail.pipelines.NaukriGulfDetailPipeline": 300,
 #}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 #AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 #AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 #HTTPCACHE_ENABLED = True
 #HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_DIR = "httpcache"
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 # Set settings whose default value is deprecated to a future-proof value
 REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = "utf-8"
--- a/scraper/naukri_gulf_detail/naukri_gulf_detail/spiders/init.py
+++ b/scraper/naukri_gulf_detail/naukri_gulf_detail/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/scraper/naukri_gulf_detail/naukri_gulf_detail/spiders/pycache/init.cpython-310.pyc
+++ b/scraper/naukri_gulf_detail/naukri_gulf_detail/spiders/pycache/init.cpython-310.pyc
--- a/scraper/naukri_gulf_detail/naukri_gulf_detail/spiders/naukri_gulf_detail_spider.py
+++ b/scraper/naukri_gulf_detail/naukri_gulf_detail/spiders/naukri_gulf_detail_spider.py
@ -0,0 +1,101 @@
 import scrapy
 import csv
 import logging
 from datetime import datetime, timedelta
 import pandas as pd
 class NaukriGulfDetailSpiderSpider(scrapy.Spider):
    name = "naukri_gulf_detail_spider"
    custom_settings = {
        'DOWNLOAD_DELAY' : 0.5,
        'CONCURRENT_REQUESTS' : 5,
        'ITEM_PIPELINES': {
            'naukri_gulf_detail.pipelines.NaukriGulfDetailPipeline': 300,
        },
        'LOG_LEVEL': 'ERROR',
        'LOG_FILE': 'naukri_gulf_detail_error.log',
        'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s',
    }
    current_date = datetime.now()
    formatted_date = current_date.strftime('%d-%m-%Y')
    yesterday = current_date - timedelta(days=1)
    yesterday_str = yesterday.strftime('%d-%m-%Y')
    yesterday_search_file = f'gulf_data/naukri_gulf_search_{yesterday_str}.csv'
    today_search_file = f'gulf_data/naukri_gulf_search_{formatted_date}.csv'
    today_search_df = pd.read_csv(today_search_file)
    yesterday_search_df = pd.read_csv(yesterday_search_file)
    newresult_df = pd.merge(today_search_df , yesterday_search_df, on='jobId', how='left', suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
    oldresult_df = pd.merge(yesterday_search_df, today_search_df , on='jobId', how='left',suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
    newresult_df = newresult_df.drop_duplicates(subset="jobId", keep="first")
    oldresult_df = oldresult_df.drop_duplicates(subset="jobId", keep="first")
    newresult_df = newresult_df.reset_index(drop=True)
    newresult_df.to_csv('gulf_data/new_jobs_gulf.csv', index=False)
    oldresult_df = oldresult_df.reset_index(drop=True)
    oldresult_df.to_csv('gulf_data/expired_jobs_gulf.csv', index=False)
    input_file = 'gulf_data/new_jobs_gulf.csv'
    print(newresult_df.shape, oldresult_df.shape)
    def start_requests(self):
      headers = {
          'authority': 'www.naukrigulf.com',
          'accept': 'application/json',
          'accept-format': 'strict',
          'accept-language': 'ENGLISH',
          'appid': '205',
          'cache-control': 'no-cache',
          'client-type': 'desktop',
          'clientid': 'desktop',
          'device-type': 'desktop',
          'puppeteer': 'false',
          'referer': 'https://www.naukrigulf.com/jobs-in-uae',
          'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
          'sec-ch-ua-mobile': '?0',
          'sec-ch-ua-platform': 'Windows',
          'sec-fetch-dest': 'empty',
          'sec-fetch-mode': 'cors',
          'sec-fetch-site': 'same-origin',
          'systemid': '2323',
          'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
          'userdata': '|IN'
      }
      base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
      with open(self.input_file, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            jobid = row.get('jobId').strip()
            mode = row['jobSource'].strip()
            if mode != "POSTED":
              print(jobid)
              yield scrapy.Request(base_url.format(jobid), headers=headers,callback=self.parse, meta={
                 'jobid' : jobid
              })
    def parse(self, response):
        try:
          job_id = response.meta.get('jobid')
          json_response = response.json()
          jd_url = response.url
          json_data = {
            "Url" : jd_url,
            "Job Key" : job_id,
            "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
            json_response.get('contact', {'website': ''}).get('website',''),
            "Job Description" : json_response.get('description',''),
            "Role Category" :"",
            "Job Industry" :  ', '.join([t['title'] for t in  json_response['industryInterlinking']]),
            "Job Title" : json_response.get('designation'),
            "Formatted Location Full" : json_response.get('location'),
            "Job Functions" :  ', '.join([x['title'] for x in  json_response['fAreaInterlinking']]),
            "Company" : json_response.get('company', {'name':''}).get('name'),
            "Job Type" : json_response.get('employmentType'),
            "Key Skills" : ', '.join([y['title'] for y in  json_response['keywordInterlinking']]),
            "Minimum Experience" : json_response.get('desiredCandidate').get('experience').get('min'),
            "Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
            "Salary Detail" : json_response.get('compensation'),
            "Country" : json_response.get('compensation',{'country':''}).get('country')
        }
          yield json_data
        except Exception as naukriError:
           self.logger.error(f'An error occured : {str(naukriError)}')
--- a/scraper/naukri_gulf_detail/scrapy.cfg
+++ b/scraper/naukri_gulf_detail/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = naukri_gulf_detail.settings
 [deploy]
 #url = http://localhost:6800/
 project = naukri_gulf_detail
--- a/scraper/naukri_gulf_search/naukri_gulf_search/init.py
+++ b/scraper/naukri_gulf_search/naukri_gulf_search/init.py
--- a/scraper/naukri_gulf_search/naukri_gulf_search/pycache/init.cpython-310.pyc
+++ b/scraper/naukri_gulf_search/naukri_gulf_search/pycache/init.cpython-310.pyc
--- a/scraper/naukri_gulf_search/naukri_gulf_search/pycache/settings.cpython-310.pyc
+++ b/scraper/naukri_gulf_search/naukri_gulf_search/pycache/settings.cpython-310.pyc
--- a/scraper/naukri_gulf_search/naukri_gulf_search/items.py
+++ b/scraper/naukri_gulf_search/naukri_gulf_search/items.py
@ -0,0 +1,12 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
 class NaukriGulfSearchItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
--- a/scraper/naukri_gulf_search/naukri_gulf_search/middlewares.py
+++ b/scraper/naukri_gulf_search/naukri_gulf_search/middlewares.py
@ -0,0 +1,103 @@
 # Define here the models for your spider middleware
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 class NaukriGulfSearchSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i
    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
        # Should return either None or an iterable of Request or item objects.
        pass
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.
        # Must return only requests (not items).
        for r in start_requests:
            yield r
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
 class NaukriGulfSearchDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.
        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
--- a/scraper/naukri_gulf_search/naukri_gulf_search/pipelines.py
+++ b/scraper/naukri_gulf_search/naukri_gulf_search/pipelines.py
@ -0,0 +1,24 @@
 from itemadapter import ItemAdapter
 import csv
 from datetime import datetime
 current_date = datetime.now()
 formatted_date = current_date.strftime('%d-%m-%Y')
 file_to_write = f'naukri_gulf_search_{formatted_date}.csv'
 fields_to_write = ['designation', 'jobId', 'Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies','city']
 class NaukriGulfSearchPipeline:
    # print("pipelien ere")
    def open_spider(self, spider):
        self.csvfile = open(file_to_write, 'a', newline='', encoding='utf-8')
        self.csv_writer = csv.DictWriter(self.csvfile, fieldnames=fields_to_write)
        if self.csvfile.tell() == 0:
            self.csv_writer.writeheader()
    def process_item(self, item, spider):
        self.csv_writer.writerow(item)
        return item
    def close_spider(self, spider):
        self.csvfile.close()
--- a/scraper/naukri_gulf_search/naukri_gulf_search/settings.py
+++ b/scraper/naukri_gulf_search/naukri_gulf_search/settings.py
@ -0,0 +1,93 @@
 # Scrapy settings for naukri_gulf_search project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 BOT_NAME = "naukri_gulf_search"
 SPIDER_MODULES = ["naukri_gulf_search.spiders"]
 NEWSPIDER_MODULE = "naukri_gulf_search.spiders"
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = "naukri_gulf_search (+http://www.yourdomain.com)"
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 #DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 #DEFAULT_REQUEST_HEADERS = {
 #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 #    "Accept-Language": "en",
 #}
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    "naukri_gulf_search.middlewares.NaukriGulfSearchSpiderMiddleware": 543,
 #}
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
 #    "naukri_gulf_search.middlewares.NaukriGulfSearchDownloaderMiddleware": 543,
 #}
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
 #    "scrapy.extensions.telnet.TelnetConsole": None,
 #}
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
 #    "naukri_gulf_search.pipelines.NaukriGulfSearchPipeline": 300,
 #}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 #AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 #AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 #HTTPCACHE_ENABLED = True
 #HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_DIR = "httpcache"
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 # Set settings whose default value is deprecated to a future-proof value
 REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = "utf-8"
--- a/scraper/naukri_gulf_search/naukri_gulf_search/spiders/init.py
+++ b/scraper/naukri_gulf_search/naukri_gulf_search/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/scraper/naukri_gulf_search/naukri_gulf_search/spiders/pycache/init.cpython-310.pyc
+++ b/scraper/naukri_gulf_search/naukri_gulf_search/spiders/pycache/init.cpython-310.pyc
--- a/scraper/naukri_gulf_search/naukri_gulf_search/spiders/naukri_gulf_search_spider.py
+++ b/scraper/naukri_gulf_search/naukri_gulf_search/spiders/naukri_gulf_search_spider.py
@ -0,0 +1,115 @@
 import scrapy
 import csv
 import logging
 import json 
 import math
 input_file_path = "static_data/_gulf_location.csv"
 # output_filename_csv = "output_all_gulf.csv"
 class NaukriGulfSearchSpiderSpider(scrapy.Spider):
    name = "naukri_gulf_search_spider"
    custom_settings = {
        'DOWNLOAD_DELAY' : 1,
        'CONCURRENT_REQUESTS' : 5,
        'ITEM_PIPELINES': {
            'naukri_gulf_search.pipelines.NaukriGulfSearchPipeline': 301,
        },
        'LOG_LEVEL': 'ERROR',
        'LOG_FILE': 'naukri_gulf_search_error.log',
        'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s',
    }
    def __init__(self, *args, **kwargs):
        super(NaukriGulfSearchSpiderSpider, self).__init__(*args, **kwargs)
        self.csv_file = input_file_path
        self.keys_to_extract = ['designation', 'jobId', 'company','Companyname', 'Companyid', 'Companyurl','latestPostedDate','isEasyApply','jobSource','location','jdURL','vacancies']
        self.headers = {
            'authority': 'www.naukrigulf.com',
            'accept': 'application/json',
            'accept-format': 'strict',
            'accept-language': 'ENGLISH',
            'appid': '205',
            'cache-control': 'no-cache',
            'client-type': 'desktop',
            'clientid': 'desktop',
            'device-type': 'desktop',
            'puppeteer': 'false',
            'referer': 'https://www.naukrigulf.com/jobs-in-uae',
            'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': 'Windows',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'systemid': '2323',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
            'userdata': '|IN'
        }
    def start_requests(self):
        base_url = "https://www.naukrigulf.com/spapi/jobapi/search?Experience=&Keywords=&KeywordsAr=&Limit=50&Location={}&LocationAr=&Offset={}&SortPreference=&breadcrumb=1&locationId=&nationality=&nationalityLabel=&pageNo={}&srchId='"
        with open(input_file_path, 'r') as file:        
            file_read = csv.reader(file)
            file_read = list(file_read)
            for city in file_read:
                total_pages = 1000
                start_page = 1
                jobs_per_pages = 50
                url = base_url.format(city[0],(jobs_per_pages*(start_page-1)),start_page)
                custom_args = {
                'url' : url,
                'total_pages':total_pages,
                'start_page':start_page,
                'base_url' : base_url,
                'jobs_per_pages' : jobs_per_pages,
                'city_name' : city[0]
                }
                yield scrapy.Request(url, headers=self.headers,callback=self.parse , meta=custom_args)
    def parse(self, response):
        # status_code = response.status
        total_pages = response.meta.get('total_pages')
        start_page = response.meta.get('start_page')
        base_url = response.meta.get('base_url')
        url = response.meta.get('url')
        city_name = response.meta.get('city_name')
        jobs_per_pages = response.meta.get('jobs_per_pages')
        while total_pages>0:
            if response.status == 200:
                json_data = response.json()        
                if(total_pages == 1000):
                    total_jobs = json_data["totalJobsCount"] 
                    total_pages = math.ceil(total_jobs/jobs_per_pages)
                    total_page_num = total_pages
                for job in json_data["jobs"]:
                    parsed_item = {field: job.get(field, None) for field in self.keys_to_extract}
                    parsed_item['city'] = city_name
                    # print("parsed_item ---", parsed_item)
                    # print(parsed_item.get('company', {'name':''}).get('name'))
                    # print(parsed_item.get('company', {'id':''}).get('id'))
                    # print(parsed_item.get('company', {'url':''}).get('url'))
                    for key, value in parsed_item.get('company', {'name':'', 'id':'', 'url':''}).items():
                        parsed_item["Company" + key] = value
                    try:
                        parsed_item.pop('company')
                    except:
                        pass
                    yield parsed_item
                    print(f"Processed{url} : {start_page}/{total_page_num}/{total_pages}")
                total_pages = total_pages-1
                start_page = start_page+1
            else:
                print("Error : ",response.status," at url ")
                total_pages = total_pages-1
                start_page = start_page+1
            custom_args = {
                'url' : url,
                'total_pages':total_pages,
                'start_page':start_page,
                'base_url' : base_url,
                'jobs_per_pages' : jobs_per_pages,
                'city_name' : city_name
                }            
            yield scrapy.Request(base_url.format(city_name,(jobs_per_pages*(start_page-1)),start_page), headers= self.headers, callback=self.parse, meta=custom_args) 
--- a/scraper/naukri_gulf_search/scrapy.cfg
+++ b/scraper/naukri_gulf_search/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = naukri_gulf_search.settings
 [deploy]
 #url = http://localhost:6800/
 project = naukri_gulf_search
--- a/scraper/naukri_india_detail/naukri_india_detail/init.py
+++ b/scraper/naukri_india_detail/naukri_india_detail/init.py
--- a/scraper/naukri_india_detail/naukri_india_detail/pycache/init.cpython-310.pyc
+++ b/scraper/naukri_india_detail/naukri_india_detail/pycache/init.cpython-310.pyc
--- a/scraper/naukri_india_detail/naukri_india_detail/pycache/pipelines.cpython-310.pyc
+++ b/scraper/naukri_india_detail/naukri_india_detail/pycache/pipelines.cpython-310.pyc
--- a/scraper/naukri_india_detail/naukri_india_detail/pycache/settings.cpython-310.pyc
+++ b/scraper/naukri_india_detail/naukri_india_detail/pycache/settings.cpython-310.pyc
--- a/scraper/naukri_india_detail/naukri_india_detail/items.py
+++ b/scraper/naukri_india_detail/naukri_india_detail/items.py
@ -0,0 +1,12 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
 class NaukriIndiaDetailItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
--- a/scraper/naukri_india_detail/naukri_india_detail/middlewares.py
+++ b/scraper/naukri_india_detail/naukri_india_detail/middlewares.py
@ -0,0 +1,103 @@
 # Define here the models for your spider middleware
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 class NaukriIndiaDetailSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i
    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
        # Should return either None or an iterable of Request or item objects.
        pass
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.
        # Must return only requests (not items).
        for r in start_requests:
            yield r
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
 class NaukriIndiaDetailDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.
        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
--- a/scraper/naukri_india_detail/naukri_india_detail/pipelines.py
+++ b/scraper/naukri_india_detail/naukri_india_detail/pipelines.py
@ -0,0 +1,35 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 # useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
 import csv
 from datetime import datetime
 current_date = datetime.now()
 formatted_date = current_date.strftime('%d-%m-%Y')
 output_file = f'india_data/naukri_india_detail_{formatted_date}.csv'
 json_data = [
          "Url","Job Key","Source Link","Job Description","Role Category","Job Industry","Job Title",
          "Formatted Location Full","Job Functions","Company","Job Type","Key Skills",
          "Minimum Experience","Maximum Experience", "Salary Detail", "Country"]      
 class NaukriIndiaDetailPipeline:
    # print("pipelien ere")
    def open_spider(self, spider):
        self.csvfile = open(output_file, 'a', newline='', encoding='utf-8')
        self.csv_writer = csv.DictWriter(self.csvfile, fieldnames=json_data)
        if self.csvfile.tell() == 0:
            self.csv_writer.writeheader()
    def process_item(self, item, spider):
        self.csv_writer.writerow(item)
        print("written To csv", )
        return item
    def close_spider(self, spider):
        self.csvfile.close()
--- a/scraper/naukri_india_detail/naukri_india_detail/settings.py
+++ b/scraper/naukri_india_detail/naukri_india_detail/settings.py
@ -0,0 +1,93 @@
 # Scrapy settings for naukri_india_detail project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 BOT_NAME = "naukri_india_detail"
 SPIDER_MODULES = ["naukri_india_detail.spiders"]
 NEWSPIDER_MODULE = "naukri_india_detail.spiders"
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = "naukri_india_detail (+http://www.yourdomain.com)"
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 #DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 #DEFAULT_REQUEST_HEADERS = {
 #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 #    "Accept-Language": "en",
 #}
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    "naukri_india_detail.middlewares.NaukriIndiaDetailSpiderMiddleware": 543,
 #}
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
 #    "naukri_india_detail.middlewares.NaukriIndiaDetailDownloaderMiddleware": 543,
 #}
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
 #    "scrapy.extensions.telnet.TelnetConsole": None,
 #}
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
 #    "naukri_india_detail.pipelines.NaukriIndiaDetailPipeline": 300,
 #}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 #AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 #AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 #HTTPCACHE_ENABLED = True
 #HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_DIR = "httpcache"
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 # Set settings whose default value is deprecated to a future-proof value
 REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = "utf-8"
--- a/scraper/naukri_india_detail/naukri_india_detail/spiders/init.py
+++ b/scraper/naukri_india_detail/naukri_india_detail/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/scraper/naukri_india_detail/naukri_india_detail/spiders/pycache/init.cpython-310.pyc
+++ b/scraper/naukri_india_detail/naukri_india_detail/spiders/pycache/init.cpython-310.pyc
--- a/scraper/naukri_india_detail/naukri_india_detail/spiders/pycache/naukri_india_detail_spider.cpython-310.pyc
+++ b/scraper/naukri_india_detail/naukri_india_detail/spiders/pycache/naukri_india_detail_spider.cpython-310.pyc
--- a/scraper/naukri_india_detail/naukri_india_detail/spiders/naukri_india_detail_spider.py
+++ b/scraper/naukri_india_detail/naukri_india_detail/spiders/naukri_india_detail_spider.py
@ -0,0 +1,112 @@
 import scrapy
 import csv
 import logging
 from datetime import datetime, timedelta
 import pandas as pd
 class NaukriIndiaDetailSpiderSpider(scrapy.Spider):
    name = "naukri_india_detail_spider"
    custom_settings = {
        'DOWNLOAD_DELAY' : 0.5,
        'CONCURRENT_REQUESTS' : 5,
        'ITEM_PIPELINES': {
            'naukri_india_detail.pipelines.NaukriIndiaDetailPipeline': 300,
        },
        'LOG_LEVEL': 'ERROR',
        'LOG_FILE': 'naukri_india_detail_error.log',
        'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s',
    }
    def start_requests(self):
        headers = {
          'authority': 'www.naukri.com',
          'accept': 'application/json',
          'accept-language': 'en-US,en;q=0.9',
          'appid': '121',
          'cache-control': 'no-cache, no-store, must-revalidate',
          'content-type': 'application/json',
          'expires': '0',
          'gid': 'LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE',
          'pragma': 'no-cache',
          'referer': 'https://www.naukri.com/job-listings-ps-technical-consultant-ii-ncr-corporation-india-pvt-ltd-kolkata-mumbai-new-delhi-hyderabad-secunderabad-pune-chennai-bangalore-bengaluru-3-to-6-years-120823501070',
          'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
          'sec-ch-ua-mobile': '?0',
          'sec-ch-ua-platform': '"Windows"',
          'sec-fetch-dest': 'empty',
          'sec-fetch-mode': 'cors',
          'sec-fetch-site': 'same-origin',
          'systemid': 'Naukri',
          'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43',
          'x-requested-with': 'XMLHttpRequest',
          'cookie': 'test=naukri.com; _t_ds=14c8c0f01691845374-19414c8c0f0-014c8c0f0; _gcl_au=1.1.1024691843.1691845381; _fbp=fb.1.1691845391563.1521284000; _t_r=1096%2F%2F; __utma=266160400.1059122291.1691845381.1691846963.1691846963.1; __utmc=266160400; __utmz=266160400.1691846963.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _gid=GA1.2.1097790226.1691946960; _cc_id=f102b3f9c375bbb80783e8e09a9c6a4d; panoramaId_expiry=1692033362592; panoramaId=f20d6d50d02ca8dbc9f4835382c2a9fb927ad954299218db60a7d9d7bca09362; panoramaIdType=panoDevice; _abck=EAF8CD87ED06F6FE0D1BE341378082D0~0~YAAQBCozarVfw8GJAQAAvF128gqV/yjff8AT5qkTc7EiVmNlJJ00nD16VEFeJh15q2bYAK8KlnGcPr7zpsi8USVMgui9DaCwoq15n4cW+Z/uKvUfCuUQAwVIKj2qlRT9tghTOfvBgGovWxTjFhD8B8DypZg3xbCBcOfMrTxIG0kml1V3V0teNzxQbKwxZBH+f9SpG1nWjcSqi0MuZ2Lp9njCQTDEXdyNn5FK9QUyBNIgMiZXGCroYN6g9Dqg50awS8p7GDin9O0yaBFnLYXYSSPqsjYlZsOAeZG1YDXhVfCIXFl9Ai4oQwulHEVR4kTx7E/GAxrPUMWKT1MJXJk38d/hHm/khF9WXryyuzBNGqBrHEmzbSK2Apvjhz+Hl7a1CDiFvYOTgurygc0o2F8E4e+o1OudsW0KCA==~-1~-1~-1; bm_sz=ED70280600D61C24AE8779690E6872A4~YAAQBCozardfw8GJAQAAvF128hRM9F6AMuh7Z7SvE3TmgXzJwI6StEga9y2KuTxZ8hXLMtJ7yq1I6ToCvJ1qcBfvYBY/W/7P2A4I+QADScKYSbs6S/S3UE9bL/lKee3NvEuD50tGUHrs59SQGoYdJGMrwml9npvfv+PANc8RaeobLmyx70LjTBajrTQruhnuEqphAnEPph1L6yqffRmta8KALbfw/sFFkvZWRte4uRCRS6IwyvdgNdGzHrvU90Cefnm1sAuK5Hm+F+JUvMVZhEWa/vukCd3Pz7toStN7N4P31cQ=~4539188~3289157; bm_mi=5266EA699B10C54B520AC0C335945591~YAAQBCozaslfw8GJAQAAFV528hRn0Dp7Ng6SjmmpdWbuBqjjlpOIm6e4no+DFPGfNvfuTNj9/tOe0zSzEbnFtWymp3K8PdRZcbO4azXh/4xphXqBeZXTZhE/H/7X6du3KAg3VyrF08jM/O2Hf8/7qtOXVUdBSpd8+mzH3IbW1d10UuiswDenQ6HiNRSkJISdZ8F6lXgGw2kpN3tAHIa9RixcTehrimRMipUgj4pRG/80a+tzAQQcAWUVOFaNoOHZ/C/oL2It920HJrOdtE85yrXx/LMaJlUb1RlHCG2KE/xkNMWpMI/FCimZYyI/DC8yQziKzxoqnP+GPA+JN5dMV76U4jXzYLqPOT5NwoKG7w==~1; ak_bmsc=0F69C083388867249F15237E773039FA~000000000000000000000000000000~YAAQBCozailgw8GJAQAAiGF28hTwkEIwbiDNaA96h/t+HbVduxzp6s1VtAmlm8JZxLg4LfiUPyA15rawjfgm3WgrQVB6GsFlaa+AvUvz1Pz3Q1P9td+LXZ5/+PFIAaTQN/O8SvcNd87eOmguE+T4BLbH5NDBcHEHBngYElDjkyqZkRtJ15EqweEPCpzn6yt+EYc/+sNuZI5/Wqj674CTqW8hmhvDToHdetlr8dh0zmRPwh1xdYnwb4uR6rGuaAIDwfopcqXdroQFVmDwMMXCkLNtTG3jToLxEDo7w/SHlJNK0LhicrXOQLyJu4k7udguvs4/Y+kXOEc04TkLKWa0gHsA+znQId6BT0CK4BFgGPYCMzpn379EH1ucz+mbjpX9p61CvxwEqFWV6O6hXXlbjHDGsuIiuIy3EP+38wb6B+uq2PBPgEmzZjLYjs9aNWGs0of7I0/V+ZL2xQDA2JD5FUXN1sgkl8r6w2sT5Fk1VuHGeorLkpIm0fkysZqAPM2yqJ5zaVkjyI4UENN56Aw79pKKVSkJtT5ALDmr1e+O8keIkg069ipenGburGc1Nw==; __gads=ID=da661383a92cc2b7:T=1691845731:RT=1691990009:S=ALNI_Ma5kdU-yCfi5vupriJnuuWUWmE_SQ; __gpi=UID=00000c2b451ccc2b:T=1691845731:RT=1691990009:S=ALNI_MZHpbDDCgSCaDcBTqfNHzHEDKk0JQ; jd=110823008324; _ga=GA1.2.1059122291.1691845381; cto_bundle=IfSELF9LbTF0TnAzamN1d2ZSSm5EMkdYekFhWDNJeElkOCUyQkElMkZ2RTRJNTFBNG95WENmVlBEV01wV3ZPSXB0dWpTZVFBZHZWQmt6WjVHTUpWNWEwQURTeWRaMWVGbyUyQjclMkZpSm5aNFZia0ZjcGklMkJFcSUyQlg2R3I3bUJkazJnaVN0cURyTUpGWUxQOHR6TFpBcDF6QU1MckFOdlg2cEElM0QlM0Q; _gat_UA-182658-1=1; bm_sv=33FDCB0BB2381FFCB1DA9B35AB25F10B~YAAQHSozaj2kUsGJAQAAFWF48hR1ZxWD9bmTihvsJwSN5urYMQoBOXsjILmBLpCp5Y8Wb2d+v8S1IsgfaFAjzZQJDWWGsM4VZOUHvjeEwqyhpkf95fegyYjUANSip9pcOY7JcbsJ3QemjclSynJdM2yjQovH+L9XiBHdKYFWDfacLicV2AGOtFikI1gVDGLSEqegx2bUuwmuQAlECM+lqj//OIwitlvDTMj9WCs40ybqG4D7o+JDWSXPBMYddaEqDw==~1; HOWTORT=ul=1691990122615&r=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1&hd=1691990122806&cl=1691990019014&nu=https%3A%2F%2Fwww.naukri.com%2Fjob-listings-sales-teamlease-services-limited-kolkata-west-bengal-pune-maharashtra-ahmedabad-chennai-tamil-nadu-rajkot-gujarat-jaipur-rajasthan-bangalore-bengaluru-karnataka-delhi-ncr-mumbai-all-areas-0-to-0-years-110823008324%3Fsrc%3Dgnbjobs_homepage_srch%26sid%3D16918479690248153%26xp%3D1%26px%3D1; _ga_K2YBNZVRLL=GS1.1.1691989990.4.1.1691990122.60.0.0'
        }
        current_date = datetime.now()
        formatted_date = current_date.strftime('%d-%m-%Y')
        yesterday = current_date - timedelta(days=1)
        yesterday_str = yesterday.strftime('%d-%m-%Y')
        yesterday_search_file = f'india_data/naukri_india_search_{yesterday_str}.csv'
        today_search_file = f'india_data/naukri_india_search_{formatted_date}.csv'
        today_search_df = pd.read_csv(today_search_file)
        yesterday_search_df = pd.read_csv(yesterday_search_file)
        newresult_df = pd.merge(today_search_df , yesterday_search_df, on='jobId', how='left', suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
        oldresult_df = pd.merge(yesterday_search_df, today_search_df , on='jobId', how='left',suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
        newresult_df = newresult_df.drop_duplicates(subset="jobId", keep="first")
        oldresult_df = oldresult_df.drop_duplicates(subset="jobId", keep="first")
        newresult_df = newresult_df.reset_index(drop=True)
        newresult_df.to_csv('india_data/new_jobs_india.csv', index=False)
        oldresult_df = oldresult_df.reset_index(drop=True)
        oldresult_df.to_csv('india_data/expired_jobs_india.csv', index=False)
        input_file = 'india_data/new_jobs_india.csv'
        print(newresult_df.shape, oldresult_df.shape)
        with open(input_file, 'r') as csv_file:
            csv_reader = csv.DictReader(csv_file)
            for row in csv_reader:
                if row['mode'] == "crawled":
                  jobId = row['jobId']  
                  url = "https://www.naukri.com/jobapi/v4/job/{}".format(jobId)
                  print(url)
                  yield scrapy.Request(url, headers=headers,callback=self.parse)
        # for url in self.start_urls:
        #     yield scrapy.Request(url, headers=headers, callback=self.parse)
        # url = "https://www.naukri.com/jobapi/v4/job/260923007828"
        # yield scrapy.Request(url, headers=headers, callback=self.parse)
    def parse(self, response):
        try:
          url = response.url
          print(f'processing {url}')
          # print(response.text)
          # input("---------------")
          response = response.json()
          job_details = response.get("jobDetails",{})
          location_arr = [item['label'] for item in job_details["locations"]]
          location_str = ', '.join(location_arr)
          skills_arr = [skill["label"] for skill in job_details.get("keySkills")["other"] if skill["label"]]
          skills_str = ", ".join(skills_arr)
          json_data = {
            "Url": url,
            "Job Key": str(url.split('/')[-1]),  
            "Source Link": job_details.get("applyRedirectUrl"),
            "Job Description": job_details.get("description"),
            "Role Category": job_details.get("roleCategory"),
            "Job Industry": job_details.get("industry"),
            "Job Title": job_details.get("title"),
            "Formatted Location Full": location_str,
            "Job Functions": job_details.get("functionalArea"),
            "Company": job_details.get("companyDetail", {}).get("name") if job_details.get("companyDetail") else None,
            "Job Type": job_details.get("employmentType").split(',')[0].strip(),
            ##Only available in naukri
            "Key Skills": skills_str,
            "Minimum Experience": job_details.get("minimumExperience"),
            "Maximum Experience": job_details.get("maximumExperience"),
            "Salary Detail": job_details.get("salaryDetail"),
            "Country" : "India"
          }
          yield json_data
        except Exception as naukriError:
           self.logger.error(f'An error occured : {str(naukriError)}')
--- a/scraper/naukri_india_detail/naukri_india_detail_error.log
+++ b/scraper/naukri_india_detail/naukri_india_detail_error.log
@ -0,0 +1,38 @@
 2023-10-06 18:16:08 [scrapy.core.engine] ERROR: Scraper close failure
 Traceback (most recent call last):
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
    yield self.engine.open_spider(self.spider, start_requests)
 FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_detail_06-10-2023.csv'
 During handling of the above exception, another exception occurred:
 Traceback (most recent call last):
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 892, in _runCallbacks
    current.result = callback(  # type: ignore[misc]
  File "C:\Rahul code\scrapy for naukri\server scraper\naukri_india_detail\naukri_india_detail\pipelines.py", line 35, in close_spider
    self.csvfile.close()
 AttributeError: 'NaukriIndiaDetailPipeline' object has no attribute 'csvfile'
 2023-10-06 18:16:08 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method CoreStats.spider_closed of <scrapy.extensions.corestats.CoreStats object at 0x000001E30744A7D0>>
 Traceback (most recent call last):
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
    yield self.engine.open_spider(self.spider, start_requests)
 FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_detail_06-10-2023.csv'
 During handling of the above exception, another exception occurred:
 Traceback (most recent call last):
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\defer.py", line 348, in maybeDeferred_coro
    result = f(*args, **kw)
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
    return receiver(*arguments, **named)
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\extensions\corestats.py", line 30, in spider_closed
    elapsed_time = finish_time - self.start_time
 TypeError: unsupported operand type(s) for -: 'datetime.datetime' and 'NoneType'
 2023-10-06 18:16:08 [twisted] CRITICAL: Unhandled error in Deferred:
 2023-10-06 18:16:08 [twisted] CRITICAL: 
 Traceback (most recent call last):
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1697, in _inlineCallbacks
    result = context.run(gen.send, result)
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
    yield self.engine.open_spider(self.spider, start_requests)
 FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_detail_06-10-2023.csv'
--- a/scraper/naukri_india_detail/scrapy.cfg
+++ b/scraper/naukri_india_detail/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = naukri_india_detail.settings
 [deploy]
 #url = http://localhost:6800/
 project = naukri_india_detail
--- a/scraper/naukri_india_search/india_data/naukri_india_search_06-10-2023
+++ b/scraper/naukri_india_search/india_data/naukri_india_search_06-10-2023
--- a/scraper/naukri_india_search/india_data/naukri_india_search_06-10-2023.csv
+++ b/scraper/naukri_india_search/india_data/naukri_india_search_06-10-2023.csv
--- a/scraper/naukri_india_search/naukri_india_search/init.py
+++ b/scraper/naukri_india_search/naukri_india_search/init.py
--- a/scraper/naukri_india_search/naukri_india_search/pycache/init.cpython-310.pyc
+++ b/scraper/naukri_india_search/naukri_india_search/pycache/init.cpython-310.pyc
--- a/scraper/naukri_india_search/naukri_india_search/pycache/pipelines.cpython-310.pyc
+++ b/scraper/naukri_india_search/naukri_india_search/pycache/pipelines.cpython-310.pyc
--- a/scraper/naukri_india_search/naukri_india_search/pycache/settings.cpython-310.pyc
+++ b/scraper/naukri_india_search/naukri_india_search/pycache/settings.cpython-310.pyc
--- a/scraper/naukri_india_search/naukri_india_search/items.py
+++ b/scraper/naukri_india_search/naukri_india_search/items.py
@ -0,0 +1,12 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
 class NaukriIndiaSearchItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
--- a/scraper/naukri_india_search/naukri_india_search/middlewares.py
+++ b/scraper/naukri_india_search/naukri_india_search/middlewares.py
@ -0,0 +1,103 @@
 # Define here the models for your spider middleware
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 class NaukriIndiaSearchSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i
    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
        # Should return either None or an iterable of Request or item objects.
        pass
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.
        # Must return only requests (not items).
        for r in start_requests:
            yield r
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
 class NaukriIndiaSearchDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.
        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
--- a/scraper/naukri_india_search/naukri_india_search/pipelines.py
+++ b/scraper/naukri_india_search/naukri_india_search/pipelines.py
@ -0,0 +1,32 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 # useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
 import csv
 from datetime import datetime
 current_date = datetime.now()
 formatted_date = current_date.strftime('%d-%m-%Y')
 file_to_write = f'india_data/naukri_india_search_{formatted_date}.csv'
 json_data = ['title', 'jobId', 'footerPlaceholderLabel', 'companyName', 'companyId', 'jdURL', 'createdDate',
                       'mode', 'placeholders']
 class NaukriIndiaSearchPipeline:
    # print("pipelien ere")
    def open_spider(self, spider):
        self.csvfile = open(file_to_write, 'a', newline='', encoding='utf-8')
        self.csv_writer = csv.DictWriter(self.csvfile, fieldnames=json_data)
        if self.csvfile.tell() == 0:
            self.csv_writer.writeheader()
    def process_item(self, item, spider):
        self.csv_writer.writerow(item)
        return item
    def close_spider(self, spider):
        self.csvfile.close()
--- a/scraper/naukri_india_search/naukri_india_search/settings.py
+++ b/scraper/naukri_india_search/naukri_india_search/settings.py
@ -0,0 +1,93 @@
 # Scrapy settings for naukri_india_search project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 BOT_NAME = "naukri_india_search"
 SPIDER_MODULES = ["naukri_india_search.spiders"]
 NEWSPIDER_MODULE = "naukri_india_search.spiders"
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = "naukri_india_search (+http://www.yourdomain.com)"
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 #DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 #DEFAULT_REQUEST_HEADERS = {
 #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 #    "Accept-Language": "en",
 #}
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    "naukri_india_search.middlewares.NaukriIndiaSearchSpiderMiddleware": 543,
 #}
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
 #    "naukri_india_search.middlewares.NaukriIndiaSearchDownloaderMiddleware": 543,
 #}
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
 #    "scrapy.extensions.telnet.TelnetConsole": None,
 #}
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
 #    "naukri_india_search.pipelines.NaukriIndiaSearchPipeline": 300,
 #}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 #AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 #AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 #HTTPCACHE_ENABLED = True
 #HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_DIR = "httpcache"
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 # Set settings whose default value is deprecated to a future-proof value
 REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = "utf-8"
--- a/scraper/naukri_india_search/naukri_india_search/spiders/init.py
+++ b/scraper/naukri_india_search/naukri_india_search/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/scraper/naukri_india_search/naukri_india_search/spiders/pycache/init.cpython-310.pyc
+++ b/scraper/naukri_india_search/naukri_india_search/spiders/pycache/init.cpython-310.pyc
--- a/scraper/naukri_india_search/naukri_india_search/spiders/pycache/naukri_india_search_spider.cpython-310.pyc
+++ b/scraper/naukri_india_search/naukri_india_search/spiders/pycache/naukri_india_search_spider.cpython-310.pyc
--- a/scraper/naukri_india_search/naukri_india_search/spiders/naukri_india_search_spider.py
+++ b/scraper/naukri_india_search/naukri_india_search/spiders/naukri_india_search_spider.py
@ -0,0 +1,125 @@
 import scrapy
 import csv
 import logging
 import json 
 import math
 input_file_path = "static_data/_industry_urls.csv"
 headers = { 
    "authority": "www.naukri.com",
    "accept": "application/json",
    "accept-language": "en-US,en;q=0.9",
    "appid": "109",
    "cache-control": "no-cache",
    "clientid": "d3skt0p",
    "content-type": "application/json",
    "cookie": "_t_ds=21836c671691564336-4621836c67-021836c67; jd=280323907884; _gcl_au=1.1.1767756339.1691564338; test=naukri.com; G_ENABLED_IDPS=google; _cc_id=c7a22b66b0e8b76ba5b1ab973ac2c4e2; _fbp=fb.1.1691586951863.1688541664; MYNAUKRI[UNID]=6decd0ec6dac4ea7adf498fd9aea1b02; MYNAUKBMS[TOTALEXP]=.; MYNAUKBMS[MISC]=%7CX%7C-1%3A-1.-1%7CX%7C-1%3A-1.-1; PHPSESSID=7r1itb4rb4a5vp75h16aj1p50j; PS=0e9c712cbbee09d64d62ed464ccf1ed68d69b9c8b8e0879f86ac8078180ed768ff003c62a2e1a36431b890266d0ecd01; _t_ds=21836c671691564336-4621836c67-021836c67; ACTIVE=1691746049; __utma=266160400.222629415.1691564339.1691747172.1691747172.1; __utmc=266160400; __utmz=266160400.1691747172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _t_s=direct; _gid=GA1.2.404208624.1692184309; _t_r=1091%2F%2F; _abck=17DF08AA6008335BFF57EC3D4F31C60A~0~YAAQBCozaovbVfWJAQAAyqlV/wqIPgjcUjD+7ht0W00DSxyvraAK8+dtCE9YPqwS+IJPRVvvHPVL4ZLzQ7cfGNXzfh3k+y2VLqP+s+cPut62fApHUtFEmbTrUNVNv9Zeq9lwI+e8zd1DsioeBQtdUG+kzSHGWky6sPhziobMkx1B7W04IwUfACS7Ve5fYBCJU5dbtVRjeDAoNXmctQPJApkPdaddRMuoeq4qCZcW/bb8bGR+nwyO8+ZBPpQqoBpZrIhpG66AkcOcsLIfBHMfb8E/1dUZyDcFEO4Y7P41NVSIGgF8BzyGksJsa+IlaCXYrz0MDX0QiHXyiozYmEocQYKeTOwkMlmoHq/+X8XLt70g2LvMc0Zszor74PL7ymsDvPRLoDCvPinCf4Uk844KKItZ6menX46Tpg==~-1~-1~-1; bm_sz=BD37187E9CC624B5599566E84E218D81~YAAQBCozao3bVfWJAQAAyqlV/xQaFSd0F+spatEEAmhMi6P20wPSNyvyqwLIgOZIqPyzNpNoeCiq27hIuVDssDqyYLJipRkLmTgJhtRpBI/UkMYHO1gve7KT27FIcZLAPM1GlmudVfZr/vsBgNU7vcq7YlESrOQUNFkdARzI9cnEHl0Uwh+TdW+jSx/uvvgN860EXQYxvgQFPwHcF6K1HLhnThG6W3LrVsKEnltKEJsWzq73YGJhtHR2gk/c2Rn2rsnlBSKkon06k/bBUNpImVfGIv57NluTzAf4HUKBL2dBFfo=~4272181~3684401; bm_mi=840B9E1760640F737B07DF6916477F14~YAAQBCozar8fV/WJAQAAemdo/xR295FqGfoDgkXCgp3Zs538VapFXehFbhWVc0uLC2Z7cfCczehDlj6/WNkwuGUEm6AQ+a2VS9H1cL3cF+vXFUomXcwhU4fmjNruimtgH2vNc8+t07S6CFswop+vgQr50vwaRKAobfsJi0jKNELyQOdgxf0EQ+vH31DwtJMCeNMFIlZxXSznSOUZ9VRY/HSFsMgPHu3ChcKnhfJhUpS2VEkwwh8FjyNNsp08Nc8B85Vbpq3PCTz1kpFWCIeBDDVthrtnKITPzciYZy5e2VhvJWKi+2iRyOVeXbLbCphszroTewz5d6Sd4RhwOg==~1; _gat_UA-182658-1=1; ak_bmsc=DC184FF5F5CF7CEC60DE28CF4A04B43E~000000000000000000000000000000~YAAQBCozakggV/WJAQAAo2xo/xST717WQAIeCYOI3htLys7gWAfwL6/uNZtCJv6fAyFBYEcPf/0asPA8yD7eyVNXLvegM9qh5IquUPoSFJH3Sjz7JyPcySdejoqwoRGhg4rYROybASf1olGEy4PNPGBCBwTi+KUhkVCkHEaDWiDa/feuQddoB3nWBPui267IP17/01afcmBsBA+xz5PFn+OVIp7pIHrsWwa3Z+QoA3+9ZTSs+D/jXsBCsrJojd8U6Ho8NPfgfUyNOJo0SzFIQbcLy5TmAQHEYBCLhYgkRJjGPRSOqEYCtOenp5WzQHRisSQUU837xfVnr42Pc9xoW73pafQv/pQiuB64SrdhVtABVsSWchE5RuqwnPPIBf6cjJWLNb71p+Is6F6zcvVmSIvx2wZO0QmLQ2pfXr6Lh+jcBNPcod8pLbWG5U5RPHQAVi0nGPOYS+3mcrkGCiTrteqyLmSEOGvThutsOfl5Kog6h78tCaHhfhnZt1mmPkanCex2CHjeuT4FESOf83XFCLDVT9v0VAh962a9KQ==; __gads=ID=85c2a6341a8344ec:T=1691641263:RT=1692207181:S=ALNI_MZnP35P-PINdjwxcv-SNoWRMxbz8w; __gpi=UID=00000c29ed221036:T=1691641263:RT=1692207181:S=ALNI_Majbvns7DTxm-L8Fcvi-v_e7zQCvA; bm_sv=743032F92D532DCFC228BE5DB12014CF~YAAQBCozarIgV/WJAQAAQnJo/xRLr5g+qzbOInTUPStEJ+njAToV8zwOvBbHEEF9WGABP3ObKrNGr0FSALH8SsyJxhCnJZP72tWp4RJ8IMvpVkNNNye2Kc0n+U9VxZhSg9RKvKTn/DwW5x0lwY6guqb4wJwZIND/pUfBqdWUPp77qF4rYSeBEg/no94nGlmXUVUY4GqTDj6hCo6XIBbTIg1BGSdrLjFRTjpKu9aRX0ScDPSxuyMe7KPZSsOGY1AL~1; cto_bundle=TYhEE19xSDJxQk1qdTBuR3hYWDklMkJ3SWhPZmRkcjg3TnYyREN1dUpHaDBlbWJoME40OTVBelNlZ3J3TnhjVmZhSTNTTXl2U2JjSWhIM29aaWJHMyUyQkIlMkJPUmZKaGNBRkJLQVNHU1FYWFlleTFVJTJGTWduTkppQzJzMW1SOFJyRWNEdndENkklMkJ6M25jaFpaJTJCUmdUOWNMY2Z3TlolMkJ3QSUzRCUzRA; HOWTORT=ul=1692207219428&r=https%3A%2F%2Fwww.naukri.com%2Faccounting-jobs%3Fxt%3Dcatsrch%26amp%3Bqi%255b%255d%3D8&hd=1692207219607; _ga=GA1.1.222629415.1691564339; _ga_K2YBNZVRLL=GS1.1.1692207181.10.1.1692207220.21.0.0",  # Add your cookie value here
    "gid": "LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE",
    "referer": "https://www.naukri.com/fresher-jobs?src=gnbjobs_homepage_srch",
    "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "Windows",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "systemid": "109",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
    "content-encoding": "gzip",
    }   
 class NaukriIndiaSearchSpiderSpider(scrapy.Spider):
    name = "naukri_india_search_spider"
    custom_settings = {
        'DOWNLOAD_DELAY' : 1,
        'CONCURRENT_REQUESTS' : 5,
        'ITEM_PIPELINES': {
            'naukri_india_search.pipelines.NaukriIndiaSearchPipeline': 301,
        },
        'LOG_LEVEL': 'ERROR',
        'LOG_FILE': 'naukri_india_search_error.log',
        'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s',
    }
    custom_urls =[]
    def start_requests(self):
        headers = { 
            "authority": "www.naukri.com",
            "accept": "application/json",
            "accept-language": "en-US,en;q=0.9",
            "appid": "109",
            "cache-control": "no-cache",
            "clientid": "d3skt0p",
            "content-type": "application/json",
            "cookie": "_t_ds=21836c671691564336-4621836c67-021836c67; jd=280323907884; _gcl_au=1.1.1767756339.1691564338; test=naukri.com; G_ENABLED_IDPS=google; _cc_id=c7a22b66b0e8b76ba5b1ab973ac2c4e2; _fbp=fb.1.1691586951863.1688541664; MYNAUKRI[UNID]=6decd0ec6dac4ea7adf498fd9aea1b02; MYNAUKBMS[TOTALEXP]=.; MYNAUKBMS[MISC]=%7CX%7C-1%3A-1.-1%7CX%7C-1%3A-1.-1; PHPSESSID=7r1itb4rb4a5vp75h16aj1p50j; PS=0e9c712cbbee09d64d62ed464ccf1ed68d69b9c8b8e0879f86ac8078180ed768ff003c62a2e1a36431b890266d0ecd01; _t_ds=21836c671691564336-4621836c67-021836c67; ACTIVE=1691746049; __utma=266160400.222629415.1691564339.1691747172.1691747172.1; __utmc=266160400; __utmz=266160400.1691747172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _t_s=direct; _gid=GA1.2.404208624.1692184309; _t_r=1091%2F%2F; _abck=17DF08AA6008335BFF57EC3D4F31C60A~0~YAAQBCozaovbVfWJAQAAyqlV/wqIPgjcUjD+7ht0W00DSxyvraAK8+dtCE9YPqwS+IJPRVvvHPVL4ZLzQ7cfGNXzfh3k+y2VLqP+s+cPut62fApHUtFEmbTrUNVNv9Zeq9lwI+e8zd1DsioeBQtdUG+kzSHGWky6sPhziobMkx1B7W04IwUfACS7Ve5fYBCJU5dbtVRjeDAoNXmctQPJApkPdaddRMuoeq4qCZcW/bb8bGR+nwyO8+ZBPpQqoBpZrIhpG66AkcOcsLIfBHMfb8E/1dUZyDcFEO4Y7P41NVSIGgF8BzyGksJsa+IlaCXYrz0MDX0QiHXyiozYmEocQYKeTOwkMlmoHq/+X8XLt70g2LvMc0Zszor74PL7ymsDvPRLoDCvPinCf4Uk844KKItZ6menX46Tpg==~-1~-1~-1; bm_sz=BD37187E9CC624B5599566E84E218D81~YAAQBCozao3bVfWJAQAAyqlV/xQaFSd0F+spatEEAmhMi6P20wPSNyvyqwLIgOZIqPyzNpNoeCiq27hIuVDssDqyYLJipRkLmTgJhtRpBI/UkMYHO1gve7KT27FIcZLAPM1GlmudVfZr/vsBgNU7vcq7YlESrOQUNFkdARzI9cnEHl0Uwh+TdW+jSx/uvvgN860EXQYxvgQFPwHcF6K1HLhnThG6W3LrVsKEnltKEJsWzq73YGJhtHR2gk/c2Rn2rsnlBSKkon06k/bBUNpImVfGIv57NluTzAf4HUKBL2dBFfo=~4272181~3684401; bm_mi=840B9E1760640F737B07DF6916477F14~YAAQBCozar8fV/WJAQAAemdo/xR295FqGfoDgkXCgp3Zs538VapFXehFbhWVc0uLC2Z7cfCczehDlj6/WNkwuGUEm6AQ+a2VS9H1cL3cF+vXFUomXcwhU4fmjNruimtgH2vNc8+t07S6CFswop+vgQr50vwaRKAobfsJi0jKNELyQOdgxf0EQ+vH31DwtJMCeNMFIlZxXSznSOUZ9VRY/HSFsMgPHu3ChcKnhfJhUpS2VEkwwh8FjyNNsp08Nc8B85Vbpq3PCTz1kpFWCIeBDDVthrtnKITPzciYZy5e2VhvJWKi+2iRyOVeXbLbCphszroTewz5d6Sd4RhwOg==~1; _gat_UA-182658-1=1; ak_bmsc=DC184FF5F5CF7CEC60DE28CF4A04B43E~000000000000000000000000000000~YAAQBCozakggV/WJAQAAo2xo/xST717WQAIeCYOI3htLys7gWAfwL6/uNZtCJv6fAyFBYEcPf/0asPA8yD7eyVNXLvegM9qh5IquUPoSFJH3Sjz7JyPcySdejoqwoRGhg4rYROybASf1olGEy4PNPGBCBwTi+KUhkVCkHEaDWiDa/feuQddoB3nWBPui267IP17/01afcmBsBA+xz5PFn+OVIp7pIHrsWwa3Z+QoA3+9ZTSs+D/jXsBCsrJojd8U6Ho8NPfgfUyNOJo0SzFIQbcLy5TmAQHEYBCLhYgkRJjGPRSOqEYCtOenp5WzQHRisSQUU837xfVnr42Pc9xoW73pafQv/pQiuB64SrdhVtABVsSWchE5RuqwnPPIBf6cjJWLNb71p+Is6F6zcvVmSIvx2wZO0QmLQ2pfXr6Lh+jcBNPcod8pLbWG5U5RPHQAVi0nGPOYS+3mcrkGCiTrteqyLmSEOGvThutsOfl5Kog6h78tCaHhfhnZt1mmPkanCex2CHjeuT4FESOf83XFCLDVT9v0VAh962a9KQ==; __gads=ID=85c2a6341a8344ec:T=1691641263:RT=1692207181:S=ALNI_MZnP35P-PINdjwxcv-SNoWRMxbz8w; __gpi=UID=00000c29ed221036:T=1691641263:RT=1692207181:S=ALNI_Majbvns7DTxm-L8Fcvi-v_e7zQCvA; bm_sv=743032F92D532DCFC228BE5DB12014CF~YAAQBCozarIgV/WJAQAAQnJo/xRLr5g+qzbOInTUPStEJ+njAToV8zwOvBbHEEF9WGABP3ObKrNGr0FSALH8SsyJxhCnJZP72tWp4RJ8IMvpVkNNNye2Kc0n+U9VxZhSg9RKvKTn/DwW5x0lwY6guqb4wJwZIND/pUfBqdWUPp77qF4rYSeBEg/no94nGlmXUVUY4GqTDj6hCo6XIBbTIg1BGSdrLjFRTjpKu9aRX0ScDPSxuyMe7KPZSsOGY1AL~1; cto_bundle=TYhEE19xSDJxQk1qdTBuR3hYWDklMkJ3SWhPZmRkcjg3TnYyREN1dUpHaDBlbWJoME40OTVBelNlZ3J3TnhjVmZhSTNTTXl2U2JjSWhIM29aaWJHMyUyQkIlMkJPUmZKaGNBRkJLQVNHU1FYWFlleTFVJTJGTWduTkppQzJzMW1SOFJyRWNEdndENkklMkJ6M25jaFpaJTJCUmdUOWNMY2Z3TlolMkJ3QSUzRCUzRA; HOWTORT=ul=1692207219428&r=https%3A%2F%2Fwww.naukri.com%2Faccounting-jobs%3Fxt%3Dcatsrch%26amp%3Bqi%255b%255d%3D8&hd=1692207219607; _ga=GA1.1.222629415.1691564339; _ga_K2YBNZVRLL=GS1.1.1692207181.10.1.1692207220.21.0.0",  # Add your cookie value here
            "gid": "LOCATION,INDUSTRY,EDUCATION,FAREA_ROLE",
            "referer": "https://www.naukri.com/fresher-jobs?src=gnbjobs_homepage_srch",
            "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "Windows",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "systemid": "109",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.43",
            "content-encoding": "gzip",
            }   
        with open(input_file_path, 'r') as file:    
            file_read = csv.reader(file)
            for industry in list(file_read):
                # industry_read_url = industry[0].replace("\n", "")
                industry_name=industry[1]
                industry_q=industry[2]
                base_url = "https://www.naukri.com/jobapi/v3/search?noOfResults=100&urlType=search_by_keyword&searchType=adv&keyword={}&pageNo={}&xt=catsrch&amp;qi\[\]={}"
                total_pages = 1000
                start_page= 1
                custom_args = {
                'industry_name':industry[1],
                'industry_q':industry[2],
                'base_url' : base_url,
                'total_pages' : 1000,
                'start_page' : 1
                }
                print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
                url = base_url.format(industry_name, start_page, industry_q)
                yield scrapy.Request(url, headers=headers,callback=self.parse , meta=custom_args)
    def parse(self, response):
        keys_to_extract = ['title', 'jobId', 'footerPlaceholderLabel', 'companyName', 'companyId', 'jdURL', 'createdDate',
                       'mode', 'placeholders']
        total_pages = response.meta.get('total_pages')
        start_page = response.meta.get('start_page')
        base_url = response.meta.get('base_url')
        industry_name = response.meta.get('industry_name')
        industry_q = response.meta.get('industry_q')
        if(total_pages == 1000):
            total_jobs = response.json()["noOfJobs"]
            total_pages = math.ceil(total_jobs/100)
        try:
            # parsed_data = []
            for job in response.json()["jobDetails"]:
                parsed_item = {field: job.get(field, None) for field in keys_to_extract}
                # parsed_data.append(parsed_item)
                yield parsed_item
            total_pages -= 1
            start_page += 1
            custom_args = {
            'industry_name':industry_name,
            'industry_q':industry_q,
            'base_url' : base_url,
            'total_pages' : total_pages,
            'start_page' : start_page
            }
            self.custom_urls.append(base_url.format(industry_name, start_page, industry_q))
            print(f"Starting for industry: {industry_name}, total pages: {total_pages}, start page: {start_page}")
            for url in self.custom_urls:
                yield scrapy.Request(url=url, headers= headers, callback=self.parse, meta=custom_args)
            # next_page = base_url.format(industry_name, start_page, industry_q)
            # yield response.follow(next_page, callback=self.parse, meta={'my_arg': custom_args})
        except Exception as naukriError:
           self.logger.error(f'An error occured : {str(naukriError)}')
--- a/scraper/naukri_india_search/naukri_india_search_error.log
+++ b/scraper/naukri_india_search/naukri_india_search_error.log
@ -0,0 +1,38 @@
 2023-10-06 17:09:58 [scrapy.core.engine] ERROR: Scraper close failure
 Traceback (most recent call last):
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
    yield self.engine.open_spider(self.spider, start_requests)
 FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_search_06-10-2023.csv'
 During handling of the above exception, another exception occurred:
 Traceback (most recent call last):
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 892, in _runCallbacks
    current.result = callback(  # type: ignore[misc]
  File "C:\Rahul code\scrapy for naukri\server scraper\naukri_india_search\naukri_india_search\pipelines.py", line 32, in close_spider
    self.csvfile.close()
 AttributeError: 'NaukriIndiaSearchPipeline' object has no attribute 'csvfile'
 2023-10-06 17:09:58 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method CoreStats.spider_closed of <scrapy.extensions.corestats.CoreStats object at 0x000001DD214DB070>>
 Traceback (most recent call last):
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
    yield self.engine.open_spider(self.spider, start_requests)
 FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_search_06-10-2023.csv'
 During handling of the above exception, another exception occurred:
 Traceback (most recent call last):
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\defer.py", line 348, in maybeDeferred_coro
    result = f(*args, **kw)
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
    return receiver(*arguments, **named)
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\extensions\corestats.py", line 30, in spider_closed
    elapsed_time = finish_time - self.start_time
 TypeError: unsupported operand type(s) for -: 'datetime.datetime' and 'NoneType'
 2023-10-06 17:09:58 [twisted] CRITICAL: Unhandled error in Deferred:
 2023-10-06 17:09:58 [twisted] CRITICAL: 
 Traceback (most recent call last):
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1697, in _inlineCallbacks
    result = context.run(gen.send, result)
  File "C:\Users\prahul\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 160, in crawl
    yield self.engine.open_spider(self.spider, start_requests)
 FileNotFoundError: [Errno 2] No such file or directory: 'india_data/naukri_india_search_06-10-2023.csv'
--- a/scraper/naukri_india_search/scrapy.cfg
+++ b/scraper/naukri_india_search/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = naukri_india_search.settings
 [deploy]
 #url = http://localhost:6800/
 project = naukri_india_search
--- a/scraper/naukri_india_search/static_data/_industry_urls.csv
+++ b/scraper/naukri_india_search/static_data/_industry_urls.csv
@ -0,0 +1,61 @@
 https://www.naukri.com/accounting-jobs?xt=catsrch&amp;qi[]=8,accounting,8
 https://www.naukri.com/advertising-jobs?xt=catsrch&amp;qi[]=32,advertising,32
 https://www.naukri.com/agriculture-jobs?xt=catsrch&amp;qi[]=33,agriculture,33
 https://www.naukri.com/animation-jobs?xt=catsrch&amp;qi[]=56,animation,56
 https://www.naukri.com/architecture-jobs?xt=catsrch&amp;qi[]=30,architecture,30
 https://www.naukri.com/automobile-jobs?xt=catsrch&amp;qi[]=4,automobile,4
 https://www.naukri.com/aviation-jobs?xt=catsrch&amp;qi[]=46,aviation,46
 https://www.naukri.com/bpo-jobs?xt=catsrch&amp;qi[]=7,bpo,7
 https://www.naukri.com/bank-jobs?xt=catsrch&amp;qi[]=14,bank,14
 https://www.naukri.com/brewery-jobs?xt=catsrch&amp;qi[]=50,brewery,50
 https://www.naukri.com/sanitary-jobs?xt=catsrch&amp;qi[]=60,sanitary,60
 https://www.naukri.com/chemical-jobs?xt=catsrch&amp;qi[]=6,chemical,6
 https://www.naukri.com/engineering-jobs?xt=catsrch&amp;qi[]=12,engineering,12
 https://www.naukri.com/consumer-durables-jobs?xt=catsrch&amp;qi[]=10,consumer-durables,10
 https://www.naukri.com/courier-jobs?xt=catsrch&amp;qi[]=18,courier,18
 https://www.naukri.com/defence-jobs?xt=catsrch&amp;qi[]=42,defence,42
 https://www.naukri.com/teaching-jobs?xt=catsrch&amp;qi[]=26,teaching,26
 https://www.naukri.com/electrical-jobs?xt=catsrch&amp;qi[]=55,electrical,55
 https://www.naukri.com/export-import-jobs?xt=catsrch&amp;qi[]=13,export-import,13
 https://www.naukri.com/fmcg-jobs?xt=catsrch&amp;qi[]=9,fmcg,9
 https://www.naukri.com/facility-management-jobs?xt=catsrch&amp;qi[]=47,facility-management,47
 https://www.naukri.com/fertilizers-jobs?xt=catsrch&amp;qi[]=41,fertilizers,41
 https://www.naukri.com/food-processing-jobs?xt=catsrch&amp;qi[]=57,food-processing,57
 https://www.naukri.com/fresher-jobs?xt=catsrch&amp;qi[]=31,fresher,31
 https://www.naukri.com/gems-jewellery-jobs?xt=catsrch&amp;qi[]=35,gems-jewellery,35
 https://www.naukri.com/glass-jobs?xt=catsrch&amp;qi[]=49,glass,49
 https://www.naukri.com/air-conditioning-jobs?xt=catsrch&amp;qi[]=61,air-conditioning,61
 https://www.naukri.com/airline-jobs?xt=catsrch&amp;qi[]=2,airline,2
 https://www.naukri.com/networking-jobs?xt=catsrch&amp;qi[]=15,networking,15
 https://www.naukri.com/information-technology-jobs?xt=catsrch&amp;qi[]=25,information-technology,25
 https://www.naukri.com/industrial-jobs?xt=catsrch&amp;qi[]=16,industrial,16
 https://www.naukri.com/insurance-jobs?xt=catsrch&amp;qi[]=17,insurance,17
 https://www.naukri.com/kpo-jobs?xt=catsrch&amp;qi[]=48,kpo,48
 https://www.naukri.com/legal-jobs?xt=catsrch&amp;qi[]=36,legal,36
 https://www.naukri.com/media-jobs?xt=catsrch&amp;qi[]=19,media,19
 https://www.naukri.com/dotcom-jobs?xt=catsrch&amp;qi[]=19,dotcom,19
 https://www.naukri.com/entertainment-jobs?xt=catsrch&amp;qi[]=19,entertainment,19
 https://www.naukri.com/medical-jobs?xt=catsrch&amp;qi[]=20,medical,20
 https://www.naukri.com/mining-jobs?xt=catsrch&amp;qi[]=54,mining,54
 https://www.naukri.com/ngo-jobs?xt=catsrch&amp;qi[]=37,ngo,37
 https://www.naukri.com/automation-jobs?xt=catsrch&amp;qi[]=21,automation,21
 https://www.naukri.com/oil-and-gas-jobs?xt=catsrch&amp;qi[]=23,oil-and-gas,23
 https://www.naukri.com/paper-jobs?xt=catsrch&amp;qi[]=43,paper,43
 https://www.naukri.com/pharma-jobs?xt=catsrch&amp;qi[]=22,pharma,22
 https://www.naukri.com/printing-jobs?xt=catsrch&amp;qi[]=38,printing,38
 https://www.naukri.com/publishing-jobs?xt=catsrch&amp;qi[]=58,publishing,58
 https://www.naukri.com/real-estate-jobs?xt=catsrch&amp;qi[]=39,real-estate,39
 https://www.naukri.com/recruitment-jobs?xt=catsrch&amp;qi[]=34,recruitment,34
 https://www.naukri.com/retail-jobs?xt=catsrch&amp;qi[]=24,retail,24
 https://www.naukri.com/security-jobs?xt=catsrch&amp;qi[]=40,security,40
 https://www.naukri.com/electronics-jobs?xt=catsrch&amp;qi[]=28,electronics,28
 https://www.naukri.com/shipping-jobs?xt=catsrch&amp;qi[]=44,shipping,44
 https://www.naukri.com/steel-jobs?xt=catsrch&amp;qi[]=53,steel,53
 https://www.naukri.com/consultant-jobs?xt=catsrch&amp;qi[]=52,consultant,52
 https://www.naukri.com/telecom-jobs?xt=catsrch&amp;qi[]=27,telecom,27
 https://www.naukri.com/textiles-jobs?xt=catsrch&amp;qi[]=3,textiles,3
 https://www.naukri.com/tyres-jobs?xt=catsrch&amp;qi[]=45,tyres,45
 https://www.naukri.com/water-treatment-jobs?xt=catsrch&amp;qi[]=51,water-treatment,51
 https://www.naukri.com/fitness-trainer-jobs?xt=catsrch&amp;qi[]=59,fitness-trainer,59
 https://www.naukri.com/ecommerce-jobs?xt=catsrch&amp;qi[]=63,ecommerce,63
 https://www.naukri.com/internet-jobs?xt=catsrch&amp;qi[]=63,internet,63