compete_jobs/naukri/jobdata_gulf_r.py

import requests
import csv
from time import sleep, time
"""
# List of URLs to query
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"

headers = {
    'authority': 'www.naukrigulf.com',
    'accept': 'application/json',
    'accept-format': 'strict',
    'accept-language': 'ENGLISH',
    'appid': '205',
    'cache-control': 'no-cache',
    'client-type': 'desktop',
    'clientid': 'desktop',
    'device-type': 'desktop',
    'puppeteer': 'false',
    'referer': 'https://www.naukrigulf.com/jobs-in-uae',
    'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': 'Windows',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'systemid': '2323',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
    'userdata': '|IN'
}

with open("o.csv", 'a+', newline='', encoding='utf-8') as outfile:
    outfile_writer = csv.writer(outfile)
    with open("output_all_gulf old.csv", 'r', encoding="utf-8", newline='') as jobis:
        j_read = list(csv.DictReader(jobis))
        for item in j_read:
            print(base_url.format(item.get('jobId')))
            jd_url = base_url.format(item.get('jobId'))
            sleep(0.5)
            response = requests.get(base_url.format(item.get('jobId')), headers=headers)
            if response.status_code == 200:
                job_data = {
                    "Url" : jd_url,
                    "Job Key" : item.get('jobId'),
                    "Source Link": response.json().get('other', {'tag': ''}).get('tag',''),
                    "Job Description" : response.json().get('description',''),
                    "Role Category" :"",
                    "Job Industry" :  ', '.join([t['title'] for t in  response.json()['industryInterlinking']]),
                    "Job Title" : response.json().get('designation'),
                    "Formatted Location Full" : response.json().get('location'),
                    "Job Functions" :  ', '.join([x['title'] for x in  response.json()['fAreaInterlinking']]),
                    "Company" : response.json().get('company', {'name':''}).get('name'),
                    "Job Type" : response.json().get('employmentType'),
                    "Key Skills" : ', '.join([y['title'] for y in  response.json()['keywordInterlinking']]),
                    "Minimum Experience" : response.json().get('desiredCandidate').get('experience').get('min'),
                    "Maximum Experience" : response.json().get('desiredCandidate').get('experience').get('max'),
                    "Salary Detail" : response.json().get('compensation')
                }
                if outfile.tell() == 0:
                    header = job_data.keys()
                    outfile_writer.writerow(header)
                outfile_writer.writerow([str(z).replace('\n','').strip() for z in job_data.values()])
"""


# Global variables
input_file = "gulf_data/output_all_gulf.csv"
output_file = "gulf_data/jobdata_gulf.csv"
error_file = "gulf_data/jobdata_error_gulf.csv"
stats_file = "gulf_data/stats_gulf.txt"
skip=0

class NaukriGulfJobDetailScraper:
    base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
    headers = {
        'authority': 'www.naukrigulf.com',
        'accept': 'application/json',
        'accept-format': 'strict',
        'accept-language': 'ENGLISH',
        'appid': '205',
        'cache-control': 'no-cache',
        'client-type': 'desktop',
        'clientid': 'desktop',
        'device-type': 'desktop',
        'puppeteer': 'false',
        'referer': 'https://www.naukrigulf.com/jobs-in-uae',
        'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': 'Windows',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'systemid': '2323',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
        'userdata': '|IN'
    }

    def __init__(self, input_file, output_file, error_file):
        self.input_file = input_file
        self.output_file = output_file
        self.error_file = error_file
        self.timeout = 30
        self.count = 1
        # self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}

    def transform_data(self, job_id, jd_url, json_response):
        json_data = {
            "Url" : jd_url,
            "Job Key" : job_id,
            "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
            json_response.get('contact', {'website': ''}).get('website',''),
            "Job Description" : json_response.get('description',''),
            "Role Category" :"",
            "Job Industry" :  ', '.join([t['title'] for t in  json_response['industryInterlinking']]),
            "Job Title" : json_response.get('designation'),
            "Formatted Location Full" : json_response.get('location'),
            "Job Functions" :  ', '.join([x['title'] for x in  json_response['fAreaInterlinking']]),
            "Company" : json_response.get('company', {'name':''}).get('name'),
            "Job Type" : json_response.get('employmentType'),
            "Key Skills" : ', '.join([y['title'] for y in  json_response['keywordInterlinking']]),
            "Minimum Experience" : json_response.get('desiredCandidate').get('experience').get('min'),
            "Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
            "Salary Detail" : json_response.get('compensation')
        }
        return json_data

    def scrape(self):
        with open(self.input_file, 'r', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            total_input_count=0
            all_job_ids = []
            for row in reader:
                jobid = row[1].strip()
                mode = row[7].strip()
                total_input_count+=1
                if mode == "POSTED":
                    print("removed non tagged job with jobid %s" % jobid)
                    continue
                
                all_job_ids.append(jobid)

            print(f"Size of raw all_job_ids: {len(all_job_ids)}")
            all_job_ids = list(set(all_job_ids))
            print(f"Size of unique all_job_ids: {len(all_job_ids)}")
            all_job_ids = all_job_ids[skip:]
            print(f"Total input: {total_input_count},  Valid ids to scrape {len(all_job_ids)}")
            with open(stats_file, "a") as stat:
                stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n")
            sleep(1)
            with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile:
                writer = csv.writer(outfile)
                while all_job_ids:
                    job_id = all_job_ids[0]
                    url = self.base_url.format(job_id)
                    sleep(0.5)
                    try:
                        sleep(1)
                        response = requests.get(url, headers=self.headers, timeout=self.timeout)
                        print(f"{response.status_code} for {url}")
                        if response.status_code == 200:
                            json_response = response.json()
                            transformed_data = self.transform_data(job_id, url, json_response)
                            if outfile.tell() == 0 :
                                header = transformed_data.keys()
                                writer.writerow(header)
                            writer.writerow(transformed_data.values())
                            print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}")
                            all_job_ids.pop(0)  # Remove the processed job ID
                            self.count += 1
    # /                    elif response.status_code == 303:
    #                         json_response = response.json()
    #                         if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
    #                           print(f"Expired job ID {jobid} with response 303")
    #                           all_job_ids.pop(0)  # Remove the processed job ID
                        elif response.status_code == 404:
                            all_job_ids.pop(0)  # Remove the processed job ID
                            print(f"Expired job ID {jobid} with response 404")
                        else:
                            print(f"Error for job ID {job_id}")
                    except Exception as n1:
                        print(str(n1))
                        pass

def main(): 
    start_time = time()
    scraper = NaukriGulfJobDetailScraper(input_file, output_file, error_file)
    scraper.scrape()
    end_time = time()
    duration_hours = (end_time - start_time) / 3600
    print(f"Jobdata program took {duration_hours:.2f} hours to run.")
    with open(stats_file, "a") as stat:
                stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")

if __name__ == "__main__":
    main()
nn 2023-09-28 08:54:56 +00:00			`import requests`
			`import csv`
added scrapy code 2023-10-09 08:15:53 +00:00			`from time import sleep, time`
			`"""`
nn 2023-09-28 08:54:56 +00:00			`# List of URLs to query`
			`base_url = "https://www.naukrigulf.com/spapi/jobs/{}"`

			`headers = {`
			`'authority': 'www.naukrigulf.com',`
			`'accept': 'application/json',`
			`'accept-format': 'strict',`
			`'accept-language': 'ENGLISH',`
			`'appid': '205',`
			`'cache-control': 'no-cache',`
			`'client-type': 'desktop',`
			`'clientid': 'desktop',`
			`'device-type': 'desktop',`
			`'puppeteer': 'false',`
			`'referer': 'https://www.naukrigulf.com/jobs-in-uae',`
			`'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',`
			`'sec-ch-ua-mobile': '?0',`
			`'sec-ch-ua-platform': 'Windows',`
			`'sec-fetch-dest': 'empty',`
			`'sec-fetch-mode': 'cors',`
			`'sec-fetch-site': 'same-origin',`
			`'systemid': '2323',`
			`'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',`
			`'userdata': '\|IN'`
			`}`

added scrapy code 2023-10-09 08:15:53 +00:00			`with open("o.csv", 'a+', newline='', encoding='utf-8') as outfile:`
			`outfile_writer = csv.writer(outfile)`
			`with open("output_all_gulf old.csv", 'r', encoding="utf-8", newline='') as jobis:`
			`j_read = list(csv.DictReader(jobis))`
			`for item in j_read:`
			`print(base_url.format(item.get('jobId')))`
			`jd_url = base_url.format(item.get('jobId'))`
			`sleep(0.5)`
			`response = requests.get(base_url.format(item.get('jobId')), headers=headers)`
			`if response.status_code == 200:`
			`job_data = {`
			`"Url" : jd_url,`
			`"Job Key" : item.get('jobId'),`
			`"Source Link": response.json().get('other', {'tag': ''}).get('tag',''),`
			`"Job Description" : response.json().get('description',''),`
			`"Role Category" :"",`
			`"Job Industry" : ', '.join([t['title'] for t in response.json()['industryInterlinking']]),`
			`"Job Title" : response.json().get('designation'),`
			`"Formatted Location Full" : response.json().get('location'),`
			`"Job Functions" : ', '.join([x['title'] for x in response.json()['fAreaInterlinking']]),`
			`"Company" : response.json().get('company', {'name':''}).get('name'),`
			`"Job Type" : response.json().get('employmentType'),`
			`"Key Skills" : ', '.join([y['title'] for y in response.json()['keywordInterlinking']]),`
			`"Minimum Experience" : response.json().get('desiredCandidate').get('experience').get('min'),`
			`"Maximum Experience" : response.json().get('desiredCandidate').get('experience').get('max'),`
			`"Salary Detail" : response.json().get('compensation')`
			`}`
			`if outfile.tell() == 0:`
			`header = job_data.keys()`
			`outfile_writer.writerow(header)`
			`outfile_writer.writerow([str(z).replace('\n','').strip() for z in job_data.values()])`
			`"""`



			`# Global variables`
			`input_file = "gulf_data/output_all_gulf.csv"`
			`output_file = "gulf_data/jobdata_gulf.csv"`
			`error_file = "gulf_data/jobdata_error_gulf.csv"`
			`stats_file = "gulf_data/stats_gulf.txt"`
			`skip=0`

			`class NaukriGulfJobDetailScraper:`
			`base_url = "https://www.naukrigulf.com/spapi/jobs/{}"`
			`headers = {`
			`'authority': 'www.naukrigulf.com',`
			`'accept': 'application/json',`
			`'accept-format': 'strict',`
			`'accept-language': 'ENGLISH',`
			`'appid': '205',`
			`'cache-control': 'no-cache',`
			`'client-type': 'desktop',`
			`'clientid': 'desktop',`
			`'device-type': 'desktop',`
			`'puppeteer': 'false',`
			`'referer': 'https://www.naukrigulf.com/jobs-in-uae',`
			`'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',`
			`'sec-ch-ua-mobile': '?0',`
			`'sec-ch-ua-platform': 'Windows',`
			`'sec-fetch-dest': 'empty',`
			`'sec-fetch-mode': 'cors',`
			`'sec-fetch-site': 'same-origin',`
			`'systemid': '2323',`
			`'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',`
			`'userdata': '\|IN'`
			`}`

			`def __init__(self, input_file, output_file, error_file):`
			`self.input_file = input_file`
			`self.output_file = output_file`
			`self.error_file = error_file`
			`self.timeout = 30`
			`self.count = 1`
			`# self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}`

			`def transform_data(self, job_id, jd_url, json_response):`
			`json_data = {`
			`"Url" : jd_url,`
			`"Job Key" : job_id,`
			`"Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \`
			`json_response.get('contact', {'website': ''}).get('website',''),`
			`"Job Description" : json_response.get('description',''),`
			`"Role Category" :"",`
			`"Job Industry" : ', '.join([t['title'] for t in json_response['industryInterlinking']]),`
			`"Job Title" : json_response.get('designation'),`
			`"Formatted Location Full" : json_response.get('location'),`
			`"Job Functions" : ', '.join([x['title'] for x in json_response['fAreaInterlinking']]),`
			`"Company" : json_response.get('company', {'name':''}).get('name'),`
			`"Job Type" : json_response.get('employmentType'),`
			`"Key Skills" : ', '.join([y['title'] for y in json_response['keywordInterlinking']]),`
			`"Minimum Experience" : json_response.get('desiredCandidate').get('experience').get('min'),`
			`"Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),`
			`"Salary Detail" : json_response.get('compensation')`
			`}`
			`return json_data`

			`def scrape(self):`
			`with open(self.input_file, 'r', encoding='utf-8') as infile:`
			`reader = csv.reader(infile)`
			`total_input_count=0`
			`all_job_ids = []`
			`for row in reader:`
			`jobid = row[1].strip()`
			`mode = row[7].strip()`
			`total_input_count+=1`
			`if mode == "POSTED":`
			`print("removed non tagged job with jobid %s" % jobid)`
			`continue`

			`all_job_ids.append(jobid)`

			`print(f"Size of raw all_job_ids: {len(all_job_ids)}")`
			`all_job_ids = list(set(all_job_ids))`
			`print(f"Size of unique all_job_ids: {len(all_job_ids)}")`
			`all_job_ids = all_job_ids[skip:]`
			`print(f"Total input: {total_input_count}, Valid ids to scrape {len(all_job_ids)}")`
			`with open(stats_file, "a") as stat:`
			`stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n")`
			`sleep(1)`
			`with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile:`
			`writer = csv.writer(outfile)`
			`while all_job_ids:`
			`job_id = all_job_ids[0]`
			`url = self.base_url.format(job_id)`
			`sleep(0.5)`
			`try:`
			`sleep(1)`
			`response = requests.get(url, headers=self.headers, timeout=self.timeout)`
			`print(f"{response.status_code} for {url}")`
			`if response.status_code == 200:`
			`json_response = response.json()`
			`transformed_data = self.transform_data(job_id, url, json_response)`
			`if outfile.tell() == 0 :`
			`header = transformed_data.keys()`
			`writer.writerow(header)`
			`writer.writerow(transformed_data.values())`
			`print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}")`
			`all_job_ids.pop(0) # Remove the processed job ID`
			`self.count += 1`
			`# / elif response.status_code == 303:`
			`# json_response = response.json()`
			`# if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':`
			`# print(f"Expired job ID {jobid} with response 303")`
			`# all_job_ids.pop(0) # Remove the processed job ID`
			`elif response.status_code == 404:`
			`all_job_ids.pop(0) # Remove the processed job ID`
			`print(f"Expired job ID {jobid} with response 404")`
nn 2023-09-28 08:54:56 +00:00			`else:`
added scrapy code 2023-10-09 08:15:53 +00:00			`print(f"Error for job ID {job_id}")`
			`except Exception as n1:`
			`print(str(n1))`
			`pass`

			`def main():`
			`start_time = time()`
			`scraper = NaukriGulfJobDetailScraper(input_file, output_file, error_file)`
			`scraper.scrape()`
			`end_time = time()`
			`duration_hours = (end_time - start_time) / 3600`
			`print(f"Jobdata program took {duration_hours:.2f} hours to run.")`
			`with open(stats_file, "a") as stat:`
			`stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")`
nn 2023-09-28 08:54:56 +00:00
			`if __name__ == "__main__":`
			`main()`