compete_jobs/naukri/jobdata_gulf_r.py

import requests
import csv
from time import sleep, time
"""
# List of URLs to query
base_url = "https://www.naukrigulf.com/spapi/jobs/{}"

headers = {
    'authority': 'www.naukrigulf.com',
    'accept': 'application/json',
    'accept-format': 'strict',
    'accept-language': 'ENGLISH',
    'appid': '205',
    'cache-control': 'no-cache',
    'client-type': 'desktop',
    'clientid': 'desktop',
    'device-type': 'desktop',
    'puppeteer': 'false',
    'referer': 'https://www.naukrigulf.com/jobs-in-uae',
    'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': 'Windows',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'systemid': '2323',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
    'userdata': '|IN'
}

with open("o.csv", 'a+', newline='', encoding='utf-8') as outfile:
    outfile_writer = csv.writer(outfile)
    with open("output_all_gulf old.csv", 'r', encoding="utf-8", newline='') as jobis:
        j_read = list(csv.DictReader(jobis))
        for item in j_read:
            print(base_url.format(item.get('jobId')))
            jd_url = base_url.format(item.get('jobId'))
            sleep(0.5)
            response = requests.get(base_url.format(item.get('jobId')), headers=headers)
            if response.status_code == 200:
                job_data = {
                    "Url" : jd_url,
                    "Job Key" : item.get('jobId'),
                    "Source Link": response.json().get('other', {'tag': ''}).get('tag',''),
                    "Job Description" : response.json().get('description',''),
                    "Role Category" :"",
                    "Job Industry" :  ', '.join([t['title'] for t in  response.json()['industryInterlinking']]),
                    "Job Title" : response.json().get('designation'),
                    "Formatted Location Full" : response.json().get('location'),
                    "Job Functions" :  ', '.join([x['title'] for x in  response.json()['fAreaInterlinking']]),
                    "Company" : response.json().get('company', {'name':''}).get('name'),
                    "Job Type" : response.json().get('employmentType'),
                    "Key Skills" : ', '.join([y['title'] for y in  response.json()['keywordInterlinking']]),
                    "Minimum Experience" : response.json().get('desiredCandidate').get('experience').get('min'),
                    "Maximum Experience" : response.json().get('desiredCandidate').get('experience').get('max'),
                    "Salary Detail" : response.json().get('compensation')
                }
                if outfile.tell() == 0:
                    header = job_data.keys()
                    outfile_writer.writerow(header)
                outfile_writer.writerow([str(z).replace('\n','').strip() for z in job_data.values()])
"""


# Global variables
input_file = "gulf_data/output_all_gulf.csv"
output_file = "gulf_data/jobdata_gulf.csv"
error_file = "gulf_data/jobdata_error_gulf.csv"
stats_file = "gulf_data/stats_gulf.txt"
skip=0

class NaukriGulfJobDetailScraper:
    base_url = "https://www.naukrigulf.com/spapi/jobs/{}"
    headers = {
        'authority': 'www.naukrigulf.com',
        'accept': 'application/json',
        'accept-format': 'strict',
        'accept-language': 'ENGLISH',
        'appid': '205',
        'cache-control': 'no-cache',
        'client-type': 'desktop',
        'clientid': 'desktop',
        'device-type': 'desktop',
        'puppeteer': 'false',
        'referer': 'https://www.naukrigulf.com/jobs-in-uae',
        'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': 'Windows',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'systemid': '2323',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.12',
        'userdata': '|IN'
    }

    def __init__(self, input_file, output_file, error_file):
        self.input_file = input_file
        self.output_file = output_file
        self.error_file = error_file
        self.timeout = 30
        self.count = 1
        # self.proxies = {"http": f"http://{proxy_server}", "https": f"http://{proxy_server}"} if (proxy_server := os.environ.get("PROXY_SERVER")) else {}

    def transform_data(self, job_id, jd_url, json_response):
        source_value1 = json_response.get('other', {'tag': ''}).get('tag', '')
        source_value2 = json_response.get('contact', {'website': ''}).get('website', '')
        jd = json_response.get('description','')
        desired_profile = json_response.get('desiredCandidate')
        valid_pairs = None
        if desired_profile:
            valid_pairs = [(key, value) for key, value in desired_profile.items() if value is not None and value != '' and key != 'experience']

        if valid_pairs:
            html_output = '<br><h3 class="heading">Desired Candidate Profile</h3><br>'
            for key, value in valid_pairs:
                html_output += f"<strong>{key.title()}:</strong> <br>{value}<br>"
            jd += html_output

        json_data = {
            "Url" : jd_url,
            "Job Key" : "g_" + str(job_id),
            # "Source Link": json_response.get('other', {'tag': ''}).get('tag','') + \
            # json_response.get('contact', {'website': ''}).get('website',''),
            "Source Link": source_value1 if source_value1 else source_value2 if source_value2 else '',
            # "Job Description" : json_response.get('description',''),
            "Job Description" : jd,
            "Role Category" :"",
            "Job Industry" :  ', '.join([t['title'] for t in  json_response['industryInterlinking']]),
            "Job Title" : json_response.get('designation'),
            "Formatted Location Full" : json_response.get('location'),
            "Job Functions" :  ', '.join([x['title'] for x in  json_response['fAreaInterlinking']]),
            "Company" : json_response.get('company', {'name':''}).get('name'),
            "Job Type" : json_response.get('employmentType'),
            "Key Skills" : ', '.join([y['title'] for y in  json_response['keywordInterlinking']]),
            "Minimum Experience" : json_response.get('desiredCandidate').get('experience').get('min'),
            "Maximum Experience" : json_response.get('desiredCandidate').get('experience').get('max'),
            "Salary Detail" : json_response.get('compensation'),
            "Country" : json_response.get('compensation',{'country':''}).get('country')
        }
        return json_data

    def scrape(self):
        with open(self.input_file, 'r', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            total_input_count=0
            all_job_ids = []
            for row in reader:
                jobid = row[1].strip()
                mode = row[7].strip()
                total_input_count+=1
                if mode == "POSTED":
                    print("removed non tagged job with jobid %s" % jobid)
                    continue

                all_job_ids.append(jobid)

            print(f"Size of raw all_job_ids: {len(all_job_ids)}")
            all_job_ids = list(set(all_job_ids))
            print(f"Size of unique all_job_ids: {len(all_job_ids)}")
            all_job_ids = all_job_ids[skip:]
            print(f"Total input: {total_input_count},  Valid ids to scrape {len(all_job_ids)}")
            with open(stats_file, "a") as stat:
                stat.write(f"Search Found: {total_input_count}, Valid for scraping: {len(all_job_ids)}\n")
            sleep(1)
            with open(self.output_file, 'a', newline='', encoding='utf-8') as outfile:
                writer = csv.writer(outfile)
                while all_job_ids:
                    job_id = all_job_ids[0]
                    url = self.base_url.format(job_id)
                    sleep(0.5)
                    try:
                        sleep(1)
                        response = requests.get(url, headers=self.headers, timeout=self.timeout)
                        print(f"{response.status_code} for {url}")
                        if response.status_code == 200:
                            json_response = response.json()
                            transformed_data = self.transform_data(job_id, url, json_response)
                            if outfile.tell() == 0 :
                                header = transformed_data.keys()
                                writer.writerow(header)
                            writer.writerow(transformed_data.values())
                            print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}")
                            all_job_ids.pop(0)  # Remove the processed job ID
                            self.count += 1
    # /                    elif response.status_code == 303:
    #                         json_response = response.json()
    #                         if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
    #                           print(f"Expired job ID {jobid} with response 303")
    #                           all_job_ids.pop(0)  # Remove the processed job ID
                        elif response.status_code == 404:
                            all_job_ids.pop(0)  # Remove the processed job ID
                            print(f"Expired job ID {jobid} with response 404")
                        else:
                            print(f"Error for job ID {job_id}")
                    except Exception as n1:
                        print(str(n1))
                        pass

def main():
    start_time = time()
    scraper = NaukriGulfJobDetailScraper(input_file, output_file, error_file)
    scraper.scrape()
    end_time = time()
    duration_hours = (end_time - start_time) / 3600
    print(f"Jobdata program took {duration_hours:.2f} hours to run.")
    with open(stats_file, "a") as stat:
        stat.write(f"Jobdata program took {duration_hours:.2f} hours to run.\n")

if __name__ == "__main__":
    main()