compete_jobs/jobstreet/jst_malay_detail.py

import requests
from csv import DictWriter
import pandas as pd
from time import sleep
# class JSTMalayJobDetailScraper:
# id
column = [
    "id",
    "pageUrl",
    "company",
    "jobTitle",
    "jobDetail",
    "location",
    "applyurl",
    "isExternal",
    "isExpired",
    "isConfidential",
    "isClassified",
    "accountNum",
    "advertisementId",
    "subAccount",
    "adType",
    "salary",
    "company",
    "sourceCountry"
]
def jstMalayJobDetailScraper(search_file, jd_file):
    url = "https://xapi.supercharge-srp.co/job-search/graphql?country=my&isSmartSearch=true"
    headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Content-Length': '2361',
    'Content-Type': 'application/json',
    'Origin': 'https://www.jobstreet.com.my',
    'Referer': 'https://www.jobstreet.com.my/',
    'Sec-Ch-Ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'cross-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
    jobIds = pd.read_csv(search_file)
    jobIds['id'] = jobIds['id'].astype(str)
    jobIds = jobIds[['id', 'sourceCountryCode']].values.tolist()
    query = "query getJobDetail($jobId: String, $locale: String, $country: String, $candidateId: ID, $solVisitorId: String, $flight: String) {\n  jobDetail(\n    jobId: $jobId\n    locale: $locale\n    country: $country\n    candidateId: $candidateId\n    solVisitorId: $solVisitorId\n    flight: $flight\n  ) {\n    id\n    pageUrl\n    jobTitleSlug\n    applyUrl {\n      url\n      isExternal\n    }\n    isExpired\n    isConfidential\n    isClassified\n    accountNum\n    advertisementId\n    subAccount\n    showMoreJobs\n    adType\n    header {\n      banner {\n        bannerUrls {\n          large\n        }\n      }\n      salary {\n        max\n        min\n        type\n        extraInfo\n        currency\n        isVisible\n      }\n      logoUrls {\n        small\n        medium\n        large\n        normal\n      }\n      jobTitle\n      company {\n        name\n        url\n        slug\n        advertiserId\n      }\n      review {\n        rating\n        numberOfReviewer\n      }\n      expiration\n      postedDate\n      postedAt\n      isInternship\n    }\n    companyDetail {\n      companyWebsite\n      companySnapshot {\n        avgProcessTime\n        registrationNo\n        employmentAgencyPersonnelNumber\n        employmentAgencyNumber\n        telephoneNumber\n        workingHours\n        website\n        facebook\n        size\n        dressCode\n        nearbyLocations\n      }\n      companyOverview {\n        html\n      }\n      videoUrl\n      companyPhotos {\n        caption\n        url\n      }\n    }\n    jobDetail {\n      summary\n      jobDescription {\n        html\n      }\n      jobRequirement {\n        careerLevel\n        yearsOfExperience\n        qualification\n        fieldOfStudy\n        industryValue {\n          value\n          label\n        }\n        skills\n        employmentType\n        languages\n        postedDate\n        closingDate\n        jobFunctionValue {\n          code\n          name\n          children {\n            code\n            name\n          }\n        }\n        benefits\n      }\n      whyJoinUs\n    }\n    location {\n      location\n      locationId\n      omnitureLocationId\n    }\n    sourceCountry\n  }\n}\n"
    with open(jd_file, 'a+', encoding='utf-8', newline='') as jfile:
        j_writer = DictWriter(jfile, fieldnames= column, extrasaction='ignore')
        while jobIds:
            print(jobIds[0])
            variables = {
                "jobId": jobIds[0][0],
                "country":jobIds[0][1], #"my",
                "locale": "en",
                "candidateId": "",
                "solVisitorId": "7d3f7e5c-471e-411d-8a82-d8d29a303653"
            }
            data = {
                    'query': query,
                    'variables': variables
                }
            err =0
            try:
                response = requests.post(url, json=data, headers=headers, timeout=20)
                print(response.status_code)
                # print(response.text)

                if response.status_code == 200:
                    result = response.json()
                    # print("result", result)
                    if jfile.tell() == 0:
                        j_writer.writeheader()
                    if result['data']['jobDetail']:
                        job = {
                            'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'),
                            'applyurl' : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'),
                            'jobDetail' :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}}).get('jobDescription',{'html':''}).get('html'),
                            'company' : result['data']['jobDetail']['header']['company']['name'],
                            "jobTitle" : result['data']['jobDetail']['header']['jobTitle'],
                            "location": ', '.join(xy['location'] for xy in result['data']['jobDetail']['location']),
                            'salary' : result['data']['jobDetail']['header']['salary']
                        }
                        job2 = {**result['data']['jobDetail'] , **job}
                        # print(job2)
                        j_writer.writerow(job2)
                    del jobIds[0]

                else:
                    err += 1
                if err >3:
                    continue
                sleep(2)
            except Exception as mdetail:
                pass

if __name__ == "__main__":
    jstMalayJobDetailScraper("testdata_jst_malay.csv", "test_data_malay.csv")