import requests from csv import DictWriter import pandas as pd from time import sleep # class JSTMalayJobDetailScraper: # id column = [ "isExternal", "Url", "Job Key", "Source Link", "Job Description", "Role Category", "Job Industry", "Job Title", "Formatted Location Full", "Job Functions", "Company", "Job Type", "Key Skills", "Minimum Experience", "Maximum Experience", "Salary Detail" ] def jstSGJobDetailScraper(search_file, jd_file): url = "https://xapi.supercharge-srp.co/job-search/graphql?country=sg&isSmartSearch=true" headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9', 'Content-Length': '2361', 'Content-Type': 'application/json', 'Origin': 'https://www.jobstreet.com.sg', 'Referer': 'https://www.jobstreet.com.sg/', 'Sec-Ch-Ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"Windows"', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'cross-site', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' } jobIds = pd.read_csv(search_file) jobIds['id'] = jobIds['id'].astype(str) jobIds = jobIds[['id', 'sourceCountryCode']].values.tolist() query = "query getJobDetail($jobId: String, $locale: String, $country: String, $candidateId: ID, $solVisitorId: String, $flight: String) {\n jobDetail(\n jobId: $jobId\n locale: $locale\n country: $country\n candidateId: $candidateId\n solVisitorId: $solVisitorId\n flight: $flight\n ) {\n id\n pageUrl\n jobTitleSlug\n applyUrl {\n url\n isExternal\n }\n isExpired\n isConfidential\n isClassified\n accountNum\n advertisementId\n subAccount\n showMoreJobs\n adType\n header {\n banner {\n bannerUrls {\n large\n }\n }\n salary {\n max\n min\n type\n extraInfo\n currency\n isVisible\n }\n logoUrls {\n small\n medium\n large\n normal\n }\n jobTitle\n company {\n name\n url\n slug\n advertiserId\n }\n review {\n rating\n numberOfReviewer\n }\n expiration\n postedDate\n postedAt\n isInternship\n }\n companyDetail {\n companyWebsite\n companySnapshot {\n avgProcessTime\n registrationNo\n employmentAgencyPersonnelNumber\n employmentAgencyNumber\n telephoneNumber\n workingHours\n website\n facebook\n size\n dressCode\n nearbyLocations\n }\n companyOverview {\n html\n }\n videoUrl\n companyPhotos {\n caption\n url\n }\n }\n jobDetail {\n summary\n jobDescription {\n html\n }\n jobRequirement {\n careerLevel\n yearsOfExperience\n qualification\n fieldOfStudy\n industryValue {\n value\n label\n }\n skills\n employmentType\n languages\n postedDate\n closingDate\n jobFunctionValue {\n code\n name\n children {\n code\n name\n }\n }\n benefits\n }\n whyJoinUs\n }\n location {\n location\n locationId\n omnitureLocationId\n }\n sourceCountry\n }\n}\n" with open(jd_file, 'a+', encoding='utf-8', newline='') as jfile: j_writer = DictWriter(jfile, fieldnames= column, extrasaction='ignore') while jobIds: print(jobIds[0]) variables = { "jobId": jobIds[0][0], "country":jobIds[0][1], #"sg", "locale": "en", "candidateId": "", "solVisitorId": "7d3f7e5c-471e-411d-8a82-d8d29a303653" } data = { 'query': query, 'variables': variables } err =0 # try: response = requests.post(url, json=data, headers=headers, timeout=20) print(response.status_code) # print(response.text) if response.status_code == 200: # print("yes 200") result = response.json() # print("result", result) if jfile.tell() == 0: j_writer.writeheader() if result.get('data', {'jobDetail' :''}).get('jobDetail'): # print(result.get('data', {'jobDetail' :''}).get('jobDetail')) # print(result['data']['jobDetail']) try : job_industry = result['data']['jobDetail']\ .get('jobDetail', {'jobRequirement':""})\ .get('jobRequirement', {"industryValue":""})\ .get('industryValue', {'label':""})\ .get('label') except : job_industry = "" job = { 'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'), 'Url' : result['data']['jobDetail'].get('pageUrl'), "Job Description" :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}}) .get('jobDescription',{'html':''}).get('html'), 'Company' : result['data']['jobDetail'].get('header', {'company':""}).get('company', {'name':""}).get('name'), "Job Title" : result['data']['jobDetail'].get('header', {'jobTitle' : ""}).get('jobTitle'), "Formatted Location Full": ', '.join(xy.get('location','') for xy in result['data']['jobDetail'].get('location', [])), "Salary Detail" : result['data']['jobDetail'].get('header', {'salary', ''}).get('salary'), "Job Key": "sg_" + result['data']['jobDetail'].get('id'), "Source Link" : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'), "Role Category":"", "Job Industry": job_industry, "Job Functions" : ', '.join([yu['name'] for yu in result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement')['jobFunctionValue']]), "Job Type" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"employmentType":""}).get('employmentType'), "Key Skills":"", "Minimum Experience" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"yearsOfExperience":""}).get('yearsOfExperience'), # "Salary Detail" : result['data']['jobDetail']['header']['salary'] } j_writer.writerow(job) del jobIds[0] else: print(err) err += 1 print('after update ', err) if err >3: del jobIds[0] sleep(2) # except Exception as mdetail: # print(mdetail) # err += 1 # print("Exception erro ", err) # if err >3: # del jobIds[0] if __name__ == "__main__": jstMalayJobDetailScraper("testdata_jst_sing.csv", "test_data_singdetail.csv")