compete_jobs/jobstreet/jst_malay_detail.py

126 lines
7.7 KiB
Python

import requests
from csv import DictWriter
import pandas as pd
from time import sleep
# class JSTMalayJobDetailScraper:
# id
column = [
"isExternal",
"Url",
"Job Key",
"Source Link",
"Job Description",
"Role Category",
"Job Industry",
"Job Title",
"Formatted Location Full",
"Job Functions",
"Company",
"Job Type",
"Key Skills",
"Minimum Experience",
"Maximum Experience",
"Salary Detail"
]
def jstMalayJobDetailScraper(search_file, jd_file):
url = "https://xapi.supercharge-srp.co/job-search/graphql?country=my&isSmartSearch=true"
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Content-Length': '2361',
'Content-Type': 'application/json',
'Origin': 'https://www.jobstreet.com.my',
'Referer': 'https://www.jobstreet.com.my/',
'Sec-Ch-Ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
jobIds = pd.read_csv(search_file)
jobIds['id'] = jobIds['id'].astype(str)
jobIds = jobIds[['id', 'sourceCountryCode']].values.tolist()
query = "query getJobDetail($jobId: String, $locale: String, $country: String, $candidateId: ID, $solVisitorId: String, $flight: String) {\n jobDetail(\n jobId: $jobId\n locale: $locale\n country: $country\n candidateId: $candidateId\n solVisitorId: $solVisitorId\n flight: $flight\n ) {\n id\n pageUrl\n jobTitleSlug\n applyUrl {\n url\n isExternal\n }\n isExpired\n isConfidential\n isClassified\n accountNum\n advertisementId\n subAccount\n showMoreJobs\n adType\n header {\n banner {\n bannerUrls {\n large\n }\n }\n salary {\n max\n min\n type\n extraInfo\n currency\n isVisible\n }\n logoUrls {\n small\n medium\n large\n normal\n }\n jobTitle\n company {\n name\n url\n slug\n advertiserId\n }\n review {\n rating\n numberOfReviewer\n }\n expiration\n postedDate\n postedAt\n isInternship\n }\n companyDetail {\n companyWebsite\n companySnapshot {\n avgProcessTime\n registrationNo\n employmentAgencyPersonnelNumber\n employmentAgencyNumber\n telephoneNumber\n workingHours\n website\n facebook\n size\n dressCode\n nearbyLocations\n }\n companyOverview {\n html\n }\n videoUrl\n companyPhotos {\n caption\n url\n }\n }\n jobDetail {\n summary\n jobDescription {\n html\n }\n jobRequirement {\n careerLevel\n yearsOfExperience\n qualification\n fieldOfStudy\n industryValue {\n value\n label\n }\n skills\n employmentType\n languages\n postedDate\n closingDate\n jobFunctionValue {\n code\n name\n children {\n code\n name\n }\n }\n benefits\n }\n whyJoinUs\n }\n location {\n location\n locationId\n omnitureLocationId\n }\n sourceCountry\n }\n}\n"
with open(jd_file, 'a+', encoding='utf-8', newline='') as jfile:
j_writer = DictWriter(jfile, fieldnames= column, extrasaction='ignore')
while jobIds:
print(jobIds[0])
variables = {
"jobId": jobIds[0][0],
"country":jobIds[0][1], #"my",
"locale": "en",
"candidateId": "",
"solVisitorId": "7d3f7e5c-471e-411d-8a82-d8d29a303653"
}
data = {
'query': query,
'variables': variables
}
err =0
# try:
response = requests.post(url, json=data, headers=headers, timeout=20)
print(response.status_code)
# print(response.text)
if response.status_code == 200:
print("yes 200")
result = response.json()
# print("result", result)
if jfile.tell() == 0:
j_writer.writeheader()
if result.get('data', {'jobDetail' :''}).get('jobDetail'):
print(result.get('data', {'jobDetail' :''}).get('jobDetail'))
# print(result['data']['jobDetail'])
try :
job_industry = result['data']['jobDetail']\
.get('jobDetail', {'jobRequirement':""})\
.get('jobRequirement', {"industryValue":""})\
.get('industryValue', {'label':""})\
.get('label')
except :
job_industry = ""
job = {
'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'),
'Url' : result['data']['jobDetail'].get('pageUrl'),
"Job Description" :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}})
.get('jobDescription',{'html':''}).get('html'),
'Company' : result['data']['jobDetail'].get('header', {'company':""}).get('company', {'name':""}).get('name'),
"Job Title" : result['data']['jobDetail'].get('header', {'jobTitle' : ""}).get('jobTitle'),
"Formatted Location Full": ', '.join(xy.get('location','') for xy in result['data']['jobDetail'].get('location', [])),
"Salary Detail" : result['data']['jobDetail'].get('header', {'salary', ''}).get('salary'),
"Job Key": result['data']['jobDetail'].get('id'),
"Source Link" : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'),
"Role Category":"",
"Job Industry": job_industry,
"Job Functions" : ', '.join([yu['name'] for yu in result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement')['jobFunctionValue']]),
"Job Type" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"employmentType":""}).get('employmentType'),
"Key Skills":"",
"Minimum Experience" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"yearsOfExperience":""}).get('yearsOfExperience'),
# "Salary Detail" : result['data']['jobDetail']['header']['salary']
}
j_writer.writerow(job)
del jobIds[0]
else:
print(err)
err += 1
print('after update ', err)
if err >3:
del jobIds[0]
sleep(2)
# except Exception as mdetail:
# print(mdetail)
# err += 1
# print("Exception erro ", err)
# if err >3:
# del jobIds[0]
if __name__ == "__main__":
jstMalayJobDetailScraper("testdata_jst_malay.csv", "test_data_malaydetail.csv")