jty
parent
db4ebf12e7
commit
4a242874b8
|
@ -0,0 +1,125 @@
|
||||||
|
import requests
|
||||||
|
from csv import DictWriter
|
||||||
|
import pandas as pd
|
||||||
|
from time import sleep
|
||||||
|
# class JSTIdJobDetailScraper:
|
||||||
|
# id
|
||||||
|
column = [
|
||||||
|
"isExternal",
|
||||||
|
"Url",
|
||||||
|
"Job Key",
|
||||||
|
"Source Link",
|
||||||
|
"Job Description",
|
||||||
|
"Role Category",
|
||||||
|
"Job Industry",
|
||||||
|
"Job Title",
|
||||||
|
"Formatted Location Full",
|
||||||
|
"Job Functions",
|
||||||
|
"Company",
|
||||||
|
"Job Type",
|
||||||
|
"Key Skills",
|
||||||
|
"Minimum Experience",
|
||||||
|
"Maximum Experience",
|
||||||
|
"Salary Detail"
|
||||||
|
]
|
||||||
|
|
||||||
|
def jstIdJobDetailScraper(search_file, jd_file):
|
||||||
|
url = "https://xapi.supercharge-srp.co/job-search/graphql?country=id&isSmartSearch=true"
|
||||||
|
headers = {
|
||||||
|
'Accept': '*/*',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
'Content-Length': '2361',
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Origin': 'https://www.jobstreet.com.id',
|
||||||
|
'Referer': 'https://www.jobstreet.com.id/',
|
||||||
|
'Sec-Ch-Ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
|
||||||
|
'Sec-Ch-Ua-Mobile': '?0',
|
||||||
|
'Sec-Ch-Ua-Platform': '"Windows"',
|
||||||
|
'Sec-Fetch-Dest': 'empty',
|
||||||
|
'Sec-Fetch-Mode': 'cors',
|
||||||
|
'Sec-Fetch-Site': 'cross-site',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
jobIds = pd.read_csv(search_file)
|
||||||
|
jobIds['id'] = jobIds['id'].astype(str)
|
||||||
|
jobIds = jobIds[['id', 'sourceCountryCode']].values.tolist()
|
||||||
|
query = "query getJobDetail($jobId: String, $locale: String, $country: String, $candidateId: ID, $solVisitorId: String, $flight: String) {\n jobDetail(\n jobId: $jobId\n locale: $locale\n country: $country\n candidateId: $candidateId\n solVisitorId: $solVisitorId\n flight: $flight\n ) {\n id\n pageUrl\n jobTitleSlug\n applyUrl {\n url\n isExternal\n }\n isExpired\n isConfidential\n isClassified\n accountNum\n advertisementId\n subAccount\n showMoreJobs\n adType\n header {\n banner {\n bannerUrls {\n large\n }\n }\n salary {\n max\n min\n type\n extraInfo\n currency\n isVisible\n }\n logoUrls {\n small\n medium\n large\n normal\n }\n jobTitle\n company {\n name\n url\n slug\n advertiserId\n }\n review {\n rating\n numberOfReviewer\n }\n expiration\n postedDate\n postedAt\n isInternship\n }\n companyDetail {\n companyWebsite\n companySnapshot {\n avgProcessTime\n registrationNo\n employmentAgencyPersonnelNumber\n employmentAgencyNumber\n telephoneNumber\n workingHours\n website\n facebook\n size\n dressCode\n nearbyLocations\n }\n companyOverview {\n html\n }\n videoUrl\n companyPhotos {\n caption\n url\n }\n }\n jobDetail {\n summary\n jobDescription {\n html\n }\n jobRequirement {\n careerLevel\n yearsOfExperience\n qualification\n fieldOfStudy\n industryValue {\n value\n label\n }\n skills\n employmentType\n languages\n postedDate\n closingDate\n jobFunctionValue {\n code\n name\n children {\n code\n name\n }\n }\n benefits\n }\n whyJoinUs\n }\n location {\n location\n locationId\n omnitureLocationId\n }\n sourceCountry\n }\n}\n"
|
||||||
|
with open(jd_file, 'a+', encoding='utf-8', newline='') as jfile:
|
||||||
|
j_writer = DictWriter(jfile, fieldnames= column, extrasaction='ignore')
|
||||||
|
while jobIds:
|
||||||
|
print(jobIds[0])
|
||||||
|
variables = {
|
||||||
|
"jobId": jobIds[0][0],
|
||||||
|
"country":jobIds[0][1], #"id",
|
||||||
|
"locale": "en",
|
||||||
|
"candidateId": "",
|
||||||
|
"solVisitorId": "7d3f7e5c-471e-411d-8a82-d8d29a303653"
|
||||||
|
}
|
||||||
|
data = {
|
||||||
|
'query': query,
|
||||||
|
'variables': variables
|
||||||
|
}
|
||||||
|
err =0
|
||||||
|
# try:
|
||||||
|
response = requests.post(url, json=data, headers=headers, timeout=20)
|
||||||
|
print(response.status_code)
|
||||||
|
# print(response.text)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
# print("yes 200")
|
||||||
|
result = response.json()
|
||||||
|
# print("result", result)
|
||||||
|
if jfile.tell() == 0:
|
||||||
|
j_writer.writeheader()
|
||||||
|
if result.get('data', {'jobDetail' :''}).get('jobDetail'):
|
||||||
|
# print(result.get('data', {'jobDetail' :''}).get('jobDetail'))
|
||||||
|
# print(result['data']['jobDetail'])
|
||||||
|
try :
|
||||||
|
job_industry = result['data']['jobDetail']\
|
||||||
|
.get('jobDetail', {'jobRequirement':""})\
|
||||||
|
.get('jobRequirement', {"industryValue":""})\
|
||||||
|
.get('industryValue', {'label':""})\
|
||||||
|
.get('label')
|
||||||
|
except :
|
||||||
|
job_industry = ""
|
||||||
|
|
||||||
|
job = {
|
||||||
|
'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'),
|
||||||
|
'Url' : result['data']['jobDetail'].get('pageUrl'),
|
||||||
|
"Job Description" :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}})
|
||||||
|
.get('jobDescription',{'html':''}).get('html'),
|
||||||
|
'Company' : result['data']['jobDetail'].get('header', {'company':""}).get('company', {'name':""}).get('name'),
|
||||||
|
"Job Title" : result['data']['jobDetail'].get('header', {'jobTitle' : ""}).get('jobTitle'),
|
||||||
|
"Formatted Location Full": ', '.join(xy.get('location','') for xy in result['data']['jobDetail'].get('location', [])),
|
||||||
|
"Salary Detail" : result['data']['jobDetail'].get('header', {'salary', ''}).get('salary'),
|
||||||
|
"Job Key": result['data']['jobDetail'].get('id'),
|
||||||
|
"Source Link" : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'),
|
||||||
|
"Role Category":"",
|
||||||
|
"Job Industry": job_industry,
|
||||||
|
"Job Functions" : ', '.join([yu['name'] for yu in result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement')['jobFunctionValue']]),
|
||||||
|
"Job Type" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"employmentType":""}).get('employmentType'),
|
||||||
|
"Key Skills":"",
|
||||||
|
"Minimum Experience" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"yearsOfExperience":""}).get('yearsOfExperience'),
|
||||||
|
# "Salary Detail" : result['data']['jobDetail']['header']['salary']
|
||||||
|
}
|
||||||
|
j_writer.writerow(job)
|
||||||
|
del jobIds[0]
|
||||||
|
else:
|
||||||
|
print(err)
|
||||||
|
err += 1
|
||||||
|
print('after update ', err)
|
||||||
|
if err >3:
|
||||||
|
del jobIds[0]
|
||||||
|
sleep(2)
|
||||||
|
# except Exception as mdetail:
|
||||||
|
# print(mdetail)
|
||||||
|
# err += 1
|
||||||
|
# print("Exception erro ", err)
|
||||||
|
# if err >3:
|
||||||
|
# del jobIds[0]
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
jstIdJobDetailScraper("testdata_jst_id.csv", "testdata_jst_id_detail.csv")
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,90 @@
|
||||||
|
import requests
|
||||||
|
from math import ceil
|
||||||
|
from csv import DictWriter
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
def search_jst_id(csv_file):
|
||||||
|
total_pages = 2
|
||||||
|
current_page = 1
|
||||||
|
url = 'https://xapi.supercharge-srp.co/job-search/graphql?country=id&isSmartSearch=true'
|
||||||
|
headers = {
|
||||||
|
'Accept': '*/*',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
'Content-Length': '3408',
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Origin': 'https://www.jobstreet.com.id',
|
||||||
|
'Referer': 'https://www.jobstreet.com.id/',
|
||||||
|
'Sec-Ch-Ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
|
||||||
|
'Sec-Ch-Ua-Mobile': '?0',
|
||||||
|
'Sec-Ch-Ua-Platform': '"Windows"',
|
||||||
|
'Sec-Fetch-Dest': 'empty',
|
||||||
|
'Sec-Fetch-Mode': 'cors',
|
||||||
|
'Sec-Fetch-Site': 'cross-site',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
query = "query getJobs($country: String, $locale: String, $keyword: String, $createdAt: String, $jobFunctions: [Int], $categories: [String], $locations: [Int], $careerLevels: [Int], $minSalary: Int, $maxSalary: Int, $salaryType: Int, $candidateSalary: Int, $candidateSalaryCurrency: String, $datePosted: Int, $jobTypes: [Int], $workTypes: [String], $industries: [Int], $page: Int, $pageSize: Int, $companyId: String, $advertiserId: String, $userAgent: String, $accNums: Int, $subAccount: Int, $minEdu: Int, $maxEdu: Int, $edus: [Int], $minExp: Int, $maxExp: Int, $seo: String, $searchFields: String, $candidateId: ID, $isDesktop: Boolean, $isCompanySearch: Boolean, $sort: String, $sVi: String, $duplicates: String, $flight: String, $solVisitorId: String) {\n jobs(\n country: $country\n locale: $locale\n keyword: $keyword\n createdAt: $createdAt\n jobFunctions: $jobFunctions\n categories: $categories\n locations: $locations\n careerLevels: $careerLevels\n minSalary: $minSalary\n maxSalary: $maxSalary\n salaryType: $salaryType\n candidateSalary: $candidateSalary\n candidateSalaryCurrency: $candidateSalaryCurrency\n datePosted: $datePosted\n jobTypes: $jobTypes\n workTypes: $workTypes\n industries: $industries\n page: $page\n pageSize: $pageSize\n companyId: $companyId\n advertiserId: $advertiserId\n userAgent: $userAgent\n accNums: $accNums\n subAccount: $subAccount\n minEdu: $minEdu\n edus: $edus\n maxEdu: $maxEdu\n minExp: $minExp\n maxExp: $maxExp\n seo: $seo\n searchFields: $searchFields\n candidateId: $candidateId\n isDesktop: $isDesktop\n isCompanySearch: $isCompanySearch\n sort: $sort\n sVi: $sVi\n duplicates: $duplicates\n flight: $flight\n solVisitorId: $solVisitorId\n ) {\n total\n totalJobs\n relatedSearchKeywords {\n keywords\n type\n totalJobs\n }\n solMetadata\n suggestedEmployer {\n name\n totalJobs\n }\n queryParameters {\n key\n searchFields\n pageSize\n }\n experiments {\n flight\n }\n jobs {\n id\n adType\n sourceCountryCode\n isStandout\n companyMeta {\n id\n advertiserId\n isPrivate\n name\n logoUrl\n slug\n }\n jobTitle\n jobUrl\n jobTitleSlug\n description\n employmentTypes {\n code\n name\n }\n sellingPoints\n locations {\n code\n name\n slug\n children {\n code\n name\n slug\n }\n }\n categories {\n code\n name\n children {\n code\n name\n }\n }\n postingDuration\n postedAt\n salaryRange {\n currency\n max\n min\n period\n term\n }\n salaryVisible\n bannerUrl\n isClassified\n solMetadata\n }\n }\n}\n"
|
||||||
|
while current_page <= total_pages:
|
||||||
|
sleep(1)
|
||||||
|
variables = {"keyword":"",
|
||||||
|
"jobFunctions":[],
|
||||||
|
"locations":[],
|
||||||
|
"salaryType":1,
|
||||||
|
"jobTypes":[],
|
||||||
|
"createdAt":None,
|
||||||
|
"careerLevels":[],
|
||||||
|
"page": current_page,
|
||||||
|
"country":"id",
|
||||||
|
"sVi":"",
|
||||||
|
"solVisitorId":"7d3f7e5c-471e-411d-8a82-d8d29a303653",
|
||||||
|
"categories":[],"workTypes":[],
|
||||||
|
"userAgent":"Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/118.0.0.0%20Safari/537.36",
|
||||||
|
"industries":[],
|
||||||
|
"locale":"en"}
|
||||||
|
data = {
|
||||||
|
'query': query,
|
||||||
|
'variables': variables
|
||||||
|
}
|
||||||
|
error_count = 0
|
||||||
|
try:
|
||||||
|
response = requests.post(url, json=data, headers=headers, timeout=20)
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
# print(result['data']['jobs']['totalJobs'])
|
||||||
|
# print(type(result['data']['jobs']['totalJobs']))
|
||||||
|
# print(result['data']['jobs']['totalJobs']/30)
|
||||||
|
# print(result['data']['jobs']['jobs'])
|
||||||
|
print('total pages', ceil(result['data']['jobs']['totalJobs']/30))
|
||||||
|
total_pages = ceil(result['data']['jobs']['totalJobs']/30)
|
||||||
|
if len(result['data']['jobs']['jobs']) > 0 :
|
||||||
|
column = ['Company_Name'] + list(result['data']['jobs']['jobs'][0].keys())[:6]
|
||||||
|
# print("writing to file")
|
||||||
|
with open(csv_file, 'a+', newline='', encoding='utf-8') as csvfile:
|
||||||
|
writer = DictWriter(csvfile, fieldnames=column, extrasaction='ignore')
|
||||||
|
if csvfile.tell() == 0:
|
||||||
|
writer.writeheader()
|
||||||
|
# print("writing each array")
|
||||||
|
for a_job in result['data']['jobs']['jobs']:
|
||||||
|
a_job['Company_Name'] = a_job['companyMeta']['name']
|
||||||
|
writer.writerow(a_job)
|
||||||
|
|
||||||
|
current_page += 1
|
||||||
|
# total_pages -=1
|
||||||
|
print(f"scraping page {current_page} of {total_pages}")
|
||||||
|
else:
|
||||||
|
|
||||||
|
print(f"Request failed with status code {response.status_code}: {response.text}")
|
||||||
|
error_count +=1
|
||||||
|
if error_count > 3:
|
||||||
|
current_page +=1
|
||||||
|
# total_pages -=1
|
||||||
|
except Exception as malayError:
|
||||||
|
print(malayError)
|
||||||
|
error_count+=1
|
||||||
|
if error_count > 3:
|
||||||
|
current_page +=1
|
||||||
|
# total_pages -=1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
search_jst_id("testdata_jst_id.csv")
|
Loading…
Reference in New Issue