prahul11 2023-10-31 21:07:28 +05:30
parent 0756a8671f
commit 1ba8f09a78
3 changed files with 85 additions and 151 deletions

View File

@ -1,90 +0,0 @@
import requests
from math import ceil
from csv import DictWriter
from time import sleep
def search_jst_malay(csv_file):
total_pages = 2
current_page = 1
url = 'https://xapi.supercharge-srp.co/job-search/graphql?country=my&isSmartSearch=true'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Content-Length': '3408',
'Content-Type': 'application/json',
'Origin': 'https://www.jobstreet.com.my',
'Referer': 'https://www.jobstreet.com.my/',
'Sec-Ch-Ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
query = "query getJobs($country: String, $locale: String, $keyword: String, $createdAt: String, $jobFunctions: [Int], $categories: [String], $locations: [Int], $careerLevels: [Int], $minSalary: Int, $maxSalary: Int, $salaryType: Int, $candidateSalary: Int, $candidateSalaryCurrency: String, $datePosted: Int, $jobTypes: [Int], $workTypes: [String], $industries: [Int], $page: Int, $pageSize: Int, $companyId: String, $advertiserId: String, $userAgent: String, $accNums: Int, $subAccount: Int, $minEdu: Int, $maxEdu: Int, $edus: [Int], $minExp: Int, $maxExp: Int, $seo: String, $searchFields: String, $candidateId: ID, $isDesktop: Boolean, $isCompanySearch: Boolean, $sort: String, $sVi: String, $duplicates: String, $flight: String, $solVisitorId: String) {\n jobs(\n country: $country\n locale: $locale\n keyword: $keyword\n createdAt: $createdAt\n jobFunctions: $jobFunctions\n categories: $categories\n locations: $locations\n careerLevels: $careerLevels\n minSalary: $minSalary\n maxSalary: $maxSalary\n salaryType: $salaryType\n candidateSalary: $candidateSalary\n candidateSalaryCurrency: $candidateSalaryCurrency\n datePosted: $datePosted\n jobTypes: $jobTypes\n workTypes: $workTypes\n industries: $industries\n page: $page\n pageSize: $pageSize\n companyId: $companyId\n advertiserId: $advertiserId\n userAgent: $userAgent\n accNums: $accNums\n subAccount: $subAccount\n minEdu: $minEdu\n edus: $edus\n maxEdu: $maxEdu\n minExp: $minExp\n maxExp: $maxExp\n seo: $seo\n searchFields: $searchFields\n candidateId: $candidateId\n isDesktop: $isDesktop\n isCompanySearch: $isCompanySearch\n sort: $sort\n sVi: $sVi\n duplicates: $duplicates\n flight: $flight\n solVisitorId: $solVisitorId\n ) {\n total\n totalJobs\n relatedSearchKeywords {\n keywords\n type\n totalJobs\n }\n solMetadata\n suggestedEmployer {\n name\n totalJobs\n }\n queryParameters {\n key\n searchFields\n pageSize\n }\n experiments {\n flight\n }\n jobs {\n id\n adType\n sourceCountryCode\n isStandout\n companyMeta {\n id\n advertiserId\n isPrivate\n name\n logoUrl\n slug\n }\n jobTitle\n jobUrl\n jobTitleSlug\n description\n employmentTypes {\n code\n name\n }\n sellingPoints\n locations {\n code\n name\n slug\n children {\n code\n name\n slug\n }\n }\n categories {\n code\n name\n children {\n code\n name\n }\n }\n postingDuration\n postedAt\n salaryRange {\n currency\n max\n min\n period\n term\n }\n salaryVisible\n bannerUrl\n isClassified\n solMetadata\n }\n }\n}\n"
while current_page <= total_pages:
sleep(1)
variables = {"keyword":"",
"jobFunctions":[],
"locations":[],
"salaryType":1,
"jobTypes":[],
"createdAt":None,
"careerLevels":[],
"page": current_page,
"country":"my",
"sVi":"",
"solVisitorId":"7d3f7e5c-471e-411d-8a82-d8d29a303653",
"categories":[],"workTypes":[],
"userAgent":"Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/118.0.0.0%20Safari/537.36",
"industries":[],
"locale":"en"}
data = {
'query': query,
'variables': variables
}
error_count = 0
try:
response = requests.post(url, json=data, headers=headers)
if response.status_code == 200:
# The request was successful
result = response.json()
# print(result['data']['jobs']['totalJobs'])
# print(type(result['data']['jobs']['totalJobs']))
# print(result['data']['jobs']['totalJobs']/30)
# print(result['data']['jobs']['jobs'])
print('total pages', ceil(result['data']['jobs']['totalJobs']/30))
total_pages = ceil(result['data']['jobs']['totalJobs']/30)
if len(result['data']['jobs']['jobs']) > 0 :
column = list(result['data']['jobs']['jobs'][0].keys()) + ['Company_Name']
with open(csv_file, 'a+', newline='', encoding='utf-8') as csvfile:
writer = DictWriter(csvfile, fieldnames=column)
if csvfile.tell() == 0:
writer.writeheader()
for a_job in result['data']['jobs']['jobs']:
a_job['Company_Name'] = a_job['companyMeta']['name']
writer.writerow(a_job)
current_page += 1
# total_pages -=1
print(f"scraping page {current_page} of {total_pages}")
else:
print(f"Request failed with status code {response.status_code}: {response.text}")
error_count +=1
if error_count > 3:
current_page +=1
# total_pages -=1
except Exception as malayError:
print(malayError)
error_count+=1
if error_count > 3:
current_page +=1
# total_pages -=1
if __name__ == "__main__":
search_jst_malay("testdata_jst_malay2.csv")

View File

@ -5,25 +5,24 @@ from time import sleep
# class JSTMalayJobDetailScraper: # class JSTMalayJobDetailScraper:
# id # id
column = [ column = [
"id", "isExternal",
"pageUrl", "Url",
"company", "Job Key",
"jobTitle", "Source Link",
"jobDetail", "Job Description",
"location", "Role Category",
"applyurl", "Job Industry",
"isExternal", "Job Title",
"isExpired", "Formatted Location Full",
"isConfidential", "Job Functions",
"isClassified", "Company",
"accountNum", "Job Type",
"advertisementId", "Key Skills",
"subAccount", "Minimum Experience",
"adType", "Maximum Experience",
"salary", "Salary Detail"
"company", ]
"sourceCountry"
]
def jstMalayJobDetailScraper(search_file, jd_file): def jstMalayJobDetailScraper(search_file, jd_file):
url = "https://xapi.supercharge-srp.co/job-search/graphql?country=sg&isSmartSearch=true" url = "https://xapi.supercharge-srp.co/job-search/graphql?country=sg&isSmartSearch=true"
headers = { headers = {
@ -62,40 +61,65 @@ def jstMalayJobDetailScraper(search_file, jd_file):
'variables': variables 'variables': variables
} }
err =0 err =0
try: # try:
response = requests.post(url, json=data, headers=headers, timeout=20) response = requests.post(url, json=data, headers=headers, timeout=20)
print(response.status_code) print(response.status_code)
# print(response.text) # print(response.text)
if response.status_code == 200: if response.status_code == 200:
result = response.json() print("yes 200")
# print("result", result) result = response.json()
if jfile.tell() == 0: # print("result", result)
j_writer.writeheader() if jfile.tell() == 0:
if result['data']['jobDetail']: j_writer.writeheader()
job = { if result.get('data', {'jobDetail' :''}).get('jobDetail'):
'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'), print(result.get('data', {'jobDetail' :''}).get('jobDetail'))
'applyurl' : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'), # print(result['data']['jobDetail'])
'jobDetail' :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}}).get('jobDescription',{'html':''}).get('html'), try :
'company' : result['data']['jobDetail']['header']['company']['name'], job_industry = result['data']['jobDetail']\
"jobTitle" : result['data']['jobDetail']['header']['jobTitle'], .get('jobDetail', {'jobRequirement':""})\
"location": ', '.join(xy['location'] for xy in result['data']['jobDetail']['location']), .get('jobRequirement', {"industryValue":""})\
'salary' : result['data']['jobDetail']['header']['salary'] .get('industryValue', {'label':""})\
} .get('label')
job2 = {**result['data']['jobDetail'] , **job} except :
# print(job2) job_industry = ""
j_writer.writerow(job2)
del jobIds[0]
else: job = {
err += 1 'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'),
if err >3: 'Url' : result['data']['jobDetail'].get('pageUrl'),
continue "Job Description" :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}})
sleep(2) .get('jobDescription',{'html':''}).get('html'),
except Exception as mdetail: 'Company' : result['data']['jobDetail'].get('header', {'company':""}).get('company', {'name':""}).get('name'),
pass "Job Title" : result['data']['jobDetail'].get('header', {'jobTitle' : ""}).get('jobTitle'),
"Formatted Location Full": ', '.join(xy.get('location','') for xy in result['data']['jobDetail'].get('location', [])),
"Salary Detail" : result['data']['jobDetail'].get('header', {'salary', ''}).get('salary'),
"Job Key": result['data']['jobDetail'].get('id'),
"Source Link" : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'),
"Role Category":"",
"Job Industry": job_industry,
"Job Functions" : ', '.join([yu['name'] for yu in result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement')['jobFunctionValue']]),
"Job Type" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"employmentType":""}).get('employmentType'),
"Key Skills":"",
"Minimum Experience" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"yearsOfExperience":""}).get('yearsOfExperience'),
# "Salary Detail" : result['data']['jobDetail']['header']['salary']
}
j_writer.writerow(job)
del jobIds[0]
else:
print(err)
err += 1
print('after update ', err)
if err >3:
del jobIds[0]
sleep(2)
# except Exception as mdetail:
# print(mdetail)
# err += 1
# print("Exception erro ", err)
# if err >3:
# del jobIds[0]
if __name__ == "__main__": if __name__ == "__main__":
jstMalayJobDetailScraper("testdata_jst_sg.csv", "test_data_sg.csv") jstMalayJobDetailScraper("testdata_jst_malay.csv", "test_data_malaydetail.csv")

View File

@ -24,7 +24,7 @@ def search_jst_malay(csv_file):
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
} }
query = "query getJobs($country: String, $locale: String, $keyword: String, $createdAt: String, $jobFunctions: [Int], $categories: [String], $locations: [Int], $careerLevels: [Int], $minSalary: Int, $maxSalary: Int, $salaryType: Int, $candidateSalary: Int, $candidateSalaryCurrency: String, $datePosted: Int, $jobTypes: [Int], $workTypes: [String], $industries: [Int], $page: Int, $pageSize: Int, $companyId: String, $advertiserId: String, $userAgent: String, $accNums: Int, $subAccount: Int, $minEdu: Int, $maxEdu: Int, $edus: [Int], $minExp: Int, $maxExp: Int, $seo: String, $searchFields: String, $candidateId: ID, $isDesktop: Boolean, $isCompanySearch: Boolean, $sort: String, $sVi: String, $duplicates: String, $flight: String, $solVisitorId: String) {\n jobs(\n country: $country\n locale: $locale\n keyword: $keyword\n createdAt: $createdAt\n jobFunctions: $jobFunctions\n categories: $categories\n locations: $locations\n careerLevels: $careerLevels\n minSalary: $minSalary\n maxSalary: $maxSalary\n salaryType: $salaryType\n candidateSalary: $candidateSalary\n candidateSalaryCurrency: $candidateSalaryCurrency\n datePosted: $datePosted\n jobTypes: $jobTypes\n workTypes: $workTypes\n industries: $industries\n page: $page\n pageSize: $pageSize\n companyId: $companyId\n advertiserId: $advertiserId\n userAgent: $userAgent\n accNums: $accNums\n subAccount: $subAccount\n minEdu: $minEdu\n edus: $edus\n maxEdu: $maxEdu\n minExp: $minExp\n maxExp: $maxExp\n seo: $seo\n searchFields: $searchFields\n candidateId: $candidateId\n isDesktop: $isDesktop\n isCompanySearch: $isCompanySearch\n sort: $sort\n sVi: $sVi\n duplicates: $duplicates\n flight: $flight\n solVisitorId: $solVisitorId\n ) {\n total\n totalJobs\n relatedSearchKeywords {\n keywords\n type\n totalJobs\n }\n solMetadata\n suggestedEmployer {\n name\n totalJobs\n }\n queryParameters {\n key\n searchFields\n pageSize\n }\n experiments {\n flight\n }\n jobs {\n id\n adType\n sourceCountryCode\n isStandout\n companyMeta {\n id\n advertiserId\n isPrivate\n name\n logoUrl\n slug\n }\n jobTitle\n jobUrl\n jobTitleSlug\n description\n employmentTypes {\n code\n name\n }\n sellingPoints\n locations {\n code\n name\n slug\n children {\n code\n name\n slug\n }\n }\n categories {\n code\n name\n children {\n code\n name\n }\n }\n postingDuration\n postedAt\n salaryRange {\n currency\n max\n min\n period\n term\n }\n salaryVisible\n bannerUrl\n isClassified\n solMetadata\n }\n }\n}\n" query = "query getJobs($country: String, $locale: String, $keyword: String, $createdAt: String, $jobFunctions: [Int], $categories: [String], $locations: [Int], $careerLevels: [Int], $minSalary: Int, $maxSalary: Int, $salaryType: Int, $candidateSalary: Int, $candidateSalaryCurrency: String, $datePosted: Int, $jobTypes: [Int], $workTypes: [String], $industries: [Int], $page: Int, $pageSize: Int, $companyId: String, $advertiserId: String, $userAgent: String, $accNums: Int, $subAccount: Int, $minEdu: Int, $maxEdu: Int, $edus: [Int], $minExp: Int, $maxExp: Int, $seo: String, $searchFields: String, $candidateId: ID, $isDesktop: Boolean, $isCompanySearch: Boolean, $sort: String, $sVi: String, $duplicates: String, $flight: String, $solVisitorId: String) {\n jobs(\n country: $country\n locale: $locale\n keyword: $keyword\n createdAt: $createdAt\n jobFunctions: $jobFunctions\n categories: $categories\n locations: $locations\n careerLevels: $careerLevels\n minSalary: $minSalary\n maxSalary: $maxSalary\n salaryType: $salaryType\n candidateSalary: $candidateSalary\n candidateSalaryCurrency: $candidateSalaryCurrency\n datePosted: $datePosted\n jobTypes: $jobTypes\n workTypes: $workTypes\n industries: $industries\n page: $page\n pageSize: $pageSize\n companyId: $companyId\n advertiserId: $advertiserId\n userAgent: $userAgent\n accNums: $accNums\n subAccount: $subAccount\n minEdu: $minEdu\n edus: $edus\n maxEdu: $maxEdu\n minExp: $minExp\n maxExp: $maxExp\n seo: $seo\n searchFields: $searchFields\n candidateId: $candidateId\n isDesktop: $isDesktop\n isCompanySearch: $isCompanySearch\n sort: $sort\n sVi: $sVi\n duplicates: $duplicates\n flight: $flight\n solVisitorId: $solVisitorId\n ) {\n total\n totalJobs\n relatedSearchKeywords {\n keywords\n type\n totalJobs\n }\n solMetadata\n suggestedEmployer {\n name\n totalJobs\n }\n queryParameters {\n key\n searchFields\n pageSize\n }\n experiments {\n flight\n }\n jobs {\n id\n adType\n sourceCountryCode\n isStandout\n companyMeta {\n id\n advertiserId\n isPrivate\n name\n logoUrl\n slug\n }\n jobTitle\n jobUrl\n jobTitleSlug\n description\n employmentTypes {\n code\n name\n }\n sellingPoints\n locations {\n code\n name\n slug\n children {\n code\n name\n slug\n }\n }\n categories {\n code\n name\n children {\n code\n name\n }\n }\n postingDuration\n postedAt\n salaryRange {\n currency\n max\n min\n period\n term\n }\n salaryVisible\n bannerUrl\n isClassified\n solMetadata\n }\n }\n}\n"
while total_pages > 0: while current_page <= total_pages:
sleep(1) sleep(1)
variables = {"keyword":"", variables = {"keyword":"",
"jobFunctions":[], "jobFunctions":[],
@ -47,10 +47,8 @@ def search_jst_malay(csv_file):
} }
error_count = 0 error_count = 0
try: try:
response = requests.post(url, json=data, headers=headers) response = requests.post(url, json=data, headers=headers, timeout=20)
if response.status_code == 200: if response.status_code == 200:
# The request was successful
result = response.json() result = response.json()
# print(result['data']['jobs']['totalJobs']) # print(result['data']['jobs']['totalJobs'])
# print(type(result['data']['jobs']['totalJobs'])) # print(type(result['data']['jobs']['totalJobs']))
@ -59,17 +57,19 @@ def search_jst_malay(csv_file):
print('total pages', ceil(result['data']['jobs']['totalJobs']/30)) print('total pages', ceil(result['data']['jobs']['totalJobs']/30))
total_pages = ceil(result['data']['jobs']['totalJobs']/30) total_pages = ceil(result['data']['jobs']['totalJobs']/30)
if len(result['data']['jobs']['jobs']) > 0 : if len(result['data']['jobs']['jobs']) > 0 :
column = list(result['data']['jobs']['jobs'][0].keys()) + ['Company_Name'] column = ['Company_Name'] + list(result['data']['jobs']['jobs'][0].keys())[:6]
print("writing to file")
with open(csv_file, 'a+', newline='', encoding='utf-8') as csvfile: with open(csv_file, 'a+', newline='', encoding='utf-8') as csvfile:
writer = DictWriter(csvfile, fieldnames=column) writer = DictWriter(csvfile, fieldnames=column, extrasaction='ignore')
if csvfile.tell() == 0: if csvfile.tell() == 0:
writer.writeheader() writer.writeheader()
print("writing each array")
for a_job in result['data']['jobs']['jobs']: for a_job in result['data']['jobs']['jobs']:
a_job['Company_Name'] = a_job['companyMeta']['name'] a_job['Company_Name'] = a_job['companyMeta']['name']
writer.writerow(a_job) writer.writerow(a_job)
current_page += 1 current_page += 1
total_pages -=1 # total_pages -=1
print(f"scraping page {current_page} of {total_pages}") print(f"scraping page {current_page} of {total_pages}")
else: else:
@ -77,14 +77,14 @@ def search_jst_malay(csv_file):
error_count +=1 error_count +=1
if error_count > 3: if error_count > 3:
current_page +=1 current_page +=1
total_pages -=1 # total_pages -=1
except Exception as malayError: except Exception as malayError:
print(malayError) print(malayError)
error_count+=1 error_count+=1
if error_count > 3: if error_count > 3:
current_page +=1 current_page +=1
total_pages -=1 # total_pages -=1
if __name__ == "__main__": if __name__ == "__main__":
search_jst_malay("testdata_jst_sg.csv") search_jst_malay("testdata_jst_malay2.csv")