prahul11 2023-10-31 17:49:57 +05:30
parent a64bb2f1b0
commit 0756a8671f
2 changed files with 85 additions and 61 deletions

View File

@ -5,25 +5,24 @@ from time import sleep
# class JSTMalayJobDetailScraper: # class JSTMalayJobDetailScraper:
# id # id
column = [ column = [
"id",
"pageUrl",
"company",
"jobTitle",
"jobDetail",
"location",
"applyurl",
"isExternal", "isExternal",
"isExpired", "Url",
"isConfidential", "Job Key",
"isClassified", "Source Link",
"accountNum", "Job Description",
"advertisementId", "Role Category",
"subAccount", "Job Industry",
"adType", "Job Title",
"salary", "Formatted Location Full",
"company", "Job Functions",
"sourceCountry" "Company",
"Job Type",
"Key Skills",
"Minimum Experience",
"Maximum Experience",
"Salary Detail"
] ]
def jstMalayJobDetailScraper(search_file, jd_file): def jstMalayJobDetailScraper(search_file, jd_file):
url = "https://xapi.supercharge-srp.co/job-search/graphql?country=my&isSmartSearch=true" url = "https://xapi.supercharge-srp.co/job-search/graphql?country=my&isSmartSearch=true"
headers = { headers = {
@ -62,40 +61,65 @@ def jstMalayJobDetailScraper(search_file, jd_file):
'variables': variables 'variables': variables
} }
err =0 err =0
try: # try:
response = requests.post(url, json=data, headers=headers, timeout=20) response = requests.post(url, json=data, headers=headers, timeout=20)
print(response.status_code) print(response.status_code)
# print(response.text) # print(response.text)
if response.status_code == 200: if response.status_code == 200:
print("yes 200")
result = response.json() result = response.json()
# print("result", result) # print("result", result)
if jfile.tell() == 0: if jfile.tell() == 0:
j_writer.writeheader() j_writer.writeheader()
if result['data']['jobDetail']: if result.get('data', {'jobDetail' :''}).get('jobDetail'):
print(result.get('data', {'jobDetail' :''}).get('jobDetail'))
# print(result['data']['jobDetail'])
try :
job_industry = result['data']['jobDetail']\
.get('jobDetail', {'jobRequirement':""})\
.get('jobRequirement', {"industryValue":""})\
.get('industryValue', {'label':""})\
.get('label')
except :
job_industry = ""
job = { job = {
'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'), 'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'),
'applyurl' : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'), 'Url' : result['data']['jobDetail'].get('pageUrl'),
'jobDetail' :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}}).get('jobDescription',{'html':''}).get('html'), "Job Description" :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}})
'company' : result['data']['jobDetail']['header']['company']['name'], .get('jobDescription',{'html':''}).get('html'),
"jobTitle" : result['data']['jobDetail']['header']['jobTitle'], 'Company' : result['data']['jobDetail'].get('header', {'company':""}).get('company', {'name':""}).get('name'),
"location": ', '.join(xy['location'] for xy in result['data']['jobDetail']['location']), "Job Title" : result['data']['jobDetail'].get('header', {'jobTitle' : ""}).get('jobTitle'),
'salary' : result['data']['jobDetail']['header']['salary'] "Formatted Location Full": ', '.join(xy.get('location','') for xy in result['data']['jobDetail'].get('location', [])),
"Salary Detail" : result['data']['jobDetail'].get('header', {'salary', ''}).get('salary'),
"Job Key": result['data']['jobDetail'].get('id'),
"Source Link" : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'),
"Role Category":"",
"Job Industry": job_industry,
"Job Functions" : ', '.join([yu['name'] for yu in result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement')['jobFunctionValue']]),
"Job Type" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"employmentType":""}).get('employmentType'),
"Key Skills":"",
"Minimum Experience" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"yearsOfExperience":""}).get('yearsOfExperience'),
# "Salary Detail" : result['data']['jobDetail']['header']['salary']
} }
job2 = {**result['data']['jobDetail'] , **job} j_writer.writerow(job)
# print(job2)
j_writer.writerow(job2)
del jobIds[0] del jobIds[0]
else: else:
print(err)
err += 1 err += 1
print('after update ', err)
if err >3: if err >3:
continue del jobIds[0]
sleep(2) sleep(2)
except Exception as mdetail: # except Exception as mdetail:
pass # print(mdetail)
# err += 1
# print("Exception erro ", err)
# if err >3:
# del jobIds[0]
if __name__ == "__main__": if __name__ == "__main__":
jstMalayJobDetailScraper("testdata_jst_malay.csv", "test_data_malay.csv") jstMalayJobDetailScraper("testdata_jst_malay.csv", "test_data_malaydetail.csv")

View File

@ -24,7 +24,7 @@ def search_jst_malay(csv_file):
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
} }
query = "query getJobs($country: String, $locale: String, $keyword: String, $createdAt: String, $jobFunctions: [Int], $categories: [String], $locations: [Int], $careerLevels: [Int], $minSalary: Int, $maxSalary: Int, $salaryType: Int, $candidateSalary: Int, $candidateSalaryCurrency: String, $datePosted: Int, $jobTypes: [Int], $workTypes: [String], $industries: [Int], $page: Int, $pageSize: Int, $companyId: String, $advertiserId: String, $userAgent: String, $accNums: Int, $subAccount: Int, $minEdu: Int, $maxEdu: Int, $edus: [Int], $minExp: Int, $maxExp: Int, $seo: String, $searchFields: String, $candidateId: ID, $isDesktop: Boolean, $isCompanySearch: Boolean, $sort: String, $sVi: String, $duplicates: String, $flight: String, $solVisitorId: String) {\n jobs(\n country: $country\n locale: $locale\n keyword: $keyword\n createdAt: $createdAt\n jobFunctions: $jobFunctions\n categories: $categories\n locations: $locations\n careerLevels: $careerLevels\n minSalary: $minSalary\n maxSalary: $maxSalary\n salaryType: $salaryType\n candidateSalary: $candidateSalary\n candidateSalaryCurrency: $candidateSalaryCurrency\n datePosted: $datePosted\n jobTypes: $jobTypes\n workTypes: $workTypes\n industries: $industries\n page: $page\n pageSize: $pageSize\n companyId: $companyId\n advertiserId: $advertiserId\n userAgent: $userAgent\n accNums: $accNums\n subAccount: $subAccount\n minEdu: $minEdu\n edus: $edus\n maxEdu: $maxEdu\n minExp: $minExp\n maxExp: $maxExp\n seo: $seo\n searchFields: $searchFields\n candidateId: $candidateId\n isDesktop: $isDesktop\n isCompanySearch: $isCompanySearch\n sort: $sort\n sVi: $sVi\n duplicates: $duplicates\n flight: $flight\n solVisitorId: $solVisitorId\n ) {\n total\n totalJobs\n relatedSearchKeywords {\n keywords\n type\n totalJobs\n }\n solMetadata\n suggestedEmployer {\n name\n totalJobs\n }\n queryParameters {\n key\n searchFields\n pageSize\n }\n experiments {\n flight\n }\n jobs {\n id\n adType\n sourceCountryCode\n isStandout\n companyMeta {\n id\n advertiserId\n isPrivate\n name\n logoUrl\n slug\n }\n jobTitle\n jobUrl\n jobTitleSlug\n description\n employmentTypes {\n code\n name\n }\n sellingPoints\n locations {\n code\n name\n slug\n children {\n code\n name\n slug\n }\n }\n categories {\n code\n name\n children {\n code\n name\n }\n }\n postingDuration\n postedAt\n salaryRange {\n currency\n max\n min\n period\n term\n }\n salaryVisible\n bannerUrl\n isClassified\n solMetadata\n }\n }\n}\n" query = "query getJobs($country: String, $locale: String, $keyword: String, $createdAt: String, $jobFunctions: [Int], $categories: [String], $locations: [Int], $careerLevels: [Int], $minSalary: Int, $maxSalary: Int, $salaryType: Int, $candidateSalary: Int, $candidateSalaryCurrency: String, $datePosted: Int, $jobTypes: [Int], $workTypes: [String], $industries: [Int], $page: Int, $pageSize: Int, $companyId: String, $advertiserId: String, $userAgent: String, $accNums: Int, $subAccount: Int, $minEdu: Int, $maxEdu: Int, $edus: [Int], $minExp: Int, $maxExp: Int, $seo: String, $searchFields: String, $candidateId: ID, $isDesktop: Boolean, $isCompanySearch: Boolean, $sort: String, $sVi: String, $duplicates: String, $flight: String, $solVisitorId: String) {\n jobs(\n country: $country\n locale: $locale\n keyword: $keyword\n createdAt: $createdAt\n jobFunctions: $jobFunctions\n categories: $categories\n locations: $locations\n careerLevels: $careerLevels\n minSalary: $minSalary\n maxSalary: $maxSalary\n salaryType: $salaryType\n candidateSalary: $candidateSalary\n candidateSalaryCurrency: $candidateSalaryCurrency\n datePosted: $datePosted\n jobTypes: $jobTypes\n workTypes: $workTypes\n industries: $industries\n page: $page\n pageSize: $pageSize\n companyId: $companyId\n advertiserId: $advertiserId\n userAgent: $userAgent\n accNums: $accNums\n subAccount: $subAccount\n minEdu: $minEdu\n edus: $edus\n maxEdu: $maxEdu\n minExp: $minExp\n maxExp: $maxExp\n seo: $seo\n searchFields: $searchFields\n candidateId: $candidateId\n isDesktop: $isDesktop\n isCompanySearch: $isCompanySearch\n sort: $sort\n sVi: $sVi\n duplicates: $duplicates\n flight: $flight\n solVisitorId: $solVisitorId\n ) {\n total\n totalJobs\n relatedSearchKeywords {\n keywords\n type\n totalJobs\n }\n solMetadata\n suggestedEmployer {\n name\n totalJobs\n }\n queryParameters {\n key\n searchFields\n pageSize\n }\n experiments {\n flight\n }\n jobs {\n id\n adType\n sourceCountryCode\n isStandout\n companyMeta {\n id\n advertiserId\n isPrivate\n name\n logoUrl\n slug\n }\n jobTitle\n jobUrl\n jobTitleSlug\n description\n employmentTypes {\n code\n name\n }\n sellingPoints\n locations {\n code\n name\n slug\n children {\n code\n name\n slug\n }\n }\n categories {\n code\n name\n children {\n code\n name\n }\n }\n postingDuration\n postedAt\n salaryRange {\n currency\n max\n min\n period\n term\n }\n salaryVisible\n bannerUrl\n isClassified\n solMetadata\n }\n }\n}\n"
while total_pages > 0: while current_page <= total_pages:
sleep(1) sleep(1)
variables = {"keyword":"", variables = {"keyword":"",
"jobFunctions":[], "jobFunctions":[],
@ -47,10 +47,8 @@ def search_jst_malay(csv_file):
} }
error_count = 0 error_count = 0
try: try:
response = requests.post(url, json=data, headers=headers) response = requests.post(url, json=data, headers=headers, timeout=20)
if response.status_code == 200: if response.status_code == 200:
# The request was successful
result = response.json() result = response.json()
# print(result['data']['jobs']['totalJobs']) # print(result['data']['jobs']['totalJobs'])
# print(type(result['data']['jobs']['totalJobs'])) # print(type(result['data']['jobs']['totalJobs']))
@ -59,17 +57,19 @@ def search_jst_malay(csv_file):
print('total pages', ceil(result['data']['jobs']['totalJobs']/30)) print('total pages', ceil(result['data']['jobs']['totalJobs']/30))
total_pages = ceil(result['data']['jobs']['totalJobs']/30) total_pages = ceil(result['data']['jobs']['totalJobs']/30)
if len(result['data']['jobs']['jobs']) > 0 : if len(result['data']['jobs']['jobs']) > 0 :
column = list(result['data']['jobs']['jobs'][0].keys()) + ['Company_Name'] column = ['Company_Name'] + list(result['data']['jobs']['jobs'][0].keys())[:6]
print("writing to file")
with open(csv_file, 'a+', newline='', encoding='utf-8') as csvfile: with open(csv_file, 'a+', newline='', encoding='utf-8') as csvfile:
writer = DictWriter(csvfile, fieldnames=column) writer = DictWriter(csvfile, fieldnames=column, extrasaction='ignore')
if csvfile.tell() == 0: if csvfile.tell() == 0:
writer.writeheader() writer.writeheader()
print("writing each array")
for a_job in result['data']['jobs']['jobs']: for a_job in result['data']['jobs']['jobs']:
a_job['Company_Name'] = a_job['companyMeta']['name'] a_job['Company_Name'] = a_job['companyMeta']['name']
writer.writerow(a_job) writer.writerow(a_job)
current_page += 1 current_page += 1
total_pages -=1 # total_pages -=1
print(f"scraping page {current_page} of {total_pages}") print(f"scraping page {current_page} of {total_pages}")
else: else:
@ -77,14 +77,14 @@ def search_jst_malay(csv_file):
error_count +=1 error_count +=1
if error_count > 3: if error_count > 3:
current_page +=1 current_page +=1
total_pages -=1 # total_pages -=1
except Exception as malayError: except Exception as malayError:
print(malayError) print(malayError)
error_count+=1 error_count+=1
if error_count > 3: if error_count > 3:
current_page +=1 current_page +=1
total_pages -=1 # total_pages -=1
if __name__ == "__main__": if __name__ == "__main__":
search_jst_malay("testdata_jst_malay.csv") search_jst_malay("testdata_jst_malay2.csv")