From 1ba8f09a78627d4189e9d0116461646cfd914e7e Mon Sep 17 00:00:00 2001 From: prahul11 Date: Tue, 31 Oct 2023 21:07:28 +0530 Subject: [PATCH] jhg --- jobstreet/jst_malay_search2.py | 90 ------------------------ jobstreet/jst_sg_detail.py | 124 ++++++++++++++++++++------------- jobstreet/jst_sg_search.py | 22 +++--- 3 files changed, 85 insertions(+), 151 deletions(-) delete mode 100644 jobstreet/jst_malay_search2.py diff --git a/jobstreet/jst_malay_search2.py b/jobstreet/jst_malay_search2.py deleted file mode 100644 index 3cea48e..0000000 --- a/jobstreet/jst_malay_search2.py +++ /dev/null @@ -1,90 +0,0 @@ -import requests -from math import ceil -from csv import DictWriter -from time import sleep - -def search_jst_malay(csv_file): - total_pages = 2 - current_page = 1 - url = 'https://xapi.supercharge-srp.co/job-search/graphql?country=my&isSmartSearch=true' - headers = { - 'Accept': '*/*', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en-US,en;q=0.9', - 'Content-Length': '3408', - 'Content-Type': 'application/json', - 'Origin': 'https://www.jobstreet.com.my', - 'Referer': 'https://www.jobstreet.com.my/', - 'Sec-Ch-Ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', - 'Sec-Ch-Ua-Mobile': '?0', - 'Sec-Ch-Ua-Platform': '"Windows"', - 'Sec-Fetch-Dest': 'empty', - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Site': 'cross-site', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' - } - query = "query getJobs($country: String, $locale: String, $keyword: String, $createdAt: String, $jobFunctions: [Int], $categories: [String], $locations: [Int], $careerLevels: [Int], $minSalary: Int, $maxSalary: Int, $salaryType: Int, $candidateSalary: Int, $candidateSalaryCurrency: String, $datePosted: Int, $jobTypes: [Int], $workTypes: [String], $industries: [Int], $page: Int, $pageSize: Int, $companyId: String, $advertiserId: String, $userAgent: String, $accNums: Int, $subAccount: Int, $minEdu: Int, $maxEdu: Int, $edus: [Int], $minExp: Int, $maxExp: Int, $seo: String, $searchFields: String, $candidateId: ID, $isDesktop: Boolean, $isCompanySearch: Boolean, $sort: String, $sVi: String, $duplicates: String, $flight: String, $solVisitorId: String) {\n jobs(\n country: $country\n locale: $locale\n keyword: $keyword\n createdAt: $createdAt\n jobFunctions: $jobFunctions\n categories: $categories\n locations: $locations\n careerLevels: $careerLevels\n minSalary: $minSalary\n maxSalary: $maxSalary\n salaryType: $salaryType\n candidateSalary: $candidateSalary\n candidateSalaryCurrency: $candidateSalaryCurrency\n datePosted: $datePosted\n jobTypes: $jobTypes\n workTypes: $workTypes\n industries: $industries\n page: $page\n pageSize: $pageSize\n companyId: $companyId\n advertiserId: $advertiserId\n userAgent: $userAgent\n accNums: $accNums\n subAccount: $subAccount\n minEdu: $minEdu\n edus: $edus\n maxEdu: $maxEdu\n minExp: $minExp\n maxExp: $maxExp\n seo: $seo\n searchFields: $searchFields\n candidateId: $candidateId\n isDesktop: $isDesktop\n isCompanySearch: $isCompanySearch\n sort: $sort\n sVi: $sVi\n duplicates: $duplicates\n flight: $flight\n solVisitorId: $solVisitorId\n ) {\n total\n totalJobs\n relatedSearchKeywords {\n keywords\n type\n totalJobs\n }\n solMetadata\n suggestedEmployer {\n name\n totalJobs\n }\n queryParameters {\n key\n searchFields\n pageSize\n }\n experiments {\n flight\n }\n jobs {\n id\n adType\n sourceCountryCode\n isStandout\n companyMeta {\n id\n advertiserId\n isPrivate\n name\n logoUrl\n slug\n }\n jobTitle\n jobUrl\n jobTitleSlug\n description\n employmentTypes {\n code\n name\n }\n sellingPoints\n locations {\n code\n name\n slug\n children {\n code\n name\n slug\n }\n }\n categories {\n code\n name\n children {\n code\n name\n }\n }\n postingDuration\n postedAt\n salaryRange {\n currency\n max\n min\n period\n term\n }\n salaryVisible\n bannerUrl\n isClassified\n solMetadata\n }\n }\n}\n" - while current_page <= total_pages: - sleep(1) - variables = {"keyword":"", - "jobFunctions":[], - "locations":[], - "salaryType":1, - "jobTypes":[], - "createdAt":None, - "careerLevels":[], - "page": current_page, - "country":"my", - "sVi":"", - "solVisitorId":"7d3f7e5c-471e-411d-8a82-d8d29a303653", - "categories":[],"workTypes":[], - "userAgent":"Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/118.0.0.0%20Safari/537.36", - "industries":[], - "locale":"en"} - data = { - 'query': query, - 'variables': variables - } - error_count = 0 - try: - response = requests.post(url, json=data, headers=headers) - if response.status_code == 200: - - # The request was successful - result = response.json() - # print(result['data']['jobs']['totalJobs']) - # print(type(result['data']['jobs']['totalJobs'])) - # print(result['data']['jobs']['totalJobs']/30) - # print(result['data']['jobs']['jobs']) - print('total pages', ceil(result['data']['jobs']['totalJobs']/30)) - total_pages = ceil(result['data']['jobs']['totalJobs']/30) - if len(result['data']['jobs']['jobs']) > 0 : - column = list(result['data']['jobs']['jobs'][0].keys()) + ['Company_Name'] - with open(csv_file, 'a+', newline='', encoding='utf-8') as csvfile: - writer = DictWriter(csvfile, fieldnames=column) - if csvfile.tell() == 0: - writer.writeheader() - for a_job in result['data']['jobs']['jobs']: - a_job['Company_Name'] = a_job['companyMeta']['name'] - writer.writerow(a_job) - - current_page += 1 - # total_pages -=1 - print(f"scraping page {current_page} of {total_pages}") - else: - - print(f"Request failed with status code {response.status_code}: {response.text}") - error_count +=1 - if error_count > 3: - current_page +=1 - # total_pages -=1 - except Exception as malayError: - print(malayError) - error_count+=1 - if error_count > 3: - current_page +=1 - # total_pages -=1 - - -if __name__ == "__main__": - search_jst_malay("testdata_jst_malay2.csv") \ No newline at end of file diff --git a/jobstreet/jst_sg_detail.py b/jobstreet/jst_sg_detail.py index da77550..99bf2dc 100644 --- a/jobstreet/jst_sg_detail.py +++ b/jobstreet/jst_sg_detail.py @@ -5,25 +5,24 @@ from time import sleep # class JSTMalayJobDetailScraper: # id column = [ - "id", - "pageUrl", - "company", - "jobTitle", - "jobDetail", - "location", - "applyurl", - "isExternal", - "isExpired", - "isConfidential", - "isClassified", - "accountNum", - "advertisementId", - "subAccount", - "adType", - "salary", - "company", - "sourceCountry" -] + "isExternal", + "Url", + "Job Key", + "Source Link", + "Job Description", + "Role Category", + "Job Industry", + "Job Title", + "Formatted Location Full", + "Job Functions", + "Company", + "Job Type", + "Key Skills", + "Minimum Experience", + "Maximum Experience", + "Salary Detail" + ] + def jstMalayJobDetailScraper(search_file, jd_file): url = "https://xapi.supercharge-srp.co/job-search/graphql?country=sg&isSmartSearch=true" headers = { @@ -62,40 +61,65 @@ def jstMalayJobDetailScraper(search_file, jd_file): 'variables': variables } err =0 - try: - response = requests.post(url, json=data, headers=headers, timeout=20) - print(response.status_code) - # print(response.text) + # try: + response = requests.post(url, json=data, headers=headers, timeout=20) + print(response.status_code) + # print(response.text) - if response.status_code == 200: - result = response.json() - # print("result", result) - if jfile.tell() == 0: - j_writer.writeheader() - if result['data']['jobDetail']: - job = { - 'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'), - 'applyurl' : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'), - 'jobDetail' :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}}).get('jobDescription',{'html':''}).get('html'), - 'company' : result['data']['jobDetail']['header']['company']['name'], - "jobTitle" : result['data']['jobDetail']['header']['jobTitle'], - "location": ', '.join(xy['location'] for xy in result['data']['jobDetail']['location']), - 'salary' : result['data']['jobDetail']['header']['salary'] - } - job2 = {**result['data']['jobDetail'] , **job} - # print(job2) - j_writer.writerow(job2) - del jobIds[0] + if response.status_code == 200: + print("yes 200") + result = response.json() + # print("result", result) + if jfile.tell() == 0: + j_writer.writeheader() + if result.get('data', {'jobDetail' :''}).get('jobDetail'): + print(result.get('data', {'jobDetail' :''}).get('jobDetail')) + # print(result['data']['jobDetail']) + try : + job_industry = result['data']['jobDetail']\ + .get('jobDetail', {'jobRequirement':""})\ + .get('jobRequirement', {"industryValue":""})\ + .get('industryValue', {'label':""})\ + .get('label') + except : + job_industry = "" - else: - err += 1 - if err >3: - continue - sleep(2) - except Exception as mdetail: - pass + job = { + 'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'), + 'Url' : result['data']['jobDetail'].get('pageUrl'), + "Job Description" :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}}) + .get('jobDescription',{'html':''}).get('html'), + 'Company' : result['data']['jobDetail'].get('header', {'company':""}).get('company', {'name':""}).get('name'), + "Job Title" : result['data']['jobDetail'].get('header', {'jobTitle' : ""}).get('jobTitle'), + "Formatted Location Full": ', '.join(xy.get('location','') for xy in result['data']['jobDetail'].get('location', [])), + "Salary Detail" : result['data']['jobDetail'].get('header', {'salary', ''}).get('salary'), + "Job Key": result['data']['jobDetail'].get('id'), + "Source Link" : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'), + "Role Category":"", + "Job Industry": job_industry, + "Job Functions" : ', '.join([yu['name'] for yu in result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement')['jobFunctionValue']]), + "Job Type" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"employmentType":""}).get('employmentType'), + "Key Skills":"", + "Minimum Experience" : result['data']['jobDetail'].get('jobDetail', {'jobRequirement':""}).get('jobRequirement', {"yearsOfExperience":""}).get('yearsOfExperience'), + # "Salary Detail" : result['data']['jobDetail']['header']['salary'] + } + j_writer.writerow(job) + del jobIds[0] + else: + print(err) + err += 1 + print('after update ', err) + if err >3: + del jobIds[0] + sleep(2) + # except Exception as mdetail: + # print(mdetail) + # err += 1 + # print("Exception erro ", err) + # if err >3: + # del jobIds[0] if __name__ == "__main__": - jstMalayJobDetailScraper("testdata_jst_sg.csv", "test_data_sg.csv") + jstMalayJobDetailScraper("testdata_jst_malay.csv", "test_data_malaydetail.csv") diff --git a/jobstreet/jst_sg_search.py b/jobstreet/jst_sg_search.py index 84b1c4d..ebf6385 100644 --- a/jobstreet/jst_sg_search.py +++ b/jobstreet/jst_sg_search.py @@ -24,7 +24,7 @@ def search_jst_malay(csv_file): 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' } query = "query getJobs($country: String, $locale: String, $keyword: String, $createdAt: String, $jobFunctions: [Int], $categories: [String], $locations: [Int], $careerLevels: [Int], $minSalary: Int, $maxSalary: Int, $salaryType: Int, $candidateSalary: Int, $candidateSalaryCurrency: String, $datePosted: Int, $jobTypes: [Int], $workTypes: [String], $industries: [Int], $page: Int, $pageSize: Int, $companyId: String, $advertiserId: String, $userAgent: String, $accNums: Int, $subAccount: Int, $minEdu: Int, $maxEdu: Int, $edus: [Int], $minExp: Int, $maxExp: Int, $seo: String, $searchFields: String, $candidateId: ID, $isDesktop: Boolean, $isCompanySearch: Boolean, $sort: String, $sVi: String, $duplicates: String, $flight: String, $solVisitorId: String) {\n jobs(\n country: $country\n locale: $locale\n keyword: $keyword\n createdAt: $createdAt\n jobFunctions: $jobFunctions\n categories: $categories\n locations: $locations\n careerLevels: $careerLevels\n minSalary: $minSalary\n maxSalary: $maxSalary\n salaryType: $salaryType\n candidateSalary: $candidateSalary\n candidateSalaryCurrency: $candidateSalaryCurrency\n datePosted: $datePosted\n jobTypes: $jobTypes\n workTypes: $workTypes\n industries: $industries\n page: $page\n pageSize: $pageSize\n companyId: $companyId\n advertiserId: $advertiserId\n userAgent: $userAgent\n accNums: $accNums\n subAccount: $subAccount\n minEdu: $minEdu\n edus: $edus\n maxEdu: $maxEdu\n minExp: $minExp\n maxExp: $maxExp\n seo: $seo\n searchFields: $searchFields\n candidateId: $candidateId\n isDesktop: $isDesktop\n isCompanySearch: $isCompanySearch\n sort: $sort\n sVi: $sVi\n duplicates: $duplicates\n flight: $flight\n solVisitorId: $solVisitorId\n ) {\n total\n totalJobs\n relatedSearchKeywords {\n keywords\n type\n totalJobs\n }\n solMetadata\n suggestedEmployer {\n name\n totalJobs\n }\n queryParameters {\n key\n searchFields\n pageSize\n }\n experiments {\n flight\n }\n jobs {\n id\n adType\n sourceCountryCode\n isStandout\n companyMeta {\n id\n advertiserId\n isPrivate\n name\n logoUrl\n slug\n }\n jobTitle\n jobUrl\n jobTitleSlug\n description\n employmentTypes {\n code\n name\n }\n sellingPoints\n locations {\n code\n name\n slug\n children {\n code\n name\n slug\n }\n }\n categories {\n code\n name\n children {\n code\n name\n }\n }\n postingDuration\n postedAt\n salaryRange {\n currency\n max\n min\n period\n term\n }\n salaryVisible\n bannerUrl\n isClassified\n solMetadata\n }\n }\n}\n" - while total_pages > 0: + while current_page <= total_pages: sleep(1) variables = {"keyword":"", "jobFunctions":[], @@ -47,10 +47,8 @@ def search_jst_malay(csv_file): } error_count = 0 try: - response = requests.post(url, json=data, headers=headers) - if response.status_code == 200: - - # The request was successful + response = requests.post(url, json=data, headers=headers, timeout=20) + if response.status_code == 200: result = response.json() # print(result['data']['jobs']['totalJobs']) # print(type(result['data']['jobs']['totalJobs'])) @@ -59,17 +57,19 @@ def search_jst_malay(csv_file): print('total pages', ceil(result['data']['jobs']['totalJobs']/30)) total_pages = ceil(result['data']['jobs']['totalJobs']/30) if len(result['data']['jobs']['jobs']) > 0 : - column = list(result['data']['jobs']['jobs'][0].keys()) + ['Company_Name'] + column = ['Company_Name'] + list(result['data']['jobs']['jobs'][0].keys())[:6] + print("writing to file") with open(csv_file, 'a+', newline='', encoding='utf-8') as csvfile: - writer = DictWriter(csvfile, fieldnames=column) + writer = DictWriter(csvfile, fieldnames=column, extrasaction='ignore') if csvfile.tell() == 0: writer.writeheader() + print("writing each array") for a_job in result['data']['jobs']['jobs']: a_job['Company_Name'] = a_job['companyMeta']['name'] writer.writerow(a_job) current_page += 1 - total_pages -=1 + # total_pages -=1 print(f"scraping page {current_page} of {total_pages}") else: @@ -77,14 +77,14 @@ def search_jst_malay(csv_file): error_count +=1 if error_count > 3: current_page +=1 - total_pages -=1 + # total_pages -=1 except Exception as malayError: print(malayError) error_count+=1 if error_count > 3: current_page +=1 - total_pages -=1 + # total_pages -=1 if __name__ == "__main__": - search_jst_malay("testdata_jst_sg.csv") \ No newline at end of file + search_jst_malay("testdata_jst_malay2.csv") \ No newline at end of file