diff --git a/jobstreet/jst_malay_detail.py b/jobstreet/jst_malay_detail.py new file mode 100644 index 0000000..09b8ece --- /dev/null +++ b/jobstreet/jst_malay_detail.py @@ -0,0 +1,100 @@ +import requests +from csv import DictWriter +import pandas as pd +from time import sleep +# class JSTMalayJobDetailScraper: +# id +column = [ + "id", + "pageUrl", + "company", + "jobTitle", + "jobDetail", + "location", + "applyurl", + "isExternal", + "isExpired", + "isConfidential", + "isClassified", + "accountNum", + "advertisementId", + "subAccount", + "adType", + # "header", + "company", + "sourceCountry" +] +def jstMalayJobDetailScraper(search_file, jd_file): + url = "https://xapi.supercharge-srp.co/job-search/graphql?country=my&isSmartSearch=true" + headers = { + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9', + 'Content-Length': '2361', + 'Content-Type': 'application/json', + 'Origin': 'https://www.jobstreet.com.my', + 'Referer': 'https://www.jobstreet.com.my/', + 'Sec-Ch-Ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', + 'Sec-Ch-Ua-Mobile': '?0', + 'Sec-Ch-Ua-Platform': '"Windows"', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'cross-site', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' +} + jobIds = pd.read_csv(search_file) + jobIds['id'] = jobIds['id'].astype(str) + jobIds = jobIds[['id', 'sourceCountryCode']].values.tolist() + query = "query getJobDetail($jobId: String, $locale: String, $country: String, $candidateId: ID, $solVisitorId: String, $flight: String) {\n jobDetail(\n jobId: $jobId\n locale: $locale\n country: $country\n candidateId: $candidateId\n solVisitorId: $solVisitorId\n flight: $flight\n ) {\n id\n pageUrl\n jobTitleSlug\n applyUrl {\n url\n isExternal\n }\n isExpired\n isConfidential\n isClassified\n accountNum\n advertisementId\n subAccount\n showMoreJobs\n adType\n header {\n banner {\n bannerUrls {\n large\n }\n }\n salary {\n max\n min\n type\n extraInfo\n currency\n isVisible\n }\n logoUrls {\n small\n medium\n large\n normal\n }\n jobTitle\n company {\n name\n url\n slug\n advertiserId\n }\n review {\n rating\n numberOfReviewer\n }\n expiration\n postedDate\n postedAt\n isInternship\n }\n companyDetail {\n companyWebsite\n companySnapshot {\n avgProcessTime\n registrationNo\n employmentAgencyPersonnelNumber\n employmentAgencyNumber\n telephoneNumber\n workingHours\n website\n facebook\n size\n dressCode\n nearbyLocations\n }\n companyOverview {\n html\n }\n videoUrl\n companyPhotos {\n caption\n url\n }\n }\n jobDetail {\n summary\n jobDescription {\n html\n }\n jobRequirement {\n careerLevel\n yearsOfExperience\n qualification\n fieldOfStudy\n industryValue {\n value\n label\n }\n skills\n employmentType\n languages\n postedDate\n closingDate\n jobFunctionValue {\n code\n name\n children {\n code\n name\n }\n }\n benefits\n }\n whyJoinUs\n }\n location {\n location\n locationId\n omnitureLocationId\n }\n sourceCountry\n }\n}\n" + with open(jd_file, 'a+', encoding='utf-8', newline='') as jfile: + j_writer = DictWriter(jfile, fieldnames= column, extrasaction='ignore') + while jobIds: + print(jobIds[0]) + variables = { + "jobId": jobIds[0][0], + "country":jobIds[0][1], #"my", + "locale": "en", + "candidateId": "", + "solVisitorId": "7d3f7e5c-471e-411d-8a82-d8d29a303653" + } + data = { + 'query': query, + 'variables': variables + } + err =0 + try: + response = requests.post(url, json=data, headers=headers, timeout=20) + print(response.status_code) + # print(response.text) + + if response.status_code == 200: + result = response.json() + # print("result", result) + if jfile.tell() == 0: + j_writer.writeheader() + if result['data']['jobDetail']: + job = { + 'isExternal': result['data']['jobDetail'].get('applyUrl',{'isExternal':''}).get('isExternal'), + 'applyurl' : result['data']['jobDetail'].get('applyUrl',{'url':''}).get('url'), + 'jobDetail' :result['data']['jobDetail'].get('jobDetail', {'jobDescription':{'html'}}).get('jobDescription',{'html':''}).get('html'), + 'company' : result['data']['jobDetail']['header']['company']['name'], + "jobTitle" : result['data']['jobDetail']['header']['jobTitle'], + "location": ', '.join(xy['location'] for xy in result['data']['jobDetail']['location']) + } + job2 = {**result['data']['jobDetail'] , **job} + # print(job2) + j_writer.writerow(job2) + del jobIds[0] + + else: + err += 1 + if err >3: + continue + sleep(2) + except Exception as mdetail: + pass + +if __name__ == "__main__": + jstMalayJobDetailScraper("testdata_jst_malay.csv", "test_data_malay.csv") + + diff --git a/jobstreet/jst_malay_search.py b/jobstreet/jst_malay_search.py new file mode 100644 index 0000000..79e7e64 --- /dev/null +++ b/jobstreet/jst_malay_search.py @@ -0,0 +1,88 @@ +import requests +from math import ceil +from csv import DictWriter + +def search_jst_malay(csv_file): + total_pages = 2 + current_page = 1 + url = 'https://xapi.supercharge-srp.co/job-search/graphql?country=my&isSmartSearch=true' + headers = { + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9', + 'Content-Length': '3408', + 'Content-Type': 'application/json', + 'Origin': 'https://www.jobstreet.com.my', + 'Referer': 'https://www.jobstreet.com.my/', + 'Sec-Ch-Ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', + 'Sec-Ch-Ua-Mobile': '?0', + 'Sec-Ch-Ua-Platform': '"Windows"', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'cross-site', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' + } + query = "query getJobs($country: String, $locale: String, $keyword: String, $createdAt: String, $jobFunctions: [Int], $categories: [String], $locations: [Int], $careerLevels: [Int], $minSalary: Int, $maxSalary: Int, $salaryType: Int, $candidateSalary: Int, $candidateSalaryCurrency: String, $datePosted: Int, $jobTypes: [Int], $workTypes: [String], $industries: [Int], $page: Int, $pageSize: Int, $companyId: String, $advertiserId: String, $userAgent: String, $accNums: Int, $subAccount: Int, $minEdu: Int, $maxEdu: Int, $edus: [Int], $minExp: Int, $maxExp: Int, $seo: String, $searchFields: String, $candidateId: ID, $isDesktop: Boolean, $isCompanySearch: Boolean, $sort: String, $sVi: String, $duplicates: String, $flight: String, $solVisitorId: String) {\n jobs(\n country: $country\n locale: $locale\n keyword: $keyword\n createdAt: $createdAt\n jobFunctions: $jobFunctions\n categories: $categories\n locations: $locations\n careerLevels: $careerLevels\n minSalary: $minSalary\n maxSalary: $maxSalary\n salaryType: $salaryType\n candidateSalary: $candidateSalary\n candidateSalaryCurrency: $candidateSalaryCurrency\n datePosted: $datePosted\n jobTypes: $jobTypes\n workTypes: $workTypes\n industries: $industries\n page: $page\n pageSize: $pageSize\n companyId: $companyId\n advertiserId: $advertiserId\n userAgent: $userAgent\n accNums: $accNums\n subAccount: $subAccount\n minEdu: $minEdu\n edus: $edus\n maxEdu: $maxEdu\n minExp: $minExp\n maxExp: $maxExp\n seo: $seo\n searchFields: $searchFields\n candidateId: $candidateId\n isDesktop: $isDesktop\n isCompanySearch: $isCompanySearch\n sort: $sort\n sVi: $sVi\n duplicates: $duplicates\n flight: $flight\n solVisitorId: $solVisitorId\n ) {\n total\n totalJobs\n relatedSearchKeywords {\n keywords\n type\n totalJobs\n }\n solMetadata\n suggestedEmployer {\n name\n totalJobs\n }\n queryParameters {\n key\n searchFields\n pageSize\n }\n experiments {\n flight\n }\n jobs {\n id\n adType\n sourceCountryCode\n isStandout\n companyMeta {\n id\n advertiserId\n isPrivate\n name\n logoUrl\n slug\n }\n jobTitle\n jobUrl\n jobTitleSlug\n description\n employmentTypes {\n code\n name\n }\n sellingPoints\n locations {\n code\n name\n slug\n children {\n code\n name\n slug\n }\n }\n categories {\n code\n name\n children {\n code\n name\n }\n }\n postingDuration\n postedAt\n salaryRange {\n currency\n max\n min\n period\n term\n }\n salaryVisible\n bannerUrl\n isClassified\n solMetadata\n }\n }\n}\n" + while total_pages > 0: + variables = {"keyword":"", + "jobFunctions":[], + "locations":[], + "salaryType":1, + "jobTypes":[], + "createdAt":None, + "careerLevels":[], + "page": current_page, + "country":"my", + "sVi":"", + "solVisitorId":"7d3f7e5c-471e-411d-8a82-d8d29a303653", + "categories":[],"workTypes":[], + "userAgent":"Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/118.0.0.0%20Safari/537.36", + "industries":[], + "locale":"en"} + data = { + 'query': query, + 'variables': variables + } + error_count = 0 + try: + response = requests.post(url, json=data, headers=headers) + if response.status_code == 200: + + # The request was successful + result = response.json() + # print(result['data']['jobs']['totalJobs']) + # print(type(result['data']['jobs']['totalJobs'])) + # print(result['data']['jobs']['totalJobs']/30) + # print(result['data']['jobs']['jobs']) + print('total pages', ceil(result['data']['jobs']['totalJobs']/30)) + total_pages = ceil(result['data']['jobs']['totalJobs']/30) + if len(result['data']['jobs']['jobs']) > 0 : + column = list(result['data']['jobs']['jobs'][0].keys()) + ['Company_Name'] + with open(csv_file, 'a+', newline='', encoding='utf-8') as csvfile: + writer = DictWriter(csvfile, fieldnames=column) + if csvfile.tell() == 0: + writer.writeheader() + for a_job in result['data']['jobs']['jobs']: + a_job['Company_Name'] = a_job['companyMeta']['name'] + writer.writerow(a_job) + + current_page += 1 + total_pages -=1 + print(f"scraping page {current_page} of {total_pages}") + else: + + print(f"Request failed with status code {response.status_code}: {response.text}") + error_count +=1 + if error_count > 3: + current_page +=1 + total_pages -=1 + except Exception as malayError: + print(malayError) + error_count+=1 + if error_count > 3: + current_page +=1 + total_pages -=1 + + +if __name__ == "__main__": + search_jst_malay("testdata_jst_malay.csv") \ No newline at end of file