added logger
parent
a401cf37fc
commit
7e18ef91ff
|
@ -3,6 +3,11 @@ import csv
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Configure the logging settings
|
||||||
|
logging.basicConfig(filename='search_india_error.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
# Global variables
|
# Global variables
|
||||||
input_file = "data_naukri/search_result_india.csv"
|
input_file = "data_naukri/search_result_india.csv"
|
||||||
|
@ -116,41 +121,44 @@ class NaukriJobDetailScraper:
|
||||||
url = self.base_url.format(job_id)
|
url = self.base_url.format(job_id)
|
||||||
|
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
|
try:
|
||||||
|
response = requests.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
|
||||||
|
|
||||||
print(f"{response.status_code} for {url}")
|
print(f"{response.status_code} for {url}")
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
json_response = response.json()
|
json_response = response.json()
|
||||||
|
|
||||||
transformed_data = self.transform_data(job_id, url, json_response)
|
transformed_data = self.transform_data(job_id, url, json_response)
|
||||||
|
|
||||||
# Write the header row if needed
|
# Write the header row if needed
|
||||||
if not header_written:
|
if not header_written:
|
||||||
header = transformed_data.keys()
|
header = transformed_data.keys()
|
||||||
writer.writerow(header)
|
writer.writerow(header)
|
||||||
header_written = True
|
header_written = True
|
||||||
|
|
||||||
writer.writerow(transformed_data.values())
|
writer.writerow(transformed_data.values())
|
||||||
print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}")
|
print(f"Processed job ID {job_id}. Count: {self.count}, Remaining: {len(all_job_ids)}")
|
||||||
all_job_ids.pop(0) # Remove the processed job ID
|
|
||||||
self.count += 1
|
|
||||||
|
|
||||||
elif response.status_code == 303:
|
|
||||||
json_response = response.json()
|
|
||||||
|
|
||||||
if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
|
|
||||||
|
|
||||||
print(f"Expired job ID {jobid} with response 303")
|
|
||||||
all_job_ids.pop(0) # Remove the processed job ID
|
all_job_ids.pop(0) # Remove the processed job ID
|
||||||
|
self.count += 1
|
||||||
|
|
||||||
elif response.status_code == 404:
|
elif response.status_code == 303:
|
||||||
all_job_ids.pop(0) # Remove the processed job ID
|
json_response = response.json()
|
||||||
print(f"Expired job ID {jobid} with response 404")
|
|
||||||
|
|
||||||
else:
|
if json_response.get('metaSearch', {}).get('isExpiredJob') == '1':
|
||||||
print(f"Error for job ID {job_id}")
|
|
||||||
time.sleep(10)
|
print(f"Expired job ID {jobid} with response 303")
|
||||||
|
all_job_ids.pop(0) # Remove the processed job ID
|
||||||
|
|
||||||
|
elif response.status_code == 404:
|
||||||
|
all_job_ids.pop(0) # Remove the processed job ID
|
||||||
|
print(f"Expired job ID {jobid} with response 404")
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"Error for job ID {job_id}")
|
||||||
|
time.sleep(10)
|
||||||
|
except Exception as e1:
|
||||||
|
logging.error(url + '\n'+ str(e1) + '\n')
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
|
|
|
@ -105,7 +105,7 @@ class NaukriJobScraper:
|
||||||
print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}")
|
print(f"Industry: {industry_name}, pages remaining: {total_pages}, start page: {start_page}")
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
except Exception as e1:
|
except Exception as e1:
|
||||||
logging.error(url + '\n'+ str(e1))
|
logging.error(url + '\n'+ str(e1) + '\n')
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Loading…
Reference in New Issue