From 902933f5d623562f1abfc2eda6753d542d42dd43 Mon Sep 17 00:00:00 2001 From: prahul11 Date: Mon, 9 Oct 2023 15:31:50 +0530 Subject: [PATCH] only today file --- .../spiders/naukri_gulf_detail_spider.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/server scraper/naukri_gulf_detail/naukri_gulf_detail/spiders/naukri_gulf_detail_spider.py b/server scraper/naukri_gulf_detail/naukri_gulf_detail/spiders/naukri_gulf_detail_spider.py index f4d0a32..d9394e9 100644 --- a/server scraper/naukri_gulf_detail/naukri_gulf_detail/spiders/naukri_gulf_detail_spider.py +++ b/server scraper/naukri_gulf_detail/naukri_gulf_detail/spiders/naukri_gulf_detail_spider.py @@ -17,24 +17,24 @@ class NaukriGulfDetailSpiderSpider(scrapy.Spider): 'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s', } - current_date = datetime.now() - formatted_date = current_date.strftime('%d-%m-%Y') - yesterday = current_date - timedelta(days=1) - yesterday_str = yesterday.strftime('%d-%m-%Y') - yesterday_search_file = f'gulf_data/naukri_gulf_search_{yesterday_str}.csv' - today_search_file = f'gulf_data/naukri_gulf_search_{formatted_date}.csv' - today_search_df = pd.read_csv(today_search_file) - yesterday_search_df = pd.read_csv(yesterday_search_file) - newresult_df = pd.merge(today_search_df , yesterday_search_df, on='jobId', how='left', suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge']) - oldresult_df = pd.merge(yesterday_search_df, today_search_df , on='jobId', how='left',suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge']) - newresult_df = newresult_df.drop_duplicates(subset="jobId", keep="first") - oldresult_df = oldresult_df.drop_duplicates(subset="jobId", keep="first") - newresult_df = newresult_df.reset_index(drop=True) - newresult_df.to_csv('gulf_data/new_jobs_gulf.csv', index=False) - oldresult_df = oldresult_df.reset_index(drop=True) - oldresult_df.to_csv('gulf_data/expired_jobs_gulf.csv', index=False) + # current_date = datetime.now() + # formatted_date = current_date.strftime('%d-%m-%Y') + # yesterday = current_date - timedelta(days=1) + # yesterday_str = yesterday.strftime('%d-%m-%Y') + # yesterday_search_file = f'gulf_data/naukri_gulf_search_{yesterday_str}.csv' + # today_search_file = f'gulf_data/naukri_gulf_search_{formatted_date}.csv' + # today_search_df = pd.read_csv(today_search_file) + # yesterday_search_df = pd.read_csv(yesterday_search_file) + # newresult_df = pd.merge(today_search_df , yesterday_search_df, on='jobId', how='left', suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge']) + # oldresult_df = pd.merge(yesterday_search_df, today_search_df , on='jobId', how='left',suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge']) + # newresult_df = newresult_df.drop_duplicates(subset="jobId", keep="first") + # oldresult_df = oldresult_df.drop_duplicates(subset="jobId", keep="first") + # newresult_df = newresult_df.reset_index(drop=True) + # newresult_df.to_csv('gulf_data/new_jobs_gulf.csv', index=False) + # oldresult_df = oldresult_df.reset_index(drop=True) + # oldresult_df.to_csv('gulf_data/expired_jobs_gulf.csv', index=False) input_file = 'gulf_data/new_jobs_gulf.csv' - print(newresult_df.shape, oldresult_df.shape) + # print(newresult_df.shape, oldresult_df.shape) def start_requests(self): headers = {