only today file
parent
d1e98fac17
commit
902933f5d6
|
@ -17,24 +17,24 @@ class NaukriGulfDetailSpiderSpider(scrapy.Spider):
|
||||||
'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s',
|
'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s',
|
||||||
}
|
}
|
||||||
|
|
||||||
current_date = datetime.now()
|
# current_date = datetime.now()
|
||||||
formatted_date = current_date.strftime('%d-%m-%Y')
|
# formatted_date = current_date.strftime('%d-%m-%Y')
|
||||||
yesterday = current_date - timedelta(days=1)
|
# yesterday = current_date - timedelta(days=1)
|
||||||
yesterday_str = yesterday.strftime('%d-%m-%Y')
|
# yesterday_str = yesterday.strftime('%d-%m-%Y')
|
||||||
yesterday_search_file = f'gulf_data/naukri_gulf_search_{yesterday_str}.csv'
|
# yesterday_search_file = f'gulf_data/naukri_gulf_search_{yesterday_str}.csv'
|
||||||
today_search_file = f'gulf_data/naukri_gulf_search_{formatted_date}.csv'
|
# today_search_file = f'gulf_data/naukri_gulf_search_{formatted_date}.csv'
|
||||||
today_search_df = pd.read_csv(today_search_file)
|
# today_search_df = pd.read_csv(today_search_file)
|
||||||
yesterday_search_df = pd.read_csv(yesterday_search_file)
|
# yesterday_search_df = pd.read_csv(yesterday_search_file)
|
||||||
newresult_df = pd.merge(today_search_df , yesterday_search_df, on='jobId', how='left', suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
|
# newresult_df = pd.merge(today_search_df , yesterday_search_df, on='jobId', how='left', suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
|
||||||
oldresult_df = pd.merge(yesterday_search_df, today_search_df , on='jobId', how='left',suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
|
# oldresult_df = pd.merge(yesterday_search_df, today_search_df , on='jobId', how='left',suffixes=('', '_y'), indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
|
||||||
newresult_df = newresult_df.drop_duplicates(subset="jobId", keep="first")
|
# newresult_df = newresult_df.drop_duplicates(subset="jobId", keep="first")
|
||||||
oldresult_df = oldresult_df.drop_duplicates(subset="jobId", keep="first")
|
# oldresult_df = oldresult_df.drop_duplicates(subset="jobId", keep="first")
|
||||||
newresult_df = newresult_df.reset_index(drop=True)
|
# newresult_df = newresult_df.reset_index(drop=True)
|
||||||
newresult_df.to_csv('gulf_data/new_jobs_gulf.csv', index=False)
|
# newresult_df.to_csv('gulf_data/new_jobs_gulf.csv', index=False)
|
||||||
oldresult_df = oldresult_df.reset_index(drop=True)
|
# oldresult_df = oldresult_df.reset_index(drop=True)
|
||||||
oldresult_df.to_csv('gulf_data/expired_jobs_gulf.csv', index=False)
|
# oldresult_df.to_csv('gulf_data/expired_jobs_gulf.csv', index=False)
|
||||||
input_file = 'gulf_data/new_jobs_gulf.csv'
|
input_file = 'gulf_data/new_jobs_gulf.csv'
|
||||||
print(newresult_df.shape, oldresult_df.shape)
|
# print(newresult_df.shape, oldresult_df.shape)
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
headers = {
|
headers = {
|
||||||
|
|
Loading…
Reference in New Issue