Untitled
unknown
python
2 years ago
7.6 kB
1
Indexable
Never
from scrape_modules import * import pandas as pd import numpy as np from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.common.exceptions import TimeoutException from bs4 import BeautifulSoup import time def string_encoder(text): text = f'"{text}"' return text def unzip_values(curr_dict,key_list,main_eventList): for key in curr_dict: try: for x in curr_dict[key]: key_list.append(key) main_eventList.append(x) except: print("Individual Event") key_list.append(key) main_eventList.append(curr_dict[key]) return key_list,main_eventList def flat_list(list_): unpack_list = [] for x in list_: if type(x)==list: for i in x: unpack_list.append(i) else: unpack_list.append(x) return unpack_list def top_card(soup): sectional_div = {} sections = [] for i in range(len(soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet'))): css_select = ".wallet_card_row" subsection = [] section_title = soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet')[i].select(".wallet_card_title")[0].text.strip() sections.append(section_title) try: event_clicks = soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet')[i].select('.listwalltcard')[0].findAll('a') for j in range(len(event_clicks)): subsection.append(soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet')[i].select('.listwalltcard')[0].findAll('a')[j].find("p").text) except Exception as e: subsection.append(soup.select(".supercard")[0].find("i").text.strip()) # print(e) pass sectional_div[section_title] = subsection return sectional_div,css_select,"top_card" def rel_card(soup): main_title = soup.select(".my_relation")[0].select(".pageall_title")[0].find("p").text.strip() css_select = ".my_relation" rel_card = soup.select(".rela_items_bar")[0].select('.rel_item_box') sectional_div = {} sections = [] for i in range(len(rel_card)): subsection = [] section_title = soup.select(".rela_items_bar")[0].select('.rel_item_box')[i].select(".rel_details")[0].find("p").text.strip() sections.append(section_title) try: event_clicks = soup.select(".rela_items_bar")[0].select('.rel_item_box')[i] for j in range(len(event_clicks)): subsection.append(soup.select(".rela_items_bar")[0].select('.rel_item_box')[i].select(".bottom_tags")[j].find("a").text.strip()) except Exception as e: # print(e) pass sectional_div[section_title] = subsection return sectional_div,css_select,"relations_card" def billRe(soup): section_heading = soup.select(".BHFL_Box_Fullbox")[0].select(".pageall_title")[0].find("p").text.strip() css_select = ".BHFL_Box_Fullbox" sectional_div = {} sections = [] for i in range(len(soup.select(".paybillbox")[0].findAll("a"))): sections.append(soup.select(".paybillbox")[0].findAll("a")[i].find("p").text.strip()) sectional_div[section_heading] = sections return sectional_div,css_select,"bills_card" def slider(soup): sectional_div = {} sections = [] css_select =".cbl_init" for i in range(len(soup.select(".cbl_init"))): sectional_div[soup.select(".cbl_init")[i].find("p").text.strip()] = soup.select(".cbl_init")[i].find("a").text.strip() return sectional_div,css_select,"slider_card" def emi_store(soup): main_title = soup.select(".Shop_EMI_store_BG")[0].find("p").text.strip() event_list = [] sectional_div = {} css_select = ".Shop_EMI_store_BG" for i in range(len(soup.select(".slick-list")[2].findAll("strong"))): event_list.append(soup.select(".slick-list")[2].findAll("strong")[i].text.strip()) sectional_div[main_title] = event_list return sectional_div,css_select,"emistore_card" def hot_offer(soup): main_title = soup.select(".Shop_EMI_store_BG")[1].find("p").text.strip() event_list = [] sectional_div = {} css_select = ".slick-list" for i in range(len(soup.select(".slick-list")[3].findAll("strong"))): event_list.append(soup.select(".slick-list")[3].findAll("strong")[i].text.strip()) sectional_div[main_title] = event_list return sectional_div,css_select,"hot_offers" if __name__=="__main__": username = pass password = pass chromedriver_path= "chromedriver" wri_url = pass browser = webdriver.Chrome(chromedriver_path) browser.get(wri_url) browser.find_element_by_name("uname").send_keys(username) browser.find_element_by_name("psw").send_keys(password) browser.find_element_by_xpath('//button[text()="Login"]').click() time.sleep(8) browser.refresh() delay = 5 # seconds try: myElem = WebDriverWait(browser, delay).until(lambda x: x.find_element_by_id("slick-track")) print("Page is ready!") except TimeoutException: print("Exception: find ID") page_source = browser.page_source soup = BeautifulSoup(page_source, 'html5lib') browser.close() wri_Pg = string_encoder(soup.title.text) out_array = [top_card(soup), rel_card(soup), billRe(soup), slider(soup), emi_store(soup), hot_offer(soup), insurance_banner(soup), invest_banner(soup), on_store(soup), ex_offers(soup), super_stores(soup)] css_list = [] value_list = [] categ_list = [] action_list = [] for x in out_array: for key in x[0].keys(): value_list.append(x[0][key]) css_list.append(x[1]) categ_list.append(x[2]) action_list.append(str(x[2])+"_click") final_css = [] for i in range(len(value_list)): final_css.append([css_list[i] for x in range(len(value_list[i]))]) final_category = [] for i in range(len(value_list)): final_category.append([categ_list[i] for x in range(len(value_list[i]))]) final_action = [] for i in range(len(value_list)): final_action.append([action_list[i] for x in range(len(value_list[i]))]) final_css = flat_list(final_css);final_category = flat_list(final_category);final_action = flat_list(final_action) value_list = flat_list(value_list) mainlabel_list = [] for i in out_array: for key in i[0].keys(): mainlabel_list.append([key for x in range(len(i[0][key]))]) mainlabel_list = flat_list(mainlabel_list) final_df = pd.DataFrame() final_df["Page Title"] = pd.Series([string_encoder(wri_Pg) for x in range(len(value_list))]) final_df["Page URL"] = pd.Series([string_encoder(wri_url) for x in range(len(value_list))]) final_df["Main Label"] = pd.Series([string_encoder(x) for x in mainlabel_list]) final_df["Event Category"] = pd.Series([string_encoder(x) for x in final_category]) final_df["Event Action"] = pd.Series([string_encoder(x) for x in final_action]) final_df["Event Label"] = pd.Series([string_encoder(x) for x in value_list]) final_df["css-selector"] = pd.Series([string_encoder(x) for x in final_css]) final_df.to_csv("extract_experia.csv")