from scrape_modules import *
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
def string_encoder(text):
text = f'"{text}"'
return text
def unzip_values(curr_dict,key_list,main_eventList):
for key in curr_dict:
try:
for x in curr_dict[key]:
key_list.append(key)
main_eventList.append(x)
except:
print("Individual Event")
key_list.append(key)
main_eventList.append(curr_dict[key])
return key_list,main_eventList
def flat_list(list_):
unpack_list = []
for x in list_:
if type(x)==list:
for i in x:
unpack_list.append(i)
else:
unpack_list.append(x)
return unpack_list
def top_card(soup):
sectional_div = {}
sections = []
for i in range(len(soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet'))):
css_select = ".wallet_card_row"
subsection = []
section_title = soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet')[i].select(".wallet_card_title")[0].text.strip()
sections.append(section_title)
try:
event_clicks = soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet')[i].select('.listwalltcard')[0].findAll('a')
for j in range(len(event_clicks)):
subsection.append(soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet')[i].select('.listwalltcard')[0].findAll('a')[j].find("p").text)
except Exception as e:
subsection.append(soup.select(".supercard")[0].find("i").text.strip())
# print(e)
pass
sectional_div[section_title] = subsection
return sectional_div,css_select,"top_card"
def rel_card(soup):
main_title = soup.select(".my_relation")[0].select(".pageall_title")[0].find("p").text.strip()
css_select = ".my_relation"
rel_card = soup.select(".rela_items_bar")[0].select('.rel_item_box')
sectional_div = {}
sections = []
for i in range(len(rel_card)):
subsection = []
section_title = soup.select(".rela_items_bar")[0].select('.rel_item_box')[i].select(".rel_details")[0].find("p").text.strip()
sections.append(section_title)
try:
event_clicks = soup.select(".rela_items_bar")[0].select('.rel_item_box')[i]
for j in range(len(event_clicks)):
subsection.append(soup.select(".rela_items_bar")[0].select('.rel_item_box')[i].select(".bottom_tags")[j].find("a").text.strip())
except Exception as e:
# print(e)
pass
sectional_div[section_title] = subsection
return sectional_div,css_select,"relations_card"
def billRe(soup):
section_heading = soup.select(".BHFL_Box_Fullbox")[0].select(".pageall_title")[0].find("p").text.strip()
css_select = ".BHFL_Box_Fullbox"
sectional_div = {}
sections = []
for i in range(len(soup.select(".paybillbox")[0].findAll("a"))):
sections.append(soup.select(".paybillbox")[0].findAll("a")[i].find("p").text.strip())
sectional_div[section_heading] = sections
return sectional_div,css_select,"bills_card"
def slider(soup):
sectional_div = {}
sections = []
css_select =".cbl_init"
for i in range(len(soup.select(".cbl_init"))):
sectional_div[soup.select(".cbl_init")[i].find("p").text.strip()] = soup.select(".cbl_init")[i].find("a").text.strip()
return sectional_div,css_select,"slider_card"
def emi_store(soup):
main_title = soup.select(".Shop_EMI_store_BG")[0].find("p").text.strip()
event_list = []
sectional_div = {}
css_select = ".Shop_EMI_store_BG"
for i in range(len(soup.select(".slick-list")[2].findAll("strong"))):
event_list.append(soup.select(".slick-list")[2].findAll("strong")[i].text.strip())
sectional_div[main_title] = event_list
return sectional_div,css_select,"emistore_card"
def hot_offer(soup):
main_title = soup.select(".Shop_EMI_store_BG")[1].find("p").text.strip()
event_list = []
sectional_div = {}
css_select = ".slick-list"
for i in range(len(soup.select(".slick-list")[3].findAll("strong"))):
event_list.append(soup.select(".slick-list")[3].findAll("strong")[i].text.strip())
sectional_div[main_title] = event_list
return sectional_div,css_select,"hot_offers"
if __name__=="__main__":
username = pass
password = pass
chromedriver_path= "chromedriver"
wri_url = pass
browser = webdriver.Chrome(chromedriver_path)
browser.get(wri_url)
browser.find_element_by_name("uname").send_keys(username)
browser.find_element_by_name("psw").send_keys(password)
browser.find_element_by_xpath('//button[text()="Login"]').click()
time.sleep(8)
browser.refresh()
delay = 5 # seconds
try:
myElem = WebDriverWait(browser, delay).until(lambda x: x.find_element_by_id("slick-track"))
print("Page is ready!")
except TimeoutException:
print("Exception: find ID")
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'html5lib')
browser.close()
wri_Pg = string_encoder(soup.title.text)
out_array = [top_card(soup), rel_card(soup), billRe(soup), slider(soup), emi_store(soup), hot_offer(soup), insurance_banner(soup),
invest_banner(soup), on_store(soup), ex_offers(soup), super_stores(soup)]
css_list = []
value_list = []
categ_list = []
action_list = []
for x in out_array:
for key in x[0].keys():
value_list.append(x[0][key])
css_list.append(x[1])
categ_list.append(x[2])
action_list.append(str(x[2])+"_click")
final_css = []
for i in range(len(value_list)):
final_css.append([css_list[i] for x in range(len(value_list[i]))])
final_category = []
for i in range(len(value_list)):
final_category.append([categ_list[i] for x in range(len(value_list[i]))])
final_action = []
for i in range(len(value_list)):
final_action.append([action_list[i] for x in range(len(value_list[i]))])
final_css = flat_list(final_css);final_category = flat_list(final_category);final_action = flat_list(final_action)
value_list = flat_list(value_list)
mainlabel_list = []
for i in out_array:
for key in i[0].keys():
mainlabel_list.append([key for x in range(len(i[0][key]))])
mainlabel_list = flat_list(mainlabel_list)
final_df = pd.DataFrame()
final_df["Page Title"] = pd.Series([string_encoder(wri_Pg) for x in range(len(value_list))])
final_df["Page URL"] = pd.Series([string_encoder(wri_url) for x in range(len(value_list))])
final_df["Main Label"] = pd.Series([string_encoder(x) for x in mainlabel_list])
final_df["Event Category"] = pd.Series([string_encoder(x) for x in final_category])
final_df["Event Action"] = pd.Series([string_encoder(x) for x in final_action])
final_df["Event Label"] = pd.Series([string_encoder(x) for x in value_list])
final_df["css-selector"] = pd.Series([string_encoder(x) for x in final_css])
final_df.to_csv("extract_experia.csv")