Untitled

mail@pastecode.io avatar
unknown
python
3 years ago
7.6 kB
2
Indexable
from scrape_modules import *

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup

import time

def string_encoder(text):
    text = f'"{text}"'
    return text

def unzip_values(curr_dict,key_list,main_eventList):
    for key in curr_dict:
        try:
            for x in curr_dict[key]:
                key_list.append(key)
                main_eventList.append(x)
        except:
            print("Individual Event")
            key_list.append(key)
            main_eventList.append(curr_dict[key])
    return key_list,main_eventList

def flat_list(list_):
    unpack_list = []
    for x in list_:
        if type(x)==list:
            for i in x:
                unpack_list.append(i)
        else:
            unpack_list.append(x)
    return unpack_list

def top_card(soup):
    sectional_div = {}
    sections = []
    for i in range(len(soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet'))):
        css_select = ".wallet_card_row"
        subsection = []
        section_title = soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet')[i].select(".wallet_card_title")[0].text.strip()
        sections.append(section_title)
        try:
            event_clicks = soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet')[i].select('.listwalltcard')[0].findAll('a')
            for j in range(len(event_clicks)):
                subsection.append(soup.select(".wallet_card_row")[0].select('.Bajaj_Finserv_Wallet')[i].select('.listwalltcard')[0].findAll('a')[j].find("p").text)
        except Exception as e:
            subsection.append(soup.select(".supercard")[0].find("i").text.strip())
#             print(e)
            pass
        sectional_div[section_title] = subsection
    return sectional_div,css_select,"top_card"

def rel_card(soup):
    main_title = soup.select(".my_relation")[0].select(".pageall_title")[0].find("p").text.strip()
    css_select = ".my_relation"
    rel_card = soup.select(".rela_items_bar")[0].select('.rel_item_box')
    sectional_div = {}
    sections = []
    for i in range(len(rel_card)):
        subsection = []
        section_title = soup.select(".rela_items_bar")[0].select('.rel_item_box')[i].select(".rel_details")[0].find("p").text.strip()
        sections.append(section_title)

        try:
            event_clicks = soup.select(".rela_items_bar")[0].select('.rel_item_box')[i]
            for j in range(len(event_clicks)):
                subsection.append(soup.select(".rela_items_bar")[0].select('.rel_item_box')[i].select(".bottom_tags")[j].find("a").text.strip())
        except Exception as e:
    #         print(e)
            pass
        sectional_div[section_title] = subsection
    return sectional_div,css_select,"relations_card"

def billRe(soup):
    section_heading = soup.select(".BHFL_Box_Fullbox")[0].select(".pageall_title")[0].find("p").text.strip()
    css_select = ".BHFL_Box_Fullbox"
    sectional_div = {}
    sections = []
    for i in range(len(soup.select(".paybillbox")[0].findAll("a"))):
        sections.append(soup.select(".paybillbox")[0].findAll("a")[i].find("p").text.strip())
    sectional_div[section_heading] = sections
    return sectional_div,css_select,"bills_card"

def slider(soup):
    sectional_div = {}
    sections = []
    css_select =".cbl_init"
    for i in range(len(soup.select(".cbl_init"))):
        sectional_div[soup.select(".cbl_init")[i].find("p").text.strip()] = soup.select(".cbl_init")[i].find("a").text.strip()
    return sectional_div,css_select,"slider_card"

def emi_store(soup):
    main_title = soup.select(".Shop_EMI_store_BG")[0].find("p").text.strip()
    event_list = []
    sectional_div = {}
    css_select = ".Shop_EMI_store_BG"
    for i in range(len(soup.select(".slick-list")[2].findAll("strong"))):
        event_list.append(soup.select(".slick-list")[2].findAll("strong")[i].text.strip())
    sectional_div[main_title] = event_list
    return sectional_div,css_select,"emistore_card"

def hot_offer(soup):
    main_title = soup.select(".Shop_EMI_store_BG")[1].find("p").text.strip()
    event_list = []
    sectional_div = {}
    css_select = ".slick-list"
    for i in range(len(soup.select(".slick-list")[3].findAll("strong"))):
        event_list.append(soup.select(".slick-list")[3].findAll("strong")[i].text.strip())
    sectional_div[main_title] = event_list
    return sectional_div,css_select,"hot_offers"



if __name__=="__main__":
    username = pass
    password = pass
    chromedriver_path= "chromedriver"
    wri_url = pass
    browser = webdriver.Chrome(chromedriver_path)

    browser.get(wri_url)
    browser.find_element_by_name("uname").send_keys(username)
    browser.find_element_by_name("psw").send_keys(password)
    browser.find_element_by_xpath('//button[text()="Login"]').click()

    time.sleep(8)
    browser.refresh()
    delay = 5 # seconds
    try:
        myElem = WebDriverWait(browser, delay).until(lambda x: x.find_element_by_id("slick-track"))
        print("Page is ready!")
    except TimeoutException:
        print("Exception: find ID")


    page_source = browser.page_source
    soup = BeautifulSoup(page_source, 'html5lib')
    browser.close()

    wri_Pg = string_encoder(soup.title.text)
    
    out_array = [top_card(soup), rel_card(soup), billRe(soup), slider(soup), emi_store(soup), hot_offer(soup), insurance_banner(soup),
    invest_banner(soup), on_store(soup), ex_offers(soup), super_stores(soup)]

    css_list = []
    value_list = []
    categ_list = []
    action_list = []
    for x in out_array:
        for key in x[0].keys():
            value_list.append(x[0][key])
            css_list.append(x[1])
            categ_list.append(x[2])
            action_list.append(str(x[2])+"_click")

    final_css = []
    for i in range(len(value_list)):
        final_css.append([css_list[i] for x in range(len(value_list[i]))])
    final_category = []
    for i in range(len(value_list)):
        final_category.append([categ_list[i] for x in range(len(value_list[i]))])
    final_action = []
    for i in range(len(value_list)):
        final_action.append([action_list[i] for x in range(len(value_list[i]))])
    final_css = flat_list(final_css);final_category = flat_list(final_category);final_action = flat_list(final_action)
    value_list = flat_list(value_list)

    mainlabel_list = []
    for i in out_array:
        for key in i[0].keys():
            mainlabel_list.append([key for x in range(len(i[0][key]))])
    mainlabel_list = flat_list(mainlabel_list)

    final_df = pd.DataFrame()
    final_df["Page Title"] = pd.Series([string_encoder(wri_Pg) for x in range(len(value_list))])
    final_df["Page URL"] = pd.Series([string_encoder(wri_url) for x in range(len(value_list))])
    final_df["Main Label"] = pd.Series([string_encoder(x) for x in mainlabel_list])
    final_df["Event Category"] = pd.Series([string_encoder(x) for x in final_category])
    final_df["Event Action"] = pd.Series([string_encoder(x) for x in final_action])
    final_df["Event Label"] = pd.Series([string_encoder(x) for x in value_list])
    final_df["css-selector"] = pd.Series([string_encoder(x) for x in final_css])

    final_df.to_csv("extract_experia.csv")