Untitled

mail@pastecode.io avatar
unknown
python
a year ago
5.6 kB
5
Indexable
Never
# importing models
from pandas_profiling import ProfileReport
from bs4 import BeautifulSoup
import pandas as pd
import requests
import csv
import re

# Create a root variable to store the root url of maroof website
root = 'https://maroof.sa'
'''
each store in maroof has a page, this page url is a combination of the root
and the id number in maroof, for example: the url here https://maroof.sa/7855 
has two parts the first part is the root (https://maroof.sa) and the second 
part is the sore's id in maroof(7855). 
'''
# generate multiple store urls between the rage of 1000-1500 
stores_urls = []
for i in range(1000,1501): 
    stores_urls.append(root+"/"+str(i))

# create multiple empty containers to store the extracted data
store_names = []
busenss_typs = []
commercial_records = []
business_chanels = []
maroof_numbers = []
ratings = []
number_raters = []
about_stores = []
stores_websites = []
stores_accounts = []
maroof_numbers_websites = []
email_addresses = []
store_emails_ids = []

'''
The code below aims to extract data from HTML pages and then store every single
piece of information in one of the containers above to store all the data in 
one source (CSV file). 
'''
for url in stores_urls: 
  result = requests.get(url) # request html content for each store page in Maroof 
  soup_maroof = BeautifulSoup(result.content,'lxml')
  stores_website = soup_maroof.find('a',{'id': "websiteURLAnchor"})
  maroof_number = soup_maroof.find_all('span', {'class':'maarof-number'})
  accounts = soup_maroof.find_all('p',{'class' :"text-primary"})
  if maroof_number: 
      accounts.append(maroof_number[0])
  # Extracting all contact channels of each store 
  temp_account=[]
  if accounts:
    for account in accounts:
      clean = re.compile('<.*?>')
      account = re.sub(clean, '', str(account.contents[0]))
      temp_account.append(account)
      stores_accounts.append(set(temp_account))

  # Extracting websites 
  if stores_website: 
     clean = re.compile('<.*?>')
     stores_website = re.sub(clean, '', str(stores_website.contents[0]))
     stores_websites.append(stores_website)
     if maroof_number: 
       maroof_numbers_websites.append(maroof_number[0].text[-5:])
  # Extracting store names 
  text_primary = soup_maroof.find('h3',{'class': 'media-heading text-primary'})
  if text_primary:
    store_name = text_primary.contents[1].text
    store_names.append(str(store_name))
  # Extracting business types
  busenss_typ= soup_maroof.div.p
  if busenss_typ:
    busenss_typ = busenss_typ.contents[0]
    busenss_typs.append(str(busenss_typ))
  # Extracting السجل التجاري 
  divs = soup_maroof.find_all('div', class_='media-body media-body--width')
  for span in divs:
    commercial_record = span.find_all('span')[2].text
    if commercial_record == ' متجر إلكتروني  ':
      commercial_records.append(None)
    else:
     commercial_records.append(commercial_record)

    business_chanel = span.find_all('span')[-1].text
    business_chanels.append(business_chanel)
  # Extracting phone number 
  maroof_number = soup_maroof.find_all('span', {'class':'maarof-number'})
  if maroof_number: 
    maroof_numbers.append(maroof_number[0].text[-5:])
   
  # اسنخراج تقيم المتجر في موقع معروف 
  rating_num = soup_maroof.find_all('span', {'class':'rating-num'})
  if rating_num: 
    ratings.append(rating_num[1].text)
  # استخراج عدد المقيمين للمتجر في معروف 
  number_rater = soup_maroof.find('div', {'class':'h3 text-primary ma'})
  if number_rater: 
    number_raters.append(number_rater.text[:-9])
  # Brief info about what each store does/provides. 
  about_store = soup_maroof.find('div', {'class':'col-xs-12 withScroll'})
  if about_store: 
    clean = re.compile('<.*?\>')
    about_store = re.sub(clean, '', str(about_store.contents[0]).replace("\r\n                ",""))
    about_stores.append(about_store)

# Extracting email address 
for account in stores_accounts:
  for i in account:
    emails = re.findall(r"[A-Za-z0-9\.\-+_]+@[A-Za-z0-9\.\-+_]+\.[A-Za-z]+", i)
    if emails:
      if emails not in email_addresses:
          email_addresses.append(emails)
    if "رقم معروف" in i :
      store_emails_ids.append(i[-5:])
       
for i in range(len(email_addresses)):
  email_addresses[i]= email_addresses[i][0]

# store data in data frames to merge them and then convert them to CVS file. 
storesData = pd.DataFrame({"store_name": store_names, "busenss_type":busenss_typs, 
              "commercial_record": commercial_records,
              "business_chanel": business_chanels, 
              "maroof_number": maroof_numbers, "rating": ratings, 
              "number_raters":number_raters, "about_stores": about_stores})

websites_maroofID = pd.DataFrame({"maroof_numbers_websites": maroof_numbers_websites,
                     "stores_website": stores_websites})

stores_emails=pd.DataFrame({'store_ids':store_emails_ids,
                            'email_addresses': email_addresses})

Both_DFs = pd.merge(websites_maroofID,stores_emails, 
                    how='right',left_on=['maroof_numbers_websites'],
                    right_on=['store_ids'])

stores_DF = pd.merge(Both_DFs,storesData, 
                    how='right',left_on=['store_ids'],
                    right_on=['maroof_number'])
stores_DF.drop('maroof_numbers_websites', inplace=True, axis=1)
stores_DF.drop('store_ids', inplace=True, axis=1)
stores_DF.to_csv("stores_DF.csv")
stores_DF