Untitled
unknown
python
2 years ago
5.6 kB
8
Indexable
# importing models from pandas_profiling import ProfileReport from bs4 import BeautifulSoup import pandas as pd import requests import csv import re # Create a root variable to store the root url of maroof website root = 'https://maroof.sa' ''' each store in maroof has a page, this page url is a combination of the root and the id number in maroof, for example: the url here https://maroof.sa/7855 has two parts the first part is the root (https://maroof.sa) and the second part is the sore's id in maroof(7855). ''' # generate multiple store urls between the rage of 1000-1500 stores_urls = [] for i in range(1000,1501): stores_urls.append(root+"/"+str(i)) # create multiple empty containers to store the extracted data store_names = [] busenss_typs = [] commercial_records = [] business_chanels = [] maroof_numbers = [] ratings = [] number_raters = [] about_stores = [] stores_websites = [] stores_accounts = [] maroof_numbers_websites = [] email_addresses = [] store_emails_ids = [] ''' The code below aims to extract data from HTML pages and then store every single piece of information in one of the containers above to store all the data in one source (CSV file). ''' for url in stores_urls: result = requests.get(url) # request html content for each store page in Maroof soup_maroof = BeautifulSoup(result.content,'lxml') stores_website = soup_maroof.find('a',{'id': "websiteURLAnchor"}) maroof_number = soup_maroof.find_all('span', {'class':'maarof-number'}) accounts = soup_maroof.find_all('p',{'class' :"text-primary"}) if maroof_number: accounts.append(maroof_number[0]) # Extracting all contact channels of each store temp_account=[] if accounts: for account in accounts: clean = re.compile('<.*?>') account = re.sub(clean, '', str(account.contents[0])) temp_account.append(account) stores_accounts.append(set(temp_account)) # Extracting websites if stores_website: clean = re.compile('<.*?>') stores_website = re.sub(clean, '', str(stores_website.contents[0])) stores_websites.append(stores_website) if maroof_number: maroof_numbers_websites.append(maroof_number[0].text[-5:]) # Extracting store names text_primary = soup_maroof.find('h3',{'class': 'media-heading text-primary'}) if text_primary: store_name = text_primary.contents[1].text store_names.append(str(store_name)) # Extracting business types busenss_typ= soup_maroof.div.p if busenss_typ: busenss_typ = busenss_typ.contents[0] busenss_typs.append(str(busenss_typ)) # Extracting السجل التجاري divs = soup_maroof.find_all('div', class_='media-body media-body--width') for span in divs: commercial_record = span.find_all('span')[2].text if commercial_record == ' متجر إلكتروني ': commercial_records.append(None) else: commercial_records.append(commercial_record) business_chanel = span.find_all('span')[-1].text business_chanels.append(business_chanel) # Extracting phone number maroof_number = soup_maroof.find_all('span', {'class':'maarof-number'}) if maroof_number: maroof_numbers.append(maroof_number[0].text[-5:]) # اسنخراج تقيم المتجر في موقع معروف rating_num = soup_maroof.find_all('span', {'class':'rating-num'}) if rating_num: ratings.append(rating_num[1].text) # استخراج عدد المقيمين للمتجر في معروف number_rater = soup_maroof.find('div', {'class':'h3 text-primary ma'}) if number_rater: number_raters.append(number_rater.text[:-9]) # Brief info about what each store does/provides. about_store = soup_maroof.find('div', {'class':'col-xs-12 withScroll'}) if about_store: clean = re.compile('<.*?\>') about_store = re.sub(clean, '', str(about_store.contents[0]).replace("\r\n ","")) about_stores.append(about_store) # Extracting email address for account in stores_accounts: for i in account: emails = re.findall(r"[A-Za-z0-9\.\-+_]+@[A-Za-z0-9\.\-+_]+\.[A-Za-z]+", i) if emails: if emails not in email_addresses: email_addresses.append(emails) if "رقم معروف" in i : store_emails_ids.append(i[-5:]) for i in range(len(email_addresses)): email_addresses[i]= email_addresses[i][0] # store data in data frames to merge them and then convert them to CVS file. storesData = pd.DataFrame({"store_name": store_names, "busenss_type":busenss_typs, "commercial_record": commercial_records, "business_chanel": business_chanels, "maroof_number": maroof_numbers, "rating": ratings, "number_raters":number_raters, "about_stores": about_stores}) websites_maroofID = pd.DataFrame({"maroof_numbers_websites": maroof_numbers_websites, "stores_website": stores_websites}) stores_emails=pd.DataFrame({'store_ids':store_emails_ids, 'email_addresses': email_addresses}) Both_DFs = pd.merge(websites_maroofID,stores_emails, how='right',left_on=['maroof_numbers_websites'], right_on=['store_ids']) stores_DF = pd.merge(Both_DFs,storesData, how='right',left_on=['store_ids'], right_on=['maroof_number']) stores_DF.drop('maroof_numbers_websites', inplace=True, axis=1) stores_DF.drop('store_ids', inplace=True, axis=1) stores_DF.to_csv("stores_DF.csv") stores_DF
Editor is loading...