import csv
from bs4 import BeautifulSoup
from requests import get
from bs4.element import Tag
def get_products(soup: BeautifulSoup):
categories = soup.find_all('div', class_='uk-grid uk-grid-medium')
products = []
for category in categories:
product = category.find_all('div', class_='uk-flex mzr-tc-group-item')
products.extend(product)
return products
def get_links_and_names(soup):
products = soup.find_all('a', class_='mzr-tc-group-item-href')
links = []
names = []
for item in products:
link = 'https://health-diet.ru' + item.get('href')
name = item.text
links.append(link)
names.append(name)
return links, names
def get_table(link):
page = get(link).text
soup = BeautifulSoup(page, 'lxml')
table = soup.find('table', class_='uk-table mzr-tc-group-table uk-table-hover uk-table-striped uk-table-condensed')
return table
def get_tables(links):
tables = []
print('Getting tables...')
for link in links:
table = get_table(link)
tables.append(table)
return tables
def get_table_elements(table: Tag) -> dict[str, list]:
if table:
headers = table.find('thead').find_all('th')
else:
# noinspection PyTypeChecker
return {'headers': None, 'rows': None}
for i in range(len(headers)):
headers[i] = headers[i].text
body = table.find('tbody').find_all('tr')
rows = []
for row in body:
row = row.find_all('td')
row = [el.text.strip() for el in row]
rows.append(row)
return {'headers': headers, 'rows': rows}
def save_files(names, tables):
for i in range(len(names)):
with open(f'data/{names[i]}.csv', 'w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
elements = get_table_elements(tables[i])
headers = elements['headers']
rows = elements['rows']
if headers:
writer.writerow(headers)
for row in rows:
writer.writerow(row)
def main():
src = get('https://health-diet.ru/table_calorie/', timeout=5).text
soup = BeautifulSoup(src, 'lxml')
print('Getting links...')
links, names = get_links_and_names(soup)
print('Getting tables...')
tables = get_tables(links)
print('Writing Files...')
save_files(names, tables)
print('Done!')
if __name__ == '__main__':
main()