Untitled
python
a month ago
2.6 kB
1
Indexable
Never
import csv from bs4 import BeautifulSoup from requests import get from bs4.element import Tag def get_products(soup: BeautifulSoup): categories = soup.find_all('div', class_='uk-grid uk-grid-medium') products = [] for category in categories: product = category.find_all('div', class_='uk-flex mzr-tc-group-item') products.extend(product) return products def get_links_and_names(soup): products = soup.find_all('a', class_='mzr-tc-group-item-href') links = [] names = [] for item in products: link = 'https://health-diet.ru' + item.get('href') name = item.text links.append(link) names.append(name) return links, names def get_table(link): page = get(link).text soup = BeautifulSoup(page, 'lxml') table = soup.find('table', class_='uk-table mzr-tc-group-table uk-table-hover uk-table-striped uk-table-condensed') return table def get_tables(links): tables = [] print('Getting tables...') for link in links: table = get_table(link) tables.append(table) return tables def get_table_elements(table: Tag) -> dict[str, list]: if table: headers = table.find('thead').find_all('th') else: # noinspection PyTypeChecker return {'headers': None, 'rows': None} for i in range(len(headers)): headers[i] = headers[i].text body = table.find('tbody').find_all('tr') rows = [] for row in body: row = row.find_all('td') row = [el.text.strip() for el in row] rows.append(row) return {'headers': headers, 'rows': rows} def save_files(names, tables): for i in range(len(names)): with open(f'data/{names[i]}.csv', 'w', encoding='utf-8', newline='') as file: writer = csv.writer(file) elements = get_table_elements(tables[i]) headers = elements['headers'] rows = elements['rows'] if headers: writer.writerow(headers) for row in rows: writer.writerow(row) def main(): src = get('https://health-diet.ru/table_calorie/', timeout=5).text soup = BeautifulSoup(src, 'lxml') print('Getting links...') links, names = get_links_and_names(soup) print('Getting tables...') tables = get_tables(links) print('Writing Files...') save_files(names, tables) print('Done!') if __name__ == '__main__': main()