Untitled

mail@pastecode.io avatar
unknown
python
a year ago
2.6 kB
2
Indexable
import csv
from bs4 import BeautifulSoup
from requests import get

from bs4.element import Tag


def get_products(soup: BeautifulSoup):
    categories = soup.find_all('div', class_='uk-grid uk-grid-medium')
    products = []
    for category in categories:
        product = category.find_all('div', class_='uk-flex mzr-tc-group-item')
        products.extend(product)

    return products


def get_links_and_names(soup):
    products = soup.find_all('a', class_='mzr-tc-group-item-href')
    links = []
    names = []

    for item in products:
        link = 'https://health-diet.ru' + item.get('href')
        name = item.text
        links.append(link)
        names.append(name)

    return links, names


def get_table(link):
    page = get(link).text
    soup = BeautifulSoup(page, 'lxml')
    table = soup.find('table', class_='uk-table mzr-tc-group-table uk-table-hover uk-table-striped uk-table-condensed')
    return table


def get_tables(links):
    tables = []
    print('Getting tables...')
    for link in links:
        table = get_table(link)

        tables.append(table)

    return tables


def get_table_elements(table: Tag) -> dict[str, list]:
    if table:
        headers = table.find('thead').find_all('th')
    else:
        # noinspection PyTypeChecker
        return {'headers': None, 'rows': None}

    for i in range(len(headers)):
        headers[i] = headers[i].text

    body = table.find('tbody').find_all('tr')

    rows = []
    for row in body:
        row = row.find_all('td')
        row = [el.text.strip() for el in row]
        rows.append(row)

    return {'headers': headers, 'rows': rows}


def save_files(names, tables):
    for i in range(len(names)):
        with open(f'data/{names[i]}.csv', 'w', encoding='utf-8', newline='') as file:
            writer = csv.writer(file)

            elements = get_table_elements(tables[i])
            headers = elements['headers']
            rows = elements['rows']

            if headers:
                writer.writerow(headers)
                for row in rows:
                    writer.writerow(row)


def main():
    src = get('https://health-diet.ru/table_calorie/', timeout=5).text
    soup = BeautifulSoup(src, 'lxml')

    print('Getting links...')
    links, names = get_links_and_names(soup)

    print('Getting tables...')
    tables = get_tables(links)

    print('Writing Files...')
    save_files(names, tables)
    print('Done!')


if __name__ == '__main__':
    main()