Untitled
unknown
plain_text
2 years ago
1.6 kB
3
Indexable
import requests import bs4 import pandas as pd from tqdm import tqdm base_url = 'https://www.poesiacastellana.es/' df = pd.DataFrame(columns=['Author', 'Title', 'Text']) def get_authors_and_urls(page_number): output = [] url = f'https://www.poesiacastellana.es/poetas-cronologico.php?pagina={page_number}' page = requests.get(url) soup = bs4.BeautifulSoup(page.content, 'html.parser') table = soup.find('table', {'id':'newspaper-b'}) rows = table.findAll('tr')[1:] for row in rows: author = row.findAll('a')[1].text author_url = row.findAll('a')[1]['href'] output.append((author, author_url)) return output def get_poem_texts(author, author_url): print(f'getting poems for {author}...') df = pd.DataFrame(columns=['Author', 'Title', 'Text']) page = requests.get(base_url+author_url) soup = bs4.BeautifulSoup(page.content, 'html.parser') table = soup.find('table', {'id':'newspaper-b'}) rows = table.findAll('tr')[1:] for row in tqdm(rows): title = row.find('a').text poem_url = row.find('a')['href'] poem_page = requests.get(base_url+poem_url) poem_soup = bs4.BeautifulSoup(poem_page.content, 'html.parser') poem_text = poem_soup.find('div', {'class':'poema'}).text temp_df = pd.DataFrame(data = {'Author': [author], 'Title': [title], 'Text': [poem_text]}) df = pd.concat([df, temp_df]) return df for i in range(1,3): # Change this range if you want to scrape more pages for author, author_url in get_authors_and_urls(i): temp_df = get_poem_texts(author, author_url) df = pd.concat([df, temp_df]) df.to_excel([your file name here], index=False)
Editor is loading...