Untitled
unknown
plain_text
2 years ago
1.6 kB
4
Indexable
import requests
import bs4
import pandas as pd
from tqdm import tqdm
base_url = 'https://www.poesiacastellana.es/'
df = pd.DataFrame(columns=['Author', 'Title', 'Text'])
def get_authors_and_urls(page_number):
output = []
url = f'https://www.poesiacastellana.es/poetas-cronologico.php?pagina={page_number}'
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', {'id':'newspaper-b'})
rows = table.findAll('tr')[1:]
for row in rows:
author = row.findAll('a')[1].text
author_url = row.findAll('a')[1]['href']
output.append((author, author_url))
return output
def get_poem_texts(author, author_url):
print(f'getting poems for {author}...')
df = pd.DataFrame(columns=['Author', 'Title', 'Text'])
page = requests.get(base_url+author_url)
soup = bs4.BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', {'id':'newspaper-b'})
rows = table.findAll('tr')[1:]
for row in tqdm(rows):
title = row.find('a').text
poem_url = row.find('a')['href']
poem_page = requests.get(base_url+poem_url)
poem_soup = bs4.BeautifulSoup(poem_page.content, 'html.parser')
poem_text = poem_soup.find('div', {'class':'poema'}).text
temp_df = pd.DataFrame(data = {'Author': [author], 'Title': [title], 'Text': [poem_text]})
df = pd.concat([df, temp_df])
return df
for i in range(1,3): # Change this range if you want to scrape more pages
for author, author_url in get_authors_and_urls(i):
temp_df = get_poem_texts(author, author_url)
df = pd.concat([df, temp_df])
df.to_excel([your file name here], index=False)Editor is loading...