Untitled

mail@pastecode.io avatar
unknown
plain_text
7 months ago
1.6 kB
0
Indexable
Never

import requests
import bs4
import pandas as pd 
from tqdm import tqdm

base_url = 'https://www.poesiacastellana.es/'

df = pd.DataFrame(columns=['Author', 'Title', 'Text'])

def get_authors_and_urls(page_number):
	output = []
	url = f'https://www.poesiacastellana.es/poetas-cronologico.php?pagina={page_number}'
	page = requests.get(url)
	soup = bs4.BeautifulSoup(page.content, 'html.parser')
	table = soup.find('table', {'id':'newspaper-b'})
	rows = table.findAll('tr')[1:]
	for row in rows:
		author = row.findAll('a')[1].text
		author_url = row.findAll('a')[1]['href']
		output.append((author, author_url))
	return output
	
def get_poem_texts(author, author_url):
	print(f'getting poems for {author}...')
	df = pd.DataFrame(columns=['Author', 'Title', 'Text'])
	page = requests.get(base_url+author_url)
	soup = bs4.BeautifulSoup(page.content, 'html.parser')
	table = soup.find('table', {'id':'newspaper-b'})
	rows = table.findAll('tr')[1:]
	for row in tqdm(rows):
		title = row.find('a').text
		poem_url = row.find('a')['href']
		poem_page = requests.get(base_url+poem_url)
		poem_soup = bs4.BeautifulSoup(poem_page.content, 'html.parser')
		poem_text = poem_soup.find('div', {'class':'poema'}).text
		temp_df = pd.DataFrame(data = {'Author': [author], 'Title': [title], 'Text': [poem_text]})
		df = pd.concat([df, temp_df])
	return df

for i in range(1,3): # Change this range if you want to scrape more pages
	for author, author_url in get_authors_and_urls(i):
		temp_df = get_poem_texts(author, author_url)
		df = pd.concat([df, temp_df])

df.to_excel([your file name here], index=False)