Untitled
unknown
plain_text
a year ago
854 B
4
Indexable
import requests from bs4 import BeautifulSoup import re def scrape_text(url): # Send a GET request to the URL response = requests.get(url) # If the GET request is successful, the status code will be 200 if response.status_code == 200: # Get the content of the response page_content = response.content # Create a BeautifulSoup object and specify the parser soup = BeautifulSoup(page_content, "html.parser") # Get the text of the soup object text = soup.get_text() # Return the text return text else: return "Failed to scrape the website" def preprocess_text(text): # Remove URLs text = re.sub(r'http\S+|www.\S+', '', text) # Remove all the special characters text = re.sub(r'\W', ' ', text) return text
Editor is loading...
Leave a Comment