Untitled
unknown
plain_text
2 years ago
854 B
7
Indexable
import requests
from bs4 import BeautifulSoup
import re
def scrape_text(url):
# Send a GET request to the URL
response = requests.get(url)
# If the GET request is successful, the status code will be 200
if response.status_code == 200:
# Get the content of the response
page_content = response.content
# Create a BeautifulSoup object and specify the parser
soup = BeautifulSoup(page_content, "html.parser")
# Get the text of the soup object
text = soup.get_text()
# Return the text
return text
else:
return "Failed to scrape the website"
def preprocess_text(text):
# Remove URLs
text = re.sub(r'http\S+|www.\S+', '', text)
# Remove all the special characters
text = re.sub(r'\W', ' ', text)
return textEditor is loading...
Leave a Comment