Untitled

import requests
from bs4 import BeautifulSoup
import re

def scrape_text(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # If the GET request is successful, the status code will be 200
    if response.status_code == 200:
        # Get the content of the response
        page_content = response.content

        # Create a BeautifulSoup object and specify the parser
        soup = BeautifulSoup(page_content, "html.parser")

        # Get the text of the soup object
        text = soup.get_text()

        # Return the text
        return text
    else:
        return "Failed to scrape the website"

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    
    # Remove all the special characters
    text = re.sub(r'\W', ' ', text)
    return text
Editor is loading...