Ext-scrapy.py

mail@pastecode.io avatar
unknown
python
a year ago
2.1 kB
1
Indexable
import requests
import os
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

def download_file(url, save_directory):
    # Send a GET request to the URL
    response = requests.get(url)
    # Retrieve the content type of the response
    content_type = response.headers["Content-Type"]

    # Extract the file name from the URL
    filename = os.path.basename(urlparse(url).path)

    # Save the file under the save_directory with the extracted filename
    file_path = os.path.join(save_directory, filename)
    with open(file_path, "wb") as file:
        file.write(response.content)

    print(f"File saved successfully to: {file_path}")

def download_website(url, save_directory, depth=0, max_depth=3):
    # Send a GET request to the main URL
    response = requests.get(url)
    content_type = response.headers["Content-Type"]

    # Create the save directory if it doesn't exist
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    # Check if the response is HTML
    if content_type.startswith("text/html") and depth < max_depth:
        # Save the HTML content
        download_file(url, save_directory)

        # Extract all the links from the HTML content
        links = [link.get("href") for link in BeautifulSoup(response.content, "html.parser").find_all("a")]

        # Recursively download each linked page
        for link in links:
            # Construct the absolute URL for the linked page
            absolute_link = urljoin(url, link)
            # Recursively call the function to download linked pages
            download_website(absolute_link, save_directory, depth + 1, max_depth)

    else:
        # If it's not an HTML page, save the file directly
        download_file(url, save_directory)

# Specify the URL of the website you want to clone
url = "https://www.example.com"

# Specify the directory where you want to save the downloaded files
save_directory = "path/to/save-directory"

# Set the maximum depth of recursion
max_depth = 3

download_website(url, save_directory, max_depth=max_depth)