import requests
import os
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
def download_file(url, save_directory):
# Send a GET request to the URL
response = requests.get(url)
# Retrieve the content type of the response
content_type = response.headers["Content-Type"]
# Extract the file name from the URL
filename = os.path.basename(urlparse(url).path)
# Save the file under the save_directory with the extracted filename
file_path = os.path.join(save_directory, filename)
with open(file_path, "wb") as file:
file.write(response.content)
print(f"File saved successfully to: {file_path}")
def download_website(url, save_directory, depth=0, max_depth=3):
# Send a GET request to the main URL
response = requests.get(url)
content_type = response.headers["Content-Type"]
# Create the save directory if it doesn't exist
if not os.path.exists(save_directory):
os.makedirs(save_directory)
# Check if the response is HTML
if content_type.startswith("text/html") and depth < max_depth:
# Save the HTML content
download_file(url, save_directory)
# Extract all the links from the HTML content
links = [link.get("href") for link in BeautifulSoup(response.content, "html.parser").find_all("a")]
# Recursively download each linked page
for link in links:
# Construct the absolute URL for the linked page
absolute_link = urljoin(url, link)
# Recursively call the function to download linked pages
download_website(absolute_link, save_directory, depth + 1, max_depth)
else:
# If it's not an HTML page, save the file directly
download_file(url, save_directory)
# Specify the URL of the website you want to clone
url = "https://www.example.com"
# Specify the directory where you want to save the downloaded files
save_directory = "path/to/save-directory"
# Set the maximum depth of recursion
max_depth = 3
download_website(url, save_directory, max_depth=max_depth)