ext-scrapy.py
unknown
python
a year ago
1.9 kB
1
Indexable
Never
import requests import os from urllib.parse import urljoin, urlparse def download_file(url, save_directory): # Send a GET request to the URL response = requests.get(url) # Retrieve the content type of the response content_type = response.headers["Content-Type"] # Extract the file name from the URL filename = os.path.basename(urlparse(url).path) # Save the file under the save_directory with the extracted filename file_path = os.path.join(save_directory, filename) with open(file_path, "wb") as file: file.write(response.content) print(f"File saved successfully to: {file_path}") def download_website(url, save_directory): # Send a GET request to the main URL response = requests.get(url) content_type = response.headers["Content-Type"] # Create the save directory if it doesn't exist if not os.path.exists(save_directory): os.makedirs(save_directory) # Check if the response is HTML if content_type.startswith("text/html"): # Save the HTML content download_file(url, save_directory) # Extract all the links from the HTML content links = [link.get("href") for link in BeautifulSoup(response.content, "html.parser").find_all("a")] # Recursively download each linked page for link in links: # Construct the absolute URL for the linked page absolute_link = urljoin(url, link) # Recursively call the function to download linked pages download_website(absolute_link, save_directory) else: # If it's not an HTML page, save the file directly download_file(url, save_directory) # Specify the URL of the website you want to clone url = "https://www.example.com" # Specify the directory where you want to save the downloaded files save_directory = "path/to/save-directory" download_website(url, save_directory)