Untitled
unknown
plain_text
17 days ago
980 B
3
Indexable
Never
import requests from bs4 import BeautifulSoup import urllib.parse import os import urllib.request def test_links(url, link_endings): # goes to url testhtml = requests.get(url) testhtml.raise_for_status() # grabs url html content soup = BeautifulSoup(testhtml.content, 'html.parser') links = [] for a_tag in soup.find_all('a', href=True): href = a_tag['href'] # Make sure the href is a complete URL parsed_url = urllib.parse.urlparse(href) if parsed_url.scheme not in ('https', 'http'): href = urllib.parse.urljoin(url, href) # Check if the link ends with the desired extensions if any(href.endswith(ending) for ending in link_endings): links.append(href) return links # Return the flat list of links def download_file(url, folder): # Extract file name from URL filename = os.path.basename(urllib.parse.urlparse(url).path)
Leave a Comment