Untitled
unknown
plain_text
a year ago
980 B
11
Indexable
import requests
from bs4 import BeautifulSoup
import urllib.parse
import os
import urllib.request
def test_links(url, link_endings):
# goes to url
testhtml = requests.get(url)
testhtml.raise_for_status()
# grabs url html content
soup = BeautifulSoup(testhtml.content, 'html.parser')
links = []
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
# Make sure the href is a complete URL
parsed_url = urllib.parse.urlparse(href)
if parsed_url.scheme not in ('https', 'http'):
href = urllib.parse.urljoin(url, href)
# Check if the link ends with the desired extensions
if any(href.endswith(ending) for ending in link_endings):
links.append(href)
return links # Return the flat list of links
def download_file(url, folder):
# Extract file name from URL
filename = os.path.basename(urllib.parse.urlparse(url).path)
Editor is loading...
Leave a Comment