Untitled

mail@pastecode.io avatar
unknown
plain_text
17 days ago
980 B
3
Indexable
Never
import requests
from bs4 import BeautifulSoup
import urllib.parse
import os
import urllib.request


def test_links(url, link_endings):
    # goes to url
    testhtml = requests.get(url)
    testhtml.raise_for_status()
    
    # grabs url html content
    soup = BeautifulSoup(testhtml.content, 'html.parser')
    
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        
        # Make sure the href is a complete URL
        parsed_url = urllib.parse.urlparse(href)
        if parsed_url.scheme not in ('https', 'http'):
            href = urllib.parse.urljoin(url, href)
        
        # Check if the link ends with the desired extensions
        if any(href.endswith(ending) for ending in link_endings):
            links.append(href)
            
    return links  # Return the flat list of links


def download_file(url, folder):
    # Extract file name from URL
    filename = os.path.basename(urllib.parse.urlparse(url).path)
Leave a Comment