Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
6.0 kB
4
Indexable
Never
from datetime import datetime
from tqdm import tqdm
import requests
import re
import time
from collections import deque
from urllib.parse import urljoin
import os
from urllib.robotparser import RobotFileParser

def check_robots_txt(agent, url):
    reason = ''
    can_fetch = False
    robots_txt_url = url + '/robots.txt'
    
    try:
        response = requests.get(robots_txt_url)
        if response.status_code == 200:
            rules = response.text.split('\n')
            applicable_rules = [rule for rule in rules if 'User-agent:' in rule or 'Disallow:' in rule]
            agent_rules = []
            record_for_agent = False
            for rule in applicable_rules:
                if 'User-agent:' in rule:
                    user_agent = rule.split(': ')[1]
                    record_for_agent = (user_agent == agent or user_agent == '*')
                if record_for_agent and 'Disallow:' in rule:
                    disallowed_path = rule.split(': ')[1]
                    if url.startswith(url + disallowed_path):
                        reason = f"Disallowed by robots.txt under the path: {disallowed_path}"
                        return False, reason
            can_fetch = True
        else:
            reason = 'Unable to fetch robots.txt'
    except Exception as e:
        reason = f'An error occurred: {str(e)}'

    return can_fetch, reason

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
def write_links_to_txt(links, filename, mode='a'):
    with open(filename, mode, encoding='utf-8') as file:
        for link in links:
            file.write(link + '\n')

def generate_seasons(start_year, end_year):
    seasons = []
    for year in range(start_year, end_year + 1):
        season = f"{year}-{year + 1}"
        seasons.append(season)
    return seasons

def generate_url(season, league_id=9):
    return f"https://fbref.com/en/comps/{league_id}/{season}/stats/{season}-Premier-League-Stats"

def extract_premier_league_links(html_content):
    regex_pattern_player = r'<td class="left .*?data-append-csv=".*?" data-stat="player".*?><a href="(/en/players/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+)">[^<]+</a></td>'
    
    player_urls = re.findall(regex_pattern_player, html_content)
    
    player_urls = [f"https://fbref.com{link}" for link in player_urls]

    return player_urls



def crawler(start_season, end_season, output_directory='output'):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    output_file = os.path.join(output_directory, f"players_{int(time.time())}.txt")
    existing_links_file = os.path.join(output_directory, "existing_links.txt")

    # Initialize headers, queue, and visited set
    headers = {'User-Agent': 'WebScraper for University project: Mozilla/5.0'}
    queue = deque()
    visited = set()

    # Clear and load existing_links.txt
    with open(existing_links_file, "w") as f:
        f.write('')
    if os.path.exists(existing_links_file):
        with open(existing_links_file, "r") as f:
            visited = set(line.strip() for line in f.readlines())

    with open(output_file, "w", encoding='utf-8') as f:
        f.write('')

    total_links = 1  # Initialize with 1 for the starting season URL
    scraped = 0

    # Counters and trackers
    scraped_for_current_season = 0
    total_links_for_current_season = 0
    current_season = None
    seasons = generate_seasons(start_season, end_season)
    season_urls = [generate_url(season) for season in generate_seasons(start_season, end_season)]
    queue.append(season_urls.pop(0))  # Start with the first season URL

    while queue or season_urls:  # Continue until both queue and season_urls are empty
        if not queue and season_urls:  # If queue is empty but there are more seasons
            queue.append(season_urls.pop(0)) 
        url = queue.popleft()

        # Update current season
        new_season = next((s for s in seasons if s in url), None)
        if new_season and new_season != current_season:
            current_season = new_season
            scraped_for_current_season = 0  # Reset counter when the season changes

        if url in visited:
            continue

        can_fetch, reason = check_robots_txt("Mozilla/5.0", url)
        if not can_fetch:
            print(f"Cannot scrape {url}. Reason: {reason}")
            with open("unscrapable_links.txt", "a") as f:
                f.write(f"{url}\n")

            continue

        print(f"Scraping {url}")
        response = requests.get(url, headers=headers)
        html_content = response.text


        visited.add(url)

        if any(season in url for season in seasons):
            # This is a season page
            player_links = extract_premier_league_links(html_content)
            new_player_links = [link for link in player_links if link not in visited]
            queue.extend(new_player_links)
            total_links_for_current_season += len(new_player_links)
            print(f"Found {len(new_player_links)} new player links.")
        else:
            # This is a player page, so write the URL
            with open(existing_links_file, "a") as f:
                f.write(f"{url}\n")
            # Store HTML content
            if not any(season in url for season in seasons):
                with open(output_file, "a", encoding='utf-8') as f:
                    f.write(html_content)
                    f.write("\n===PAGE DELIMITER===\n")  
            scraped_for_current_season += 1  # Only increment counter for player pages
            print(f"Scraped {scraped_for_current_season}/{total_links_for_current_season} from season {current_season}")


        time.sleep(6)

    print("Scraping completed.")


if __name__ == "__main__":
    crawler(start_season=2022, end_season=datetime.now().year)