Untitled
unknown
plain_text
a year ago
6.0 kB
4
Indexable
Never
from datetime import datetime from tqdm import tqdm import requests import re import time from collections import deque from urllib.parse import urljoin import os from urllib.robotparser import RobotFileParser def check_robots_txt(agent, url): reason = '' can_fetch = False robots_txt_url = url + '/robots.txt' try: response = requests.get(robots_txt_url) if response.status_code == 200: rules = response.text.split('\n') applicable_rules = [rule for rule in rules if 'User-agent:' in rule or 'Disallow:' in rule] agent_rules = [] record_for_agent = False for rule in applicable_rules: if 'User-agent:' in rule: user_agent = rule.split(': ')[1] record_for_agent = (user_agent == agent or user_agent == '*') if record_for_agent and 'Disallow:' in rule: disallowed_path = rule.split(': ')[1] if url.startswith(url + disallowed_path): reason = f"Disallowed by robots.txt under the path: {disallowed_path}" return False, reason can_fetch = True else: reason = 'Unable to fetch robots.txt' except Exception as e: reason = f'An error occurred: {str(e)}' return can_fetch, reason def create_directory(directory): if not os.path.exists(directory): os.makedirs(directory) def write_links_to_txt(links, filename, mode='a'): with open(filename, mode, encoding='utf-8') as file: for link in links: file.write(link + '\n') def generate_seasons(start_year, end_year): seasons = [] for year in range(start_year, end_year + 1): season = f"{year}-{year + 1}" seasons.append(season) return seasons def generate_url(season, league_id=9): return f"https://fbref.com/en/comps/{league_id}/{season}/stats/{season}-Premier-League-Stats" def extract_premier_league_links(html_content): regex_pattern_player = r'<td class="left .*?data-append-csv=".*?" data-stat="player".*?><a href="(/en/players/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+)">[^<]+</a></td>' player_urls = re.findall(regex_pattern_player, html_content) player_urls = [f"https://fbref.com{link}" for link in player_urls] return player_urls def crawler(start_season, end_season, output_directory='output'): if not os.path.exists(output_directory): os.makedirs(output_directory) output_file = os.path.join(output_directory, f"players_{int(time.time())}.txt") existing_links_file = os.path.join(output_directory, "existing_links.txt") # Initialize headers, queue, and visited set headers = {'User-Agent': 'WebScraper for University project: Mozilla/5.0'} queue = deque() visited = set() # Clear and load existing_links.txt with open(existing_links_file, "w") as f: f.write('') if os.path.exists(existing_links_file): with open(existing_links_file, "r") as f: visited = set(line.strip() for line in f.readlines()) with open(output_file, "w", encoding='utf-8') as f: f.write('') total_links = 1 # Initialize with 1 for the starting season URL scraped = 0 # Counters and trackers scraped_for_current_season = 0 total_links_for_current_season = 0 current_season = None seasons = generate_seasons(start_season, end_season) season_urls = [generate_url(season) for season in generate_seasons(start_season, end_season)] queue.append(season_urls.pop(0)) # Start with the first season URL while queue or season_urls: # Continue until both queue and season_urls are empty if not queue and season_urls: # If queue is empty but there are more seasons queue.append(season_urls.pop(0)) url = queue.popleft() # Update current season new_season = next((s for s in seasons if s in url), None) if new_season and new_season != current_season: current_season = new_season scraped_for_current_season = 0 # Reset counter when the season changes if url in visited: continue can_fetch, reason = check_robots_txt("Mozilla/5.0", url) if not can_fetch: print(f"Cannot scrape {url}. Reason: {reason}") with open("unscrapable_links.txt", "a") as f: f.write(f"{url}\n") continue print(f"Scraping {url}") response = requests.get(url, headers=headers) html_content = response.text visited.add(url) if any(season in url for season in seasons): # This is a season page player_links = extract_premier_league_links(html_content) new_player_links = [link for link in player_links if link not in visited] queue.extend(new_player_links) total_links_for_current_season += len(new_player_links) print(f"Found {len(new_player_links)} new player links.") else: # This is a player page, so write the URL with open(existing_links_file, "a") as f: f.write(f"{url}\n") # Store HTML content if not any(season in url for season in seasons): with open(output_file, "a", encoding='utf-8') as f: f.write(html_content) f.write("\n===PAGE DELIMITER===\n") scraped_for_current_season += 1 # Only increment counter for player pages print(f"Scraped {scraped_for_current_season}/{total_links_for_current_season} from season {current_season}") time.sleep(6) print("Scraping completed.") if __name__ == "__main__": crawler(start_season=2022, end_season=datetime.now().year)