Untitled
unknown
plain_text
2 years ago
6.0 kB
13
Indexable
from datetime import datetime
from tqdm import tqdm
import requests
import re
import time
from collections import deque
from urllib.parse import urljoin
import os
from urllib.robotparser import RobotFileParser
def check_robots_txt(agent, url):
reason = ''
can_fetch = False
robots_txt_url = url + '/robots.txt'
try:
response = requests.get(robots_txt_url)
if response.status_code == 200:
rules = response.text.split('\n')
applicable_rules = [rule for rule in rules if 'User-agent:' in rule or 'Disallow:' in rule]
agent_rules = []
record_for_agent = False
for rule in applicable_rules:
if 'User-agent:' in rule:
user_agent = rule.split(': ')[1]
record_for_agent = (user_agent == agent or user_agent == '*')
if record_for_agent and 'Disallow:' in rule:
disallowed_path = rule.split(': ')[1]
if url.startswith(url + disallowed_path):
reason = f"Disallowed by robots.txt under the path: {disallowed_path}"
return False, reason
can_fetch = True
else:
reason = 'Unable to fetch robots.txt'
except Exception as e:
reason = f'An error occurred: {str(e)}'
return can_fetch, reason
def create_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)
def write_links_to_txt(links, filename, mode='a'):
with open(filename, mode, encoding='utf-8') as file:
for link in links:
file.write(link + '\n')
def generate_seasons(start_year, end_year):
seasons = []
for year in range(start_year, end_year + 1):
season = f"{year}-{year + 1}"
seasons.append(season)
return seasons
def generate_url(season, league_id=9):
return f"https://fbref.com/en/comps/{league_id}/{season}/stats/{season}-Premier-League-Stats"
def extract_premier_league_links(html_content):
regex_pattern_player = r'<td class="left .*?data-append-csv=".*?" data-stat="player".*?><a href="(/en/players/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+)">[^<]+</a></td>'
player_urls = re.findall(regex_pattern_player, html_content)
player_urls = [f"https://fbref.com{link}" for link in player_urls]
return player_urls
def crawler(start_season, end_season, output_directory='output'):
if not os.path.exists(output_directory):
os.makedirs(output_directory)
output_file = os.path.join(output_directory, f"players_{int(time.time())}.txt")
existing_links_file = os.path.join(output_directory, "existing_links.txt")
# Initialize headers, queue, and visited set
headers = {'User-Agent': 'WebScraper for University project: Mozilla/5.0'}
queue = deque()
visited = set()
# Clear and load existing_links.txt
with open(existing_links_file, "w") as f:
f.write('')
if os.path.exists(existing_links_file):
with open(existing_links_file, "r") as f:
visited = set(line.strip() for line in f.readlines())
with open(output_file, "w", encoding='utf-8') as f:
f.write('')
total_links = 1 # Initialize with 1 for the starting season URL
scraped = 0
# Counters and trackers
scraped_for_current_season = 0
total_links_for_current_season = 0
current_season = None
seasons = generate_seasons(start_season, end_season)
season_urls = [generate_url(season) for season in generate_seasons(start_season, end_season)]
queue.append(season_urls.pop(0)) # Start with the first season URL
while queue or season_urls: # Continue until both queue and season_urls are empty
if not queue and season_urls: # If queue is empty but there are more seasons
queue.append(season_urls.pop(0))
url = queue.popleft()
# Update current season
new_season = next((s for s in seasons if s in url), None)
if new_season and new_season != current_season:
current_season = new_season
scraped_for_current_season = 0 # Reset counter when the season changes
if url in visited:
continue
can_fetch, reason = check_robots_txt("Mozilla/5.0", url)
if not can_fetch:
print(f"Cannot scrape {url}. Reason: {reason}")
with open("unscrapable_links.txt", "a") as f:
f.write(f"{url}\n")
continue
print(f"Scraping {url}")
response = requests.get(url, headers=headers)
html_content = response.text
visited.add(url)
if any(season in url for season in seasons):
# This is a season page
player_links = extract_premier_league_links(html_content)
new_player_links = [link for link in player_links if link not in visited]
queue.extend(new_player_links)
total_links_for_current_season += len(new_player_links)
print(f"Found {len(new_player_links)} new player links.")
else:
# This is a player page, so write the URL
with open(existing_links_file, "a") as f:
f.write(f"{url}\n")
# Store HTML content
if not any(season in url for season in seasons):
with open(output_file, "a", encoding='utf-8') as f:
f.write(html_content)
f.write("\n===PAGE DELIMITER===\n")
scraped_for_current_season += 1 # Only increment counter for player pages
print(f"Scraped {scraped_for_current_season}/{total_links_for_current_season} from season {current_season}")
time.sleep(6)
print("Scraping completed.")
if __name__ == "__main__":
crawler(start_season=2022, end_season=datetime.now().year)Editor is loading...