Untitled
unknown
plain_text
3 years ago
16 kB
9
Indexable
import scrapy
from mal_scraper.items import MalScraperItem
class MalSpiderSpider(scrapy.Spider):
name = "mal_spider"
allowed_domains = ["myanimelist.net"]
start_urls = ["https://myanimelist.net/anime/season/archive"]
def parse(self, response):
season_url_items = response.css('tr a')
for season_url_item in [season_url_items[0]]:
season_url = season_url_item.attrib['href']
season_year = season_url_item.css('::text').get()
# yield scrapy.Request(
# url=season_url,
# callback=self.seasonal_anime_page,
# meta={'playwright': True})
yield response.follow(
url=season_url,
callback=self.seasonal_anime_page
)
def seasonal_anime_page(self, response):
seasonal_shows_lists = response.css("div.js-seasonal-anime-list")
seasonal_show_urls = []
for seasonal_shows_list in seasonal_shows_lists:
anime_header = seasonal_shows_list.css("div.anime-header::text").get()
if anime_header != "TV (Continuing)":
[seasonal_show_urls.append(seasonal_title) for seasonal_title in seasonal_shows_list.css("h2.h2_anime_title a")]
for titles in seasonal_show_urls:
seasonal_shows_url = titles.attrib["href"]
yield response.follow(url=seasonal_shows_url, callback=self.anime_page)
def anime_page(self, response):
# In this function wee want title, id from url, type, episodes,
# status, aired, premiered, producers, Licensors
# studios, source, genre, theme, demographic, duration, rating
# score & rated by user number, ranked, members, favorites,
# official site, resources
# streaming platform, synopsis, prequal/sequal/alternative with id
mal_item = MalScraperItem()
title = response.css('div.h1-title h1.title-name strong::text').get()
poster_img_url = response.xpath('//img[@itemprop="image"]').attrib['data-src']
mal_url = response.url
mal_id = response.url.split("/")[4]
sidebar_information = response.css('div.spaceit_pad')
# alternative_names
synonym_name = None
japanese_name = None
english_name = None
show_type = None
total_episodes = None
status = None
airing_start_date = None
airing_finish_date = None
premiered = None
broadcast = None
producers = None
licensors = None
studios = None
source = None
genres = None
themes = None
demographic = None
duration_in_minutes = None
age_rating = None
score = None
members = None
official_site = None
anidb_url = None
ann_url = None
wikipedia_url = None
description = response.xpath("//p[@itemprop='description']/node()").getall()
description = "".join(description)
for name in sidebar_information[:3]:
language = name.css('span.dark_text::text').get().strip()
alternative_title = name.css("::text")[2].get()
if language == "Japanese:":
japanese_name = alternative_title
elif language == "English:":
english_name = alternative_title
elif language == "Synonyms:":
synonym_name = alternative_title
for info in sidebar_information:
info_key_text = info.css('span.dark_text::text').get()
info_value_text = info.css("::text")
if info_key_text == "Type:":
show_type = info_value_text[3].get()
elif info_key_text == "Episodes:":
total_episodes = info_value_text[2].get()
elif info_key_text == "Status:":
status = info_value_text[2].get()
elif info_key_text == "Aired:":
aired = info_value_text[2].get()
aired = aired.split(" to ")
airing_start_date = aired[0]
if len(aired) > 1:
airing_finish_date = aired[1]
if "?" in airing_finish_date:
airing_finish_date = None
else:
airing_finish_date = None
elif info_key_text == "Premiered:":
premiered = info_value_text[3].get()
elif info_key_text == "Broadcast:":
broadcast = info_value_text[2].get()
elif info_key_text == "Producers:":
if "None found" not in info_value_text[2].get():
producers = "".join([i.get() for i in info_value_text[3:]])
elif info_key_text == "Licensors:":
if "None found" not in info_value_text[2].get():
licensors = "".join([i.get() for i in info_value_text[3:]])
elif info_key_text == "Studios:":
if "None found" not in info_value_text[2].get():
studios = "".join([i.get() for i in info_value_text[3:]])
elif info_key_text == "Source:":
source = "".join([i.get() for i in info_value_text[2:]])
elif info_key_text == "Genres:" or info_key_text == "Genre:":
genres = ", ".join([i.get() for i in info.css("a::text")])
elif info_key_text == "Themes:" or info_key_text == "Theme:":
themes = ", ".join([i.get() for i in info.css("a::text")])
elif info_key_text == "Demographic:":
demographic = info_value_text[3].get()
elif info_key_text == "Duration:":
duration_in_minutes = info_value_text[2].get()
if "Unknown" in duration_in_minutes:
duration_in_minutes = None
elif info_key_text == "Rating:":
age_rating = info_value_text[2].get()
if "None" in age_rating:
age_rating = None
elif info_key_text == "Score:":
score = info_value_text[3].get()
elif info_key_text == "Members:":
members = info_value_text[2].get()
# get external resources links
external_links_list = response.css("div.external_links a")
for link in external_links_list:
link_text = link.css("::text").get()
url = link.attrib['href']
if link_text == "Official Site":
official_site = url
elif link_text == "AniDB":
anidb_url = url
elif link_text == "ANN":
ann_url = url
elif link_text == "Wikipedia":
if "en.wikipedia.org" in url:
wikipedia_url = url
def get_related_media(row):
"""This function takes related anime row selector and returns
a list of all shows"""
shows = row[1].css("td a")
show_list = []
for show in shows:
show_id = show.attrib["href"].split("/")[2]
show_title = show.css("::text").get()
anime = {"title": show_title, "mal_id": show_id}
show_list.append(anime)
return show_list
related_media_table = response.css("table.anime_detail_related_anime tr")
sequels = None
prequels = None
other_shows = None
side_stories = None
alternative_versions = None
character_shows = None
spin_offs = None
alternative_setting_shows = None
summary_shows = None
for row in related_media_table:
row = row.css("td")
left_row_data = row[0].css("::text").get()
if "Sequel" in left_row_data:
sequels = get_related_media(row)
if "Prequel" in left_row_data:
prequels = get_related_media(row)
if "Other" in left_row_data:
other_shows = get_related_media(row)
if "Side story" in left_row_data:
side_stories = get_related_media(row)
if "Alternative version" in left_row_data:
alternative_versions = get_related_media(row)
if "Character" in left_row_data:
character_shows = get_related_media(row)
if "Spin-off" in left_row_data:
spin_offs = get_related_media(row)
if "Alternative setting" in left_row_data:
alternative_setting_shows = get_related_media(row)
if "Summary" in left_row_data:
summary_shows = get_related_media(row)
mal_item["title"] = title
mal_item["poster_img_url"] = poster_img_url
mal_item["mal_url"] = mal_url
mal_item["mal_id"] = mal_id
mal_item["synonym_name"] = synonym_name
mal_item["japanese_name"] = japanese_name
mal_item["english_name"] = english_name
mal_item["show_type"] = show_type
mal_item["total_episodes"] = total_episodes
mal_item["status"] = status
mal_item["airing_start_date"] = airing_start_date
mal_item["airing_finish_date"] = airing_finish_date
mal_item["premiered"] = premiered
mal_item["broadcast"] = broadcast
mal_item["producers"] = producers
mal_item["licensors"] = licensors
mal_item["studios"] = studios
mal_item["source"] = source
mal_item["genres"] = genres
mal_item["themes"] = themes
mal_item["demographic"] = demographic
mal_item["duration_in_minutes"] = duration_in_minutes
mal_item["age_rating"] = age_rating
mal_item["score"] = score
mal_item["members"] = members
mal_item["official_site"] = official_site
mal_item["anidb_url"] = anidb_url
mal_item["ann_url"] = ann_url
mal_item["wikipedia_url"] = wikipedia_url
mal_item["description"] = description
mal_item["prequels"] = prequels
mal_item["sequels"] = sequels
mal_item["other_shows"] = other_shows
mal_item["side_stories"] = side_stories
mal_item["alternative_versions"] = alternative_versions
mal_item["character_shows"] = character_shows
mal_item["spin_offs"] = spin_offs
mal_item["alternative_setting_shows"] = alternative_setting_shows
mal_item["summary_shows"] = summary_shows
character_staff_page_url = response.url + "/characters"
episodes_page_url = response.url + "/episode"
yield response.follow(character_staff_page_url,
callback=self.character_page,
meta={"mal_item": mal_item,
"episodes_page_url": episodes_page_url})
def character_page(self, response):
"""This function returns a list of characters"""
mal_item = response.meta["mal_item"]
characters_tables = response.css('div.anime-character-container table.js-anime-character-table')
characters = []
# Loop for getting character info
for table in characters_tables:
table_data = table.css("td.borderClass")[1:]
character = table_data[0].css("td.borderClass div")[2:]
character_name = character[0].css("a h3::text").get()
character_first_name = None
character_last_name = None
if "," in character_name:
character_first_name = character_name.split(",")[1]
character_last_name = character_name.split(",")[0]
else:
character_first_name = character_name
character_role = character[1].css("::text").get()
voice_actors = table_data[1].css("tr.js-anime-character-va-lang")
character_voice_actors = []
for actor in voice_actors:
va_name = actor.css("div.spaceit_pad")[0]
va_name = va_name.css("a::text").get()
va_first_name = None
va_secon_name = None
if "," in va_name:
va_first_name = va_name.split(",")[1]
va_last_name = va_name.split(",")[0]
else:
va_first_name = va_name
va_lang = actor.css("div.spaceit_pad")[1]
va_lang = va_lang.css("::text").get()
va_name_lang = {"va_first_name": va_first_name,
"va_last_name": va_last_name,
"va_lang": va_lang,
}
character_voice_actors.append(va_name_lang)
character_info = {
"character_first_name": character_first_name,
"character_last_name": character_last_name,
"character_role": character_role,
"character_voice_actors": character_voice_actors}
characters.append(character_info)
staff_tables = response.xpath("//div[contains(concat(' ', @class, ' '), ' rightside ')]/table")
staffs = []
for table in staff_tables:
name_role = table.css("td")[1]
staff_name = name_role.css("a::text").get()
staff_role = name_role.css("div small::text").get()
staff_first_name = None
staff_last_name = None
staff_roles = None
if "," in staff_name:
staff_first_name = staff_name.split(",")[1]
staff_last_name = staff_name.split(",")[0]
else:
staff_first_name = staff_name
if "," in staff_role:
staff_roles = staff_role.split(",")
else:
staff_roles = [staff_role]
staff_with_roles = {"staff_first_name": staff_first_name,
"staff_last_name": staff_last_name,
"staff_roles": staff_roles}
staffs.append(staff_with_roles)
mal_item["characters"] = characters
mal_item["staffs"] = staffs
if "Movie" not in mal_item["show_type"]:
episodes_page_url = response.meta["episodes_page_url"]
yield response.follow(episodes_page_url,
callback=self.episodes_page,
meta={"mal_item": mal_item}
)
else:
mal_item["episodes"] = None
def episodes_page(self, response):
mal_item = response.meta["mal_item"]
mal_item["episodes"] = []
episode_urls = response.css("td.episode-title a")
for url in episode_urls:
url = url.attrib["href"]
yield response.follow(url,
callback=self.episode_main_page,
meta={"mal_item": mal_item})
yield mal_item
def episode_main_page(self, response):
mal_item = response.meta["mal_item"]
# TO-DO: scrape episodes
has_multi_episodes_info = response.css("div.has-multi-episodes-info")
episode_number = has_multi_episodes_info.css("h2 span::text").get()
episode_title = has_multi_episodes_info.css("h2::text").getall()[1]
# other_names = has_multi_episodes_info.css("p::text").get()
# romaji_name = romaji_name = other_names.split("(")[0]
# japanese_name = other_names.split("(")[1].split(")")[0]
duration_aired_text = has_multi_episodes_info.css("div")[2].css("::text").getall()
duration = duration_aired_text[2]
air_date = duration_aired_text[4].split("(")[0]
synopsis = None
if not response.css('div.badresult').get() is None:
synopsis = response.xpath("//div[@class='pt8 pb8']/node()").getall()
synopsis = "".join(synopsis[2:])
episode = {"number": episode_number,
"title": episode_title,
"duration": duration,
"air_date": air_date,
"synopsis": synopsis}
mal_item["episodes"].append(episode)
return mal_item
Editor is loading...