Untitled
unknown
plain_text
2 years ago
16 kB
4
Indexable
import scrapy from mal_scraper.items import MalScraperItem class MalSpiderSpider(scrapy.Spider): name = "mal_spider" allowed_domains = ["myanimelist.net"] start_urls = ["https://myanimelist.net/anime/season/archive"] def parse(self, response): season_url_items = response.css('tr a') for season_url_item in [season_url_items[0]]: season_url = season_url_item.attrib['href'] season_year = season_url_item.css('::text').get() # yield scrapy.Request( # url=season_url, # callback=self.seasonal_anime_page, # meta={'playwright': True}) yield response.follow( url=season_url, callback=self.seasonal_anime_page ) def seasonal_anime_page(self, response): seasonal_shows_lists = response.css("div.js-seasonal-anime-list") seasonal_show_urls = [] for seasonal_shows_list in seasonal_shows_lists: anime_header = seasonal_shows_list.css("div.anime-header::text").get() if anime_header != "TV (Continuing)": [seasonal_show_urls.append(seasonal_title) for seasonal_title in seasonal_shows_list.css("h2.h2_anime_title a")] for titles in seasonal_show_urls: seasonal_shows_url = titles.attrib["href"] yield response.follow(url=seasonal_shows_url, callback=self.anime_page) def anime_page(self, response): # In this function wee want title, id from url, type, episodes, # status, aired, premiered, producers, Licensors # studios, source, genre, theme, demographic, duration, rating # score & rated by user number, ranked, members, favorites, # official site, resources # streaming platform, synopsis, prequal/sequal/alternative with id mal_item = MalScraperItem() title = response.css('div.h1-title h1.title-name strong::text').get() poster_img_url = response.xpath('//img[@itemprop="image"]').attrib['data-src'] mal_url = response.url mal_id = response.url.split("/")[4] sidebar_information = response.css('div.spaceit_pad') # alternative_names synonym_name = None japanese_name = None english_name = None show_type = None total_episodes = None status = None airing_start_date = None airing_finish_date = None premiered = None broadcast = None producers = None licensors = None studios = None source = None genres = None themes = None demographic = None duration_in_minutes = None age_rating = None score = None members = None official_site = None anidb_url = None ann_url = None wikipedia_url = None description = response.xpath("//p[@itemprop='description']/node()").getall() description = "".join(description) for name in sidebar_information[:3]: language = name.css('span.dark_text::text').get().strip() alternative_title = name.css("::text")[2].get() if language == "Japanese:": japanese_name = alternative_title elif language == "English:": english_name = alternative_title elif language == "Synonyms:": synonym_name = alternative_title for info in sidebar_information: info_key_text = info.css('span.dark_text::text').get() info_value_text = info.css("::text") if info_key_text == "Type:": show_type = info_value_text[3].get() elif info_key_text == "Episodes:": total_episodes = info_value_text[2].get() elif info_key_text == "Status:": status = info_value_text[2].get() elif info_key_text == "Aired:": aired = info_value_text[2].get() aired = aired.split(" to ") airing_start_date = aired[0] if len(aired) > 1: airing_finish_date = aired[1] if "?" in airing_finish_date: airing_finish_date = None else: airing_finish_date = None elif info_key_text == "Premiered:": premiered = info_value_text[3].get() elif info_key_text == "Broadcast:": broadcast = info_value_text[2].get() elif info_key_text == "Producers:": if "None found" not in info_value_text[2].get(): producers = "".join([i.get() for i in info_value_text[3:]]) elif info_key_text == "Licensors:": if "None found" not in info_value_text[2].get(): licensors = "".join([i.get() for i in info_value_text[3:]]) elif info_key_text == "Studios:": if "None found" not in info_value_text[2].get(): studios = "".join([i.get() for i in info_value_text[3:]]) elif info_key_text == "Source:": source = "".join([i.get() for i in info_value_text[2:]]) elif info_key_text == "Genres:" or info_key_text == "Genre:": genres = ", ".join([i.get() for i in info.css("a::text")]) elif info_key_text == "Themes:" or info_key_text == "Theme:": themes = ", ".join([i.get() for i in info.css("a::text")]) elif info_key_text == "Demographic:": demographic = info_value_text[3].get() elif info_key_text == "Duration:": duration_in_minutes = info_value_text[2].get() if "Unknown" in duration_in_minutes: duration_in_minutes = None elif info_key_text == "Rating:": age_rating = info_value_text[2].get() if "None" in age_rating: age_rating = None elif info_key_text == "Score:": score = info_value_text[3].get() elif info_key_text == "Members:": members = info_value_text[2].get() # get external resources links external_links_list = response.css("div.external_links a") for link in external_links_list: link_text = link.css("::text").get() url = link.attrib['href'] if link_text == "Official Site": official_site = url elif link_text == "AniDB": anidb_url = url elif link_text == "ANN": ann_url = url elif link_text == "Wikipedia": if "en.wikipedia.org" in url: wikipedia_url = url def get_related_media(row): """This function takes related anime row selector and returns a list of all shows""" shows = row[1].css("td a") show_list = [] for show in shows: show_id = show.attrib["href"].split("/")[2] show_title = show.css("::text").get() anime = {"title": show_title, "mal_id": show_id} show_list.append(anime) return show_list related_media_table = response.css("table.anime_detail_related_anime tr") sequels = None prequels = None other_shows = None side_stories = None alternative_versions = None character_shows = None spin_offs = None alternative_setting_shows = None summary_shows = None for row in related_media_table: row = row.css("td") left_row_data = row[0].css("::text").get() if "Sequel" in left_row_data: sequels = get_related_media(row) if "Prequel" in left_row_data: prequels = get_related_media(row) if "Other" in left_row_data: other_shows = get_related_media(row) if "Side story" in left_row_data: side_stories = get_related_media(row) if "Alternative version" in left_row_data: alternative_versions = get_related_media(row) if "Character" in left_row_data: character_shows = get_related_media(row) if "Spin-off" in left_row_data: spin_offs = get_related_media(row) if "Alternative setting" in left_row_data: alternative_setting_shows = get_related_media(row) if "Summary" in left_row_data: summary_shows = get_related_media(row) mal_item["title"] = title mal_item["poster_img_url"] = poster_img_url mal_item["mal_url"] = mal_url mal_item["mal_id"] = mal_id mal_item["synonym_name"] = synonym_name mal_item["japanese_name"] = japanese_name mal_item["english_name"] = english_name mal_item["show_type"] = show_type mal_item["total_episodes"] = total_episodes mal_item["status"] = status mal_item["airing_start_date"] = airing_start_date mal_item["airing_finish_date"] = airing_finish_date mal_item["premiered"] = premiered mal_item["broadcast"] = broadcast mal_item["producers"] = producers mal_item["licensors"] = licensors mal_item["studios"] = studios mal_item["source"] = source mal_item["genres"] = genres mal_item["themes"] = themes mal_item["demographic"] = demographic mal_item["duration_in_minutes"] = duration_in_minutes mal_item["age_rating"] = age_rating mal_item["score"] = score mal_item["members"] = members mal_item["official_site"] = official_site mal_item["anidb_url"] = anidb_url mal_item["ann_url"] = ann_url mal_item["wikipedia_url"] = wikipedia_url mal_item["description"] = description mal_item["prequels"] = prequels mal_item["sequels"] = sequels mal_item["other_shows"] = other_shows mal_item["side_stories"] = side_stories mal_item["alternative_versions"] = alternative_versions mal_item["character_shows"] = character_shows mal_item["spin_offs"] = spin_offs mal_item["alternative_setting_shows"] = alternative_setting_shows mal_item["summary_shows"] = summary_shows character_staff_page_url = response.url + "/characters" episodes_page_url = response.url + "/episode" yield response.follow(character_staff_page_url, callback=self.character_page, meta={"mal_item": mal_item, "episodes_page_url": episodes_page_url}) def character_page(self, response): """This function returns a list of characters""" mal_item = response.meta["mal_item"] characters_tables = response.css('div.anime-character-container table.js-anime-character-table') characters = [] # Loop for getting character info for table in characters_tables: table_data = table.css("td.borderClass")[1:] character = table_data[0].css("td.borderClass div")[2:] character_name = character[0].css("a h3::text").get() character_first_name = None character_last_name = None if "," in character_name: character_first_name = character_name.split(",")[1] character_last_name = character_name.split(",")[0] else: character_first_name = character_name character_role = character[1].css("::text").get() voice_actors = table_data[1].css("tr.js-anime-character-va-lang") character_voice_actors = [] for actor in voice_actors: va_name = actor.css("div.spaceit_pad")[0] va_name = va_name.css("a::text").get() va_first_name = None va_secon_name = None if "," in va_name: va_first_name = va_name.split(",")[1] va_last_name = va_name.split(",")[0] else: va_first_name = va_name va_lang = actor.css("div.spaceit_pad")[1] va_lang = va_lang.css("::text").get() va_name_lang = {"va_first_name": va_first_name, "va_last_name": va_last_name, "va_lang": va_lang, } character_voice_actors.append(va_name_lang) character_info = { "character_first_name": character_first_name, "character_last_name": character_last_name, "character_role": character_role, "character_voice_actors": character_voice_actors} characters.append(character_info) staff_tables = response.xpath("//div[contains(concat(' ', @class, ' '), ' rightside ')]/table") staffs = [] for table in staff_tables: name_role = table.css("td")[1] staff_name = name_role.css("a::text").get() staff_role = name_role.css("div small::text").get() staff_first_name = None staff_last_name = None staff_roles = None if "," in staff_name: staff_first_name = staff_name.split(",")[1] staff_last_name = staff_name.split(",")[0] else: staff_first_name = staff_name if "," in staff_role: staff_roles = staff_role.split(",") else: staff_roles = [staff_role] staff_with_roles = {"staff_first_name": staff_first_name, "staff_last_name": staff_last_name, "staff_roles": staff_roles} staffs.append(staff_with_roles) mal_item["characters"] = characters mal_item["staffs"] = staffs if "Movie" not in mal_item["show_type"]: episodes_page_url = response.meta["episodes_page_url"] yield response.follow(episodes_page_url, callback=self.episodes_page, meta={"mal_item": mal_item} ) else: mal_item["episodes"] = None def episodes_page(self, response): mal_item = response.meta["mal_item"] mal_item["episodes"] = [] episode_urls = response.css("td.episode-title a") for url in episode_urls: url = url.attrib["href"] yield response.follow(url, callback=self.episode_main_page, meta={"mal_item": mal_item}) yield mal_item def episode_main_page(self, response): mal_item = response.meta["mal_item"] # TO-DO: scrape episodes has_multi_episodes_info = response.css("div.has-multi-episodes-info") episode_number = has_multi_episodes_info.css("h2 span::text").get() episode_title = has_multi_episodes_info.css("h2::text").getall()[1] # other_names = has_multi_episodes_info.css("p::text").get() # romaji_name = romaji_name = other_names.split("(")[0] # japanese_name = other_names.split("(")[1].split(")")[0] duration_aired_text = has_multi_episodes_info.css("div")[2].css("::text").getall() duration = duration_aired_text[2] air_date = duration_aired_text[4].split("(")[0] synopsis = None if not response.css('div.badresult').get() is None: synopsis = response.xpath("//div[@class='pt8 pb8']/node()").getall() synopsis = "".join(synopsis[2:]) episode = {"number": episode_number, "title": episode_title, "duration": duration, "air_date": air_date, "synopsis": synopsis} mal_item["episodes"].append(episode) return mal_item
Editor is loading...