Untitled

 avatar
unknown
plain_text
2 years ago
16 kB
4
Indexable
import scrapy
from mal_scraper.items import MalScraperItem


class MalSpiderSpider(scrapy.Spider):
    name = "mal_spider"
    allowed_domains = ["myanimelist.net"]
    start_urls = ["https://myanimelist.net/anime/season/archive"]

    def parse(self, response):
        season_url_items = response.css('tr a')

        for season_url_item in [season_url_items[0]]:
            season_url = season_url_item.attrib['href']
            season_year = season_url_item.css('::text').get()

            # yield scrapy.Request(
            #     url=season_url,
            #     callback=self.seasonal_anime_page,
            #     meta={'playwright': True})
            yield response.follow(
                url=season_url,
                callback=self.seasonal_anime_page
                )


    def seasonal_anime_page(self, response):
        seasonal_shows_lists = response.css("div.js-seasonal-anime-list")

        seasonal_show_urls = []
        for seasonal_shows_list in seasonal_shows_lists:
            anime_header = seasonal_shows_list.css("div.anime-header::text").get()
            if anime_header != "TV (Continuing)":
                 [seasonal_show_urls.append(seasonal_title) for seasonal_title in  seasonal_shows_list.css("h2.h2_anime_title a")]

        for titles in seasonal_show_urls:
            seasonal_shows_url = titles.attrib["href"]
            yield response.follow(url=seasonal_shows_url, callback=self.anime_page)


    def anime_page(self, response):
        # In this function wee want title, id from url, type, episodes,
        # status, aired, premiered, producers, Licensors
        # studios, source, genre, theme, demographic, duration, rating
        # score & rated by user number, ranked, members, favorites,
        # official site, resources
        # streaming platform, synopsis, prequal/sequal/alternative with id
        mal_item = MalScraperItem()
        title = response.css('div.h1-title h1.title-name strong::text').get()
        poster_img_url = response.xpath('//img[@itemprop="image"]').attrib['data-src']
        mal_url = response.url
        mal_id = response.url.split("/")[4]
        sidebar_information = response.css('div.spaceit_pad')

        # alternative_names
        synonym_name = None
        japanese_name = None
        english_name = None
        show_type = None
        total_episodes = None
        status = None
        airing_start_date = None
        airing_finish_date = None
        premiered = None
        broadcast = None
        producers = None
        licensors = None
        studios = None
        source = None
        genres = None
        themes = None
        demographic = None
        duration_in_minutes = None
        age_rating = None
        score = None
        members = None
        official_site = None
        anidb_url = None
        ann_url = None
        wikipedia_url = None

        description = response.xpath("//p[@itemprop='description']/node()").getall()
        description = "".join(description)

        for name in sidebar_information[:3]:
            language = name.css('span.dark_text::text').get().strip()
            alternative_title = name.css("::text")[2].get()
            if language == "Japanese:":
                japanese_name = alternative_title
            elif language == "English:":
                english_name = alternative_title
            elif language == "Synonyms:":
                synonym_name = alternative_title

        for info in sidebar_information:
            info_key_text = info.css('span.dark_text::text').get()
            info_value_text = info.css("::text")

            if info_key_text == "Type:":
                show_type = info_value_text[3].get()
            elif info_key_text == "Episodes:":
                total_episodes = info_value_text[2].get()
            elif info_key_text == "Status:":
                status = info_value_text[2].get()
            elif info_key_text == "Aired:":
                aired = info_value_text[2].get()
                aired = aired.split(" to ")
                airing_start_date = aired[0]
                if len(aired) > 1:
                    airing_finish_date = aired[1]
                    if "?" in airing_finish_date:
                        airing_finish_date = None
                else:
                    airing_finish_date = None
            elif info_key_text == "Premiered:":
                premiered = info_value_text[3].get()
            elif info_key_text == "Broadcast:":
                broadcast = info_value_text[2].get()
            elif info_key_text == "Producers:":
                if "None found" not in info_value_text[2].get():
                    producers = "".join([i.get() for i in info_value_text[3:]])
            elif info_key_text == "Licensors:":
                if "None found" not in info_value_text[2].get():
                    licensors = "".join([i.get() for i in info_value_text[3:]])
            elif info_key_text == "Studios:":
                if "None found" not in info_value_text[2].get():
                    studios = "".join([i.get() for i in info_value_text[3:]])
            elif info_key_text == "Source:":
                source = "".join([i.get() for i in info_value_text[2:]])
            elif info_key_text == "Genres:" or info_key_text == "Genre:":
                genres = ", ".join([i.get() for i in info.css("a::text")])
            elif info_key_text == "Themes:" or info_key_text == "Theme:":
                themes = ", ".join([i.get() for i in info.css("a::text")])
            elif info_key_text == "Demographic:":
                demographic = info_value_text[3].get()
            elif info_key_text == "Duration:":
                duration_in_minutes = info_value_text[2].get()
                if "Unknown" in duration_in_minutes:
                    duration_in_minutes = None
            elif info_key_text == "Rating:":
                age_rating = info_value_text[2].get()
                if "None" in age_rating:
                    age_rating = None
            elif info_key_text == "Score:":
                score = info_value_text[3].get()
            elif info_key_text == "Members:":
                members = info_value_text[2].get()

            # get external resources links
            external_links_list = response.css("div.external_links a")
            for link in external_links_list:
                link_text = link.css("::text").get()
                url = link.attrib['href']
                if link_text == "Official Site":
                    official_site = url
                elif link_text == "AniDB":
                    anidb_url = url
                elif link_text == "ANN":
                    ann_url = url
                elif link_text == "Wikipedia":
                    if "en.wikipedia.org" in url:
                        wikipedia_url = url


        def get_related_media(row):
            """This function takes related anime row selector and returns
            a list of all shows"""
            shows = row[1].css("td a")
            show_list = []
            for show in shows:
                show_id = show.attrib["href"].split("/")[2]
                show_title = show.css("::text").get()
                anime = {"title": show_title, "mal_id": show_id}
                show_list.append(anime)
            return show_list

        related_media_table = response.css("table.anime_detail_related_anime tr")
        sequels = None
        prequels = None
        other_shows = None
        side_stories = None
        alternative_versions = None
        character_shows = None
        spin_offs = None
        alternative_setting_shows = None
        summary_shows = None
        for row in related_media_table:
            row = row.css("td")
            left_row_data = row[0].css("::text").get()
            if "Sequel" in left_row_data:
                sequels = get_related_media(row)

            if "Prequel" in left_row_data:
                prequels = get_related_media(row)

            if "Other" in left_row_data:
                other_shows = get_related_media(row)

            if "Side story" in left_row_data:
                side_stories = get_related_media(row)

            if "Alternative version" in left_row_data:
                alternative_versions = get_related_media(row)

            if "Character" in left_row_data:
                character_shows = get_related_media(row)

            if "Spin-off" in left_row_data:
                spin_offs = get_related_media(row)

            if "Alternative setting" in left_row_data:
                alternative_setting_shows = get_related_media(row)

            if "Summary" in left_row_data:
                summary_shows = get_related_media(row)

        mal_item["title"] = title
        mal_item["poster_img_url"] = poster_img_url
        mal_item["mal_url"] = mal_url
        mal_item["mal_id"] = mal_id
        mal_item["synonym_name"] = synonym_name
        mal_item["japanese_name"] = japanese_name
        mal_item["english_name"] = english_name
        mal_item["show_type"] = show_type
        mal_item["total_episodes"] = total_episodes
        mal_item["status"] = status
        mal_item["airing_start_date"] = airing_start_date
        mal_item["airing_finish_date"] = airing_finish_date
        mal_item["premiered"] = premiered
        mal_item["broadcast"] = broadcast
        mal_item["producers"] = producers
        mal_item["licensors"] = licensors
        mal_item["studios"] = studios
        mal_item["source"] = source
        mal_item["genres"] = genres
        mal_item["themes"] = themes
        mal_item["demographic"] = demographic
        mal_item["duration_in_minutes"] = duration_in_minutes
        mal_item["age_rating"] = age_rating
        mal_item["score"] = score
        mal_item["members"] = members
        mal_item["official_site"] = official_site
        mal_item["anidb_url"] = anidb_url
        mal_item["ann_url"] = ann_url
        mal_item["wikipedia_url"] = wikipedia_url
        mal_item["description"] = description
        mal_item["prequels"] = prequels
        mal_item["sequels"] = sequels
        mal_item["other_shows"] = other_shows
        mal_item["side_stories"] = side_stories
        mal_item["alternative_versions"] = alternative_versions
        mal_item["character_shows"] = character_shows
        mal_item["spin_offs"] = spin_offs
        mal_item["alternative_setting_shows"] = alternative_setting_shows
        mal_item["summary_shows"] = summary_shows

        character_staff_page_url = response.url + "/characters"
        episodes_page_url = response.url + "/episode"

        yield response.follow(character_staff_page_url,
                              callback=self.character_page,
                              meta={"mal_item": mal_item,
                                    "episodes_page_url": episodes_page_url})


    def character_page(self, response):
        """This function returns a list of characters"""
        mal_item = response.meta["mal_item"]
        characters_tables = response.css('div.anime-character-container table.js-anime-character-table')
        characters = []
        # Loop for getting character info
        for table in characters_tables:
            table_data = table.css("td.borderClass")[1:]
            character = table_data[0].css("td.borderClass div")[2:]

            character_name = character[0].css("a h3::text").get()
            character_first_name = None
            character_last_name = None
            if "," in character_name:
                character_first_name = character_name.split(",")[1]
                character_last_name = character_name.split(",")[0]
            else:
                character_first_name = character_name

            character_role = character[1].css("::text").get()

            voice_actors = table_data[1].css("tr.js-anime-character-va-lang")
            character_voice_actors = []
            for actor in voice_actors:
                va_name = actor.css("div.spaceit_pad")[0]
                va_name = va_name.css("a::text").get()
                va_first_name = None
                va_secon_name = None
                if "," in va_name:
                    va_first_name = va_name.split(",")[1]
                    va_last_name = va_name.split(",")[0]
                else:
                    va_first_name = va_name

                va_lang = actor.css("div.spaceit_pad")[1]
                va_lang = va_lang.css("::text").get()

                va_name_lang = {"va_first_name": va_first_name,
                                "va_last_name": va_last_name,
                                "va_lang": va_lang,
                                }
                character_voice_actors.append(va_name_lang)

            character_info = {
                "character_first_name": character_first_name,
                "character_last_name": character_last_name,
                "character_role": character_role,
                "character_voice_actors": character_voice_actors}

            characters.append(character_info)

        staff_tables = response.xpath("//div[contains(concat(' ', @class, ' '), ' rightside ')]/table")

        staffs = []
        for table in staff_tables:
            name_role = table.css("td")[1]
            staff_name = name_role.css("a::text").get()
            staff_role = name_role.css("div small::text").get()

            staff_first_name = None
            staff_last_name = None
            staff_roles = None
            if "," in staff_name:
                staff_first_name = staff_name.split(",")[1]
                staff_last_name = staff_name.split(",")[0]
            else:
                staff_first_name = staff_name

            if "," in staff_role:
                staff_roles = staff_role.split(",")
            else:
                staff_roles = [staff_role]

            staff_with_roles = {"staff_first_name": staff_first_name,
                                "staff_last_name": staff_last_name,
                                "staff_roles": staff_roles}
            staffs.append(staff_with_roles)


        mal_item["characters"] = characters
        mal_item["staffs"] = staffs

        if "Movie" not in mal_item["show_type"]:
            episodes_page_url = response.meta["episodes_page_url"]

            yield response.follow(episodes_page_url,
                                  callback=self.episodes_page,
                                  meta={"mal_item": mal_item}
                                  )
        else:
            mal_item["episodes"] = None


    def episodes_page(self, response):
        mal_item = response.meta["mal_item"]
        mal_item["episodes"] = []
        episode_urls = response.css("td.episode-title a")
        for url in episode_urls:
            url = url.attrib["href"]
            yield response.follow(url,
                                  callback=self.episode_main_page,
                                  meta={"mal_item": mal_item})
        yield mal_item


    def episode_main_page(self, response):
        mal_item = response.meta["mal_item"]
        # TO-DO: scrape episodes
        has_multi_episodes_info = response.css("div.has-multi-episodes-info")

        episode_number = has_multi_episodes_info.css("h2 span::text").get()
        episode_title = has_multi_episodes_info.css("h2::text").getall()[1]

        # other_names = has_multi_episodes_info.css("p::text").get()
        # romaji_name = romaji_name = other_names.split("(")[0]
        # japanese_name = other_names.split("(")[1].split(")")[0]

        duration_aired_text = has_multi_episodes_info.css("div")[2].css("::text").getall()

        duration = duration_aired_text[2]
        air_date = duration_aired_text[4].split("(")[0]

        synopsis = None
        if not response.css('div.badresult').get() is None:
            synopsis = response.xpath("//div[@class='pt8 pb8']/node()").getall()
            synopsis = "".join(synopsis[2:])

        episode = {"number": episode_number,
                   "title": episode_title,
                   "duration": duration,
                   "air_date": air_date,
                   "synopsis": synopsis}

        mal_item["episodes"].append(episode)
        return mal_item
Editor is loading...