Untitled

mail@pastecode.io avatar
unknown
plain_text
4 years ago
1.8 kB
4
Indexable
Never
import scrapy
 
 
class PregnancySpider(scrapy.Spider):
    name = 'pregnancy'
    allowed_domains = ['pregnancyforum.momtastic.com']
    start_urls = ['https://pregnancyforum.momtastic.com/forums/pregnancy-chat.97']
 
    def parse(self, response):
        posts = response.xpath("//div[@class='titleText']")
        for post in posts:
            post_url = post.xpath(".//h3/a[@class='PreviewTooltip']/@href").get()
            post = post.xpath(".//h3/a[@class='PreviewTooltip']/text()").get()
            if post_url:
                yield scrapy.Request(
                    url = response.urljoin(post_url),
                    callback=self.parse_post,
                    meta={
                         'post': post
                    }
                )
 
 
 
        next_page = response.xpath("//a[@class='text']/@href").extract()[1]
 
        if next_page:
            yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse) 
 
 
 
    def parse_post(self, response):
        post_url = response.url
        comments = response.xpath("//li[@class='sectionMain message     ']")
        for comment in comments:
            body = comment.xpath("normalize-space(.//blockquote/text())").extract()
            author = comment.xpath(".//a[@class='username author']/text()").get()
            date = comment.xpath(".//a[@class='datePermalink']/span[@class='DateTime']/text()").get()
            yield {
                'url': post_url,
                'comments': body,
                'author': author,
                'date' : date
            }
        
        # next_page = response.xpath("//a[@class='text']/@href").extract()[0]
 
        # if next_page:
        #     yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_post)