Untitled
unknown
plain_text
5 years ago
1.8 kB
7
Indexable
import scrapy class PregnancySpider(scrapy.Spider): name = 'pregnancy' allowed_domains = ['pregnancyforum.momtastic.com'] start_urls = ['https://pregnancyforum.momtastic.com/forums/pregnancy-chat.97'] def parse(self, response): posts = response.xpath("//div[@class='titleText']") for post in posts: post_url = post.xpath(".//h3/a[@class='PreviewTooltip']/@href").get() post = post.xpath(".//h3/a[@class='PreviewTooltip']/text()").get() if post_url: yield scrapy.Request( url = response.urljoin(post_url), callback=self.parse_post, meta={ 'post': post } ) next_page = response.xpath("//a[@class='text']/@href").extract()[1] if next_page: yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse) def parse_post(self, response): post_url = response.url comments = response.xpath("//li[@class='sectionMain message ']") for comment in comments: body = comment.xpath("normalize-space(.//blockquote/text())").extract() author = comment.xpath(".//a[@class='username author']/text()").get() date = comment.xpath(".//a[@class='datePermalink']/span[@class='DateTime']/text()").get() yield { 'url': post_url, 'comments': body, 'author': author, 'date' : date } # next_page = response.xpath("//a[@class='text']/@href").extract()[0] # if next_page: # yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_post)
Editor is loading...