Untitled
unknown
plain_text
5 years ago
1.8 kB
8
Indexable
import scrapy
class PregnancySpider(scrapy.Spider):
name = 'pregnancy'
allowed_domains = ['pregnancyforum.momtastic.com']
start_urls = ['https://pregnancyforum.momtastic.com/forums/pregnancy-chat.97']
def parse(self, response):
posts = response.xpath("//div[@class='titleText']")
for post in posts:
post_url = post.xpath(".//h3/a[@class='PreviewTooltip']/@href").get()
post = post.xpath(".//h3/a[@class='PreviewTooltip']/text()").get()
if post_url:
yield scrapy.Request(
url = response.urljoin(post_url),
callback=self.parse_post,
meta={
'post': post
}
)
next_page = response.xpath("//a[@class='text']/@href").extract()[1]
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
def parse_post(self, response):
post_url = response.url
comments = response.xpath("//li[@class='sectionMain message ']")
for comment in comments:
body = comment.xpath("normalize-space(.//blockquote/text())").extract()
author = comment.xpath(".//a[@class='username author']/text()").get()
date = comment.xpath(".//a[@class='datePermalink']/span[@class='DateTime']/text()").get()
yield {
'url': post_url,
'comments': body,
'author': author,
'date' : date
}
# next_page = response.xpath("//a[@class='text']/@href").extract()[0]
# if next_page:
# yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_post) Editor is loading...