Untitled
unknown
python
2 years ago
3.0 kB
14
Indexable
''' This spider file contains the spider logic and scraping code. In order to determine what needs to go in this file, we have to inspect the website! ''' import scrapy from scrapy.spiders import CrawlSpider from complaintscraper.items.ComplaintItem import ComplaintItem from complaintscraper.utils.DataCleaning import DataCleaning import json import pydash class ComplaintScraper(CrawlSpider): name = "ComplaintScraper" custom_settings = { 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, 'CONCURRENT_REQUESTS': 10, 'CONCURRENT_REQUESTS_PER_DOMAIN': 10, 'DOWNLOAD_DELAY': 1 } start_urls = [ "https://www.reclameaqui.com.br/empresa/santander/lista-reclamacoes/?pagina=1", "https://www.reclameaqui.com.br/empresa/santander/lista-reclamacoes/?pagina=2", "https://www.reclameaqui.com.br/empresa/santander/lista-reclamacoes/?pagina=3" ] def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, self.parse_complaint, dont_filter=True) def parse_complaint(self, response): for row in response.xpath('//div[contains(@class,"bJdtis")]'): link = row.xpath("./a/@href").get() yield scrapy.Request(response.urljoin(link), callback=self.parse_model_complaint, dont_filter=True) def get_data(self, data, query): return pydash.get(data, query, None) def parse_model_complaint(self, response): complaintItem = ComplaintItem() data = json.loads(response.xpath('//*[@id="__NEXT_DATA__"]//text()').extract()[0]) complaintItem['id'] = self.get_data(data, "props.pageProps.complaint.legacyId") complaintItem['title'] = self.get_data(data, "props.pageProps.complaint.title") complaintItem['solved'] = self.get_data(data, "props.pageProps.complaint.solved") complaintItem['description'] = self.get_data(data, "props.pageProps.complaint.description") complaintItem['url'] = response.url content_container = response.xpath('//*[contains(@data-testid, "complaint-content-container")]//text()').extract() complaintItem['tags'] = content_container[content_container.index("ID:") + 2 : content_container.index("Status da reclamação:")] complaintItem['status'] = response.xpath('//*[contains(@data-testid, "complaint-status")]//text()').extract() complaintItem['userCity'] = self.get_data(data, "props.pageProps.complaint.userCity") complaintItem['userState'] = self.get_data(data, "props.pageProps.complaint.userState") complaintItem['creation_date'] = self.get_data(data, "props.pageProps.complaint.created") #Interactions contain customer and company replicas. complaintItem['deal_again'] = self.get_data(data, "props.pageProps.complaint.dealAgain") complaintItem['score'] = self.get_data(data, "props.pageProps.complaint.score") yield complaintItem
Editor is loading...