Puppeteer.js
Script to scrape tweets from the cnn account.unknown
javascript
a year ago
3.6 kB
32
Indexable
Never
const puppeteer = require('puppeteer-extra'); (async () => { const browser = await puppeteer.launch({ executablePath: '/usr/bin/brave', headless: false, defaultViewport: null, ignoreDefaultArgs: ["--disable-extensions"], args: ["--start-maximized", "--no-sandbox", "--disable-setuid-sandbox"], }); const page = await browser.newPage(); await page.goto('https://twitter.com/CNN', { waitUntil: 'networkidle2' }); console.log('here'); fetch(page); console.log('there'); await autoScroll(page); browser.close(); })(); async function fetch(page) { while (true) { // const content = await page.$$eval('article div[lang]', (tweets) => tweets.map((tweet) => tweet.textContent)); const stuff = await page.$$eval('article', (tweets) => tweets.map((tweet) => { try { if (!tweet || tweet == null) return 0; if ( !tweet.querySelectorAll("time") || tweet.querySelectorAll("time") == null || !tweet.outerHTML ) return 0; //fallback mechanism to enter null if selector cannot retreive let _content = null; let _published = null; let _replies = null; let _retweets = null; let _likes = null; let _error = null; let _url = [...tweet.querySelectorAll("a")] .map((e) => e.getAttribute("href")) .filter((e) => e.includes("status") && !e.includes("photo"))[0]; try { //find tweetmap data _content = tweet.textContent; _published = tweet .querySelectorAll("time")[0] .getAttribute("datetime"); if (tweet.outerHTML.match("[0-9]+ .etweets")) _retweets = tweet.outerHTML .match("[0-9]+ .etweets")[0] .split(" ")[0]; else _retweets = tweet.outerHTML .match("[0-9]+ .etweet")[0] .split(" ")[0]; if (tweet.outerHTML.match("[0-9]+ .ikes")) _likes = tweet.outerHTML.match("[0-9]+ .ikes")[0].split(" ")[0]; else _likes = tweet.outerHTML.match("[0-9]+ .ike")[0].split(" ")[0]; if (tweet.outerHTML.match("[0-9]+ .eplies")) _replies = tweet.outerHTML .match("[0-9]+ .eplies")[0] .split(" ")[0]; else _replies = tweet.outerHTML .match("[0-9]+ .eply")[0] .split(" ")[0]; } catch (ex) { _error = ex.toString(); } //populate tweetmap return { url: _url, content: _content, published: _published, replies: _replies, retweets: _retweets, likes: _likes, error: _error, }; } catch (e) { console.log("puppeteer error"); console.log(e); return 0; } })); console.log(stuff); } } async function autoScroll(page) { await page.evaluate(async () => { await new Promise((resolve) => { var totalHeight = 0; var distance = 100; var timer = setInterval(() => { var scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= scrollHeight - window.innerHeight) { clearInterval(timer); resolve(); } }, 400); }); }); }