Puppeteer.js
Script to scrape tweets from the cnn account.unknown
javascript
3 years ago
3.6 kB
49
Indexable
const puppeteer = require('puppeteer-extra');
(async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/brave',
headless: false,
defaultViewport: null,
ignoreDefaultArgs: ["--disable-extensions"],
args: ["--start-maximized", "--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.goto('https://twitter.com/CNN', { waitUntil: 'networkidle2' });
console.log('here');
fetch(page);
console.log('there');
await autoScroll(page);
browser.close();
})();
async function fetch(page) {
while (true) {
// const content = await page.$$eval('article div[lang]', (tweets) => tweets.map((tweet) => tweet.textContent));
const stuff = await page.$$eval('article', (tweets) =>
tweets.map((tweet) => {
try {
if (!tweet || tweet == null) return 0;
if (
!tweet.querySelectorAll("time") ||
tweet.querySelectorAll("time") == null ||
!tweet.outerHTML
)
return 0;
//fallback mechanism to enter null if selector cannot retreive
let _content = null;
let _published = null;
let _replies = null;
let _retweets = null;
let _likes = null;
let _error = null;
let _url = [...tweet.querySelectorAll("a")]
.map((e) => e.getAttribute("href"))
.filter((e) => e.includes("status") && !e.includes("photo"))[0];
try {
//find tweetmap data
_content = tweet.textContent;
_published = tweet
.querySelectorAll("time")[0]
.getAttribute("datetime");
if (tweet.outerHTML.match("[0-9]+ .etweets"))
_retweets = tweet.outerHTML
.match("[0-9]+ .etweets")[0]
.split(" ")[0];
else
_retweets = tweet.outerHTML
.match("[0-9]+ .etweet")[0]
.split(" ")[0];
if (tweet.outerHTML.match("[0-9]+ .ikes"))
_likes = tweet.outerHTML.match("[0-9]+ .ikes")[0].split(" ")[0];
else
_likes = tweet.outerHTML.match("[0-9]+ .ike")[0].split(" ")[0];
if (tweet.outerHTML.match("[0-9]+ .eplies"))
_replies = tweet.outerHTML
.match("[0-9]+ .eplies")[0]
.split(" ")[0];
else
_replies = tweet.outerHTML
.match("[0-9]+ .eply")[0]
.split(" ")[0];
} catch (ex) {
_error = ex.toString();
}
//populate tweetmap
return {
url: _url,
content: _content,
published: _published,
replies: _replies,
retweets: _retweets,
likes: _likes,
error: _error,
};
} catch (e) {
console.log("puppeteer error");
console.log(e);
return 0;
}
}));
console.log(stuff);
}
}
async function autoScroll(page) {
await page.evaluate(async () => {
await new Promise((resolve) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight - window.innerHeight) {
clearInterval(timer);
resolve();
}
}, 400);
});
});
}
Editor is loading...