Puppeteer.js

Script to scrape tweets from the cnn account.
mail@pastecode.io avatar
unknown
javascript
a year ago
3.6 kB
32
Indexable
Never
const puppeteer = require('puppeteer-extra');

(async () => {

  const browser = await puppeteer.launch({
    executablePath: '/usr/bin/brave',
    headless: false,
    defaultViewport: null,
    ignoreDefaultArgs: ["--disable-extensions"],
    args: ["--start-maximized", "--no-sandbox", "--disable-setuid-sandbox"],
  });
  const page = await browser.newPage();

  await page.goto('https://twitter.com/CNN', { waitUntil: 'networkidle2' });

  console.log('here');
  fetch(page);
  console.log('there');

  await autoScroll(page);

  browser.close();
})();

async function fetch(page) {
  while (true) {
    // const content = await page.$$eval('article div[lang]', (tweets) => tweets.map((tweet) => tweet.textContent));
    const stuff = await page.$$eval('article', (tweets) =>
      tweets.map((tweet) => {
        try {
          if (!tweet || tweet == null) return 0;
          if (
            !tweet.querySelectorAll("time") ||
            tweet.querySelectorAll("time") == null ||
            !tweet.outerHTML
          )
            return 0;

          //fallback mechanism to enter null if selector cannot retreive
        
          let _content = null;
          let _published = null;
          let _replies = null;
          let _retweets = null;
          let _likes = null;
          let _error = null;
          let _url = [...tweet.querySelectorAll("a")]
            .map((e) => e.getAttribute("href"))
            .filter((e) => e.includes("status") && !e.includes("photo"))[0];

          try {
            //find tweetmap data

            _content = tweet.textContent;
            _published = tweet
              .querySelectorAll("time")[0]
              .getAttribute("datetime");
            if (tweet.outerHTML.match("[0-9]+ .etweets"))
              _retweets = tweet.outerHTML
                .match("[0-9]+ .etweets")[0]
                .split(" ")[0];
            else
              _retweets = tweet.outerHTML
                .match("[0-9]+ .etweet")[0]
                .split(" ")[0];
            if (tweet.outerHTML.match("[0-9]+ .ikes"))
              _likes = tweet.outerHTML.match("[0-9]+ .ikes")[0].split(" ")[0];
            else
              _likes = tweet.outerHTML.match("[0-9]+ .ike")[0].split(" ")[0];
            if (tweet.outerHTML.match("[0-9]+ .eplies"))
              _replies = tweet.outerHTML
                .match("[0-9]+ .eplies")[0]
                .split(" ")[0];
            else
              _replies = tweet.outerHTML
                .match("[0-9]+ .eply")[0]
                .split(" ")[0];
          } catch (ex) {
            _error = ex.toString();
          }
          
          //populate tweetmap

          return {
            url: _url,
            content: _content,
            published: _published,
            replies: _replies,
            retweets: _retweets,
            likes: _likes,
            error: _error,
          };
        } catch (e) {
          console.log("puppeteer error");
          console.log(e);
          return 0;
        }
      }));

    console.log(stuff);
  }
}

async function autoScroll(page) {
  await page.evaluate(async () => {
    await new Promise((resolve) => {
      var totalHeight = 0;
      var distance = 100;
      var timer = setInterval(() => {
        var scrollHeight = document.body.scrollHeight;
        window.scrollBy(0, distance);
        totalHeight += distance;

        if (totalHeight >= scrollHeight - window.innerHeight) {
          clearInterval(timer);
          resolve();
        }
      }, 400);
    });
  });
}