Untitled

 avatar
unknown
plain_text
4 years ago
2.4 kB
11
Indexable
const express = require('express')
//import timesLimit from 'async/timesLimit';
const async = require("async")
const app = express()
const port = 3000

const puppeteer = require('puppeteer');

let browser
const singlefile = require("fs").readFileSync("./singlefile.js",{encoding:"utf-8"})


async  function  scrapeSite(url, n){

    async function autoScroll(page){
        await page.evaluate(async () => {
            await new Promise((resolve, reject) => {
                var totalHeight = 0;
                var distance = 100;
                var timer = setInterval(() => {
                    var scrollHeight = document.body.scrollHeight;
                    window.scrollBy(0, distance);
                    totalHeight += distance;

                    if(totalHeight >= scrollHeight){
                        clearInterval(timer);
                        resolve();
                    }
                }, 80);
            });
        });
    }


    const page = await browser.newPage();
    await page.setBypassCSP(true)
    page.goto(url);
    await page.waitForNavigation({waitUntil: 'domcontentloaded'});
    await autoScroll(page);
    await page.waitForNavigation({waitUntil: 'networkidle2'})
        .catch(e => {})
    await page.addScriptTag({content:  singlefile })

    const data =  await page.evaluate(()=> {

        return new Promise(((resolve, reject) => {
            setTimeout(()=>{
                singlefile.getPageData().then(resolve)
            } ,1000)
        }))


    })

    var SCRIPT_REGEX = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi;
    while (SCRIPT_REGEX.test(data.content)) {
        data.content = data.content.replace(SCRIPT_REGEX, "");
    }



    require("fs").writeFileSync(`${n}`+'.html',data.content)
    await page.close();

}

function startScraping(){

    async.timesLimit(4,2, (n, next)=>{
            let url =  "https://icanhas.cheezburger.com"
            if(n !== 0 )
            {
                url = url + `/page/${n+1}`
            }
            scrapeSite(url,n+1)
                .then(next)
        },
        ()=>{
            console.log("done")
        })

}
puppeteer.launch({    args: ['--no-sandbox'], headless:false})
        .then(a => {
            browser = a ;

            app.listen(port, () => {
                console.log(`Example app listening at http://localhost:${port}`)
            })
        })
    .then(startScraping)








Editor is loading...