Untitled
unknown
plain_text
4 years ago
2.4 kB
16
Indexable
const express = require('express')
//import timesLimit from 'async/timesLimit';
const async = require("async")
const app = express()
const port = 3000
const puppeteer = require('puppeteer');
let browser
const singlefile = require("fs").readFileSync("./singlefile.js",{encoding:"utf-8"})
async function scrapeSite(url, n){
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve();
}
}, 80);
});
});
}
const page = await browser.newPage();
await page.setBypassCSP(true)
page.goto(url);
await page.waitForNavigation({waitUntil: 'domcontentloaded'});
await autoScroll(page);
await page.waitForNavigation({waitUntil: 'networkidle2'})
.catch(e => {})
await page.addScriptTag({content: singlefile })
const data = await page.evaluate(()=> {
return new Promise(((resolve, reject) => {
setTimeout(()=>{
singlefile.getPageData().then(resolve)
} ,1000)
}))
})
var SCRIPT_REGEX = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi;
while (SCRIPT_REGEX.test(data.content)) {
data.content = data.content.replace(SCRIPT_REGEX, "");
}
require("fs").writeFileSync(`${n}`+'.html',data.content)
await page.close();
}
function startScraping(){
async.timesLimit(4,2, (n, next)=>{
let url = "https://icanhas.cheezburger.com"
if(n !== 0 )
{
url = url + `/page/${n+1}`
}
scrapeSite(url,n+1)
.then(next)
},
()=>{
console.log("done")
})
}
puppeteer.launch({ args: ['--no-sandbox'], headless:false})
.then(a => {
browser = a ;
app.listen(port, () => {
console.log(`Example app listening at http://localhost:${port}`)
})
})
.then(startScraping)
Editor is loading...