Untitled
unknown
plain_text
4 years ago
2.4 kB
11
Indexable
const express = require('express') //import timesLimit from 'async/timesLimit'; const async = require("async") const app = express() const port = 3000 const puppeteer = require('puppeteer'); let browser const singlefile = require("fs").readFileSync("./singlefile.js",{encoding:"utf-8"}) async function scrapeSite(url, n){ async function autoScroll(page){ await page.evaluate(async () => { await new Promise((resolve, reject) => { var totalHeight = 0; var distance = 100; var timer = setInterval(() => { var scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if(totalHeight >= scrollHeight){ clearInterval(timer); resolve(); } }, 80); }); }); } const page = await browser.newPage(); await page.setBypassCSP(true) page.goto(url); await page.waitForNavigation({waitUntil: 'domcontentloaded'}); await autoScroll(page); await page.waitForNavigation({waitUntil: 'networkidle2'}) .catch(e => {}) await page.addScriptTag({content: singlefile }) const data = await page.evaluate(()=> { return new Promise(((resolve, reject) => { setTimeout(()=>{ singlefile.getPageData().then(resolve) } ,1000) })) }) var SCRIPT_REGEX = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi; while (SCRIPT_REGEX.test(data.content)) { data.content = data.content.replace(SCRIPT_REGEX, ""); } require("fs").writeFileSync(`${n}`+'.html',data.content) await page.close(); } function startScraping(){ async.timesLimit(4,2, (n, next)=>{ let url = "https://icanhas.cheezburger.com" if(n !== 0 ) { url = url + `/page/${n+1}` } scrapeSite(url,n+1) .then(next) }, ()=>{ console.log("done") }) } puppeteer.launch({ args: ['--no-sandbox'], headless:false}) .then(a => { browser = a ; app.listen(port, () => { console.log(`Example app listening at http://localhost:${port}`) }) }) .then(startScraping)
Editor is loading...