Created
February 5, 2019 10:17
-
-
Save shukla2112/c3bda3c41a93662d582158dfd03f34e4 to your computer and use it in GitHub Desktop.
puppeteer with resident google chrome - working with high blocking sites
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// overwrite the `languages` property to use a custom getter | |
Object.defineProperty(navigator, "languages", { | |
get: function() { | |
return ["en-US", "en"]; | |
}; | |
}); | |
// overwrite the `plugins` property to use a custom getter | |
Object.defineProperty(navigator, 'plugins', { | |
get: function() { | |
// this just needs to have `length > 0`, but we could mock the plugins too | |
return [1, 2, 3, 4, 5]; | |
}, | |
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// const puppeteer = require('puppeteer'); | |
const puppeteer = require('puppeteer-extra') | |
const pluginStealth = require("puppeteer-extra-plugin-stealth") | |
puppeteer.use(pluginStealth()) | |
// Enable stealth plugin | |
// puppeteer.use(require('puppeteer-extra-plugin-stealth')()) | |
// | |
// Read the file line by line - Each line would be url | |
// Visit the url and get the response and store it in desired format | |
// Capture the screenshot | |
// Save the stats to local mongodb | |
// Send the stats at the end of file | |
// | |
// Uses the below techniques to avoid bot detection | |
// Using the stealth mode | |
// load the plugins | |
// Use the resident browser | |
// Not use the default settings by puppeteer | |
// use headless : false | |
// Enable javascript | |
// Add headers | |
var mongoose = require("mongoose"); | |
mongoose.Promise = global.Promise; | |
mongoose.connect("mongodb://localhost:27017/simple-crawler",{ useNewUrlParser: true }); | |
var resSchema = new mongoose.Schema({ | |
url: String, | |
html: String, | |
img: String, | |
statusCode : String, | |
fileName : String | |
}); | |
var Result = mongoose.model("Result", resSchema); | |
async function crawlAndScreenshot(fileline, url) { | |
const args = [ | |
'--no-sandbox', | |
'--disable-setuid-sandbox', | |
'--disable-infobars', | |
'--window-position=0,0', | |
'--ignore-certifcate-errors', | |
'--ignore-certifcate-errors-spki-list', | |
'--user-data-dir="/Users/nikunjshukla/Library/Application Support/Google/Chrome/"', | |
]; | |
// '--user-agent="Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"', | |
// Worked for vitacost - '--user-agent="Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"' | |
// '--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3312.0 Safari/537.36"' | |
const options = { | |
args, | |
headless: false, | |
ignoreHTTPSErrors: true, | |
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', | |
handleSIGINT: true, | |
ignoreDefaultArgs: true, | |
}; | |
// ignoreDefaultArgs: true, | |
// devtools: true, | |
// slowMo: 2000, | |
const browser = await puppeteer.launch(options); | |
const context = await browser.createIncognitoBrowserContext(); | |
var fs = require('fs'); | |
const preloadFile = fs.readFileSync('./preload.js', 'utf8'); | |
// const page = await browser.newPage(); | |
const page = await context.newPage(); | |
await page.setJavaScriptEnabled(true) | |
await page.evaluateOnNewDocument(preloadFile); | |
const headers = { | |
'Accept': '*/*', | |
'Accept-Language': 'en-US', | |
} | |
await page.setExtraHTTPHeaders(headers); | |
// await page.setUserAgent('Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'); | |
var obj = {"url" : url} | |
try { | |
// await page.setViewport({width: 1000, height: 500}) | |
const response = await page.goto(url, {waitUntil: 'networkidle2'}); | |
await page.waitFor(1000); | |
let bodyHTML = await page.evaluate(() => document.body.innerHTML); | |
const userAgent = await page.evaluate('navigator.userAgent'); | |
console.log('Useragent used - ', userAgent) | |
const html = await page.content(); | |
const statusCode = response.status(); | |
// await page.setViewport({width: 1000, height: 500}) | |
image_path = 'images/' + fileline + '.png' | |
const imgData = await page.screenshot({path: image_path}); | |
const img = Buffer.from(imgData, 'base64'); | |
obj["html"] = html; | |
obj["img"] = img; | |
obj["statusCode"] = statusCode; | |
if (statusCode == 403) { | |
await sleep(60000); //sleep 30secs | |
} | |
} catch (err) { | |
console.log('Error loading page:', err); | |
} | |
// await sleep(180000); //sleep 30secs | |
await context.close(); | |
browser.close(); | |
return obj; | |
}; | |
async function saveData(data) { | |
var myData = new Result(data); | |
myData.save() | |
.then(item => { | |
console.log("data saved to database"); | |
}) | |
.catch(err => { | |
console.log("Unable to save to database"); | |
}); | |
} | |
async function processFile() { | |
const args = process.argv; | |
if (args.length != 3) { | |
console.log("usage - `node scrape.js <filepath>`"); | |
process.exit(1); | |
} | |
inputFile = args[2] | |
var fs = require('fs'), | |
readline = require('readline'); | |
var fileContents = fs.readFileSync(inputFile, { encoding: 'utf-8' }); | |
var c = 0 | |
var lines = fileContents.split("\n"); | |
lines.pop(); | |
var summary = {} | |
for (var line of lines) { | |
await sleep(3000); //sleep 30secs | |
var retVal = {}; | |
console.log(line); | |
fileNameArr = inputFile.split("/"); | |
fileline = fileNameArr[fileNameArr.length - 1] + c | |
// await crawlAndScreenshot(fileline, line).then((value) => { | |
// console.log(value); // Success! | |
// }); | |
await crawlAndScreenshot(fileline, line).then(function (obj) { | |
console.log("Finished retrieving page : " + obj.url); | |
retVal['url'] = obj.url; | |
retVal['html'] = obj.html; | |
retVal['img'] = obj.img; | |
retVal['statusCode'] = obj.statusCode; | |
retVal['fileName'] = inputFile; | |
console.log("Status = " + obj.statusCode); | |
}); | |
c++ | |
await saveData(retVal); | |
if ( retVal['statusCode'] in summary ) { | |
summary[retVal['statusCode']] = summary[retVal['statusCode']] + 1 | |
} else { | |
summary[retVal['statusCode']] = 1 | |
} | |
} | |
console.log(summary) | |
process.exit(0); | |
} | |
processFile(); | |
function sleep(ms) { | |
return new Promise(resolve => setTimeout(resolve, ms)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment