Last active
December 3, 2019 07:09
-
-
Save AditiKhullar/9456987 to your computer and use it in GitHub Desktop.
Scrapes the text from a given URL into an output file. Initialize the phantomJS module using "module load phantomjs" and then scrape "phantomjs text-scraper.js https://www.coursera.org/course/ml output.txt"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var page = require('webpage').create(); | |
var system = require('system'); | |
var fs = require('fs'); | |
if(system.args.length !== 3) { | |
console.log('Usage: phantomjs text-scraper.js <url> <output file>'); | |
phantom.exit(); | |
} | |
var url = system.args[1]; | |
var outfile = system.args[2]; | |
page.onConsoleMessage = function(msg) { | |
console.log(msg); | |
}; | |
page.open(url, function(status) { | |
var output = url + '\n'; | |
if(status === 'success') { | |
setTimeout(function() { | |
var text = page.evaluate(function () { | |
return document.title + '\n' + document.body.innerText; | |
}); | |
output += text; | |
fs.write(outfile, output); | |
}, 1000); | |
setTimeout(function () { | |
phantom.exit() | |
}, 1000); | |
} else { | |
console.log("Error!") | |
phantom.exit() | |
} | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On running this example, the output is simply
https://www.coursera.org/course/ml Coursera.org loading
Is there any other config required to run this? I am using phantomJs v1.9.7