Created
June 7, 2013 08:00
-
-
Save straps/5727707 to your computer and use it in GitHub Desktop.
Based on the fantastic node-crawler plugin ( https://github.com/sylvinus/node-crawler ) Requires: npm install crawler Usage: node simple-crawler.js URL EXTENSIONS Example: node simple-crawler.js http://www.omgubuntu.co.uk/ jpg,png Find links for every page and subpages in URL and generates an URL list you can copy to a file and download with wge…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var Crawler = require("crawler").Crawler; | |
//console.argv=['node', 'app.js', url, ext] | |
if (process.argv.length < 4) { | |
console.log('Usage: ' + process.argv[0] + ' ' + process.argv[1] + ' URL EXT'); | |
console.log('Example: ' + process.argv[0] + ' ' + process.argv[1] + ' http://www.omgubuntu.co.uk/ jpg,png'); | |
process.exit(1); | |
} | |
var url = process.argv[2], | |
ext = process.argv[3]; | |
//Converting extension link to regex | |
// png,jpg ==> /\.png$|\.jpg$/ | |
var regexExt = new RegExp('\\.' + ext.split(',').join('$|\\.') + '$'); | |
//urls just found | |
var found = []; | |
var c = new Crawler({ | |
"maxConnections": 4, | |
// This will be called for each crawled page | |
"callback": function (error, result, $) { | |
// $ is a jQuery instance scoped to the server-side DOM of the page | |
if ($) { | |
$("a").each(function (index, a) { | |
if (regexExt.test(a.href)) { | |
if (found.indexOf(a.href) < 0) { | |
console.log(a.href); | |
found.push(a.href); | |
} | |
} | |
//Dont go outside of base url | |
if (a.href.indexOf(url) === 0) { | |
c.queue(a.href); | |
} | |
}); | |
} | |
} | |
}); | |
c.queue(url); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment