Created
March 20, 2013 09:35
-
-
Save IceCreamYou/5203447 to your computer and use it in GitHub Desktop.
This script scrapes the Inc 500 data and turns it into a tab-separated values (TSV) string which can be easily imported into Excel or a database. To run it, go to http://www.inc.com/inc5000/list/2012 and execute the code in your browser's JavaScript console. Be nice! This hits the Inc site 50 times in quick succession. I don't know who technical…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Get an array containing the text nodes within a DOM node. | |
* | |
* Modified from http://stackoverflow.com/a/4399718/843621 | |
* | |
* @param node Any DOM node. | |
* @param [includeWhitespaceNodes=false] Whether to include whitespace-only nodes. | |
* @param [recurse=false] Whether to get all nodes (true) or only the immediate child nodes (false). | |
* @return An array containing TextNodes. | |
*/ | |
function getTextNodesIn(node, includeWhitespaceNodes, recurse) { | |
var textNodes = [], whitespace = /^\s*$/; | |
function getTextNodes(node, recurse) { | |
if (node.nodeType == 3) { | |
if (includeWhitespaceNodes || !whitespace.test(node.nodeValue)) { | |
textNodes.push(node); | |
} | |
} | |
else if (recurse) { | |
for (var i = 0, len = node.childNodes.length; i < len; ++i) { | |
getTextNodes(node.childNodes[i], !recurse); | |
} | |
} | |
} | |
getTextNodes(node, !recurse); | |
return textNodes; | |
} | |
/** | |
* Get the data on the current page in TSV format. | |
*/ | |
function getPageCSV() { | |
var s = ''; | |
// Walk through each row | |
$('#fulltable tbody tr:not(:first)').each(function() { | |
// Walk through each column | |
$(this).find('td').each(function(i, v) { | |
// Usually just grab the value | |
v = $(this).html(); | |
// In the second column, get both the company name and URL | |
if (i === 1) v = $(this).find('a').html() + "\t" + $(this).find('a')[0].href; | |
// In the fifth column, get just the category, not the fancy colored block thing | |
else if (i === 4) v = getTextNodesIn(this)[0].nodeValue; | |
// Add to the line | |
s += v + "\t"; | |
}); | |
// Next line | |
s += "\n"; | |
}); | |
return s; | |
} | |
// "s" holds the whole TSV at the end of this script | |
var s = "Rank\tCompany Name\tURL\t3-year % growth\tRevenue (millions)\tIndustry\t# of Employees\tCity\tState\n"; | |
/** | |
* Scrapes all the data. | |
* | |
* This recursively reads the data from the current page, then loads in the next one. | |
* After this function runs, the s variable will hold the entire TSV. | |
* | |
* @param [max] The maximum number of pages to retrieve. If not specified, gets them all. | |
* @param [curr] Used internally to keep track of what page we're on. | |
*/ | |
function getNextPage(max, curr) { | |
// On the first run, assume we're on the first page | |
if (typeof curr === 'undefined') curr = 1; | |
// Get the data | |
s += getPageCSV(); | |
// If there is a next page and we haven't read too many pages, load the next one | |
if ($('.next').length && (typeof max === 'undefined' || curr < max)) { | |
// Copy the next page over the current one. There are a couple advantages of load(): | |
// most importantly, we get just the DOM we care about without running other scripts on the page. | |
$('#maincolumn_inner').load($('.next')[0].href + ' #maincolumn_inner', function() { | |
// Recurse! | |
getNextPage(max, ++curr); | |
}); | |
} | |
// When we're done, go back to the first page so we can easily run again if we want. | |
else { | |
$('#maincolumn_inner').load('http://www.inc.com/inc5000/list/2012 #maincolumn_inner', function() { | |
// Print out the result in the console so we can copy it into a document | |
console.log(s); | |
}); | |
} | |
} | |
// Run the script. | |
getNextPage(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment