IceCreamYou · March 20, 2013 09:35
diff --git a/Inc 500 TSV b/Inc 500 TSV
 /**
 * Get an array containing the text nodes within a DOM node.
 *
 * Modified from http://stackoverflow.com/a/4399718/843621
 *
 * @param node Any DOM node.
 * @param [includeWhitespaceNodes=false] Whether to include whitespace-only nodes.
 * @param [recurse=false] Whether to get all nodes (true) or only the immediate child nodes (false).
 * @return An array containing TextNodes.
 */
 function getTextNodesIn(node, includeWhitespaceNodes, recurse) {
  var textNodes = [], whitespace = /^\s*$/;
  function getTextNodes(node, recurse) {
    if (node.nodeType == 3) {
      if (includeWhitespaceNodes || !whitespace.test(node.nodeValue)) {
        textNodes.push(node);
      }
    }
    else if (recurse) {
      for (var i = 0, len = node.childNodes.length; i < len; ++i) {
        getTextNodes(node.childNodes[i], !recurse);
      }
    }
  }
  getTextNodes(node, !recurse);
  return textNodes;
 }

 /**
 * Get the data on the current page in TSV format.
 */
 function getPageCSV() {
  var s = '';
  // Walk through each row
  $('#fulltable tbody tr:not(:first)').each(function() {
    // Walk through each column
    $(this).find('td').each(function(i, v) {
      // Usually just grab the value
      v = $(this).html();
      // In the second column, get both the company name and URL
      if (i === 1) v = $(this).find('a').html() + "\t" + $(this).find('a')[0].href;
      // In the fifth column, get just the category, not the fancy colored block thing
      else if (i === 4) v = getTextNodesIn(this)[0].nodeValue;
      // Add to the line
      s += v + "\t";
    });
    // Next line
    s += "\n";
  });
  return s;
 }

 // "s" holds the whole TSV at the end of this script
 var s = "Rank\tCompany Name\tURL\t3-year % growth\tRevenue (millions)\tIndustry\t# of Employees\tCity\tState\n";

 /**
 * Scrapes all the data.
 *
 * This recursively reads the data from the current page, then loads in the next one.
 * After this function runs, the s variable will hold the entire TSV.
 *
 * @param [max] The maximum number of pages to retrieve. If not specified, gets them all.
 * @param [curr] Used internally to keep track of what page we're on.
 */
 function getNextPage(max, curr) {
  // On the first run, assume we're on the first page
  if (typeof curr === 'undefined') curr = 1;
  // Get the data
  s += getPageCSV();
  // If there is a next page and we haven't read too many pages, load the next one
  if ($('.next').length && (typeof max === 'undefined' || curr < max)) {
    // Copy the next page over the current one. There are a couple advantages of load():
    // most importantly, we get just the DOM we care about without running other scripts on the page.
    $('#maincolumn_inner').load($('.next')[0].href + ' #maincolumn_inner', function() {
      // Recurse!
      getNextPage(max, ++curr);
    });
  }
  // When we're done, go back to the first page so we can easily run again if we want.
  else {
    $('#maincolumn_inner').load('http://www.inc.com/inc5000/list/2012 #maincolumn_inner', function() {
      // Print out the result in the console so we can copy it into a document
      console.log(s);
    });
  }
 }

 // Run the script.
 getNextPage();
	/**
	* Get an array containing the text nodes within a DOM node.
	*
	* Modified from http://stackoverflow.com/a/4399718/843621
	*
	* @param node Any DOM node.
	* @param [includeWhitespaceNodes=false] Whether to include whitespace-only nodes.
	* @param [recurse=false] Whether to get all nodes (true) or only the immediate child nodes (false).
	* @return An array containing TextNodes.
	*/
	function getTextNodesIn(node, includeWhitespaceNodes, recurse) {
	var textNodes = [], whitespace = /^\s*$/;
	function getTextNodes(node, recurse) {
	if (node.nodeType == 3) {
	if (includeWhitespaceNodes \|\| !whitespace.test(node.nodeValue)) {
	textNodes.push(node);
	}
	}
	else if (recurse) {
	for (var i = 0, len = node.childNodes.length; i < len; ++i) {
	getTextNodes(node.childNodes[i], !recurse);
	}
	}
	}
	getTextNodes(node, !recurse);
	return textNodes;
	}

	/**
	* Get the data on the current page in TSV format.
	*/
	function getPageCSV() {
	var s = '';
	// Walk through each row
	$('#fulltable tbody tr:not(:first)').each(function() {
	// Walk through each column
	$(this).find('td').each(function(i, v) {
	// Usually just grab the value
	v = $(this).html();
	// In the second column, get both the company name and URL
	if (i === 1) v = $(this).find('a').html() + "\t" + $(this).find('a')[0].href;
	// In the fifth column, get just the category, not the fancy colored block thing
	else if (i === 4) v = getTextNodesIn(this)[0].nodeValue;
	// Add to the line
	s += v + "\t";
	});
	// Next line
	s += "\n";
	});
	return s;
	}

	// "s" holds the whole TSV at the end of this script
	var s = "Rank\tCompany Name\tURL\t3-year % growth\tRevenue (millions)\tIndustry\t# of Employees\tCity\tState\n";

	/**
	* Scrapes all the data.
	*
	* This recursively reads the data from the current page, then loads in the next one.
	* After this function runs, the s variable will hold the entire TSV.
	*
	* @param [max] The maximum number of pages to retrieve. If not specified, gets them all.
	* @param [curr] Used internally to keep track of what page we're on.
	*/
	function getNextPage(max, curr) {
	// On the first run, assume we're on the first page
	if (typeof curr === 'undefined') curr = 1;
	// Get the data
	s += getPageCSV();
	// If there is a next page and we haven't read too many pages, load the next one
	if ($('.next').length && (typeof max === 'undefined' \|\| curr < max)) {
	// Copy the next page over the current one. There are a couple advantages of load():
	// most importantly, we get just the DOM we care about without running other scripts on the page.
	$('#maincolumn_inner').load($('.next')[0].href + ' #maincolumn_inner', function() {
	// Recurse!
	getNextPage(max, ++curr);
	});
	}
	// When we're done, go back to the first page so we can easily run again if we want.
	else {
	$('#maincolumn_inner').load('http://www.inc.com/inc5000/list/2012 #maincolumn_inner', function() {
	// Print out the result in the console so we can copy it into a document
	console.log(s);
	});
	}
	}

	// Run the script.
	getNextPage();