Skip to content

Instantly share code, notes, and snippets.

@stefan-girlich
Last active August 29, 2015 14:19
Show Gist options
  • Save stefan-girlich/fb18777d736f8a86f84c to your computer and use it in GitHub Desktop.
Save stefan-girlich/fb18777d736f8a86f84c to your computer and use it in GitHub Desktop.
Crawls the official "Sanft & Sorgfältig" website and saves the available MP3 download URLs as a JSON file list with meta data and plain text URL list
var Crawler = require('crawler');
var url = require('url');
var fs = require('fs');
var NUM_PAGES = 10;
var OUTPUT_JSON_FILENAME = 'file_index.json';
var OUTPUT_TXT_FILENAME = 'file_index.txt';
var episodesByPage = {};
var urls = buildSourceUrls();
var c = createCrawler();
c.queue(urls);
function buildSourceUrls() {
var result = [];
for(var i=0; i<NUM_PAGES; i++) result.push('http://www.radioeins.de/archiv/podcast/zwei_alte_hasen.htm/page=' + i + '.html');
return result;
}
function createCrawler() {
return new Crawler({
maxConnections : 10,
callback : function (error, result, $) {
var pageIndex = urls.indexOf(result.options.uri);
episodesByPage[pageIndex] = [];
$('.containerContent').each(function(index, el) {
var el = $(el);
var title = el.find('h2').text();
var date = el.parent().find('.manualteaserDateTime').text();
var description = el.find('.manualteaserShortText').text();
var downloadUrl = el.find('a.download').attr('href');
var metaData = {
title: title,
date: date,
description: description,
url: downloadUrl
};
episodesByPage[pageIndex].push(metaData);
});
if(Object.keys(episodesByPage).length == NUM_PAGES) {
saveResult(episodesByPage);
process.exit();
}
}
});
}
function saveResult(episodesByPageData) {
var resultList = [];
var resultFilenameList = '';
var len = Object.keys(episodesByPageData).length;
for(var i=0; i<len; i++) {
var episodes = episodesByPageData[i];
Array.prototype.push.apply(resultList, episodes);
for(var j=0; j<episodes.length; j++) resultFilenameList += episodes[j].url + '\n';
}
var resultListJson = JSON.stringify(resultList);
fs.writeFileSync(OUTPUT_JSON_FILENAME, resultListJson);
fs.writeFileSync(OUTPUT_TXT_FILENAME, resultFilenameList);
}
{
"name": "SanftUndSorgfaeltigDownloader",
"version": "0.1.0",
"description": "Crawls the official \"Sanft & Sorgfältig\" website and saves the available MP3 downloads as a JSON file list with meta data and plain text URL list",
"main": "node_sanft_und_sorgfaeltig_downloader.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Stefan Girlich",
"license": "ISC",
"dependencies": {
"crawler": "^0.4.3"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment