Last active
August 29, 2015 14:19
-
-
Save stefan-girlich/fb18777d736f8a86f84c to your computer and use it in GitHub Desktop.
Crawls the official "Sanft & Sorgfältig" website and saves the available MP3 download URLs as a JSON file list with meta data and plain text URL list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var Crawler = require('crawler'); | |
var url = require('url'); | |
var fs = require('fs'); | |
var NUM_PAGES = 10; | |
var OUTPUT_JSON_FILENAME = 'file_index.json'; | |
var OUTPUT_TXT_FILENAME = 'file_index.txt'; | |
var episodesByPage = {}; | |
var urls = buildSourceUrls(); | |
var c = createCrawler(); | |
c.queue(urls); | |
function buildSourceUrls() { | |
var result = []; | |
for(var i=0; i<NUM_PAGES; i++) result.push('http://www.radioeins.de/archiv/podcast/zwei_alte_hasen.htm/page=' + i + '.html'); | |
return result; | |
} | |
function createCrawler() { | |
return new Crawler({ | |
maxConnections : 10, | |
callback : function (error, result, $) { | |
var pageIndex = urls.indexOf(result.options.uri); | |
episodesByPage[pageIndex] = []; | |
$('.containerContent').each(function(index, el) { | |
var el = $(el); | |
var title = el.find('h2').text(); | |
var date = el.parent().find('.manualteaserDateTime').text(); | |
var description = el.find('.manualteaserShortText').text(); | |
var downloadUrl = el.find('a.download').attr('href'); | |
var metaData = { | |
title: title, | |
date: date, | |
description: description, | |
url: downloadUrl | |
}; | |
episodesByPage[pageIndex].push(metaData); | |
}); | |
if(Object.keys(episodesByPage).length == NUM_PAGES) { | |
saveResult(episodesByPage); | |
process.exit(); | |
} | |
} | |
}); | |
} | |
function saveResult(episodesByPageData) { | |
var resultList = []; | |
var resultFilenameList = ''; | |
var len = Object.keys(episodesByPageData).length; | |
for(var i=0; i<len; i++) { | |
var episodes = episodesByPageData[i]; | |
Array.prototype.push.apply(resultList, episodes); | |
for(var j=0; j<episodes.length; j++) resultFilenameList += episodes[j].url + '\n'; | |
} | |
var resultListJson = JSON.stringify(resultList); | |
fs.writeFileSync(OUTPUT_JSON_FILENAME, resultListJson); | |
fs.writeFileSync(OUTPUT_TXT_FILENAME, resultFilenameList); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "SanftUndSorgfaeltigDownloader", | |
"version": "0.1.0", | |
"description": "Crawls the official \"Sanft & Sorgfältig\" website and saves the available MP3 downloads as a JSON file list with meta data and plain text URL list", | |
"main": "node_sanft_und_sorgfaeltig_downloader.js", | |
"scripts": { | |
"test": "echo \"Error: no test specified\" && exit 1" | |
}, | |
"author": "Stefan Girlich", | |
"license": "ISC", | |
"dependencies": { | |
"crawler": "^0.4.3" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment