Created
February 16, 2011 13:19
-
-
Save fuba/829350 to your computer and use it in GitHub Desktop.
Scrape Google realtime search using phantomjs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// phantomjs realtime.js keyword | |
var xpathes = { | |
item: '//li[@class="g s"]', | |
screen_name: './/div/a[@class="l"]/text()', | |
url: './/div//a[./span]', | |
text: './/div[./a[@class="l"]]', | |
}; | |
var log = {}; | |
if (phantom.state.length === 0) { | |
if (phantom.args.length === 0) { | |
phantom.exit(); | |
} else { | |
var keyword = phantom.args[0]; | |
var url = 'http://www.google.co.jp/search?q='+ | |
//encodeURIComponent(keyword)+ | |
keyword+ // phantomjs escapes URL… | |
'&hl=ja&safe=off&tbs=mbl:1'; | |
phantom.state = 'realtime'; | |
phantom.open(url); | |
} | |
} else { | |
if (phantom.loadStatus === 'success') { | |
extract(); | |
function extract () { | |
var items = $X(xpathes.item, document); | |
for (var i = 0; i < items.length; i++) { | |
(function () { | |
var item = items[i]; | |
var screen_name = $X(xpathes.screen_name, item, String); | |
var urls = $X(xpathes.url, item); | |
var texts = $X(xpathes.text, item); | |
if (!log[urls[0].href]) { | |
console.log( | |
JSON.stringify({ | |
screen_name: screen_name, | |
url: urls[0].href, | |
text: texts[0].innerHTML | |
}) | |
); | |
log[urls[0].href] = 1; | |
} | |
})(); | |
} | |
window.setTimeout(extract, 20000); | |
} | |
} | |
} | |
// $X is from https://gist.github.com/3238 | |
// extend version of $X | |
// $X(exp); | |
// $X(exp, context); | |
// $X(exp, type); | |
// $X(exp, context, type); | |
function $X (exp, context, type /* want type */) { | |
if (typeof context == "function") { | |
type = context; | |
context = null; | |
} | |
if (!context) context = document; | |
exp = (context.ownerDocument || context).createExpression(exp, function (prefix) { | |
var o = document.createNSResolver(context)(prefix); | |
if (o) return o; | |
return (document.contentType == "application/xhtml+xml") ? "http://www.w3.org/1999/xhtml" : ""; | |
}); | |
switch (type) { | |
case String: return exp.evaluate(context, XPathResult.STRING_TYPE, null).stringValue; | |
case Number: return exp.evaluate(context, XPathResult.NUMBER_TYPE, null).numberValue; | |
case Boolean: return exp.evaluate(context, XPathResult.BOOLEAN_TYPE, null).booleanValue; | |
case Array: | |
var result = exp.evaluate(context, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); | |
for (var ret = [], i = 0, len = result.snapshotLength; i < len; i++) { | |
ret.push(result.snapshotItem(i)); | |
} | |
return ret; | |
case undefined: | |
var result = exp.evaluate(context, XPathResult.ANY_TYPE, null); | |
switch (result.resultType) { | |
case XPathResult.STRING_TYPE : return result.stringValue; | |
case XPathResult.NUMBER_TYPE : return result.numberValue; | |
case XPathResult.BOOLEAN_TYPE: return result.booleanValue; | |
case XPathResult.UNORDERED_NODE_ITERATOR_TYPE: | |
// not ensure the order. | |
var ret = [], i = null; | |
while ((i = result.iterateNext())) ret.push(i); | |
return ret; | |
} | |
return null; | |
default: throw(TypeError("$X: specified type is not valid type.")); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment