Created
November 30, 2013 11:15
-
-
Save anonymous/7717795 to your computer and use it in GitHub Desktop.
Google search scraper to list all results likely to be MediaWiki installations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
""" | |
Google search scraper to list all results likely to be MediaWiki installations | |
""" | |
# CC-0, ArchiveTeam/WikiTeam, 2013 | |
require 'rubygems' | |
require 'mechanize' | |
require 'uri' | |
require 'cgi' | |
domains = Array.new | |
a = Mechanize.new { |agent| | |
agent.user_agent_alias = 'Linux Konqueror' | |
} | |
prng = Random.new | |
search_result = a.get('http://www.google.it/') # webhp?num=30&complete=0&hl=it | |
search_form = search_result.form('f') | |
search_form.q = '"Magnus Manske, Brion Vibber, Lee Daniel Crocker" -site:wikia.com -wikimedia' | |
search_result = a.submit(search_form, search_form.buttons.first) | |
# FIXME: Continue clicking "Next" endlessly; need to exit at some point | |
while !search_result.nil? | |
search_result.search("//h3/a").each do |link| | |
# The result URLs are in h3 headers and passed through google.com/url?q= | |
target = CGI.parse(link['href'])['/url?q'][0] | |
unless target.nil? | |
# Take each result URI provided | |
uri = URI.parse(target) | |
# Try to extract the entry URL to MediaWiki: index.php if we're lucky, otherwise the article path | |
# We could try and be smart, open the URL and follow the link rel=EditURI; but it's too recent a feature | |
unless uri.query.nil? | |
# If there are parameters, perhaps we're lucky, just take till the path | |
# TODO: This looks silly | |
entry = uri.scheme + '://' + uri.host + uri.path | |
else | |
# But if there is none it's probably using short URLs or some other rewriting: | |
# the last part must be the page title, remove it | |
entry = target.split("/")[0..-2].join("/") | |
end | |
unless domains.include?(entry) | |
domains << entry | |
print '.' | |
end | |
# A human would probably click every now and then | |
if prng.rand(0..3.0) < 1 then | |
begin | |
trash = a.get('http://google.com' + link['href']) | |
rescue Exception | |
# Nothing to do; we don't care at all | |
end | |
end | |
end | |
end | |
sleep(prng.rand(150..300.0)) | |
begin | |
search_result = search_result.link_with(:text => 'Avanti').click | |
rescue NoMethodError | |
begin | |
search_result = search_result.link_with(:text => 'ripetere la ricerca includendo i risultati omessi').click | |
rescue NoMethodError | |
search_result = nil | |
end | |
rescue Net::HTTPServiceUnavailable | |
puts "We got a 503, party is over" | |
search_result | |
end | |
end | |
# Print all domains found | |
domains.each do |domain| | |
puts domain | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment