# Simple exercise based on call to action at # http://phpfashion.com/cisty-programatorsky-experiment require 'net/http' require 'tmpdir' require 'digest/md5' require 'fileutils' require 'pathname' module Crawler # Generic file-based storage. Do your own: MySQLStorage, etc. # # In real life, you'd create an abstract class to define the "interface" etc :P # In real life, you'd need some Cache class to wrap the Storage. # You need to ask the cache for data, not the storage. But we don't care here. # And <b>first of all</b>, in real life you'd need some expiration logic :) # class FileStorage def initialize(path=nil) raise ArgumentError, "Storage path does not exist!" if path && !File.exist?(path) path ||= Dir::tmpdir @store = Pathname.new(path) end def set(key, value); File.open( @store.join(encode(key)), 'w' ) { |file| file << Marshal.dump(value) }; end def get(key); Marshal.load(File.read( @store.join(encode(key)))); end def exist?(key); File.exist? @store.join(encode(key)); end private def encode(key); Digest::MD5.hexdigest(key); end end # = Wrap a HTML page # # The +load+ method returns the representation of HTML page either from cache, # or from the network (and caches it) # # == Usage # # require 'web_page' # page = Crawler::WebPage.load('htpp://example.com') # puts page.url # puts page.body # puts page.headers.inspect # class WebPage @cache = Crawler::FileStorage.new def self.load(url) return cache.get( url ) if cache.exist?( url ) url = URI.parse(url) url.path = '/' if url.path =~ /^$/ client = Net::HTTP.start(url.host, url.port) response = client.request_get(url.path) webpage = WebPage.new( url.to_s, response.body, response.to_hash ) cache.set(url.to_s, webpage) return webpage end attr_reader :url, :body, :headers def initialize(url, body, headers) @url, @body, @headers = url, body, headers self end def self.cache; @cache; end def thumbnail @thumbnail ||= create_thumbnail end private def create_thumbnail # Thumbnail.new(url) ... Some expensive logic ... etc end end end if $0 == __FILE__ require 'test/unit' require 'rubygems' require 'fakeweb' require 'shoulda' include Crawler FakeWeb.register_uri :get, 'http://example.com', :response => DATA.read FakeWeb.allow_net_connect = false class WebPageTest < Test::Unit::TestCase context "When loading a URL, it" do should "handle the request" do assert_nothing_raised { @webpage = WebPage.load('http://example.com/') } assert_not_nil @webpage end should "add trailing slash" do assert_nothing_raised { @webpage = WebPage.load('http://example.com') } assert_not_nil @webpage end should "return the URL with trailing slash back" do @webpage = WebPage.load('http://example.com') assert_equal 'http://example.com/', @webpage.url end should "parse the body" do @webpage = WebPage.load('http://example.com') assert_match /Example Web Page/, @webpage.body end should "parse the headers" do @webpage = WebPage.load('http://example.com') assert_not_nil @webpage.headers assert_equal 'text/html; charset=UTF-8', @webpage.headers['content-type'].to_s end should "have thumbnail" do @webpage = WebPage.load('http://example.com') assert_respond_to @webpage, :thumbnail end should "cache the response" do @webpage = WebPage.load('http://example.com/') assert_not_nil WebPage.cache.exist?( 'http://example.com/' ) @cached = WebPage.cache.get( 'http://example.com/' ) assert_instance_of WebPage, @cached end should "load valid web page from cache" do @webpage = WebPage.load('http://example.com/') @cached = WebPage.cache.get( 'http://example.com/' ) assert_equal @webpage.url, @cached.url assert_equal @webpage.body, @cached.body assert_equal @webpage.headers, @cached.headers end end # --------------------------------------------------------------------------- context "FileStorage" do setup do @tmp_path = File.join(File.dirname(__FILE__), 'tmp') FileUtils.mkdir_p @tmp_path end teardown do FileUtils.rm_rf @tmp_path end should "be initialized with a valid path" do assert_nothing_raised { @storage = FileStorage.new @tmp_path } assert File.exist?(@tmp_path), "Path does not exist" end should "raise when initialized with invalid path" do assert_raise(ArgumentError) { FileStorage.new('/some/path/to/hell') } end should "not have data missing key" do @storage = FileStorage.new @tmp_path assert_equal false, @storage.exist?('who-do-you-think-you-are-talking-to') end should "have data for valid key" do @storage = FileStorage.new @tmp_path @storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] }) assert @storage.exist?('abc123'), "Does not have data for the abc123 key?!" end should "store and retrieve data" do @storage = FileStorage.new @tmp_path @storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] }) assert_equal @storage.get('abc123')[:array], [1, 2, 3] end end # --------------------------------------------------------------------------- end end __END__ HTTP/1.1 200 OK Server: Apache/2.2.3 (Red Hat) Last-Modified: Tue, 15 Nov 2005 13:24:10 GMT ETag: "b300b4-1b6-4059a80bfd280" Accept-Ranges: bytes Content-Type: text/html; charset=UTF-8 Connection: Keep-Alive Date: Fri, 30 Oct 2009 09:20:03 GMT Age: 2361 Content-Length: 438 <HTML> <HEAD> <TITLE>Example Web Page</TITLE> </HEAD> <body> <p>You have reached this web page by typing "example.com", "example.net", or "example.org" into your web browser.</p> <p>These domain names are reserved for use in documentation and are not available for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC 2606</a>, Section 3.</p> </BODY> </HTML>