# Simple exercise based on call to action at
# http://phpfashion.com/cisty-programatorsky-experiment

require 'net/http'
require 'tmpdir'
require 'digest/md5'
require 'fileutils'
require 'pathname'

module Crawler

  # Generic file-based storage. Do your own: MySQLStorage, etc.
  #
  # In real life, you'd create an abstract class to define the "interface" etc :P
  # In real life, you'd need some Cache class to wrap the Storage.
  # You need to ask the cache for data, not the storage. But we don't care here.
  # And <b>first of all</b>, in real life you'd need some expiration logic :)
  #
  class FileStorage
    def initialize(path=nil)
      raise ArgumentError, "Storage path does not exist!" if path && !File.exist?(path)
      path ||= Dir::tmpdir
      @store = Pathname.new(path)
    end
    def set(key, value);  File.open( @store.join(encode(key)), 'w' ) { |file| file << Marshal.dump(value)  }; end
    def get(key);         Marshal.load(File.read( @store.join(encode(key))));                                 end
    def exist?(key);      File.exist? @store.join(encode(key));                                               end
    private
    def encode(key);      Digest::MD5.hexdigest(key);                                                         end
  end

  # = Wrap a HTML page
  #
  # The +load+ method returns the representation of HTML page either from cache,
  # or from the network (and caches it)
  #
  # == Usage
  # 
  #   require 'web_page'
  #   page = Crawler::WebPage.load('htpp://example.com')
  #   puts page.url
  #   puts page.body
  #   puts page.headers.inspect
  #
  class WebPage

    @cache = Crawler::FileStorage.new

    def self.load(url)
      return cache.get( url ) if cache.exist?( url )
      url        = URI.parse(url)
      url.path   = '/' if url.path =~ /^$/
      client     = Net::HTTP.start(url.host, url.port)
      response   = client.request_get(url.path)
      webpage   = WebPage.new( url.to_s, response.body, response.to_hash )
      cache.set(url.to_s, webpage)
      return webpage
    end

    attr_reader   :url, :body, :headers
    def initialize(url, body, headers)
      @url, @body, @headers = url, body, headers
      self
    end

    def self.cache; @cache; end
    def thumbnail
      @thumbnail ||= create_thumbnail
    end

    private

    def create_thumbnail
      # Thumbnail.new(url) ... Some expensive logic ... etc
    end

  end

end


if $0 == __FILE__

  require 'test/unit'
  require 'rubygems'
  require 'fakeweb'
  require 'shoulda'

  include Crawler

  FakeWeb.register_uri :get, 'http://example.com', :response => DATA.read
  FakeWeb.allow_net_connect = false

  class WebPageTest < Test::Unit::TestCase

    context "When loading a URL, it" do

      should "handle the request" do
        assert_nothing_raised { @webpage = WebPage.load('http://example.com/') }
        assert_not_nil @webpage
      end

      should "add trailing slash" do
        assert_nothing_raised { @webpage = WebPage.load('http://example.com') }
        assert_not_nil @webpage
      end

      should "return the URL with trailing slash back" do
        @webpage = WebPage.load('http://example.com')
        assert_equal 'http://example.com/', @webpage.url
      end

      should "parse the body" do
        @webpage = WebPage.load('http://example.com')
        assert_match /Example Web Page/, @webpage.body
      end

      should "parse the headers" do
        @webpage = WebPage.load('http://example.com')
        assert_not_nil @webpage.headers
        assert_equal 'text/html; charset=UTF-8', @webpage.headers['content-type'].to_s
      end

      should "have thumbnail" do
        @webpage = WebPage.load('http://example.com')
        assert_respond_to @webpage, :thumbnail
      end

      should "cache the response" do
        @webpage = WebPage.load('http://example.com/')
        assert_not_nil WebPage.cache.exist?( 'http://example.com/' )
        @cached  = WebPage.cache.get( 'http://example.com/' )
        assert_instance_of WebPage, @cached
      end

      should "load valid web page from cache" do
        @webpage = WebPage.load('http://example.com/')
        @cached  = WebPage.cache.get( 'http://example.com/' )
        assert_equal @webpage.url,     @cached.url
        assert_equal @webpage.body,    @cached.body
        assert_equal @webpage.headers, @cached.headers
      end
    end

    # ---------------------------------------------------------------------------

    context "FileStorage" do
      setup do
        @tmp_path = File.join(File.dirname(__FILE__), 'tmp')
        FileUtils.mkdir_p @tmp_path
      end

      teardown do
        FileUtils.rm_rf @tmp_path
      end

      should "be initialized with a valid path" do
        assert_nothing_raised { @storage = FileStorage.new @tmp_path }
        assert File.exist?(@tmp_path), "Path does not exist"
      end

      should "raise when initialized with invalid path" do
        assert_raise(ArgumentError) { FileStorage.new('/some/path/to/hell') }
      end

      should "not have data missing key" do
        @storage = FileStorage.new @tmp_path
        assert_equal false, @storage.exist?('who-do-you-think-you-are-talking-to')
      end

      should "have data for valid key" do
        @storage = FileStorage.new @tmp_path
        @storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] })
        assert @storage.exist?('abc123'), "Does not have data for the abc123 key?!"
      end

      should "store and retrieve data" do
        @storage = FileStorage.new @tmp_path
        @storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] })
        assert_equal @storage.get('abc123')[:array], [1, 2, 3]
      end
    end

    # ---------------------------------------------------------------------------
    
  end

  
end

__END__
HTTP/1.1 200 OK 
Server: Apache/2.2.3 (Red Hat) 
Last-Modified: Tue, 15 Nov 2005 13:24:10 GMT 
ETag: "b300b4-1b6-4059a80bfd280" 
Accept-Ranges: bytes 
Content-Type: text/html; charset=UTF-8 
Connection: Keep-Alive 
Date: Fri, 30 Oct 2009 09:20:03 GMT 
Age: 2361    
Content-Length: 438 

<HTML> 
<HEAD> 
  <TITLE>Example Web Page</TITLE> 
</HEAD> 
<body>   
<p>You have reached this web page by typing &quot;example.com&quot;, 
&quot;example.net&quot;, 
  or &quot;example.org&quot; into your web browser.</p> 
<p>These domain names are reserved for use in documentation and are not available 
  for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC 
  2606</a>, Section 3.</p> 
</BODY> 
</HTML>