class Browser

A mechanize class that emulates a web-browser, with cache and everything. Progress bars are enabled by default.

Attributes

agent[RW]
cache[RW]
delay[RW]
delay_jitter[RW]
use_cache[RW]

Public Class Methods

new(options={}) click to toggle source

Default options:

:delay => 1,                      # Sleep 1 second between gets
:delay_jitter => 0.2,             # Random deviation from delay
:use_cache => true,               # Cache all gets
:use_logs => false,               # Don't log the detailed transfer info
:cookie_file => "cookies.txt"     # Save cookies to file
# File lib/epitools/browser.rb, line 39
def initialize(options={})
  @last_get     = Time.at(0)
  @delay        = options[:delay]          || 1
  @delay_jitter = options[:delay_jitter]   || 0.2
  @use_cache    = !!(options[:cache] || options[:cached] || options[:use_cache])
  @use_logs     = options[:logs]           || false
  @cookie_file  = options[:cookiefile]     || "cookies.txt"
  @cache_file   = options[:cache_file]     || "browser-cache.db"

  # TODO: @progress, @user_agent, @logfile, @cache_file (default location: ~/.epitools?)

  if options[:proxy]
    host, port = options[:proxy].split(':')
    TCPSocket::socks_server = host
    TCPSocket::socks_port   = port.to_i
  end

  init_agent!
  init_cache!
end

Public Instance Methods

cache_put(page, url) click to toggle source
# File lib/epitools/browser.rb, line 112
def cache_put(page, url)
  if cache.valid_page?(page)
    if page.content_type =~ %r{(^text/|^application/javascript|javascript)}
      puts "  |_ writing to cache"
      cache.put(page, url, :overwrite=>true)
    end
  end
end
cacheable?(page) click to toggle source
# File lib/epitools/browser.rb, line 105
def cacheable?(page)
  case page.content_type
  when %r{^(text|application)}
    true
  end
end
get(url, options={}) click to toggle source

Retrieve an URL, and return a Mechanize::Page instance (which acts a bit like a Nokogiri::HTML::Document instance.)

Options:

:cached => true/false   | check cache before getting page
# File lib/epitools/browser.rb, line 129
  def get(url, options={})

    # TODO: Have a base-URL option

    #if relative?(url)
    #  url = URI.join("http://base-url/", url).to_s
    #end

    # Determine the cache setting
    use_cache = options[:cached].nil? ? @use_cache : options[:cached]

    cached_already = cache.include?(url) if use_cache

    puts
    puts "[ GET #{url} (using cache: #{!!use_cache}) ]"

    delay unless cached_already
    max_retries = 4
    retries = 0

    begin

      if use_cache and page = cache.get(url)
        puts "  |_ cached (#{page.content_type})"
      else
        page = agent.get(url)
        @last_get = Time.now
        cache_put(page, url) if use_cache
      end

      puts

    rescue Net::HTTPBadResponse, Errno::ECONNRESET, SocketError, Timeout::Error, SOCKSError => e
      raise if e.message == "getaddrinfo: Name or service not known"

      retries += 1
      return if retries >= max_retries

      puts "  |_ ERROR: #{e.inspect} -- retrying"
      delay(5)
      retry

=begin
    rescue Mechanize::ResponseCodeError => e

      case e.response_code
        when "401" #=> Net::HTTPUnauthorized
          p e
          login!
          page = get(url)
          puts
        when "404"
          p e
          raise e
        when "503"
          puts "  |_ ERROR: #{e.inspect} -- retrying"
          delay(5)
          retry
      else
        raise e
      end
=end

    end

    page
  end
init_agent!() click to toggle source
# File lib/epitools/browser.rb, line 60
def init_agent!
  @agent = Mechanize.new do |a|
    # ["Mechanize", "Mac Mozilla", "Linux Mozilla", "Windows IE 6", "iPhone", "Linux Konqueror", "Windows IE 7", "Mac FireFox", "Mac Safari", "Windows Mozilla"]
    a.max_history = 10
    a.user_agent_alias = "Windows Chrome"
    a.log = Logger.new "mechanize.log" if @use_logs
  end

  load_cookies!
end
init_cache!() click to toggle source
# File lib/epitools/browser.rb, line 82
def init_cache!
  # TODO: Rescue "couldn't load" exception and disable caching
  @cache = Cache.new(@cache_file, agent) if @use_cache
end
load_cookies!() click to toggle source
# File lib/epitools/browser.rb, line 87
def load_cookies!
  if File.exists? @cookie_file
    agent.cookie_jar.load @cookie_file
    true
  else
    false
  end
end
relative?(url) click to toggle source
# File lib/epitools/browser.rb, line 101
def relative?(url)
  not url[ %r{^https?://} ]
end
save_cookies!() click to toggle source
# File lib/epitools/browser.rb, line 96
def save_cookies!
  agent.cookie_jar.save_as @cookie_file
  true
end