class HttpSpell::Spider

Attributes

done[R]
todo[R]

Public Class Methods

new(starting_point, whitelist: nil, blacklist: [], tracing: false) click to toggle source
# File lib/httpspell/spider.rb, line 11
def initialize(starting_point, whitelist: nil, blacklist: [], tracing: false)
  @todo = []
  @done = []
  todo << Addressable::URI.parse(starting_point)
  @whitelist = whitelist || [/^#{starting_point}/]
  @blacklist = blacklist
  @tracing = tracing
end

Public Instance Methods

start() { |u, d| ... } click to toggle source
# File lib/httpspell/spider.rb, line 20
def start
  success = true

  while todo.any?
    url = todo.pop

    begin
      extracted = links(url) do |u, d|
        yield u, d if block_given?
      rescue
        warn "Callback error for #{url}: #{$ERROR_INFO}"
        warn $ERROR_INFO.backtrace if @tracing
      end

      done.append(url)
      todo.concat(extracted - done - todo)
    rescue StandardError
      warn "Skipping #{url} because of #{$ERROR_INFO.message}"
      warn $ERROR_INFO.backtrace if @tracing
      success = false
    end
  end

  return success
end

Private Instance Methods

http_get(uri) click to toggle source

twin.github.io/improving-open-uri/

# File lib/httpspell/spider.rb, line 87
def http_get(uri)
  tries = 10

  begin
    uri.open(redirect: false)
  rescue OpenURI::HTTPRedirect => redirect
    uri = redirect.uri
    retry if (tries -= 1) > 0
    raise
  end
end