class Arachnid2::Watir

Constants

DEFAULT_AGENT
DEFAULT_ORIENTATION

Public Class Methods

new(url) click to toggle source
# File lib/arachnid2/watir.rb, line 8
def initialize(url)
  @url = url
  @domain = Adomain[@url]
end

Public Instance Methods

crawl(opts) click to toggle source
# File lib/arachnid2/watir.rb, line 13
def crawl(opts)
  preflight(opts)
  watir_preflight
  @already_retried = false

  until @global_queue.empty?
    q = @global_queue.shift
    links = nil

    break if time_to_stop?

    @global_visited.insert(q)

    make_request(q, &Proc.new)
  end # until @global_queue.empty?
ensure
  @browser.close if @browser rescue nil
  @headless.destroy if @headless rescue nil
end

Private Instance Methods

behead() click to toggle source
# File lib/arachnid2/watir.rb, line 124
def behead
  @headless = Headless.new
  @headless.start
end
browser() click to toggle source
# File lib/arachnid2/watir.rb, line 102
def browser
  unless @browser
    behead if @make_headless

    @browser = create_browser

    set_timeout
  end

  return @browser
end
create_browser() click to toggle source
# File lib/arachnid2/watir.rb, line 114
def create_browser
  return ::Watir::Browser.new(driver, proxy: @proxy) if @proxy

  ::Watir::Browser.new driver
end
driver() click to toggle source
# File lib/arachnid2/watir.rb, line 129
def driver
  unless @driver
    language    = @options.dig(:headers, "Accept-Language") || DEFAULT_LANGUAGE
    user_agent  = @options.dig(:headers, "User-Agent")      || DEFAULT_USER_AGENT
    agent       = @options.dig(:agent)                      || DEFAULT_AGENT
    orientation = @options.dig(:orientation)                || DEFAULT_ORIENTATION

    @driver = Webdriver::UserAgent.driver(
      browser: browser_type,
      agent: agent,
      orientation: orientation,
      accept_language_string: language,
      user_agent_string: user_agent
    )
  end

  @driver
end
make_request(q) click to toggle source
# File lib/arachnid2/watir.rb, line 34
def make_request(q)
  begin
    links = browse_links(q, &Proc.new)
    return unless links

    vacuum(links, browser.url)
  rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e
    msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
          "is ignoring an error: " \
          "#{e.class} - #{e.message}"
    puts msg
  rescue => e
    raise e if raise_before_retry?(e.class)
    msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
          "is retrying once after an error: " \
          "#{e.class} - #{e.message}"
    puts msg
    e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..."
    reset_for_retry
  end
end
navigate(url) click to toggle source
raise_before_retry?(klass) click to toggle source
# File lib/arachnid2/watir.rb, line 87
def raise_before_retry?(klass)
  @already_retried || \
    "#{klass}".include?("Selenium") || \
    "#{klass}".include?("Watir")
end
reset_for_retry() click to toggle source
# File lib/arachnid2/watir.rb, line 93
def reset_for_retry
  @browser.close if @browser rescue nil
  @headless.destroy if @headless rescue nil
  @driver.quit if @headless rescue nil
  @driver = nil
  @browser = nil
  @already_retried = true
end
set_timeout() click to toggle source
# File lib/arachnid2/watir.rb, line 120
def set_timeout
  @browser.driver.manage.timeouts.page_load = timeout
end
time_to_stop?() click to toggle source
# File lib/arachnid2/watir.rb, line 81
def time_to_stop?
  @global_visited.size >= crawl_options[:max_urls] || \
           Time.now > crawl_options[:time_limit] || \
           memory_danger?
end
watir_preflight() click to toggle source
# File lib/arachnid2/watir.rb, line 148
def watir_preflight
  watir_proxy_options
  @make_headless = @options[:headless]
end
watir_proxy_options() click to toggle source
# File lib/arachnid2/watir.rb, line 153
def watir_proxy_options
  crawl_options[:proxy] = {}

  crawl_options[:proxy][:http] = @options[:proxy][:http] if @options.dig(:proxy, :http)
  crawl_options[:proxy][:ssl] = @options[:proxy][:ssl] if @options.dig(:proxy, :ssl)
end