class Arachnid2::Watir
Constants
- DEFAULT_AGENT
- DEFAULT_ORIENTATION
Public Class Methods
new(url)
click to toggle source
# File lib/arachnid2/watir.rb, line 8 def initialize(url) @url = url @domain = Adomain[@url] end
Public Instance Methods
crawl(opts)
click to toggle source
# File lib/arachnid2/watir.rb, line 13 def crawl(opts) preflight(opts) watir_preflight @already_retried = false until @global_queue.empty? q = @global_queue.shift links = nil break if time_to_stop? @global_visited.insert(q) make_request(q, &Proc.new) end # until @global_queue.empty? ensure @browser.close if @browser rescue nil @headless.destroy if @headless rescue nil end
Private Instance Methods
behead()
click to toggle source
# File lib/arachnid2/watir.rb, line 124 def behead @headless = Headless.new @headless.start end
browse_links(url) { |browser| ... }
click to toggle source
# File lib/arachnid2/watir.rb, line 56 def browse_links(url) return unless navigate(url) yield browser process(browser.url, browser.body.html) if browser.body.exists? end
browser()
click to toggle source
# File lib/arachnid2/watir.rb, line 102 def browser unless @browser behead if @make_headless @browser = create_browser set_timeout end return @browser end
create_browser()
click to toggle source
# File lib/arachnid2/watir.rb, line 114 def create_browser return ::Watir::Browser.new(driver, proxy: @proxy) if @proxy ::Watir::Browser.new driver end
driver()
click to toggle source
# File lib/arachnid2/watir.rb, line 129 def driver unless @driver language = @options.dig(:headers, "Accept-Language") || DEFAULT_LANGUAGE user_agent = @options.dig(:headers, "User-Agent") || DEFAULT_USER_AGENT agent = @options.dig(:agent) || DEFAULT_AGENT orientation = @options.dig(:orientation) || DEFAULT_ORIENTATION @driver = Webdriver::UserAgent.driver( browser: browser_type, agent: agent, orientation: orientation, accept_language_string: language, user_agent_string: user_agent ) end @driver end
make_request(q)
click to toggle source
# File lib/arachnid2/watir.rb, line 34 def make_request(q) begin links = browse_links(q, &Proc.new) return unless links vacuum(links, browser.url) rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \ "is ignoring an error: " \ "#{e.class} - #{e.message}" puts msg rescue => e raise e if raise_before_retry?(e.class) msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \ "is retrying once after an error: " \ "#{e.class} - #{e.message}" puts msg e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..." reset_for_retry end end
raise_before_retry?(klass)
click to toggle source
# File lib/arachnid2/watir.rb, line 87 def raise_before_retry?(klass) @already_retried || \ "#{klass}".include?("Selenium") || \ "#{klass}".include?("Watir") end
reset_for_retry()
click to toggle source
# File lib/arachnid2/watir.rb, line 93 def reset_for_retry @browser.close if @browser rescue nil @headless.destroy if @headless rescue nil @driver.quit if @headless rescue nil @driver = nil @browser = nil @already_retried = true end
set_timeout()
click to toggle source
# File lib/arachnid2/watir.rb, line 120 def set_timeout @browser.driver.manage.timeouts.page_load = timeout end
time_to_stop?()
click to toggle source
# File lib/arachnid2/watir.rb, line 81 def time_to_stop? @global_visited.size >= crawl_options[:max_urls] || \ Time.now > crawl_options[:time_limit] || \ memory_danger? end
watir_preflight()
click to toggle source
# File lib/arachnid2/watir.rb, line 148 def watir_preflight watir_proxy_options @make_headless = @options[:headless] end
watir_proxy_options()
click to toggle source
# File lib/arachnid2/watir.rb, line 153 def watir_proxy_options crawl_options[:proxy] = {} crawl_options[:proxy][:http] = @options[:proxy][:http] if @options.dig(:proxy, :http) crawl_options[:proxy][:ssl] = @options[:proxy][:ssl] if @options.dig(:proxy, :ssl) end