class Arachnid2::Typhoeus

Public Class Methods

new(url) click to toggle source
# File lib/arachnid2/typhoeus.rb, line 6
def initialize(url)
  @url = url
  @domain = Adomain[@url]
  @cached_data = []
end

Public Instance Methods

crawl(opts = {}) click to toggle source
# File lib/arachnid2/typhoeus.rb, line 12
def crawl(opts = {})
  preflight(opts)
  typhoeus_preflight

  until @global_queue.empty?
    max_concurrency.times do
      q = @global_queue.shift

      break if time_to_stop?
      @global_visited.insert(q)

      found_in_cache = use_cache(q, opts, &Proc.new)
      return if found_in_cache

      request = ::Typhoeus::Request.new(q, request_options)
      requestable = after_request(request, &Proc.new)
      @hydra.queue(request) if requestable
    end # max_concurrency.times do

    @hydra.run
  end # until @global_queue.empty?
ensure
  @cookie_file.close! if @cookie_file
end

Private Instance Methods

after_request(request) click to toggle source
# File lib/arachnid2/typhoeus.rb, line 38
def after_request(request)
  request.on_complete do |response|
    cacheable = use_response(response, &Proc.new)
    return unless cacheable

    put_cached_data(response.effective_url, @options, response)
  end

  true
end
followlocation() click to toggle source
# File lib/arachnid2/typhoeus.rb, line 85
def followlocation
  return @followlocation unless @followlocation.nil?

  @followlocation = @options[:followlocation]
  @followlocation = true unless @followlocation.is_a?(FalseClass)
end
max_concurrency() click to toggle source
# File lib/arachnid2/typhoeus.rb, line 77
def max_concurrency
  return @max_concurrency if @max_concurrency

  @max_concurrency = "#{@options[:max_concurrency]}".to_i
  @max_concurrency = 1 unless (@max_concurrency > 0)
  @max_concurrency
end
request_options() click to toggle source
# File lib/arachnid2/typhoeus.rb, line 92
def request_options
  @cookie_file ||= Tempfile.new('cookies')

  @request_options = {
    timeout: timeout,
    followlocation: followlocation,
    cookiefile: @cookie_file.path,
    cookiejar: @cookie_file.path,
    headers: @options[:headers]
  }.merge(crawl_options[:proxy])

  @request_options[:headers] ||= {}
  @request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
  @request_options[:headers]['User-Agent']      ||= DEFAULT_USER_AGENT

  @request_options
end
time_to_stop?() click to toggle source
# File lib/arachnid2/typhoeus.rb, line 66
def time_to_stop?
  @global_visited.size >= crawl_options[:max_urls] || \
           Time.now > crawl_options[:time_limit] || \
           memory_danger?
end
typhoeus_preflight() click to toggle source
# File lib/arachnid2/typhoeus.rb, line 72
def typhoeus_preflight
  @hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
  typhoeus_proxy_options
end
typhoeus_proxy_options() click to toggle source
# File lib/arachnid2/typhoeus.rb, line 110
def typhoeus_proxy_options
  crawl_options[:proxy] = {}

  crawl_options[:proxy][:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
  crawl_options[:proxy][:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
end
use_cache(url, options) click to toggle source
# File lib/arachnid2/typhoeus.rb, line 59
def use_cache(url, options)
  data = load_data(url, options)
  use_response(data, &Proc.new) if data

  data
end
use_response(response) { |response| ... } click to toggle source
# File lib/arachnid2/typhoeus.rb, line 49
def use_response(response)
  links = process(response.effective_url, response.body)
  return unless links

  yield response

  vacuum(links, response.effective_url)
  true
end