class GruCrawler

Constants

VERSION

Public Class Methods

new(rules) click to toggle source
# File lib/grucrawler.rb, line 12
def initialize(rules)
  @crawler = rules
  @options = @crawler.options()
  domain_wait = @options[:domain_wait] || 20
  @queue = GruCrawler::Queue.new(@crawler.class.name, @options[:visit_urls_only_once], domain_wait)

  @crawler.on_init(self)
end

Public Instance Methods

add_from_queue() click to toggle source
# File lib/grucrawler.rb, line 32
def add_from_queue
  url = @queue.next_url()
  return false unless url

  request = Typhoeus::Request.new(url, followlocation: @options[:follow_redirects], accept_encoding: 'gzip')
  @queue.started(url)

  request.on_complete do |response|
    on_response(response)
  end

  @crawler.debug("#{Time.now} started URL #{url}")
  @hydra.queue(request)

  true
end
add_url(url) click to toggle source
# File lib/grucrawler.rb, line 28
def add_url(url)
  @queue.push(url)
end
crawl_more() click to toggle source
# File lib/grucrawler.rb, line 77
def crawl_more
  while @queue.count < @concurrency
    break unless add_from_queue()
  end
end
on_response(response) click to toggle source
# File lib/grucrawler.rb, line 53
def on_response(response)
  @crawler.debug("#{Time.now} ended URL #{response.request.url}")
  @queue.finished(response.request.url)

  crawl_more()

  if response.body.length > (@options[:max_page_size] || 1000*1000*1000)
    @crawler.debug("URL response size too big: #{response.body.length} from #{response.request.url}")
    return
  end

  nokogiri = Nokogiri::HTML(response.body)

  begin
    @crawler.on_page_received(response, nokogiri)
  rescue
    @crawler.log_error(response, $!)
  end

  queue_links(response, nokogiri)

  crawl_more()
end
reset() click to toggle source
# File lib/grucrawler.rb, line 49
def reset
  @queue.reset
end
run() click to toggle source
# File lib/grucrawler.rb, line 21
def run
  @hydra = Typhoeus::Hydra.new()
  @concurrency = @options[:concurrency] || 5
  crawl_more()
  @hydra.run
end