class GruCrawler
Constants
- VERSION
Public Class Methods
new(rules)
click to toggle source
# File lib/grucrawler.rb, line 12 def initialize(rules) @crawler = rules @options = @crawler.options() domain_wait = @options[:domain_wait] || 20 @queue = GruCrawler::Queue.new(@crawler.class.name, @options[:visit_urls_only_once], domain_wait) @crawler.on_init(self) end
Public Instance Methods
add_from_queue()
click to toggle source
# File lib/grucrawler.rb, line 32 def add_from_queue url = @queue.next_url() return false unless url request = Typhoeus::Request.new(url, followlocation: @options[:follow_redirects], accept_encoding: 'gzip') @queue.started(url) request.on_complete do |response| on_response(response) end @crawler.debug("#{Time.now} started URL #{url}") @hydra.queue(request) true end
add_url(url)
click to toggle source
# File lib/grucrawler.rb, line 28 def add_url(url) @queue.push(url) end
crawl_more()
click to toggle source
# File lib/grucrawler.rb, line 77 def crawl_more while @queue.count < @concurrency break unless add_from_queue() end end
on_response(response)
click to toggle source
# File lib/grucrawler.rb, line 53 def on_response(response) @crawler.debug("#{Time.now} ended URL #{response.request.url}") @queue.finished(response.request.url) crawl_more() if response.body.length > (@options[:max_page_size] || 1000*1000*1000) @crawler.debug("URL response size too big: #{response.body.length} from #{response.request.url}") return end nokogiri = Nokogiri::HTML(response.body) begin @crawler.on_page_received(response, nokogiri) rescue @crawler.log_error(response, $!) end queue_links(response, nokogiri) crawl_more() end
queue_links(response, nokogiri)
click to toggle source
# File lib/grucrawler.rb, line 83 def queue_links(response, nokogiri) nokogiri.css('a').each do |link| next unless link['href'] begin url = URI.join(response.effective_url, link['href']).to_s rescue next end if @crawler.follow_link(url, response, nokogiri) added = add_url(url) @crawler.debug("#{Time.now} queued #{url}") if added end end end
reset()
click to toggle source
# File lib/grucrawler.rb, line 49 def reset @queue.reset end
run()
click to toggle source
# File lib/grucrawler.rb, line 21 def run @hydra = Typhoeus::Hydra.new() @concurrency = @options[:concurrency] || 5 crawl_more() @hydra.run end