class GruCrawler::Queue
Constants
- DOMAIN_VISITS_KEY
- QUEUE_KEY
- VISITED_ALREADY_KEY
Public Class Methods
new(namespace, visit_once, domain_wait)
click to toggle source
# File lib/grucrawler/queue.rb, line 10 def initialize(namespace, visit_once, domain_wait) @redis = Redis.new @rns = namespace + ':' @concurrent_requests = 0 @tmp_block = {} @domains_throttle = Hash.new(0.0) @visit_once = visit_once @domain_wait = domain_wait end
Public Instance Methods
can_visit_now(url)
click to toggle source
# File lib/grucrawler/queue.rb, line 45 def can_visit_now(url) return false if @tmp_block[url] last_visit = last_visit_to_domain(url) time_passed = Time.now.to_f - last_visit time_passed > @domain_wait end
count()
click to toggle source
# File lib/grucrawler/queue.rb, line 67 def count @concurrent_requests end
domain(url)
click to toggle source
TODO: PublicSuffix
# File lib/grucrawler/queue.rb, line 106 def domain(url) begin uri = URI.parse(url) rescue URI::InvalidURIError return nil end return nil if uri.host.nil? host = uri.host.downcase host = host.start_with?('www.') ? host[4..-1] : host match = [host.match(/\w+\.\w+$/)] return nil unless match match[0] end
finished(url)
click to toggle source
# File lib/grucrawler/queue.rb, line 60 def finished(url) @tmp_block.delete(url) set_visited_already(url) remove_url_from_queue(url) if url @concurrent_requests -= 1 end
last_visit_to_domain(url)
click to toggle source
# File lib/grucrawler/queue.rb, line 76 def last_visit_to_domain(url) @redis.hget(@rns + DOMAIN_VISITS_KEY, domain(url)).to_f end
next_url()
click to toggle source
# File lib/grucrawler/queue.rb, line 26 def next_url url = '' 100.times do url = random_url_from_queue() if visited_already(url) or not can_visit_now(url) url = nil next end break end @tmp_block[url] = true url end
push(url)
click to toggle source
# File lib/grucrawler/queue.rb, line 89 def push(url) @redis.sadd(@rns + QUEUE_KEY, url) == 1 end
random_url_from_queue()
click to toggle source
# File lib/grucrawler/queue.rb, line 85 def random_url_from_queue @redis.srandmember(@rns + QUEUE_KEY) end
remove_url_from_queue(url)
click to toggle source
# File lib/grucrawler/queue.rb, line 81 def remove_url_from_queue(url) @redis.srem(@rns + QUEUE_KEY, url) end
reset()
click to toggle source
# File lib/grucrawler/queue.rb, line 20 def reset @redis.del(@rns + DOMAIN_VISITS_KEY) @redis.del(@rns + QUEUE_KEY) @redis.del(@rns + VISITED_ALREADY_KEY) end
set_last_visit_to_domain(url)
click to toggle source
# File lib/grucrawler/queue.rb, line 71 def set_last_visit_to_domain(url) time = Time.now.to_f @redis.hset(@rns + DOMAIN_VISITS_KEY, domain(url), time) end
set_visited_already(url)
click to toggle source
# File lib/grucrawler/queue.rb, line 99 def set_visited_already(url) return unless @visit_once @redis.sadd(@rns + VISITED_ALREADY_KEY, url) end
started(url)
click to toggle source
# File lib/grucrawler/queue.rb, line 54 def started(url) set_last_visit_to_domain(url) @concurrent_requests += 1 end
visited_already(url)
click to toggle source
# File lib/grucrawler/queue.rb, line 94 def visited_already(url) return false unless @visit_once @redis.sismember(@rns + VISITED_ALREADY_KEY, url) end