class GruCrawler::Queue

Constants

DOMAIN_VISITS_KEY
QUEUE_KEY
VISITED_ALREADY_KEY

Public Class Methods

new(namespace, visit_once, domain_wait) click to toggle source
# File lib/grucrawler/queue.rb, line 10
def initialize(namespace, visit_once, domain_wait)
  @redis = Redis.new
  @rns = namespace + ':'
  @concurrent_requests = 0
  @tmp_block = {}
  @domains_throttle = Hash.new(0.0)
  @visit_once = visit_once
  @domain_wait = domain_wait
end

Public Instance Methods

can_visit_now(url) click to toggle source
# File lib/grucrawler/queue.rb, line 45
def can_visit_now(url)
  return false if @tmp_block[url]

  last_visit = last_visit_to_domain(url)
  time_passed = Time.now.to_f - last_visit

  time_passed > @domain_wait
end
count() click to toggle source
# File lib/grucrawler/queue.rb, line 67
def count
  @concurrent_requests
end
domain(url) click to toggle source

TODO: PublicSuffix

# File lib/grucrawler/queue.rb, line 106
def domain(url)
  begin
    uri = URI.parse(url)
  rescue URI::InvalidURIError
    return nil
  end

  return nil if uri.host.nil?
  host = uri.host.downcase
  host = host.start_with?('www.') ? host[4..-1] : host
  match = [host.match(/\w+\.\w+$/)]
  return nil unless match
  match[0]
end
finished(url) click to toggle source
# File lib/grucrawler/queue.rb, line 60
def finished(url)
  @tmp_block.delete(url)
  set_visited_already(url)
  remove_url_from_queue(url) if url
  @concurrent_requests -= 1
end
last_visit_to_domain(url) click to toggle source
# File lib/grucrawler/queue.rb, line 76
def last_visit_to_domain(url)
  @redis.hget(@rns + DOMAIN_VISITS_KEY, domain(url)).to_f
end
next_url() click to toggle source
# File lib/grucrawler/queue.rb, line 26
def next_url
  url = ''

  100.times do
    url = random_url_from_queue()

    if visited_already(url) or not can_visit_now(url)
      url = nil
      next
    end

    break
  end

  @tmp_block[url] = true

  url
end
push(url) click to toggle source
# File lib/grucrawler/queue.rb, line 89
def push(url)
  @redis.sadd(@rns + QUEUE_KEY, url) == 1
end
random_url_from_queue() click to toggle source
# File lib/grucrawler/queue.rb, line 85
def random_url_from_queue
  @redis.srandmember(@rns + QUEUE_KEY)
end
remove_url_from_queue(url) click to toggle source
# File lib/grucrawler/queue.rb, line 81
def remove_url_from_queue(url)
  @redis.srem(@rns + QUEUE_KEY, url)
end
reset() click to toggle source
# File lib/grucrawler/queue.rb, line 20
def reset
  @redis.del(@rns + DOMAIN_VISITS_KEY)
  @redis.del(@rns + QUEUE_KEY)
  @redis.del(@rns + VISITED_ALREADY_KEY)
end
set_last_visit_to_domain(url) click to toggle source
# File lib/grucrawler/queue.rb, line 71
def set_last_visit_to_domain(url)
  time = Time.now.to_f
  @redis.hset(@rns + DOMAIN_VISITS_KEY, domain(url), time)
end
set_visited_already(url) click to toggle source
# File lib/grucrawler/queue.rb, line 99
def set_visited_already(url)
  return unless @visit_once
  @redis.sadd(@rns + VISITED_ALREADY_KEY, url)
end
started(url) click to toggle source
# File lib/grucrawler/queue.rb, line 54
def started(url)
  set_last_visit_to_domain(url)

  @concurrent_requests += 1
end
visited_already(url) click to toggle source
# File lib/grucrawler/queue.rb, line 94
def visited_already(url)
  return false unless @visit_once
  @redis.sismember(@rns + VISITED_ALREADY_KEY, url)
end