class Spider::VisitQueue

Constants

IterationExit

Attributes

robot_txt[RW]
visit_count[RW]

Public Class Methods

new(robots = nil, agent = nil, finish = nil) click to toggle source
# File lib/queue.rb, line 16
def initialize(robots = nil, agent = nil, finish = nil)
  @robot_txt = ExclusionParser.new(robots, agent) if robots
  @finalize = finish
  @visit_count = 0
  clear_visited
  @pending = []
end

Public Instance Methods

clear_visited() click to toggle source
# File lib/queue.rb, line 64
def clear_visited
  @visited =  Bloomer.new(10_000, 0.001)
end
empty?() click to toggle source
# File lib/queue.rb, line 56
def empty?
  @pending.empty?
end
mark(urls) click to toggle source
# File lib/queue.rb, line 47
def mark(urls)
  urls = [urls] unless urls.is_a? Array
  urls.each { |u| @visited.add(u) }
end
push_back(urls) click to toggle source
# File lib/queue.rb, line 43
def push_back(urls)
  add_url(urls) { |u| @pending.unshift(u) }
end
push_front(urls) click to toggle source
# File lib/queue.rb, line 39
def push_front(urls)
  add_url(urls) { |u| @pending.push(u) }
end
size() click to toggle source
# File lib/queue.rb, line 52
def size
  @pending.size
end
stop() click to toggle source
# File lib/queue.rb, line 60
def stop
  raise IterationExit
end
url_okay(url) click to toggle source
# File lib/queue.rb, line 68
def url_okay(url)
  return false if @visited.include?(url)
  return false if @robot_txt && @robot_txt.excluded?(url)
  true
end
visit_each() { |clone| ... } click to toggle source
# File lib/queue.rb, line 24
def visit_each
  begin
    until @pending.empty?
      url = @pending.pop
      next unless url_okay(url)
      yield url.clone if block_given?
      @visited.add(url)
      @visit_count += 1
    end
  rescue IterationExit
  end

  @finalize.call if @finalize
end

Private Instance Methods

add_url(urls) { |url| ... } click to toggle source
# File lib/queue.rb, line 76
def add_url(urls)
  urls = [urls] unless urls.is_a? Array
  urls.compact!

  urls.each do |url|
    yield url unless @visited.include?(url) || @pending.include?(url)
  end
end