class Spider::VisitQueue
Constants
- IterationExit
Attributes
robot_txt[RW]
visit_count[RW]
Public Class Methods
new(robots = nil, agent = nil, finish = nil)
click to toggle source
# File lib/queue.rb, line 16 def initialize(robots = nil, agent = nil, finish = nil) @robot_txt = ExclusionParser.new(robots, agent) if robots @finalize = finish @visit_count = 0 clear_visited @pending = [] end
Public Instance Methods
clear_visited()
click to toggle source
# File lib/queue.rb, line 64 def clear_visited @visited = Bloomer.new(10_000, 0.001) end
empty?()
click to toggle source
# File lib/queue.rb, line 56 def empty? @pending.empty? end
mark(urls)
click to toggle source
# File lib/queue.rb, line 47 def mark(urls) urls = [urls] unless urls.is_a? Array urls.each { |u| @visited.add(u) } end
push_back(urls)
click to toggle source
# File lib/queue.rb, line 43 def push_back(urls) add_url(urls) { |u| @pending.unshift(u) } end
push_front(urls)
click to toggle source
# File lib/queue.rb, line 39 def push_front(urls) add_url(urls) { |u| @pending.push(u) } end
size()
click to toggle source
# File lib/queue.rb, line 52 def size @pending.size end
stop()
click to toggle source
# File lib/queue.rb, line 60 def stop raise IterationExit end
url_okay(url)
click to toggle source
# File lib/queue.rb, line 68 def url_okay(url) return false if @visited.include?(url) return false if @robot_txt && @robot_txt.excluded?(url) true end
visit_each() { |clone| ... }
click to toggle source
# File lib/queue.rb, line 24 def visit_each begin until @pending.empty? url = @pending.pop next unless url_okay(url) yield url.clone if block_given? @visited.add(url) @visit_count += 1 end rescue IterationExit end @finalize.call if @finalize end
Private Instance Methods
add_url(urls) { |url| ... }
click to toggle source
# File lib/queue.rb, line 76 def add_url(urls) urls = [urls] unless urls.is_a? Array urls.compact! urls.each do |url| yield url unless @visited.include?(url) || @pending.include?(url) end end