class Anemone::Core
Constants
- DEFAULT_OPTS
Attributes
Hash of options for the crawl
Public Class Methods
Convenience method to start a new crawl
# File lib/anemone/core.rb, line 89 def self.crawl(urls, opts = {}) self.new(urls, opts) do |core| yield core if block_given? core.run end end
Initialize the crawl with starting urls (single URL or Array of URLs) and optional block
# File lib/anemone/core.rb, line 72 def initialize(urls, opts = {}) @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) } @urls.each{ |url| url.path = '/' if url.path.empty? } @tentacles = [] @on_every_page_blocks = [] @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] @after_crawl_blocks = [] @opts = opts yield self if block_given? end
Public Instance Methods
Add a block to be executed on the PageStore
after the crawl is finished
# File lib/anemone/core.rb, line 100 def after_crawl(&block) @after_crawl_blocks << block self end
Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.
# File lib/anemone/core.rb, line 140 def focus_crawl(&block) @focus_crawl_block = block self end
Add a block to be executed on every Page
as they are encountered during the crawl
# File lib/anemone/core.rb, line 118 def on_every_page(&block) @on_every_page_blocks << block self end
Add a block to be executed on Page
objects with a URL matching one or more patterns
# File lib/anemone/core.rb, line 127 def on_pages_like(*patterns, &block) if patterns patterns.each do |pattern| @on_pages_like_blocks[pattern] << block end end self end
Perform the crawl
# File lib/anemone/core.rb, line 148 def run process_options @urls.delete_if { |url| !visit_link?(url) } return if @urls.empty? link_queue = Queue.new page_queue = Queue.new @opts[:threads].times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run } end @urls.each{ |url| link_queue.enq(url) } loop do page = page_queue.deq @pages.touch_key page.url puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] do_page_blocks page page.discard_doc! if @opts[:discard_page_bodies] links = links_to_follow page links.each do |link| link_queue << [link, page.url.dup, page.depth + 1] end @pages.touch_keys links @pages[page.url] = page # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass end if page_queue.empty? @tentacles.size.times { link_queue << :END } break end end end @tentacles.each { |thread| thread.join } do_after_crawl_blocks self end
Add one ore more Regex patterns for URLs which should not be followed
# File lib/anemone/core.rb, line 109 def skip_links_like(*patterns) @skip_link_patterns.concat [patterns].flatten.compact self end
Private Instance Methods
Returns true
if we are obeying robots.txt and the link is granted access in it. Always returns true
when we are not obeying robots.txt.
# File lib/anemone/core.rb, line 267 def allowed(link) @opts[:obey_robots_txt] ? @robots.allowed?(link) : true rescue false end
Execute the after_crawl
blocks
# File lib/anemone/core.rb, line 220 def do_after_crawl_blocks @after_crawl_blocks.each { |block| block.call(@pages) } end
Execute the on_every_page
blocks for page
# File lib/anemone/core.rb, line 227 def do_page_blocks(page) @on_every_page_blocks.each do |block| block.call(page) end @on_pages_like_blocks.each do |pattern, blocks| blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern end end
Freeze the opts Hash so that no options can be modified once the crawl begins
# File lib/anemone/core.rb, line 211 def freeze_options @opts.freeze @opts.each_key { |key| @opts[key].freeze } @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil end
Return an Array of links to follow from the given page. Based on whether or not the link has already been crawled, and the block given to focus_crawl
()
# File lib/anemone/core.rb, line 242 def links_to_follow(page) links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links links.select { |link| visit_link?(link, page) }.map { |link| link.dup } end
# File lib/anemone/core.rb, line 197 def process_options @opts = DEFAULT_OPTS.merge @opts @opts[:threads] = 1 if @opts[:delay] > 0 storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash) @pages = PageStore.new(storage) @robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt] freeze_options end
Returns true
if link should not be visited because its URL matches a skip_link pattern.
# File lib/anemone/core.rb, line 297 def skip_link?(link) @skip_link_patterns.any? { |pattern| link.path =~ pattern } end
Returns true
if link should not be visited because it has a query string and skip_query_strings
is true.
# File lib/anemone/core.rb, line 289 def skip_query_string?(link) @opts[:skip_query_strings] && link.query end
Returns true
if we are over the page depth limit. This only works when coming from a page and with the depth_limit
option set. When neither is the case, will always return false
.
# File lib/anemone/core.rb, line 277 def too_deep?(from_page) if from_page && @opts[:depth_limit] from_page.depth >= @opts[:depth_limit] else false end end
Returns true
if link has not been visited already, and is not excluded by a skip_link pattern… and is not excluded by robots.txt… and is not deeper than the depth limit Returns false
otherwise.
# File lib/anemone/core.rb, line 254 def visit_link?(link, from_page = nil) !@pages.has_page?(link) && !skip_link?(link) && !skip_query_string?(link) && allowed(link) && !too_deep?(from_page) end