class Anemone::Core

Constants

DEFAULT_OPTS

Attributes

opts[R]

Hash of options for the crawl

pages[R]

PageStore storing all Page objects encountered during the crawl

Public Class Methods

crawl(urls, opts = {}) { |core| ... } click to toggle source

Convenience method to start a new crawl

# File lib/anemone/core.rb, line 89
def self.crawl(urls, opts = {})
  self.new(urls, opts) do |core|
    yield core if block_given?
    core.run
  end
end
new(urls, opts = {}) { |self| ... } click to toggle source

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block

# File lib/anemone/core.rb, line 72
def initialize(urls, opts = {})
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
  @urls.each{ |url| url.path = '/' if url.path.empty? }

  @tentacles = []
  @on_every_page_blocks = []
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
  @skip_link_patterns = []
  @after_crawl_blocks = []
  @opts = opts

  yield self if block_given?
end

Public Instance Methods

after_crawl(&block) click to toggle source

Add a block to be executed on the PageStore after the crawl is finished

# File lib/anemone/core.rb, line 100
def after_crawl(&block)
  @after_crawl_blocks << block
  self
end
focus_crawl(&block) click to toggle source

Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.

# File lib/anemone/core.rb, line 140
def focus_crawl(&block)
  @focus_crawl_block = block
  self
end
on_every_page(&block) click to toggle source

Add a block to be executed on every Page as they are encountered during the crawl

# File lib/anemone/core.rb, line 118
def on_every_page(&block)
  @on_every_page_blocks << block
  self
end
on_pages_like(*patterns, &block) click to toggle source

Add a block to be executed on Page objects with a URL matching one or more patterns

# File lib/anemone/core.rb, line 127
def on_pages_like(*patterns, &block)
  if patterns
    patterns.each do |pattern|
      @on_pages_like_blocks[pattern] << block
    end
  end
  self
end
run() click to toggle source

Perform the crawl

# File lib/anemone/core.rb, line 148
def run
  process_options

  @urls.delete_if { |url| !visit_link?(url) }
  return if @urls.empty?

  link_queue = Queue.new
  page_queue = Queue.new

  @opts[:threads].times do
    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
  end

  @urls.each{ |url| link_queue.enq(url) }

  loop do
    page = page_queue.deq
    @pages.touch_key page.url
    puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
    do_page_blocks page
    page.discard_doc! if @opts[:discard_page_bodies]

    links = links_to_follow page
    links.each do |link|
      link_queue << [link, page.url.dup, page.depth + 1]
    end
    @pages.touch_keys links

    @pages[page.url] = page

    # if we are done with the crawl, tell the threads to end
    if link_queue.empty? and page_queue.empty?
      until link_queue.num_waiting == @tentacles.size
        Thread.pass
      end
      if page_queue.empty?
        @tentacles.size.times { link_queue << :END }
        break
      end
    end
  end

  @tentacles.each { |thread| thread.join }
  do_after_crawl_blocks
  self
end

Private Instance Methods

allowed(link) click to toggle source

Returns true if we are obeying robots.txt and the link is granted access in it. Always returns true when we are not obeying robots.txt.

# File lib/anemone/core.rb, line 267
def allowed(link)
  @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
rescue
  false
end
do_after_crawl_blocks() click to toggle source

Execute the after_crawl blocks

# File lib/anemone/core.rb, line 220
def do_after_crawl_blocks
  @after_crawl_blocks.each { |block| block.call(@pages) }
end
do_page_blocks(page) click to toggle source

Execute the on_every_page blocks for page

# File lib/anemone/core.rb, line 227
def do_page_blocks(page)
  @on_every_page_blocks.each do |block|
    block.call(page)
  end

  @on_pages_like_blocks.each do |pattern, blocks|
    blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
  end
end
freeze_options() click to toggle source

Freeze the opts Hash so that no options can be modified once the crawl begins

# File lib/anemone/core.rb, line 211
def freeze_options
  @opts.freeze
  @opts.each_key { |key| @opts[key].freeze }
  @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
end
process_options() click to toggle source
# File lib/anemone/core.rb, line 197
def process_options
  @opts = DEFAULT_OPTS.merge @opts
  @opts[:threads] = 1 if @opts[:delay] > 0
  storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
  @pages = PageStore.new(storage)
  @robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]

  freeze_options
end
skip_query_string?(link) click to toggle source

Returns true if link should not be visited because it has a query string and skip_query_strings is true.

# File lib/anemone/core.rb, line 289
def skip_query_string?(link)
  @opts[:skip_query_strings] && link.query
end
too_deep?(from_page) click to toggle source

Returns true if we are over the page depth limit. This only works when coming from a page and with the depth_limit option set. When neither is the case, will always return false.

# File lib/anemone/core.rb, line 277
def too_deep?(from_page)
  if from_page && @opts[:depth_limit]
    from_page.depth >= @opts[:depth_limit]
  else
    false
  end
end