class Spidr::Agent

Attributes

authorized[RW]

HTTP Authentication credentials

@return [AuthStore]

cookies[R]

Cached cookies

@return [CookieJar]

default_headers[R]

HTTP Headers to use for every request

@return [Hash{String => String}]

@since 0.6.0

delay[RW]

Delay in between fetching pages

@return [Integer]

failures[R]

List of unreachable URLs

@return [Set<URI::HTTP>]

history[R]

History containing visited URLs

@return [Set<URI::HTTP>]

host_header[RW]

HTTP Host `Header` to use

@return [String]

host_headers[R]

HTTP `Host` Headers to use for specific hosts

@return [Hash{String,Regexp => String}]

levels[R]

The visited URLs and their depth within a site

@return [Hash{URI::HTTP => Integer}]

limit[R]

Maximum number of pages to visit.

@return [Integer]

max_depth[R]

Maximum depth

@return [Integer]

pending_urls[R]

Queue of URLs to visit

@return [Array<URI::HTTP>]

queue[R]

Queue of URLs to visit

@return [Array<URI::HTTP>]

referer[RW]

Referer to use

@return [String]

schemes[R]

List of acceptable URL schemes to follow

sessions[R]

The session cache

@return [SessionCache]

@since 0.6.0

strip_fragments[RW]

Specifies whether the Agent will strip URI fragments

strip_query[RW]

Specifies whether the Agent will strip URI queries

visited_urls[R]

History containing visited URLs

@return [Set<URI::HTTP>]

Public Class Methods

domain(name,**kwargs,&block) click to toggle source

Creates a new agent and spiders the entire domain.

@param [String] name

The top-level domain to spider.

@param [Hash{Symbol => Object}] kwargs

Additional keyword arguments. See {Agent#initialize}.

@yield [agent]

If a block is given, it will be passed the newly created agent
before it begins spidering.

@yieldparam [Agent] agent

The newly created agent.

@return [Agent]

The created agent object.

@see initialize

@since 0.7.0

# File lib/spidr/agent.rb, line 418
def self.domain(name,**kwargs,&block)
  agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
  agent.start_at(URI::HTTP.build(host: name, path: '/'))
  return agent
end
host(name,**kwargs,&block) click to toggle source

Creates a new agent and spiders the given host.

@param [String] name

The host-name to spider.

@param [Hash{Symbol => Object}] kwargs

Additional keyword arguments. See {Agent#initialize}.

@yield [agent]

If a block is given, it will be passed the newly created agent
before it begins spidering.

@yieldparam [Agent] agent

The newly created agent.

@return [Agent]

The created agent object.

@see initialize

# File lib/spidr/agent.rb, line 389
def self.host(name,**kwargs,&block)
  agent = new(host: name, **kwargs, &block)
  agent.start_at(URI::HTTP.build(host: name, path: '/'))
  return agent
end
new( host_header: nil, host_headers: {}, default_headers: {}, user_agent: Spidr.user_agent, referer: nil, proxy: Spidr.proxy, open_timeout: Spidr.open_timeout, ssl_timeout: Spidr.ssl_timeout, read_timeout: Spidr.read_timeout, continue_timeout: Spidr.continue_timeout, keep_alive_timeout: Spidr.keep_alive_timeout, delay: 0, limit: nil, max_depth: nil, queue: nil, history: nil, strip_fragments: true, strip_query: false, schemes: self.class.default_schemes, host: nil, hosts: nil, ignore_hosts: nil, ports: nil, ignore_ports: nil, links: nil, ignore_links: nil, urls: nil, ignore_urls: nil, exts: nil, ignore_exts: nil, robots: Spidr.robots?) { |self| ... } click to toggle source

Creates a new Agent object.

@param [String, nil] host_header

The HTTP `Host` header to use with each request.

@param [Hash{String,Regexp => String}] host_headers

The HTTP `Host` headers to use for specific hosts.

@param [Hash{String => String}] default_headers

Default headers to set for every request.

@param [String, nil] user_agent

The `User-Agent` string to send with each requests.

@param [String, nil] referer

The `Referer` URL to send with each request.

@param [Integer, nil] open_timeout

Optional open connection timeout.

@param [Integer, nil] read_timeout

Optional read timeout.

@param [Integer, nil] ssl_timeout

Optional SSL connection timeout.

@param [Integer, nil] continue_timeout

Optional continue timeout.

@param [Integer, nil] keep_alive_timeout

Optional `Keep-Alive` timeout.

@param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy

The proxy information to use.

@option proxy [String] :host

The host the proxy is running on.

@option proxy [Integer] :port (8080)

The port the proxy is running on.

@option proxy [String, nil] :user

The user to authenticate as with the proxy.

@option proxy [String, nil] :password

The password to authenticate with.

@param [Integer] delay

The number of seconds to pause between each request.

@param [Integer, nil] limit

The maximum number of pages to visit.

@param [Integer, nil] max_depth

The maximum link depth to follow.

@param [Set, Array, nil] queue

The initial queue of URLs to visit.

@param [Set, Array, nil] history

The initial list of visited URLs.

@param [Boolean] strip_fragments

Controls whether to strip the fragment components from the URLs.

@param [Boolean] strip_query

Controls whether to strip the query components from the URLs.

@param [Array<String>] schemes

The list of acceptable URI schemes to visit.
The `https` scheme will be ignored if `net/https` cannot be loaded.

@param [String] host

The host-name to visit.

@param [Array<String, Regexp, Proc>] hosts

The patterns which match the host-names to visit.

@param [Array<String, Regexp, Proc>] ignore_hosts

The patterns which match the host-names to not visit.

@param [Array<Integer, Regexp, Proc>] ports

The patterns which match the ports to visit.

@param [Array<Integer, Regexp, Proc>] ignore_ports

The patterns which match the ports to not visit.

@param [Array<String, Regexp, Proc>] links

The patterns which match the links to visit.

@param [Array<String, Regexp, Proc>] ignore_links

The patterns which match the links to not visit.

@param [Array<String, Regexp, Proc>] urls

The patterns which match the URLs to visit.

@param [Array<String, Regexp, Proc>] ignore_urls

The patterns which match the URLs to not visit.

@param [Array<String, Regexp, Proc>] exts

The patterns which match the URI path extensions to visit.

@param [Array<String, Regexp, Proc>] ignore_exts

The patterns which match the URI path extensions to not visit.

@param [Boolean] robots

Specifies whether `robots.txt` should be honored.

@yield [agent]

If a block is given, it will be passed the newly created agent
for further configuration.

@yieldparam [Agent] agent

The newly created agent.
# File lib/spidr/agent.rb, line 214
def initialize(# header keyword arguments
               host_header:        nil,
               host_headers:       {},
               default_headers:    {},
               user_agent:         Spidr.user_agent,
               referer:            nil,
               # session cache keyword arguments
               proxy:              Spidr.proxy,
               open_timeout:       Spidr.open_timeout,
               ssl_timeout:        Spidr.ssl_timeout,
               read_timeout:       Spidr.read_timeout,
               continue_timeout:   Spidr.continue_timeout,
               keep_alive_timeout: Spidr.keep_alive_timeout,
               # spidering controls keyword arguments
               delay:     0,
               limit:     nil,
               max_depth: nil,
               # history keyword arguments
               queue:   nil,
               history: nil,
               # sanitizer keyword arguments
               strip_fragments: true,
               strip_query:     false,
               # filtering keyword arguments
               schemes:      self.class.default_schemes,
               host:         nil,
               hosts:        nil,
               ignore_hosts: nil,
               ports:        nil,
               ignore_ports: nil,
               links:        nil,
               ignore_links: nil,
               urls:         nil,
               ignore_urls:  nil,
               exts:         nil,
               ignore_exts:  nil,
               # robots keyword arguments
               robots:       Spidr.robots?)
  @host_header  = host_header
  @host_headers = host_headers

  @default_headers = default_headers

  @user_agent = user_agent
  @referer    = referer

  @sessions   = SessionCache.new(
    proxy:              proxy,
    open_timeout:       open_timeout,
    ssl_timeout:        ssl_timeout,
    read_timeout:       read_timeout,
    continue_timeout:   continue_timeout,
    keep_alive_timeout: keep_alive_timeout
  )
  @cookies    = CookieJar.new
  @authorized = AuthStore.new

  @running  = false
  @delay    = delay
  @history  = Set[]
  @failures = Set[]
  @queue    = []

  @limit     = limit
  @levels    = Hash.new(0)
  @max_depth = max_depth

  self.queue   = queue   if queue
  self.history = history if history

  initialize_sanitizers(
    strip_fragments: strip_fragments,
    strip_query:     strip_query
  )

  initialize_filters(
    schemes:      schemes,
    host:         host,
    hosts:        hosts,
    ignore_hosts: ignore_hosts,
    ports:        ports,
    ignore_ports: ignore_ports,
    links:        links,
    ignore_links: ignore_links,
    urls:         urls,
    ignore_urls:  ignore_urls,
    exts:         exts,
    ignore_exts:  ignore_exts
  )
  initialize_actions
  initialize_events

  initialize_robots if robots

  yield self if block_given?
end
site(url,**kwargs,&block) click to toggle source

Creates a new agent and spiders the web-site located at the given URL.

@param [URI::HTTP, String] url

The web-site to spider.

@param [Hash{Symbol => Object}] kwargs

Additional keyword arguments. See {Agent#initialize}.

@yield [agent]

If a block is given, it will be passed the newly created agent
before it begins spidering.

@yieldparam [Agent] agent

The newly created agent.

@return [Agent]

The created agent object.

@see initialize

# File lib/spidr/agent.rb, line 360
def self.site(url,**kwargs,&block)
  url = URI(url)

  agent = new(host: url.host, **kwargs, &block)
  agent.start_at(url)
  return agent
end
start_at(url,**kwargs,&block) click to toggle source

Creates a new agent and begin spidering at the given URL.

@param [URI::HTTP, String] url

The URL to start spidering at.

@param [Hash{Symbol => Object}] kwargs

Additional keyword arguments. See {Agent#initialize}.

@yield [agent]

If a block is given, it will be passed the newly created agent
before it begins spidering.

@yieldparam [Agent] agent

The newly created agent.

@return [Agent]

The created agent object.

@see initialize @see start_at

# File lib/spidr/agent.rb, line 333
def self.start_at(url,**kwargs,&block)
  agent = new(**kwargs,&block)
  agent.start_at(url)
  return agent
end

Protected Class Methods

default_schemes() click to toggle source

Determines the default URI schemes to follow.

@return [Array<String>]

The default URI schemes to follow.

@since 0.6.2

# File lib/spidr/agent/filters.rb, line 429
def self.default_schemes
  schemes = ['http']

  begin
    require 'net/https'

    schemes << 'https'
  rescue Gem::LoadError => e
    raise(e)
  rescue ::LoadError
    warn "Warning: cannot load 'net/https', https support disabled"
  end

  return schemes
end

Public Instance Methods

all_headers() { |headers| ... } click to toggle source

Pass the headers from every response the agent receives to a given block.

@yield [headers]

The block will be passed the headers of every response.

@yieldparam [Hash] headers

The headers from a response.
# File lib/spidr/agent/events.rb, line 70
def all_headers
  every_page { |page| yield page.headers }
end
clear() click to toggle source

Clears the history of the agent.

# File lib/spidr/agent.rb, line 458
def clear
  @queue.clear
  @history.clear
  @failures.clear
  return self
end
continue!(&block) click to toggle source

Continue spidering.

@yield [page]

If a block is given, it will be passed every page visited.

@yieldparam [Page] page

The page to be visited.
# File lib/spidr/agent/actions.rb, line 42
def continue!(&block)
  @paused = false
  return run(&block)
end
enqueue(url,level=0) click to toggle source

Enqueues a given URL for visiting, only if it passes all of the agent's rules for visiting a given URL.

@param [URI::HTTP, String] url

The URL to enqueue for visiting.

@return [Boolean]

Specifies whether the URL was enqueued, or ignored.
# File lib/spidr/agent.rb, line 658
def enqueue(url,level=0)
  url = sanitize_url(url)

  if (!queued?(url) && visit?(url))
    link = url.to_s

    begin
      @every_url_blocks.each { |url_block| url_block.call(url) }

      @every_url_like_blocks.each do |pattern,url_blocks|
        match = case pattern
                when Regexp
                  link =~ pattern
                else
                  (pattern == link) || (pattern == url)
                end

        if match
          url_blocks.each { |url_block| url_block.call(url) }
        end
      end
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipLink
      return false
    rescue Actions::Action
    end

    @queue << url
    @levels[url] = level
    return true
  end

  return false
end
every_atom_doc() { |doc| ... } click to toggle source

Pass every Atom document that the agent parses to a given block.

@yield [doc]

The block will be passed every Atom document parsed.

@yieldparam [Nokogiri::XML::Document] doc

A parsed XML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html

# File lib/spidr/agent/events.rb, line 389
def every_atom_doc
  every_page do |page|
    if (block_given? && page.atom?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_atom_page() { |page| ... } click to toggle source

Pass every Atom feed that the agent visits to a given block.

@yield [feed]

The block will be passed every Atom feed visited.

@yieldparam [Page] feed

A visited page.
# File lib/spidr/agent/events.rb, line 453
def every_atom_page
  every_page do |page|
    yield page if (block_given? && page.atom?)
  end
end
every_bad_request_page() { |page| ... } click to toggle source

Pass every Bad Request page that the agent visits to a given block.

@yield [page]

The block will be passed every Bad Request page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 142
def every_bad_request_page
  every_page do |page|
    yield page if (block_given? && page.bad_request?)
  end
end
every_css_page() { |page| ... } click to toggle source

Pass every CSS page that the agent visits to a given block.

@yield [page]

The block will be passed every CSS page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 423
def every_css_page
  every_page do |page|
    yield page if (block_given? && page.css?)
  end
end
every_doc() { |doc| ... } click to toggle source

Pass every HTML or XML document that the agent parses to a given block.

@yield [doc]

The block will be passed every HTML or XML document parsed.

@yieldparam [Nokogiri::HTML::Document, Nokogiri::XML::Document] doc

A parsed HTML or XML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html @see nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html

# File lib/spidr/agent/events.rb, line 283
def every_doc
  every_page do |page|
    if block_given?
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_failed_url(&block) click to toggle source

Pass each URL that could not be requested to the given block.

@yield [url]

The block will be passed every URL that could not be requested.

@yieldparam [URI::HTTP] url

A failed URL.
# File lib/spidr/agent/events.rb, line 28
def every_failed_url(&block)
  @every_failed_url_blocks << block
  return self
end
every_forbidden_page() { |page| ... } click to toggle source

Pass every Forbidden page that the agent visits to a given block.

@yield [page]

The block will be passed every Forbidden page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 172
def every_forbidden_page
  every_page do |page|
    yield page if (block_given? && page.forbidden?)
  end
end
every_html_doc() { |doc| ... } click to toggle source

Pass every HTML document that the agent parses to a given block.

@yield [doc]

The block will be passed every HTML document parsed.

@yieldparam [Nokogiri::HTML::Document] doc

A parsed HTML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html

# File lib/spidr/agent/events.rb, line 304
def every_html_doc
  every_page do |page|
    if (block_given? && page.html?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_html_page() { |page| ... } click to toggle source

Pass every HTML page that the agent visits to a given block.

@yield [page]

The block will be passed every HTML page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 233
def every_html_page
  every_page do |page|
    yield page if (block_given? && page.html?)
  end
end
every_internal_server_error_page() { |page| ... } click to toggle source

Pass every Internal Server Error page that the agent visits to a given block.

@yield [page]

The block will be passed every Internal Server Error page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 203
def every_internal_server_error_page
  every_page do |page|
    yield page if (block_given? && page.had_internal_server_error?)
  end
end
every_javascript_page() { |page| ... } click to toggle source

Pass every JavaScript page that the agent visits to a given block.

@yield [page]

The block will be passed every JavaScript page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 408
def every_javascript_page
  every_page do |page|
    yield page if (block_given? && page.javascript?)
  end
end
every_missing_page() { |page| ... } click to toggle source

Pass every Missing page that the agent visits to a given block.

@yield [page]

The block will be passed every Missing page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 187
def every_missing_page
  every_page do |page|
    yield page if (block_given? && page.missing?)
  end
end
every_ms_word_page() { |page| ... } click to toggle source

Pass every MS Word page that the agent visits to a given block.

@yield [page]

The block will be passed every MS Word page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 468
def every_ms_word_page
  every_page do |page|
    yield page if (block_given? && page.ms_word?)
  end
end
every_ok_page() { |page| ... } click to toggle source

Pass every OK page that the agent visits to a given block.

@yield [page]

The block will be passed every OK page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 97
def every_ok_page
  every_page do |page|
    yield page if (block_given? && page.ok?)
  end
end
every_page(&block) click to toggle source

Pass every page that the agent visits to a given block.

@yield [page]

The block will be passed every page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 83
def every_page(&block)
  @every_page_blocks << block
  return self
end
every_pdf_page() { |page| ... } click to toggle source

Pass every PDF page that the agent visits to a given block.

@yield [page]

The block will be passed every PDF page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 483
def every_pdf_page
  every_page do |page|
    yield page if (block_given? && page.pdf?)
  end
end
every_redirect_page() { |page| ... } click to toggle source

Pass every Redirect page that the agent visits to a given block.

@yield [page]

The block will be passed every Redirect page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 112
def every_redirect_page
  every_page do |page|
    yield page if (block_given? && page.redirect?)
  end
end
every_rss_doc() { |doc| ... } click to toggle source

Pass every RSS document that the agent parses to a given block.

@yield [doc]

The block will be passed every RSS document parsed.

@yieldparam [Nokogiri::XML::Document] doc

A parsed XML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html

# File lib/spidr/agent/events.rb, line 368
def every_rss_doc
  every_page do |page|
    if (block_given? && page.rss?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_rss_page() { |page| ... } click to toggle source

Pass every RSS feed that the agent visits to a given block.

@yield [feed]

The block will be passed every RSS feed visited.

@yieldparam [Page] feed

A visited page.
# File lib/spidr/agent/events.rb, line 438
def every_rss_page
  every_page do |page|
    yield page if (block_given? && page.rss?)
  end
end
every_timedout_page() { |page| ... } click to toggle source

Pass every Timeout page that the agent visits to a given block.

@yield [page]

The block will be passed every Timeout page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 127
def every_timedout_page
  every_page do |page|
    yield page if (block_given? && page.timedout?)
  end
end
every_txt_page() { |page| ... } click to toggle source

Pass every Plain Text page that the agent visits to a given block.

@yield [page]

The block will be passed every Plain Text page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 218
def every_txt_page
  every_page do |page|
    yield page if (block_given? && page.txt?)
  end
end
every_unauthorized_page() { |page| ... } click to toggle source

Pass every Unauthorized page that the agent visits to a given block.

@yield [page]

The block will be passed every Unauthorized page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 157
def every_unauthorized_page
  every_page do |page|
    yield page if (block_given? && page.unauthorized?)
  end
end
every_url(&block) click to toggle source

Pass each URL from each page visited to the given block.

@yield [url]

The block will be passed every URL from every page visited.

@yieldparam [URI::HTTP] url

Each URL from each page visited.
# File lib/spidr/agent/events.rb, line 14
def every_url(&block)
  @every_url_blocks << block
  return self
end
every_url_like(pattern,&block) click to toggle source

Pass every URL that the agent visits, and matches a given pattern, to a given block.

@param [Regexp, String] pattern

The pattern to match URLs with.

@yield [url]

The block will be passed every URL that matches the given pattern.

@yieldparam [URI::HTTP] url

A matching URL.

@since 0.3.2

# File lib/spidr/agent/events.rb, line 48
def every_url_like(pattern,&block)
  @every_url_like_blocks[pattern] << block
  return self
end
every_xml_doc() { |doc| ... } click to toggle source

Pass every XML document that the agent parses to a given block.

@yield [doc]

The block will be passed every XML document parsed.

@yieldparam [Nokogiri::XML::Document] doc

A parsed XML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html

# File lib/spidr/agent/events.rb, line 325
def every_xml_doc
  every_page do |page|
    if (block_given? && page.xml?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_xml_page() { |page| ... } click to toggle source

Pass every XML page that the agent visits to a given block.

@yield [page]

The block will be passed every XML page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 248
def every_xml_page
  every_page do |page|
    yield page if (block_given? && page.xml?)
  end
end
every_xsl_doc() { |doc| ... } click to toggle source

Pass every XML Stylesheet (XSL) that the agent parses to a given block.

@yield [doc]

The block will be passed every XSL Stylesheet (XSL) parsed.

@yieldparam [Nokogiri::XML::Document] doc

A parsed XML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html

# File lib/spidr/agent/events.rb, line 347
def every_xsl_doc
  every_page do |page|
    if (block_given? && page.xsl?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_xsl_page() { |page| ... } click to toggle source

Pass every XML Stylesheet (XSL) page that the agent visits to a given block.

@yield [page]

The block will be passed every XML Stylesheet (XSL) page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 264
def every_xsl_page
  every_page do |page|
    yield page if (block_given? && page.xsl?)
  end
end
every_zip_page() { |page| ... } click to toggle source

Pass every ZIP page that the agent visits to a given block.

@yield [page]

The block will be passed every ZIP page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 498
def every_zip_page
  every_page do |page|
    yield page if (block_given? && page.zip?)
  end
end
failed?(url) click to toggle source

Determines whether a given URL could not be visited.

@param [URI::HTTP, String] url

The URL to check for failures.

@return [Boolean]

Specifies whether the given URL was unable to be visited.
# File lib/spidr/agent.rb, line 607
def failed?(url)
  @failures.include?(URI(url))
end
failures=(new_failures) click to toggle source

Sets the list of failed URLs.

@param [#each] new_failures

The new list of failed URLs.

@return [Array<URI::HTTP>]

The list of failed URLs.

@example

agent.failures = ['http://localhost/']
# File lib/spidr/agent.rb, line 588
def failures=(new_failures)
  @failures.clear

  new_failures.each do |url|
    @failures << URI(url)
  end

  return @failures
end
get_page(url) { |new_page| ... } click to toggle source

Requests and creates a new Page object from a given URL.

@param [URI::HTTP] url

The URL to request.

@yield [page]

If a block is given, it will be passed the page that represents the
response.

@yieldparam [Page] page

The page for the response.

@return [Page, nil]

The page for the response, or `nil` if the request failed.
# File lib/spidr/agent.rb, line 710
def get_page(url)
  url = URI(url)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.get(path,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    yield new_page if block_given?
    return new_page
  end
end
history=(new_history) click to toggle source

Sets the history of URLs that were previously visited.

@param [#each] new_history

A list of URLs to populate the history with.

@return [Set<URI::HTTP>]

The history of the agent.

@example

agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
# File lib/spidr/agent.rb, line 531
def history=(new_history)
  @history.clear

  new_history.each do |url|
    @history << URI(url)
  end

  return @history
end
ignore_exts() click to toggle source

Specifies the patterns that match URI path extensions to not visit.

@return [Array<String, Regexp, Proc>]

The URI path extension patterns to not visit.
# File lib/spidr/agent/filters.rb, line 330
def ignore_exts
  @ext_rules.reject
end
ignore_exts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_exts}.

@param [String, Regexp] pattern

The pattern to match URI path extensions with.

@yield [ext]

If a block is given, it will be used to filter URI path extensions.

@yieldparam [String] ext

A URI path extension to reject or accept.
# File lib/spidr/agent/filters.rb, line 346
def ignore_exts_like(pattern=nil,&block)
  if pattern
    ignore_exts << pattern
  elsif block
    ignore_exts << block
  end

  return self
end
ignore_hosts() click to toggle source

Specifies the patterns that match host-names to not visit.

@return [Array<String, Regexp, Proc>]

The host-name patterns to not visit.
# File lib/spidr/agent/filters.rb, line 62
def ignore_hosts
  @host_rules.reject
end
ignore_hosts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_hosts}.

@param [String, Regexp] pattern

The pattern to match host-names with.

@yield [host]

If a block is given, it will be used to filter host-names.

@yieldparam [String] host

A host-name to reject or accept.
# File lib/spidr/agent/filters.rb, line 78
def ignore_hosts_like(pattern=nil,&block)
  if pattern
    ignore_hosts << pattern
  elsif block
    ignore_hosts << block
  end

  return self
end
ignore_ports() click to toggle source

Specifies the patterns that match ports to not visit.

@return [Array<Integer, Regexp, Proc>]

The port patterns to not visit.
# File lib/spidr/agent/filters.rb, line 126
def ignore_ports
  @port_rules.reject
end
ignore_ports_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_ports}.

@param [Integer, Regexp] pattern

The pattern to match ports with.

@yield [port]

If a block is given, it will be used to filter ports.

@yieldparam [Integer] port

A port to reject or accept.
# File lib/spidr/agent/filters.rb, line 142
def ignore_ports_like(pattern=nil,&block)
  if pattern
    ignore_ports << pattern
  elsif block
    ignore_ports << block
  end

  return self
end
ignore_urls() click to toggle source

Specifies the patterns that match URLs to not visit.

@return [Array<String, Regexp, Proc>]

The URL patterns to not visit.

@since 0.2.4

# File lib/spidr/agent/filters.rb, line 264
def ignore_urls
  @url_rules.reject
end
ignore_urls_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_urls}.

@param [String, Regexp] pattern

The pattern to match URLs with.

@yield [url]

If a block is given, it will be used to filter URLs.

@yieldparam [URI::HTTP, URI::HTTPS] url

A URL to reject or accept.

@since 0.2.4

# File lib/spidr/agent/filters.rb, line 282
def ignore_urls_like(pattern=nil,&block)
  if pattern
    ignore_urls << pattern
  elsif block
    ignore_urls << block
  end

  return self
end
initialize_robots() click to toggle source

Initializes the robots filter.

# File lib/spidr/agent/robots.rb, line 13
def initialize_robots
  unless Object.const_defined?(:Robots)
    raise(ArgumentError,":robots option given but unable to require 'robots' gem")
  end

  @robots = Robots.new(@user_agent)
end
pause!() click to toggle source

Pauses the agent, causing spidering to temporarily stop.

@raise [Paused]

Indicates to the agent, that it should pause spidering.
# File lib/spidr/agent/actions.rb, line 63
def pause!
  @paused = true
  raise(Actions::Paused)
end
pause=(state) click to toggle source

Sets the pause state of the agent.

@param [Boolean] state

The new pause state of the agent.
# File lib/spidr/agent/actions.rb, line 53
def pause=(state)
  @paused = state
end
paused?() click to toggle source

Determines whether the agent is paused.

@return [Boolean]

Specifies whether the agent is paused.
# File lib/spidr/agent/actions.rb, line 74
def paused?
  @paused == true
end
post_page(url,post_data='') { |new_page| ... } click to toggle source

Posts supplied form data and creates a new Page object from a given URL.

@param [URI::HTTP] url

The URL to request.

@param [String] post_data

Form option data.

@yield [page]

If a block is given, it will be passed the page that represents the
response.

@yieldparam [Page] page

The page for the response.

@return [Page, nil]

The page for the response, or `nil` if the request failed.

@since 0.2.2

# File lib/spidr/agent.rb, line 745
def post_page(url,post_data='')
  url = URI(url)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.post(path,post_data,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    yield new_page if block_given?
    return new_page
  end
end
proxy() click to toggle source

The proxy information the agent uses.

@return [Proxy]

The proxy information.

@see SessionCache#proxy

@since 0.2.2

# File lib/spidr/agent.rb, line 434
def proxy
  @sessions.proxy
end
proxy=(new_proxy) click to toggle source

Sets the proxy information that the agent uses.

@param [Proxy, Hash, URI::HTTP, String, nil] new_proxy

The new proxy information.

@return [Proxy]

The new proxy information.

@see SessionCache#proxy=

@since 0.2.2

# File lib/spidr/agent.rb, line 451
def proxy=(new_proxy)
  @sessions.proxy = new_proxy
end
queue=(new_queue) click to toggle source

Sets the queue of URLs to visit.

@param [#each] new_queue

The new list of URLs to visit.

@return [Array<URI::HTTP>]

The list of URLs to visit.

@example

agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
# File lib/spidr/agent.rb, line 625
def queue=(new_queue)
  @queue.clear

  new_queue.each do |url|
    @queue << URI(url)
  end

  return @queue
end
queued?(url) click to toggle source

Determines whether a given URL has been enqueued.

@param [URI::HTTP] url

The URL to search for in the queue.

@return [Boolean]

Specifies whether the given URL has been queued for visiting.
# File lib/spidr/agent.rb, line 644
def queued?(url)
  @queue.include?(url)
end
robot_allowed?(url) click to toggle source

Determines whether a URL is allowed by the robot policy.

@param [URI::HTTP, String] url

The URL to check.

@return [Boolean]

Specifies whether a URL is allowed by the robot policy.
# File lib/spidr/agent/robots.rb, line 30
def robot_allowed?(url)
  if @robots
    @robots.allowed?(url)
  else
    true
  end
end
run(&block) click to toggle source

Start spidering until the queue becomes empty or the agent is paused.

@yield [page]

If a block is given, it will be passed every page visited.

@yieldparam [Page] page

A page which has been visited.
# File lib/spidr/agent.rb, line 492
def run(&block)
  @running = true

  until (@queue.empty? || paused? || limit_reached?)
    begin
      visit_page(dequeue,&block)
    rescue Actions::Paused
      return self
    rescue Actions::Action
    end
  end

  @running = false
  @sessions.clear
  return self
end
running?() click to toggle source

Determines if the agent is running.

@return [Boolean]

Specifies whether the agent is running or stopped.
# File lib/spidr/agent.rb, line 515
def running?
  @running == true
end
sanitize_url(url) click to toggle source

Sanitizes a URL based on filtering options.

@param [URI::HTTP, URI::HTTPS, String] url

The URL to be sanitized

@return [URI::HTTP, URI::HTTPS]

The new sanitized URL.

@since 0.2.2

# File lib/spidr/agent/sanitizers.rb, line 25
def sanitize_url(url)
  url = URI(url)

  url.fragment = nil if @strip_fragments
  url.query    = nil if @strip_query

  return url
end
schemes=(new_schemes) click to toggle source

Sets the list of acceptable URL schemes to visit.

@param [Array] new_schemes

The new schemes to visit.

@example

agent.schemes = ['http']
# File lib/spidr/agent/filters.rb, line 20
def schemes=(new_schemes)
  @schemes = new_schemes.map(&:to_s)
end
skip_page!() click to toggle source

Causes the agent to skip the page being visited.

@raise [SkipPage]

Indicates to the agent, that the current page should be skipped.
# File lib/spidr/agent/actions.rb, line 95
def skip_page!
  raise(Actions::SkipPage)
end
start_at(url,&block) click to toggle source

Start spidering at a given URL.

@param [URI::HTTP, String] url

The URL to start spidering at.

@yield [page]

If a block is given, it will be passed every page visited.

@yieldparam [Page] page

A page which has been visited.
# File lib/spidr/agent.rb, line 477
def start_at(url,&block)
  enqueue(url)
  return run(&block)
end
to_hash() click to toggle source

Converts the agent into a Hash.

@return [Hash]

The agent represented as a Hash containing the `history` and
the `queue` of the agent.
# File lib/spidr/agent.rb, line 819
def to_hash
  {history: @history, queue: @queue}
end
urls_like(pattern,&block) click to toggle source

@see every_url_like

# File lib/spidr/agent/events.rb, line 56
def urls_like(pattern,&block)
  every_url_like(pattern,&block)
end
visit_exts() click to toggle source

Specifies the patterns that match the URI path extensions to visit.

@return [Array<String, Regexp, Proc>]

The URI path extensions patterns to visit.
# File lib/spidr/agent/filters.rb, line 298
def visit_exts
  @ext_rules.accept
end
visit_exts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_exts}.

@param [String, Regexp] pattern

The pattern to match URI path extensions with.

@yield [ext]

If a block is given, it will be used to filter URI path extensions.

@yieldparam [String] ext

A URI path extension to accept or reject.
# File lib/spidr/agent/filters.rb, line 314
def visit_exts_like(pattern=nil,&block)
  if pattern
    visit_exts << pattern
  elsif block
    visit_exts << block
  end

  return self
end
visit_hosts() click to toggle source

Specifies the patterns that match host-names to visit.

@return [Array<String, Regexp, Proc>]

The host-name patterns to visit.
# File lib/spidr/agent/filters.rb, line 30
def visit_hosts
  @host_rules.accept
end
visit_hosts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_hosts}.

@param [String, Regexp] pattern

The pattern to match host-names with.

@yield [host]

If a block is given, it will be used to filter host-names.

@yieldparam [String] host

A host-name to accept or reject.
# File lib/spidr/agent/filters.rb, line 46
def visit_hosts_like(pattern=nil,&block)
  if pattern
    visit_hosts << pattern
  elsif block
    visit_hosts << block
  end

  return self
end
visit_page(url) { |page| ... } click to toggle source

Visits a given URL, and enqueues the links recovered from the URL to be visited later.

@param [URI::HTTP, String] url

The URL to visit.

@yield [page]

If a block is given, it will be passed the page which was visited.

@yieldparam [Page] page

The page which was visited.

@return [Page, nil]

The page that was visited. If `nil` is returned, either the request
for the page failed, or the page was skipped.
# File lib/spidr/agent.rb, line 776
def visit_page(url)
  url = sanitize_url(url)

  get_page(url) do |page|
    @history << page.url

    begin
      @every_page_blocks.each { |page_block| page_block.call(page) }

      yield page if block_given?
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipPage
      return nil
    rescue Actions::Action
    end

    page.each_url do |next_url|
      begin
        @every_link_blocks.each do |link_block|
          link_block.call(page.url,next_url)
        end
      rescue Actions::Paused => action
        raise(action)
      rescue Actions::SkipLink
        next
      rescue Actions::Action
      end

      if (@max_depth.nil? || @max_depth > @levels[url])
        enqueue(next_url,@levels[url] + 1)
      end
    end
  end
end
visit_ports() click to toggle source

Specifies the patterns that match the ports to visit.

@return [Array<Integer, Regexp, Proc>]

The port patterns to visit.
# File lib/spidr/agent/filters.rb, line 94
def visit_ports
  @port_rules.accept
end
visit_ports_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_ports}.

@param [Integer, Regexp] pattern

The pattern to match ports with.

@yield [port]

If a block is given, it will be used to filter ports.

@yieldparam [Integer] port

A port to accept or reject.
# File lib/spidr/agent/filters.rb, line 110
def visit_ports_like(pattern=nil,&block)
  if pattern
    visit_ports << pattern
  elsif block
    visit_ports << block
  end

  return self
end
visit_urls() click to toggle source

Specifies the patterns that match the URLs to visit.

@return [Array<String, Regexp, Proc>]

The link patterns to visit.

@since 0.2.4

# File lib/spidr/agent/filters.rb, line 228
def visit_urls
  @url_rules.accept
end
visit_urls_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_urls}

@param [String, Regexp] pattern

The pattern to match URLs with.

@yield [url]

If a block is given, it will be used to filter URLs.

@yieldparam [URI::HTTP, URI::HTTPS] url

A URL to accept or reject.

@since 0.2.4

# File lib/spidr/agent/filters.rb, line 246
def visit_urls_like(pattern=nil,&block)
  if pattern
    visit_urls << pattern
  elsif block
    visit_urls << block
  end

  return self
end
visited?(url) click to toggle source

Determines whether a URL was visited or not.

@param [URI::HTTP, String] url

The URL to search for.

@return [Boolean]

Specifies whether a URL was visited.
# File lib/spidr/agent.rb, line 572
def visited?(url)
  @history.include?(URI(url))
end
visited_hosts() click to toggle source

Specifies all hosts that were visited.

@return [Array<String>]

The hosts which have been visited.
# File lib/spidr/agent.rb, line 559
def visited_hosts
  visited_urls.map(&:host).uniq
end

Protected Instance Methods

dequeue() click to toggle source

Dequeues a URL that will later be visited.

@return [URI::HTTP]

The URL that was at the front of the queue.
# File lib/spidr/agent.rb, line 922
def dequeue
  @queue.shift
end
failed(url) click to toggle source

Adds a given URL to the failures list.

@param [URI::HTTP] url

The URL to add to the failures list.
# File lib/spidr/agent.rb, line 963
def failed(url)
  @failures << url
  @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
  return true
end
initialize_actions() click to toggle source
# File lib/spidr/agent/actions.rb, line 101
def initialize_actions
  @paused = false
end
initialize_events() click to toggle source
# File lib/spidr/agent/events.rb, line 525
def initialize_events
  @every_url_blocks        = []
  @every_failed_url_blocks = []
  @every_url_like_blocks   = Hash.new { |hash,key| hash[key] = [] }

  @every_page_blocks = []
  @every_link_blocks = []
end
initialize_filters(schemes: self.class.default_schemes, host: nil, hosts: nil, ignore_hosts: nil, ports: nil, ignore_ports: nil, links: nil, ignore_links: nil, urls: nil, ignore_urls: nil, exts: nil, ignore_exts: nil) click to toggle source

Initializes filtering rules.

@param [Array<String>] schemes

The list of acceptable URI schemes to visit.
The `https` scheme will be ignored if `net/https` cannot be loaded.

@param [String] host

The host-name to visit.

@param [Array<String, Regexp, Proc>] hosts

The patterns which match the host-names to visit.

@param [Array<String, Regexp, Proc>] ignore_hosts

The patterns which match the host-names to not visit.

@param [Array<Integer, Regexp, Proc>] ports

The patterns which match the ports to visit.

@param [Array<Integer, Regexp, Proc>] ignore_ports

The patterns which match the ports to not visit.

@param [Array<String, Regexp, Proc>] links

The patterns which match the links to visit.

@param [Array<String, Regexp, Proc>] ignore_links

The patterns which match the links to not visit.

@param [Array<String, Regexp, Proc>] urls

The patterns which match the URLs to visit.

@param [Array<String, Regexp, Proc>] ignore_urls

The patterns which match the URLs to not visit.

@param [Array<String, Regexp, Proc>] exts

The patterns which match the URI path extensions to visit.

@param [Array<String, Regexp, Proc>] ignore_exts

The patterns which match the URI path extensions to not visit.
# File lib/spidr/agent/filters.rb, line 398
def initialize_filters(schemes:      self.class.default_schemes,
                       host:         nil,
                       hosts:        nil,
                       ignore_hosts: nil,
                       ports:        nil,
                       ignore_ports: nil,
                       links:        nil,
                       ignore_links: nil,
                       urls:         nil,
                       ignore_urls:  nil,
                       exts:         nil,
                       ignore_exts:  nil)
  @schemes = schemes.map(&:to_s)

  @host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
  @port_rules = Rules.new(accept: ports, reject: ignore_ports)
  @link_rules = Rules.new(accept: links, reject: ignore_links)
  @url_rules  = Rules.new(accept: urls,  reject: ignore_urls)
  @ext_rules  = Rules.new(accept: exts,  reject: ignore_exts)

  visit_hosts_like(host) if host
end
initialize_sanitizers(strip_fragments: true, strip_query: false) click to toggle source

Initializes the Sanitizer rules.

@param [Boolean] strip_fragments

Specifies whether or not to strip the fragment component from URLs.

@param [Boolean] strip_query

Specifies whether or not to strip the query component from URLs.

@since 0.2.2

# File lib/spidr/agent/sanitizers.rb, line 47
def initialize_sanitizers(strip_fragments: true, strip_query: false)
  @strip_fragments = strip_fragments
  @strip_query     = strip_query
end
limit_reached?() click to toggle source

Determines if the maximum limit has been reached.

@return [Boolean]

@since 0.6.0

# File lib/spidr/agent.rb, line 933
def limit_reached?
  @limit && @history.length >= @limit
end
prepare_request(url) { |sessions, path, headers| ... } click to toggle source

Normalizes the request path and grabs a session to handle page get and post requests.

@param [URI::HTTP] url

The URL to request.

@yield [request]

A block whose purpose is to make a page request.

@yieldparam [Net::HTTP] session

An HTTP session object.

@yieldparam [String] path

Normalized URL string.

@yieldparam [Hash] headers

A Hash of request header options.

@since 0.2.2

# File lib/spidr/agent.rb, line 885
def prepare_request(url,&block)
  path = unless url.path.empty?
           url.path
         else
           '/'
         end

  # append the URL query to the path
  path += "?#{url.query}" if url.query

  headers = prepare_request_headers(url)

  begin
    sleep(@delay) if @delay > 0

    yield @sessions[url], path, headers
  rescue SystemCallError,
         Timeout::Error,
         SocketError,
         IOError,
         OpenSSL::SSL::SSLError,
         Net::HTTPBadResponse,
         Zlib::Error

    @sessions.kill!(url)

    failed(url)
    return nil
  end
end
prepare_request_headers(url) click to toggle source

Prepares request headers for the given URL.

@param [URI::HTTP] url

The URL to prepare the request headers for.

@return [Hash{String => String}]

The prepared headers.

@since 0.6.0

# File lib/spidr/agent.rb, line 836
def prepare_request_headers(url)
  # set any additional HTTP headers
  headers = @default_headers.dup

  unless @host_headers.empty?
    @host_headers.each do |name,header|
      if url.host.match(name)
        headers['Host'] = header
        break
      end
    end
  end

  headers['Host']     ||= @host_header if @host_header
  headers['User-Agent'] = @user_agent if @user_agent
  headers['Referer']    = @referer if @referer

  if (authorization = @authorized.for_url(url))
    headers['Authorization'] = "Basic #{authorization}"
  end

  if (header_cookies = @cookies.for_host(url.host))
    headers['Cookie'] = header_cookies
  end

  return headers
end
visit?(url) click to toggle source

Determines if a given URL should be visited.

@param [URI::HTTP] url

The URL in question.

@return [Boolean]

Specifies whether the given URL should be visited.
# File lib/spidr/agent.rb, line 946
def visit?(url)
  !visited?(url) &&
   visit_scheme?(url.scheme) &&
   visit_host?(url.host) &&
   visit_port?(url.port) &&
   visit_link?(url.to_s) &&
   visit_url?(url) &&
   visit_ext?(url.path) &&
   robot_allowed?(url.to_s)
end
visit_ext?(path) click to toggle source

Determines if a given URI path extension should be visited.

@param [String] path

The path that contains the extension.

@return [Boolean]

Specifies whether the given URI path extension should be visited.
# File lib/spidr/agent/filters.rb, line 525
def visit_ext?(path)
  @ext_rules.accept?(File.extname(path)[1..-1])
end
visit_host?(host) click to toggle source

Determines if a given host-name should be visited.

@param [String] host

The host-name.

@return [Boolean]

Specifies whether the given host-name should be visited.
# File lib/spidr/agent/filters.rb, line 471
def visit_host?(host)
  @host_rules.accept?(host)
end
visit_port?(port) click to toggle source

Determines if a given port should be visited.

@param [Integer] port

The port number.

@return [Boolean]

Specifies whether the given port should be visited.
# File lib/spidr/agent/filters.rb, line 484
def visit_port?(port)
  @port_rules.accept?(port)
end
visit_scheme?(scheme) click to toggle source

Determines if a given URI scheme should be visited.

@param [String] scheme

The URI scheme.

@return [Boolean]

Specifies whether the given scheme should be visited.
# File lib/spidr/agent/filters.rb, line 454
def visit_scheme?(scheme)
  if scheme
    @schemes.include?(scheme)
  else
    true
  end
end
visit_url?(link) click to toggle source

Determines if a given URL should be visited.

@param [URI::HTTP, URI::HTTPS] link

The URL.

@return [Boolean]

Specifies whether the given URL should be visited.

@since 0.2.4

# File lib/spidr/agent/filters.rb, line 512
def visit_url?(link)
  @url_rules.accept?(link)
end