class Anemone::Page

Attributes

body[R]

The raw HTTP response body of the page

code[RW]

Integer response code of the page

data[RW]

OpenStruct for user-stored data

depth[RW]

Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.

error[R]

Exception object, if one was raised during HTTP#fetch_page

headers[R]

Headers of the HTTP response

redirect_to[R]

URL of the page this one redirected to, if any

referer[RW]

URL of the page that brought us to this page

response_time[RW]

Response time of the request for this page in milliseconds

url[R]

The URL of the page

visited[RW]

Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!

Public Class Methods

from_hash(hash) click to toggle source
# File lib/anemone/page.rb, line 198
def self.from_hash(hash)
  page = self.new(URI(hash['url']))
  {'@headers' => Marshal.load(hash['headers']),
   '@data' => Marshal.load(hash['data']),
   '@body' => hash['body'],
   '@links' => hash['links'].map { |link| URI(link) },
   '@code' => hash['code'].to_i,
   '@visited' => hash['visited'],
   '@depth' => hash['depth'].to_i,
   '@referer' => hash['referer'],
   '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
   '@response_time' => hash['response_time'].to_i,
   '@fetched' => hash['fetched']
  }.each do |var, value|
    page.instance_variable_set(var, value)
  end
  page
end
new(url, params = {}) click to toggle source

Create a new page

# File lib/anemone/page.rb, line 36
def initialize(url, params = {})
  @url = url
  @data = OpenStruct.new

  @code = params[:code]
  @headers = params[:headers] || {}
  @headers['content-type'] ||= ['']
  @aliases = Array(params[:aka]).compact
  @referer = params[:referer]
  @depth = params[:depth] || 0
  @redirect_to = to_absolute(params[:redirect_to])
  @response_time = params[:response_time]
  @body = params[:body]
  @error = params[:error]

  @fetched = !params[:code].nil?
end

Public Instance Methods

base() click to toggle source

Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE

# File lib/anemone/page.rb, line 138
def base
  @base = if doc
    href = doc.search('//head/base/@href')
    URI(href.to_s) unless href.nil? rescue nil
  end unless @base
  
  return nil if @base && @base.to_s().empty?
  @base
end
content_type() click to toggle source

The content-type returned by the HTTP request for this page

# File lib/anemone/page.rb, line 106
def content_type
  headers['content-type'].first
end
cookies() click to toggle source

Array of cookies received with this page as WEBrick::Cookie objects.

# File lib/anemone/page.rb, line 99
def cookies
  WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
end
discard_doc!() click to toggle source

Delete the Nokogiri document and response body to conserve memory

# File lib/anemone/page.rb, line 83
def discard_doc!
  links # force parsing of page links before we trash the document
  @doc = @body = nil
end
doc() click to toggle source

Nokogiri document for the HTML body

# File lib/anemone/page.rb, line 75
def doc
  return @doc if @doc
  @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
end
fetched?() click to toggle source

Was the page successfully fetched? true if the page was fetched with no error, false otherwise.

# File lib/anemone/page.rb, line 92
def fetched?
  @fetched
end
html?() click to toggle source

Returns true if the page is a HTML document, returns false otherwise.

# File lib/anemone/page.rb, line 114
def html?
  !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
end
in_domain?(uri) click to toggle source

Returns true if uri is in the same domain as the page, returns false otherwise

# File lib/anemone/page.rb, line 171
def in_domain?(uri)
  uri.host == @url.host
end
marshal_dump() click to toggle source
# File lib/anemone/page.rb, line 175
def marshal_dump
  [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
end
marshal_load(ary) click to toggle source
# File lib/anemone/page.rb, line 179
def marshal_load(ary)
  @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
end
not_found?() click to toggle source

Returns true if the page was not found (returned 404 code), returns false otherwise.

# File lib/anemone/page.rb, line 130
def not_found?
  404 == @code
end
redirect?() click to toggle source

Returns true if the page is a HTTP redirect, returns false otherwise.

# File lib/anemone/page.rb, line 122
def redirect?
  (300..307).include?(@code)
end
to_absolute(link) click to toggle source

Converts relative URL link into an absolute URL based on the location of the page

# File lib/anemone/page.rb, line 153
def to_absolute(link)
  return nil if link.nil?

  # remove anchor
  link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))

  relative = URI(link)
  absolute = base ? base.merge(relative) : @url.merge(relative)

  absolute.path = '/' if absolute.path.empty?

  return absolute
end
to_hash() click to toggle source
# File lib/anemone/page.rb, line 183
def to_hash
  {'url' => @url.to_s,
   'headers' => Marshal.dump(@headers),
   'data' => Marshal.dump(@data),
   'body' => @body,
   'links' => links.map(&:to_s), 
   'code' => @code,
   'visited' => @visited,
   'depth' => @depth,
   'referer' => @referer.to_s,
   'redirect_to' => @redirect_to.to_s,
   'response_time' => @response_time,
   'fetched' => @fetched}
end