class Anemone::Page
Attributes
The raw HTTP
response body of the page
Integer response code of the page
OpenStruct for user-stored data
Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths!
to find that value.
Exception object, if one was raised during HTTP#fetch_page
Headers of the HTTP
response
URL of the page this one redirected to, if any
URL of the page that brought us to this page
Response time of the request for this page in milliseconds
The URL of the page
Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
Public Class Methods
# File lib/anemone/page.rb, line 198 def self.from_hash(hash) page = self.new(URI(hash['url'])) {'@headers' => Marshal.load(hash['headers']), '@data' => Marshal.load(hash['data']), '@body' => hash['body'], '@links' => hash['links'].map { |link| URI(link) }, '@code' => hash['code'].to_i, '@visited' => hash['visited'], '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'] }.each do |var, value| page.instance_variable_set(var, value) end page end
Create a new page
# File lib/anemone/page.rb, line 36 def initialize(url, params = {}) @url = url @data = OpenStruct.new @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? end
Public Instance Methods
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE
# File lib/anemone/page.rb, line 138 def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s().empty? @base end
The content-type returned by the HTTP
request for this page
# File lib/anemone/page.rb, line 106 def content_type headers['content-type'].first end
Delete the Nokogiri document and response body to conserve memory
# File lib/anemone/page.rb, line 83 def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end
Nokogiri document for the HTML body
# File lib/anemone/page.rb, line 75 def doc return @doc if @doc @doc = Nokogiri::HTML(@body) if @body && html? rescue nil end
Was the page successfully fetched? true
if the page was fetched with no error, false
otherwise.
# File lib/anemone/page.rb, line 92 def fetched? @fetched end
Returns true
if the page is a HTML document, returns false
otherwise.
# File lib/anemone/page.rb, line 114 def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end
Returns true
if uri is in the same domain as the page, returns false
otherwise
# File lib/anemone/page.rb, line 171 def in_domain?(uri) uri.host == @url.host end
Array of distinct A tag HREFs from the page
# File lib/anemone/page.rb, line 57 def links return @links unless @links.nil? @links = [] return @links if !doc doc.search("//a[@href]").each do |a| u = a['href'] next if u.nil? or u.empty? abs = to_absolute(u) rescue next @links << abs if in_domain?(abs) end @links.uniq! @links end
# File lib/anemone/page.rb, line 175 def marshal_dump [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] end
# File lib/anemone/page.rb, line 179 def marshal_load(ary) @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end
Returns true
if the page was not found (returned 404 code), returns false
otherwise.
# File lib/anemone/page.rb, line 130 def not_found? 404 == @code end
Returns true
if the page is a HTTP
redirect, returns false
otherwise.
# File lib/anemone/page.rb, line 122 def redirect? (300..307).include?(@code) end
Converts relative URL link into an absolute URL based on the location of the page
# File lib/anemone/page.rb, line 153 def to_absolute(link) return nil if link.nil? # remove anchor link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))) relative = URI(link) absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end
# File lib/anemone/page.rb, line 183 def to_hash {'url' => @url.to_s, 'headers' => Marshal.dump(@headers), 'data' => Marshal.dump(@data), 'body' => @body, 'links' => links.map(&:to_s), 'code' => @code, 'visited' => @visited, 'depth' => @depth, 'referer' => @referer.to_s, 'redirect_to' => @redirect_to.to_s, 'response_time' => @response_time, 'fetched' => @fetched} end