class Medusa::Page
Attributes
The raw HTTP
response body of the page
Integer response code of the page
OpenStruct for user-stored data
Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.
Exception object, if one was raised during HTTP#fetch_page
Headers of the HTTP
response
URL of the page this one redirected to, if any
URL of the page that brought us to this page
Response time of the request for this page in milliseconds
The URL of the page
Public Class Methods
# File lib/medusa/page.rb, line 207 def self.from_hash(hash) page = self.new(URI(hash['url'])) {'@headers' => Marshal.load(hash['headers']), '@data' => Marshal.load(hash['data']), '@body' => hash['body'], '@links' => hash['links'].map { |link| URI(link) }, '@code' => hash['code'].to_i, '@visited' => hash['visited'], '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'] }.each do |var, value| page.instance_variable_set(var, value) end page end
Create a new page
# File lib/medusa/page.rb, line 36 def initialize(url, params = {}) @url = url @data = OpenStruct.new @links = nil @body = nil @doc = nil @base = nil @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= '' @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? end
Public Instance Methods
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE
# File lib/medusa/page.rb, line 144 def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s().empty? @base end
The content-type returned by the HTTP
request for this page
# File lib/medusa/page.rb, line 112 def content_type headers['content-type'] end
Delete the Nokogiri document and response body to conserve memory
# File lib/medusa/page.rb, line 89 def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end
Nokogiri document for the HTML body
# File lib/medusa/page.rb, line 81 def doc return @doc if @doc @doc = Nokogiri::HTML(@body) if @body && html? rescue nil end
Was the page successfully fetched? true
if the page was fetched with no error, false
otherwise.
# File lib/medusa/page.rb, line 98 def fetched? @fetched end
Returns true
if the page is a HTML document, returns false
otherwise.
# File lib/medusa/page.rb, line 120 def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end
Returns true
if uri is in the same domain as the page, returns false
otherwise
# File lib/medusa/page.rb, line 180 def in_domain?(uri) uri.host == @url.host end
Array of distinct A tag HREFs from the page
# File lib/medusa/page.rb, line 62 def links return @links unless @links.nil? @links = [] return @links if !doc doc.search("//a[@href]").each do |a| next if a['data-method'] && a['data-method'] != 'get' u = a['href'] next if u.nil? or u.empty? abs = to_absolute(u) rescue next @links << abs if in_domain?(abs) end @links.uniq! @links end
# File lib/medusa/page.rb, line 184 def marshal_dump [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] end
# File lib/medusa/page.rb, line 188 def marshal_load(ary) @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end
Returns true
if the page was not found (returned 404 code), returns false
otherwise.
# File lib/medusa/page.rb, line 136 def not_found? 404 == @code end
Returns true
if the page is a HTTP
redirect, returns false
otherwise.
# File lib/medusa/page.rb, line 128 def redirect? (300..307).include?(@code) end
Converts relative URL link into an absolute URL based on the location of the page
# File lib/medusa/page.rb, line 159 def to_absolute(link) return nil if link.nil? # remove anchor link = link.to_s.gsub(/#.*$/,'') if Gem::Requirement.new('< 2.5').satisfied_by?(Gem::Version.new(RUBY_VERSION)) link = URI.encode(URI.decode(link)) end relative = URI(link) absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end
# File lib/medusa/page.rb, line 192 def to_hash {'url' => @url.to_s, 'headers' => Marshal.dump(@headers), 'data' => Marshal.dump(@data), 'body' => @body, 'links' => links.map(&:to_s), 'code' => @code, 'visited' => @visited, 'depth' => @depth, 'referer' => @referer.to_s, 'redirect_to' => @redirect_to.to_s, 'response_time' => @response_time, 'fetched' => @fetched} end