class WebPageParser::BaseParser
Attributes
retrieve_session[RW]
url[R]
Public Class Methods
new(options = { })
click to toggle source
takes a hash of options. The :url option passes the page url, and the :page option passes the raw html page content for parsing
# File lib/web-page-parser/base_parser.rb, line 19 def initialize(options = { }) @url = options[:url] @page = options[:page] @guid = options[:guid] end
Public Instance Methods
content()
click to toggle source
# File lib/web-page-parser/base_parser.rb, line 43 def content @content || [] end
date()
click to toggle source
# File lib/web-page-parser/base_parser.rb, line 47 def date end
guid()
click to toggle source
# File lib/web-page-parser/base_parser.rb, line 53 def guid return @guid if @guid @guid = guid_from_url if url @guid end
guid_from_url()
click to toggle source
# File lib/web-page-parser/base_parser.rb, line 50 def guid_from_url end
hash()
click to toggle source
Return a hash representing the textual content of this web page
# File lib/web-page-parser/base_parser.rb, line 60 def hash digest = Digest::MD5.new digest << title.to_s digest << content.join('').to_s digest.to_s end
page()
click to toggle source
return the page contents, retrieving it from the server if necessary
# File lib/web-page-parser/base_parser.rb, line 26 def page @page ||= retrieve_page end
retrieve_page(rurl = nil)
click to toggle source
request the page from the server and return the raw contents
# File lib/web-page-parser/base_parser.rb, line 31 def retrieve_page(rurl = nil) durl = rurl || url return nil unless durl durl = filter_url(durl) if self.respond_to?(:filter_url) self.class.retrieve_session ||= WebPageParser::HTTP::Session.new self.class.retrieve_session.get(durl) end
title()
click to toggle source
# File lib/web-page-parser/base_parser.rb, line 39 def title @title end