class OpmlJanitor::Parser
The Parser
class takes in the contents of an OPML XML document and can filter and save the results
Public Class Methods
from_filehandle
allows the OPML XML document to be read from a given filehandle and returns an initialized Parser
instance
# File lib/opml_janitor.rb, line 44 def self.from_filehandle(filehandle) contents = filehandle.read Parser.new(contents) end
initialize takes the contents of an OPML XML document and a flag for debug messages (default false)
# File lib/opml_janitor.rb, line 16 def initialize(contents, debug = false) @xml = contents @opml = Nokogiri::XML.parse(@xml) @debug = debug @threads = 1 @timeout = 20 end
Public Instance Methods
debug= sets the debug flag
# File lib/opml_janitor.rb, line 25 def debug=(debug) @debug = debug end
threads= sets the number of threads for running the validation process
# File lib/opml_janitor.rb, line 37 def threads=(threads) @threads = threads end
timeout= sets the timeout for downloading and processing each feed the default is 20 seconds
# File lib/opml_janitor.rb, line 31 def timeout=(timeout) @timeout = timeout end
to_xml
outputs the current OPML XML structure as a String containing all the XML markup
# File lib/opml_janitor.rb, line 94 def to_xml @opml.to_xml end
validate! takes in one argument, since
, specifing a Time object. Since is used to check if any posts have been posted since that time, thus detecting “stale” blogs/rss feeds
# File lib/opml_janitor.rb, line 63 def validate!(since = nil) # this threading methodology is highly expensive for simple blocks, but a life-saver for IO-bound blocks @work_queue = Queue.new data = @opml.css("body").children boss = Thread.new do filter!(data) end workers = (0...@threads).map do @work_queue.push(false) # this will end each thread Thread.new do begin while work = @work_queue.pop() val = validate_callback(work[:outline], since) spaces = 80 - work[:outline][:xml_url].length spaces = 1 if spaces < 1 puts "#{work[:outline][:xml_url]}#{' ' * spaces}#{val}" if @debug unless val == "PASS" work[:node].unlink end end rescue ThreadError end end end boss.join workers.map(&:join) end
Private Instance Methods
filter! recurses down the OPML body, looking for outline tags, and pushes each leaf node onto a work queue
# File lib/opml_janitor.rb, line 103 def filter!(data) data.each do |node| if node.name == 'outline' outline = Outline.new(node).to_hash if node.children.length > 0 title = outline[:title] || outline[:text] filter!(node.children) else @work_queue.push({ :outline => outline, :node => node}) end end end end
validate_callback
tries to download a feed and verify that it has been updated since the since
time
# File lib/opml_janitor.rb, line 120 def validate_callback(feed, since=nil) val = "FAIL" begin Timeout::timeout(@timeout) { open(feed[:xml_url]) do |rss| feed = RSS::Parser.parse(rss) if feed last_updated = Time.at(0) feed.items.each do |item| #p item.class updated = nil if item.respond_to?(:updated) updated = item.updated.content elsif item.respond_to?(:date) updated = item.date end next unless updated if updated and updated > last_updated last_updated = updated end end if since #p last_updated if last_updated and last_updated > since val = "PASS" else val = "STALE" end else val = "PASS" end else val = "NOFEED" end end } rescue EOFError => e val = "EOFError" rescue OpenURI::HTTPError => e val = "HTTPError" rescue RSS::Error => e val = "RSSError" rescue Timeout::Error => e val = "Timedout" rescue SocketError => e val = "SocketError" rescue RuntimeError => e val = "Redirect Loop" rescue Errno::ECONNREFUSED => e val = "Connection Refused" rescue Exception => e val = "Unexpected error: #{e}" end val end