class Feedbag

Constants

CONTENT_TYPES

Public Class Methods

feed?(url) click to toggle source
# File lib/feedbag.rb, line 42
def self.feed?(url)
  new.feed?(url)
end
find(url, args = {}) click to toggle source
# File lib/feedbag.rb, line 46
def self.find(url, args = {})
  new.find(url, args = {})
end
new() click to toggle source
# File lib/feedbag.rb, line 50
def initialize
  @feeds = []
end

Public Instance Methods

_is_http_valid(uri, orig_url) click to toggle source

not used. yet.

# File lib/feedbag.rb, line 201
def _is_http_valid(uri, orig_url)
  req = Net::HTTP.get_response(uri)
  orig_uri = URI.parse(orig_url)
  case req
  when Net::HTTPSuccess then
    return true
  else
    return false
  end
end
add_feed(feed_url, orig_url, base_uri = nil) click to toggle source
# File lib/feedbag.rb, line 176
def add_feed(feed_url, orig_url, base_uri = nil)
  # puts "#{feed_url} - #{orig_url}"
  url = feed_url.sub(/^feed:/, '').strip

  if base_uri
    # url = base_uri + feed_url
    url = URI.parse(base_uri).merge(feed_url).to_s
  end

  begin
    uri = URI.parse(url)
  rescue
    puts "Error with `#{url}'"
    exit 1
  end
  unless uri.absolute?
    orig = URI.parse(orig_url)
    url = orig.merge(url).to_s
  end

  # verify url is really valid
  @feeds.push(url) unless @feeds.include?(url)# if self._is_http_valid(URI.parse(url), orig_url)
end
feed?(url) click to toggle source
# File lib/feedbag.rb, line 54
def feed?(url)
  # use LWR::Simple.normalize some time
  url_uri = URI.parse(url)
  url = "#{url_uri.scheme or 'http'}://#{url_uri.host}#{url_uri.path}"
  url << "?#{url_uri.query}" if url_uri.query

    # hack:
    url.sub!(/^feed:\/\//, 'http://')

  res = Feedbag.find(url)
  if res.size == 1 and res.first == url
    return true
  else
    return false
  end
end
find(url, args = {}) click to toggle source
# File lib/feedbag.rb, line 71
def find(url, args = {})
  url_uri = URI.parse(url)
  url = nil
  if url_uri.scheme.nil?
    url = "http://#{url_uri.to_s}"
  elsif url_uri.scheme == "feed"
    return self.add_feed(url_uri.to_s.sub(/^feed:\/\//, 'http://'), nil)
  else
    url = url_uri.to_s
  end
  #url = "#{url_uri.scheme or 'http'}://#{url_uri.host}#{url_uri.path}"

  # check if feed_valid is avail
  begin
    require "feed_validator"
    v = W3C::FeedValidator.new
    v.validate_url(url)
    return self.add_feed(url, nil) if v.valid?
  rescue LoadError
    # scoo
  rescue REXML::ParseException
    # usually indicates timeout
    # TODO: actually find out timeout. use Terminator?
    # $stderr.puts "Feed looked like feed but might not have passed validation or timed out"
  rescue => ex
    $stderr.puts "#{ex.class} error occurred with: `#{url}': #{ex.message}"
  end

  begin
    html = URI.open(url, :allow_redirections => :all) do |f|
      content_type = f.content_type.downcase
      if content_type == "application/octet-stream" # open failed
        content_type = f.meta["content-type"].gsub(/;.*$/, '')
      end
      if CONTENT_TYPES.include?(content_type)
        return self.add_feed(url, nil)
      end

      doc = Nokogiri::HTML(f.read)

      if doc.at("base") and doc.at("base")["href"]
        @base_uri = doc.at("base")["href"]
      else
        @base_uri = nil
      end

      # first with links
      (doc/"atom:link").each do |l|
        next unless l["rel"] && l["href"].present?
        if l["type"] and CONTENT_TYPES.include?(l["type"].downcase.strip) and l["rel"].downcase == "self"
          self.add_feed(l["href"], url, @base_uri)
        end
      end

      doc.xpath("//link[@rel='alternate' or @rel='service.feed'][@href][@type]").each do |l|
        if CONTENT_TYPES.include?(l['type'].downcase.strip)
          self.add_feed(l["href"], url, @base_uri)
        end
      end

      doc.xpath("//link[@rel='alternate' and @type='application/json'][@href]").each do |e|
        self.add_feed(e['href'], url, @base_uri) if self.looks_like_feed?(e['href'])
      end

      (doc/"a").each do |a|
        next unless a["href"]
        if self.looks_like_feed?(a["href"]) and (a["href"] =~ /\// or a["href"] =~ /#{url_uri.host}/)
          self.add_feed(a["href"], url, @base_uri)
        end
      end

      (doc/"a").each do |a|
        next unless a["href"]
        if self.looks_like_feed?(a["href"])
          self.add_feed(a["href"], url, @base_uri)
        end
      end

      # Added support for feeds like http://tabtimes.com/tbfeed/mashable/full.xml
      if url.match(/.xml$/) and doc.root and doc.root["xml:base"] and doc.root["xml:base"].strip == url.strip
        self.add_feed(url, nil)
      end
    end
  rescue Timeout::Error => err
    $stderr.puts "Timeout error occurred with `#{url}: #{err}'"
  rescue OpenURI::HTTPError => the_error
    $stderr.puts "Error occurred with `#{url}': #{the_error}"
  rescue SocketError => err
    $stderr.puts "Socket error occurred with: `#{url}': #{err}"
  rescue => ex
    $stderr.puts "#{ex.class} error occurred with: `#{url}': #{ex.message}"
  ensure
    return @feeds
  end

end
looks_like_feed?(url) click to toggle source
# File lib/feedbag.rb, line 168
def looks_like_feed?(url)
  if url =~ /(\.(rdf|xml|rss)(\?([\w'\-%]?(=[\w'\-%.]*)?(&|#)?)+)?(:[\w'\-%]+)?$|feed=(rss|atom)|(atom|feed)\/?$)/i
    true
  else
    false
  end
end