class GozapRss::ChoutiRss

Constants

ENCODING

Attributes

items[R]

Public Class Methods

new(uri) click to toggle source
# File lib/gozap_rss/chouti_rss.rb, line 66
def initialize uri

  @url = uri
  @items = []
  @ttl = 120
  @content = get_feed_content uri
  parse_rss
end

Private Instance Methods

create_from(arg) click to toggle source
# File lib/gozap_rss/chouti_rss.rb, line 78
def create_from(arg)
  if arg.respond_to? :read and
      arg.respond_to? :readline and
      arg.respond_to? :nil? and
      arg.respond_to? :eof?
    puts "IO"
  elsif arg.respond_to? :to_str
    require 'stringio'
    puts "string"
  elsif arg.kind_of? Source
   puts "self"
  else
    raise "#{arg.class} is not a valid input stream.  It must walk \n"+
              "like either a String, an IO, or a Source."
  end
end
from_iso_8859_15(str) click to toggle source
# File lib/gozap_rss/chouti_rss.rb, line 95
def from_iso_8859_15(str)
  array_latin9 = str.unpack('C*')
  array_enc = []
  array_latin9.each do |num|
    case num
      # characters that differ compared to iso-8859-1
      when 0xA4; array_enc << 0x20AC
      when 0xA6; array_enc << 0x0160
      when 0xA8; array_enc << 0x0161
      when 0xB4; array_enc << 0x017D
      when 0xB8; array_enc << 0x017E
      when 0xBC; array_enc << 0x0152
      when 0xBD; array_enc << 0x0153
      when 0xBE; array_enc << 0x0178
      else
        array_enc << num
    end
  end
  array_enc.pack('U*')
end
get_feed_content(uri) click to toggle source
# File lib/gozap_rss/chouti_rss.rb, line 186
def get_feed_content uri
  retry_times = 3
  content = ""
  begin
    content = ""
    open(uri, "User-Agent" => "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7",
         :read_timeout => 60, :redirect => true) do |f|
        content = f.read
        #p f.charset
        #p f.content_encoding
        #p content.encoding.to_s
    end
    #content = content.force_encoding("UTF-8")

  rescue  Exception=>e
    logger_exception  e
    logger.info "retry #{retry_times} #{uri}"

    retry if (retry_times -= 1) > 0
  end
  return content
end
parse_rss() click to toggle source
# File lib/gozap_rss/chouti_rss.rb, line 166
def parse_rss
  return if @content.nil? or @content.empty?
  begin
    transe_code
    rss = RSS::Parser.parse(@content, false)
    @title = rss.channel.title.to_s.html_format
    @description = rss.channel.description.to_s.html_format
    @ttl =  rss.channel.ttl.to_i * 60 if (rss.channel.respond_to?(:ttl) && rss.channel.ttl.to_i > 0)

    rss.items.each do |item|
      rss_item = ChoutiRssItem.new(item)
      @items << rss_item if rss_item
    end
  rescue Exception => e
    logger_exception e
  end

end
to_iso_8859_15(content) click to toggle source

Convert from UTF-8

# File lib/gozap_rss/chouti_rss.rb, line 118
def to_iso_8859_15(content)
  array_utf8 = content.unpack('U*')
  array_enc = []
  array_utf8.each do |num|
    case num
      # shortcut first bunch basic characters
      when 0..0xA3; array_enc << num
      # characters removed compared to iso-8859-1
      when 0xA4; array_enc << '&#164;'
      when 0xA6; array_enc << '&#166;'
      when 0xA8; array_enc << '&#168;'
      when 0xB4; array_enc << '&#180;'
      when 0xB8; array_enc << '&#184;'
      when 0xBC; array_enc << '&#188;'
      when 0xBD; array_enc << '&#189;'
      when 0xBE; array_enc << '&#190;'
      # characters added compared to iso-8859-1
      when 0x20AC; array_enc << 0xA4 # 0xe2 0x82 0xac
      when 0x0160; array_enc << 0xA6 # 0xc5 0xa0
      when 0x0161; array_enc << 0xA8 # 0xc5 0xa1
      when 0x017D; array_enc << 0xB4 # 0xc5 0xbd
      when 0x017E; array_enc << 0xB8 # 0xc5 0xbe
      when 0x0152; array_enc << 0xBC # 0xc5 0x92
      when 0x0153; array_enc << 0xBD # 0xc5 0x93
      when 0x0178; array_enc << 0xBE # 0xc5 0xb8
      else
        # all remaining basic characters can be used directly
        if num <= 0xFF
          array_enc << num
        else
          # Numeric entity (&#nnnn;); shard by  Stefan Scholl
          array_enc.concat "&\##{num};".unpack('C*')
        end
    end
  end
  array_enc.pack('C*')
end
transe_code() click to toggle source
# File lib/gozap_rss/chouti_rss.rb, line 156
def transe_code
  @content = from_iso_8859_15(@content)
  encoding = ENCODING.match(@content)
  @content = to_iso_8859_15(@content)
  if encoding && encoding[1].upcase[0..1] == "GB"
    @content = @content.force_encoding("GBK").encode("UTF-8")
    @content = @content.gsub(/\bencoding\s*=\s*["'](.*?)['"]/um, "encoding='UTF-8'")
  end
end