module Analects::Encoding

Constants

BIG5
GB

Public Instance Methods

from_big5(str) click to toggle source
# File lib/analects/encoding.rb, line 17
def from_big5(str)
  recode(BIG5, str)
end
from_gb(str) click to toggle source
# File lib/analects/encoding.rb, line 13
def from_gb(str)
  recode(GB, str)
end
ratings(str) click to toggle source

Crude way to guess which encoding it is

# File lib/analects/encoding.rb, line 33
def ratings(str)
  all_valid_cjk(str).map do |enc|
    [
      enc,
      recode(enc, str).codepoints.map do |point|
        Analects::Models::Zi.codepoint_ranges.map.with_index do |range, idx|
          next 6-idx if range.include?(point)
          0
        end.inject(:+)
      end.inject(:+)
    ]
  end.sort_by(&:last).reverse
end
recode(enc, str) click to toggle source
# File lib/analects/encoding.rb, line 9
def recode(enc, str)
  str.force_encoding(enc).encode('UTF-8')
end
valid_cjk(str) click to toggle source
# File lib/analects/encoding.rb, line 21
def valid_cjk(str)
  [GB, BIG5].map do |enc|
    begin
      recode(enc, str)
      enc
    rescue ::Encoding::UndefinedConversionError
    rescue ::Encoding::InvalidByteSequenceError
    end
  end.compact
end