class ChupaText::UTF8Converter

Constants

UTF_16BE_BOM
UTF_16LE_BOM
UTF_32BE_BOM
UTF_32LE_BOM
UTF_8_BOM

Public Class Methods

new(string, max_size: nil) click to toggle source
# File lib/chupa-text/utf8-converter.rb, line 19
def initialize(string, max_size: nil)
  @string = string
  @max_size = max_size
end

Public Instance Methods

convert() click to toggle source
# File lib/chupa-text/utf8-converter.rb, line 24
def convert
  encoding = @string.encoding
  case encoding
  when Encoding::UTF_8
    bom_size, bom_encoding = detect_bom
    if bom_size
      utf8_string = @string.byteslice(bom_size,
                                      @string.bytesize - bom_size)
    else
      utf8_string = @string
    end
    return truncate(utf8_string)
  when Encoding::ASCII_8BIT
    return truncate(@string) if @string.ascii_only?
  else
    utf8_string = @string.encode(Encoding::UTF_8,
                                 invalid: :replace,
                                 undef: :replace,
                                 replace: "")
    return truncate(utf8_string)
  end

  bom_size, bom_encoding = detect_bom
  if bom_encoding
    string_without_bom = @string.byteslice(bom_size,
                                           @string.bytesize - bom_size)
    utf8_string = string_without_bom.encode(Encoding::UTF_8,
                                            bom_encoding,
                                            invalid: :replace,
                                            undef: :replace,
                                            replace: "")
    return truncate(utf8_string)
  end

  guessed_encoding = guess_encoding
  if guessed_encoding
    truncate(@string.encode(Encoding::UTF_8,
                            guessed_encoding,
                            invalid: :replace,
                            undef: :replace,
                            replace: ""))
  else
    if @max_size
      utf8_string = @string.byteslice(0, @max_size)
    else
      utf8_string = @string.dup
    end
    utf8_string.force_encoding(Encoding::UTF_8)
    utf8_string.scrub!("")
    utf8_string.gsub!(/\p{Control}+/, "")
    utf8_string
  end
end

Private Instance Methods

detect_bom() click to toggle source
# File lib/chupa-text/utf8-converter.rb, line 84
def detect_bom
  case @string.byteslice(0, 4).b
  when UTF_32BE_BOM
    return 4, Encoding::UTF_32BE
  when UTF_32LE_BOM
    return 4, Encoding::UTF_32LE
  end

  case @string.byteslice(0, 3).b
  when UTF_8_BOM
    return 3, Encoding::UTF_8
  end

  case @string.byteslice(0, 2).b
  when UTF_16BE_BOM
    return 2, Encoding::UTF_16BE
  when UTF_16LE_BOM
    return 2, Encoding::UTF_16LE
  end

  nil
end
guess_encoding() click to toggle source
# File lib/chupa-text/utf8-converter.rb, line 107
def guess_encoding
  original_encoding = @string.encoding
  begin
    candidates = [
      Encoding::UTF_8,
      Encoding::EUC_JP,
      Encoding::Windows_31J,
    ]
    candidates.each do |candidate|
      @string.force_encoding(candidate)
      return candidate if @string.valid_encoding?
    end
    nil
  ensure
    @string.force_encoding(original_encoding)
  end
end
truncate(string) click to toggle source
# File lib/chupa-text/utf8-converter.rb, line 125
def truncate(string)
  if @max_size and string.bytesize > @max_size
    truncated = string.byteslice(0, @max_size)
    truncated.scrub!("")
    truncated
  else
    string
  end
end