class ChupaText::UTF8Converter
Constants
- UTF_16BE_BOM
- UTF_16LE_BOM
- UTF_32BE_BOM
- UTF_32LE_BOM
- UTF_8_BOM
Public Class Methods
new(string, max_size: nil)
click to toggle source
# File lib/chupa-text/utf8-converter.rb, line 19 def initialize(string, max_size: nil) @string = string @max_size = max_size end
Public Instance Methods
convert()
click to toggle source
# File lib/chupa-text/utf8-converter.rb, line 24 def convert encoding = @string.encoding case encoding when Encoding::UTF_8 bom_size, bom_encoding = detect_bom if bom_size utf8_string = @string.byteslice(bom_size, @string.bytesize - bom_size) else utf8_string = @string end return truncate(utf8_string) when Encoding::ASCII_8BIT return truncate(@string) if @string.ascii_only? else utf8_string = @string.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: "") return truncate(utf8_string) end bom_size, bom_encoding = detect_bom if bom_encoding string_without_bom = @string.byteslice(bom_size, @string.bytesize - bom_size) utf8_string = string_without_bom.encode(Encoding::UTF_8, bom_encoding, invalid: :replace, undef: :replace, replace: "") return truncate(utf8_string) end guessed_encoding = guess_encoding if guessed_encoding truncate(@string.encode(Encoding::UTF_8, guessed_encoding, invalid: :replace, undef: :replace, replace: "")) else if @max_size utf8_string = @string.byteslice(0, @max_size) else utf8_string = @string.dup end utf8_string.force_encoding(Encoding::UTF_8) utf8_string.scrub!("") utf8_string.gsub!(/\p{Control}+/, "") utf8_string end end
Private Instance Methods
detect_bom()
click to toggle source
# File lib/chupa-text/utf8-converter.rb, line 84 def detect_bom case @string.byteslice(0, 4).b when UTF_32BE_BOM return 4, Encoding::UTF_32BE when UTF_32LE_BOM return 4, Encoding::UTF_32LE end case @string.byteslice(0, 3).b when UTF_8_BOM return 3, Encoding::UTF_8 end case @string.byteslice(0, 2).b when UTF_16BE_BOM return 2, Encoding::UTF_16BE when UTF_16LE_BOM return 2, Encoding::UTF_16LE end nil end
guess_encoding()
click to toggle source
# File lib/chupa-text/utf8-converter.rb, line 107 def guess_encoding original_encoding = @string.encoding begin candidates = [ Encoding::UTF_8, Encoding::EUC_JP, Encoding::Windows_31J, ] candidates.each do |candidate| @string.force_encoding(candidate) return candidate if @string.valid_encoding? end nil ensure @string.force_encoding(original_encoding) end end
truncate(string)
click to toggle source
# File lib/chupa-text/utf8-converter.rb, line 125 def truncate(string) if @max_size and string.bytesize > @max_size truncated = string.byteslice(0, @max_size) truncated.scrub!("") truncated else string end end