module Rucc::UTF
Public Class Methods
count_leading_ones(c)
click to toggle source
@param [Integer] c @return [Integer]
# File lib/rucc/utf.rb, line 116 def count_leading_ones(c) 7.downto(0).each do |i| if (c & (1 << i)) == 0 return 7 - i end end 8 end
read_rune(s)
click to toggle source
@param [<Integer>] s @return [<Integer, <Integer>>]
# File lib/rucc/utf.rb, line 82 def read_rune(s) len = count_leading_ones(s[0]) if len == 0 return s[0], s[1..-1] end if len > s.size raise "invalid UTF-8 sequence" # error("invalid UTF-8 sequence"); end 1.upto(len - 1).each do |i| if (s[i] & 0xC0) != 0x80 raise "invalid UTF-8 continuation byte" # error("invalid UTF-8 continuation byte"); end end case len when 2 r = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F) return r, s[2..-1] when 3 r = ((s[0] & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F) return r, s[3..-1] when 4 r = ((s[0] & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F) return r, s[4..-1] else raise "invalid UTF-8 sequence" # error("invalid UTF-8 sequence"); end end
to_utf16(str)
click to toggle source
@param [String] str @return [String]
# File lib/rucc/utf.rb, line 39 def to_utf16(str) b = "" bytes = str.bytes while bytes.size > 0 rune, bytes = read_rune(bytes) if rune < 0x10000 write16(b, rune) else write16(b, (rune >> 10) + 0xD7C0) write16(b, (rune & 0x3FF) + 0xDC00) end end b end
to_utf32(str)
click to toggle source
@param [String] str @return [String]
# File lib/rucc/utf.rb, line 56 def to_utf32(str) b = "" bytes = str.bytes while bytes.size > 0 rune, bytes = read_rune(bytes) write32(b, rune) end b end
write16(b, rune)
click to toggle source
@param(return) [String] b @param [Integer] rune
# File lib/rucc/utf.rb, line 68 def write16(b, rune) b << (rune & 0xFF) b << (rune >> 8) end
write32(b, rune)
click to toggle source
@param(return) [String] b @param [Integer] rune
# File lib/rucc/utf.rb, line 75 def write32(b, rune) write16(b, rune & 0xFFFF) write16(b, rune >> 16) end
write_utf8(b, rune)
click to toggle source
@param(return) [String] b @param [Integer] rune
# File lib/rucc/utf.rb, line 6 def write_utf8(b, rune) # In ruby, default encoding is UTF-8, so `String#<<` can append rune # as UTF-8 string b << rune # if rune < 0x80 # b << rune # return # end # if rune < 0x800 # b << (0xC0 | (rune >> 6)) # b << (0x80 | (rune & 0x3F)) # return # end # if rune < 0x10000 # b << (0xE0 | (rune >> 12)) # b << (0x80 | ((rune >> 6) & 0x3F)) # b << (0x80 | (rune & 0x3F)) # return # end # if rune < 0x200000 # b << (0xF0 | (rune >> 18)) # b << (0x80 | ((rune >> 12) & 0x3F)) # b << (0x80 | ((rune >> 6) & 0x3F)) # b << (0x80 | (rune & 0x3F)) # return # end # raise "invalid UCS character: \\U#{format("%08d", rune)}" # error("invalid UCS character: \\U%08x", rune); end