class MultibyteCharsExtrasTest

The default Multibyte Chars proxy has more features than the normal string implementation. Tests for the implementation of these features should run on all Ruby versions and shouldn't be tested through the proxy methods.

Public Instance Methods

test_capitalize_should_be_unicode_aware() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 513
def test_capitalize_should_be_unicode_aware
  { "аБвг аБвг" => "Абвг абвг",
    "аБвг АБВГ" => "Абвг абвг",
    "АБВГ АБВГ" => "Абвг абвг",
    "" => "" }.each do |f, t|
    assert_equal t, chars(f).capitalize
  end
end
test_class_is_not_forwarded() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 718
def test_class_is_not_forwarded
  assert_equal BYTE_STRING.dup.mb_chars.class, ActiveSupport::Multibyte::Chars
end
test_composition_exclusion_is_set_up_properly() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 568
def test_composition_exclusion_is_set_up_properly
  # Normalization of DEVANAGARI LETTER QA breaks when composition exclusion isn't used correctly
  qa = [0x915, 0x93c].pack("U*")
  assert_equal qa, chars(qa).normalize(:c)
end
test_downcase_should_be_unicode_aware() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 503
def test_downcase_should_be_unicode_aware
  assert_equal "абвгд\0f", chars("аБвгд\0F").downcase
  assert_equal "こにちわ", chars("こにちわ").downcase
end
test_limit_should_keep_under_the_specified_byte_limit() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 561
def test_limit_should_keep_under_the_specified_byte_limit
  example = chars(UNICODE_STRING)
  (1..UNICODE_STRING.length).each do |limit|
    assert example.limit(limit).to_s.length <= limit
  end
end
test_limit_should_not_break_on_blank_strings() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 531
def test_limit_should_not_break_on_blank_strings
  example = chars("")
  assert_equal example, example.limit(0)
  assert_equal example, example.limit(1)
end
test_limit_should_work_on_a_multibyte_string() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 537
def test_limit_should_work_on_a_multibyte_string
  example = chars(UNICODE_STRING)
  bytesize = UNICODE_STRING.bytesize

  assert_equal UNICODE_STRING, example.limit(bytesize)
  assert_equal "", example.limit(0)
  assert_equal "", example.limit(1)
  assert_equal "こ", example.limit(3)
  assert_equal "こに", example.limit(6)
  assert_equal "こに", example.limit(8)
  assert_equal "こにち", example.limit(9)
  assert_equal "こにちわ", example.limit(50)
end
test_limit_should_work_on_an_ascii_string() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 551
def test_limit_should_work_on_an_ascii_string
  ascii = chars(ASCII_STRING)
  assert_equal ASCII_STRING, ascii.limit(ASCII_STRING.length)
  assert_equal "", ascii.limit(0)
  assert_equal "o", ascii.limit(1)
  assert_equal "oh", ascii.limit(2)
  assert_equal "ohay", ascii.limit(4)
  assert_equal "ohayo", ascii.limit(50)
end
test_normalization_C_pri_29() click to toggle source

Test for the Public Review Issue #29, bad explanation of composition might lead to a bad implementation: www.unicode.org/review/pr-29.html

# File activesupport/test/multibyte_chars_test.rb, line 576
def test_normalization_C_pri_29
  [
    [0x0B47, 0x0300, 0x0B3E],
    [0x1100, 0x0300, 0x1161]
  ].map { |c| c.pack("U*") }.each do |c|
    assert_equal_codepoints c, chars(c).normalize(:c)
  end
end
test_normalization_shouldnt_strip_null_bytes() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 585
def test_normalization_shouldnt_strip_null_bytes
  null_byte_str = "Test\0test"

  assert_equal null_byte_str, chars(null_byte_str).normalize(:kc)
  assert_equal null_byte_str, chars(null_byte_str).normalize(:c)
  assert_equal null_byte_str, chars(null_byte_str).normalize(:d)
  assert_equal null_byte_str, chars(null_byte_str).normalize(:kd)
  assert_equal null_byte_str, chars(null_byte_str).decompose
  assert_equal null_byte_str, chars(null_byte_str).compose
end
test_should_compute_grapheme_length() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 611
def test_should_compute_grapheme_length
  [
    ["", 0],
    ["abc", 3],
    ["こにちわ", 4],
    [[0x0924, 0x094D, 0x0930].pack("U*"), 2],
    # GB3
    [%w(cr lf), 1],
    # GB4
    [%w(cr n), 2],
    [%w(lf n), 2],
    [%w(control n), 2],
    [%w(cr extend), 2],
    [%w(lf extend), 2],
    [%w(control extend), 2],
    # GB 5
    [%w(n cr), 2],
    [%w(n lf), 2],
    [%w(n control), 2],
    [%w(extend cr), 2],
    [%w(extend lf), 2],
    [%w(extend control), 2],
    # GB 6
    [%w(l l), 1],
    [%w(l v), 1],
    [%w(l lv), 1],
    [%w(l lvt), 1],
    # GB7
    [%w(lv v), 1],
    [%w(lv t), 1],
    [%w(v v), 1],
    [%w(v t), 1],
    # GB8
    [%w(lvt t), 1],
    [%w(t t), 1],
    # GB8a
    [%w(r r), 1],
    # GB9
    [%w(n extend), 1],
    # GB9a
    [%w(n spacingmark), 1],
    # GB10
    [%w(n n), 2],
    # Other
    [%w(n cr lf n), 3],
    [%w(n l v t), 2],
    [%w(cr extend n), 3],
  ].each do |input, expected_length|
    if input.kind_of?(Array)
      str = string_from_classes(input)
    else
      str = input
    end
    assert_equal expected_length, chars(str).grapheme_length, input.inspect
  end
end
test_simple_normalization() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 596
def test_simple_normalization
  comp_str = [
    44,  # LATIN CAPITAL LETTER D
    307, # COMBINING DOT ABOVE
    328, # COMBINING OGONEK
    323 # COMBINING DOT BELOW
  ].pack("U*")

  assert_equal_codepoints "", chars("").normalize
  assert_equal_codepoints [44, 105, 106, 328, 323].pack("U*"), chars(comp_str).normalize(:kc).to_s
  assert_equal_codepoints [44, 307, 328, 323].pack("U*"), chars(comp_str).normalize(:c).to_s
  assert_equal_codepoints [44, 307, 110, 780, 78, 769].pack("U*"), chars(comp_str).normalize(:d).to_s
  assert_equal_codepoints [44, 105, 106, 110, 780, 78, 769].pack("U*"), chars(comp_str).normalize(:kd).to_s
end
test_swapcase_should_be_unicode_aware() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 508
def test_swapcase_should_be_unicode_aware
  assert_equal "аaéÜ\0f", chars("АAÉü\0F").swapcase
  assert_equal "こにちわ", chars("こにちわ").swapcase
end
test_tidy_bytes_should_forcibly_tidy_bytes_if_specified() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 711
def test_tidy_bytes_should_forcibly_tidy_bytes_if_specified
  byte_string = "\xF0\xA5\xA4\xA4" # valid as both CP-1252 and UTF-8, but with different interpretations.
  assert_not_equal "𥤤", chars(byte_string).tidy_bytes
  # Forcible conversion to UTF-8
  assert_equal "𥤤", chars(byte_string).tidy_bytes(true)
end
test_tidy_bytes_should_tidy_bytes() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 668
def test_tidy_bytes_should_tidy_bytes
  single_byte_cases = {
    "\x21" => "!",   # Valid ASCII byte, low
    "\x41" => "A",   # Valid ASCII byte, mid
    "\x7E" => "~",   # Valid ASCII byte, high
    "\x80" => "€",   # Continuation byte, low (cp125)
    "\x94" => "”",   # Continuation byte, mid (cp125)
    "\x9F" => "Ÿ",   # Continuation byte, high (cp125)
    "\xC0" => "À",   # Overlong encoding, start of 2-byte sequence, but codepoint < 128
    "\xC1" => "Á",   # Overlong encoding, start of 2-byte sequence, but codepoint < 128
    "\xC2" => "Â",   # Start of 2-byte sequence, low
    "\xC8" => "È",   # Start of 2-byte sequence, mid
    "\xDF" => "ß",   # Start of 2-byte sequence, high
    "\xE0" => "à",   # Start of 3-byte sequence, low
    "\xE8" => "è",   # Start of 3-byte sequence, mid
    "\xEF" => "ï",   # Start of 3-byte sequence, high
    "\xF0" => "ð",   # Start of 4-byte sequence
    "\xF1" => "ñ",   # Unused byte
    "\xFF" => "ÿ",   # Restricted byte
    "\x00" => "\x00" # null char
  }

  single_byte_cases.each do |bad, good|
    assert_equal good, chars(bad).tidy_bytes.to_s
    assert_equal "#{good}#{good}", chars("#{bad}#{bad}").tidy_bytes
    assert_equal "#{good}#{good}#{good}", chars("#{bad}#{bad}#{bad}").tidy_bytes
    assert_equal "#{good}a", chars("#{bad}a").tidy_bytes
    assert_equal "#{good}á", chars("#{bad}á").tidy_bytes
    assert_equal "a#{good}a", chars("a#{bad}a").tidy_bytes
    assert_equal "á#{good}á", chars("á#{bad}á").tidy_bytes
    assert_equal "a#{good}", chars("a#{bad}").tidy_bytes
    assert_equal "á#{good}", chars("á#{bad}").tidy_bytes
  end

  byte_string = "\270\236\010\210\245"
  tidy_string = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack("U*")
  assert_equal_codepoints tidy_string, chars(byte_string).tidy_bytes
  assert_nothing_raised { chars(byte_string).tidy_bytes.to_s.unpack("U*") }

  # UTF-8 leading byte followed by too few continuation bytes
  assert_equal_codepoints "\xc3\xb0\xc2\xa5\xc2\xa4\x21", chars("\xf0\xa5\xa4\x21").tidy_bytes
end
test_titleize_should_be_unicode_aware() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 522
def test_titleize_should_be_unicode_aware
  assert_equal "Él Que Se Enteró", chars("ÉL QUE SE ENTERÓ").titleize
  assert_equal "Абвг Абвг", chars("аБвг аБвг").titleize
end
test_titleize_should_not_affect_characters_that_do_not_case_fold() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 527
def test_titleize_should_not_affect_characters_that_do_not_case_fold
  assert_equal "日本語", chars("日本語").titleize
end
test_upcase_should_be_unicode_aware() click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 498
def test_upcase_should_be_unicode_aware
  assert_equal "АБВГД\0F", chars("аБвгд\0f").upcase
  assert_equal "こにちわ", chars("こにちわ").upcase
end

Private Instance Methods

string_from_classes(classes) click to toggle source
# File activesupport/test/multibyte_chars_test.rb, line 724
def string_from_classes(classes)
  # Characters from the character classes as described in UAX #29
  character_from_class = {
    l: 0x1100, v: 0x1160, t: 0x11A8, lv: 0xAC00, lvt: 0xAC01, cr: 0x000D, lf: 0x000A,
    extend: 0x094D, n: 0x64, spacingmark: 0x0903, r: 0x1F1E6, control: 0x0001
  }
  classes.collect do |k|
    character_from_class[k.intern]
  end.pack("U*")
end