class Invoca::Utils::GuaranteedUTF8String

Constants

PRIVATE_CP1252_CHAR_PATTERN
PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE
PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE
REPLACE_CHARACTER

chosen because this is a 1-byte ASCII character that is not used in any of the popular escaping systems: XML, HTML, HTTP URIs, HTTP Form Post, JSON

UTF_16_BE_BOM
UTF_16_LE_BOM
UTF_8_BOM

Attributes

to_s[R]
to_string[R]

Public Class Methods

new(string) click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 12
def initialize(string)
  @to_string = self.class.normalize_string(string)
end

Private Class Methods

cp1252_to_utf_8(string) click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 120
def cp1252_to_utf_8(string)
  string.force_encoding('CP1252')
  string.encode!(
    'UTF-8',
    replace:  REPLACE_CHARACTER,
    undef:    :replace,
    invalid:  :replace
  )
end
normalize_all_strings(value, **options) click to toggle source

Walks a JSON doc of hashes, arrays, and values and normalizes all strings found to UTF-8

# File lib/invoca/utils/guaranteed_utf8_string.rb, line 48
def normalize_all_strings(value, **options)
  case value
  when Hash
    value.each_with_object({}) do |(k, v), result|
      result[normalize_all_strings(k, **options)] = normalize_all_strings(v, **options)
    end
  when Array
    value.map { |v| normalize_all_strings(v, **options) }
  when String
    normalize_string(value, **options)
  else
    value
  end
end
normalize_multibyte_cp1252(string, pattern, encoding) click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 112
def normalize_multibyte_cp1252(string, pattern, encoding)
  string.gsub!(pattern) { |c| c.ord.chr.force_encoding('CP1252').encode('UTF-8').encode(encoding) }
end
normalize_newlines(string) click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 116
def normalize_newlines(string)
  string.gsub!(/ \r\n | \r | \n /x, "\n")
end
normalize_string(orig_string, normalize_utf16: true, normalize_cp1252: true, normalize_newlines: true, remove_utf8_bom: true, replace_unicode_beyond_ffff: true) click to toggle source

normalizes a string to UTF-8

# File lib/invoca/utils/guaranteed_utf8_string.rb, line 25
def normalize_string(orig_string,
                     normalize_utf16:              true,
                     normalize_cp1252:             true,
                     normalize_newlines:           true,
                     remove_utf8_bom:              true,
                     replace_unicode_beyond_ffff:  true)
  string =  if orig_string.is_a?(String) ||
              (orig_string.respond_to?(:to_s) &&
                orig_string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :(
              orig_string.to_s.dup
            else
              raise ArgumentError, "must be passed a string or an object with a non-Kernel .to_s method but instead was #{orig_string.class} #{orig_string.inspect}"
            end
  string.force_encoding('UTF-8')
  normalize_string_from_utf8(string,
                             normalize_utf16: normalize_utf16,
                             normalize_cp1252: normalize_cp1252,
                             normalize_newlines: normalize_newlines,
                             remove_utf8_bom: remove_utf8_bom,
                             replace_unicode_beyond_ffff: replace_unicode_beyond_ffff)
end
normalize_string_from_utf8(string, normalize_utf16:, normalize_cp1252:, normalize_newlines:, remove_utf8_bom:, replace_unicode_beyond_ffff:) click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 65
def normalize_string_from_utf8(string,
                               normalize_utf16:,
                               normalize_cp1252:,
                               normalize_newlines:,
                               remove_utf8_bom:,
                               replace_unicode_beyond_ffff:)
  found_utf_16 = normalize_utf_16(string, normalize_cp1252: normalize_cp1252) if normalize_utf16
  if found_utf_16
    string.encode!('UTF-8')
  elsif !string.valid_encoding?
    if normalize_cp1252
      cp1252_to_utf_8(string)
    else
      raise ArgumentError, 'Could not normalize to utf8 due to invalid characters (probably CP1252)'
    end
  end
  normalize_newlines(string)           if normalize_newlines
  remove_utf8_bom(string)              if remove_utf8_bom
  replace_unicode_beyond_ffff(string)  if replace_unicode_beyond_ffff
  string
end
normalize_utf_16(string, normalize_cp1252:) click to toggle source

returns truthy iff UTF_16 was found, in which case it has been normalized but the string is still UTF-16 otherwise returns falsey and leaves the string as is

# File lib/invoca/utils/guaranteed_utf8_string.rb, line 97
def normalize_utf_16(string, normalize_cp1252:)
  case string[0, 2]
  when UTF_16_LE_BOM
    string.slice!(0, 2)                 # remove the BOM
    string.force_encoding('UTF-16LE')
    normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE, 'UTF-16LE') if normalize_cp1252
    true
  when UTF_16_BE_BOM
    string.slice!(0, 2)                 # remove the BOM
    string.force_encoding('UTF-16BE')
    normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE, 'UTF-16BE') if normalize_cp1252
    true
  end
end
remove_utf8_bom(string) click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 130
def remove_utf8_bom(string)
  string.sub!(/\A #{UTF_8_BOM}/x, '')
end
replace_unicode_beyond_ffff(string) click to toggle source

Note MySQL can only store Unicode up to code point U+FFFF in the standard mb3 storage type. There is an option to use mb4 which is needed to hold the code points above that (including emoji) but we haven't enabled that on any columns yet since it would take a data migration and didn't seem that important.

# File lib/invoca/utils/guaranteed_utf8_string.rb, line 138
def replace_unicode_beyond_ffff(string)
  string.gsub!(/[^\u0000-\uffff]/x, REPLACE_CHARACTER)
end