class Invoca::Utils::GuaranteedUTF8String
Constants
- PRIVATE_CP1252_CHAR_PATTERN
- PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE
- PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE
- REPLACE_CHARACTER
chosen because this is a 1-byte ASCII character that is not used in any of the popular escaping systems: XML, HTML, HTTP URIs, HTTP Form Post, JSON
- UTF_16_BE_BOM
- UTF_16_LE_BOM
- UTF_8_BOM
Attributes
to_s[R]
to_string[R]
Public Class Methods
new(string)
click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 12 def initialize(string) @to_string = self.class.normalize_string(string) end
Private Class Methods
cp1252_to_utf_8(string)
click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 120 def cp1252_to_utf_8(string) string.force_encoding('CP1252') string.encode!( 'UTF-8', replace: REPLACE_CHARACTER, undef: :replace, invalid: :replace ) end
normalize_all_strings(value, **options)
click to toggle source
Walks a JSON doc of hashes, arrays, and values and normalizes all strings found to UTF-8
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 48 def normalize_all_strings(value, **options) case value when Hash value.each_with_object({}) do |(k, v), result| result[normalize_all_strings(k, **options)] = normalize_all_strings(v, **options) end when Array value.map { |v| normalize_all_strings(v, **options) } when String normalize_string(value, **options) else value end end
normalize_multibyte_cp1252(string, pattern, encoding)
click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 112 def normalize_multibyte_cp1252(string, pattern, encoding) string.gsub!(pattern) { |c| c.ord.chr.force_encoding('CP1252').encode('UTF-8').encode(encoding) } end
normalize_newlines(string)
click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 116 def normalize_newlines(string) string.gsub!(/ \r\n | \r | \n /x, "\n") end
normalize_string(orig_string, normalize_utf16: true, normalize_cp1252: true, normalize_newlines: true, remove_utf8_bom: true, replace_unicode_beyond_ffff: true)
click to toggle source
normalizes a string to UTF-8
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 25 def normalize_string(orig_string, normalize_utf16: true, normalize_cp1252: true, normalize_newlines: true, remove_utf8_bom: true, replace_unicode_beyond_ffff: true) string = if orig_string.is_a?(String) || (orig_string.respond_to?(:to_s) && orig_string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :( orig_string.to_s.dup else raise ArgumentError, "must be passed a string or an object with a non-Kernel .to_s method but instead was #{orig_string.class} #{orig_string.inspect}" end string.force_encoding('UTF-8') normalize_string_from_utf8(string, normalize_utf16: normalize_utf16, normalize_cp1252: normalize_cp1252, normalize_newlines: normalize_newlines, remove_utf8_bom: remove_utf8_bom, replace_unicode_beyond_ffff: replace_unicode_beyond_ffff) end
normalize_string_from_utf8(string, normalize_utf16:, normalize_cp1252:, normalize_newlines:, remove_utf8_bom:, replace_unicode_beyond_ffff:)
click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 65 def normalize_string_from_utf8(string, normalize_utf16:, normalize_cp1252:, normalize_newlines:, remove_utf8_bom:, replace_unicode_beyond_ffff:) found_utf_16 = normalize_utf_16(string, normalize_cp1252: normalize_cp1252) if normalize_utf16 if found_utf_16 string.encode!('UTF-8') elsif !string.valid_encoding? if normalize_cp1252 cp1252_to_utf_8(string) else raise ArgumentError, 'Could not normalize to utf8 due to invalid characters (probably CP1252)' end end normalize_newlines(string) if normalize_newlines remove_utf8_bom(string) if remove_utf8_bom replace_unicode_beyond_ffff(string) if replace_unicode_beyond_ffff string end
normalize_utf_16(string, normalize_cp1252:)
click to toggle source
returns truthy iff UTF_16 was found, in which case it has been normalized but the string is still UTF-16 otherwise returns falsey and leaves the string as is
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 97 def normalize_utf_16(string, normalize_cp1252:) case string[0, 2] when UTF_16_LE_BOM string.slice!(0, 2) # remove the BOM string.force_encoding('UTF-16LE') normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE, 'UTF-16LE') if normalize_cp1252 true when UTF_16_BE_BOM string.slice!(0, 2) # remove the BOM string.force_encoding('UTF-16BE') normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE, 'UTF-16BE') if normalize_cp1252 true end end
remove_utf8_bom(string)
click to toggle source
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 130 def remove_utf8_bom(string) string.sub!(/\A #{UTF_8_BOM}/x, '') end
replace_unicode_beyond_ffff(string)
click to toggle source
Note MySQL can only store Unicode up to code point U+FFFF in the standard mb3 storage type. There is an option to use mb4 which is needed to hold the code points above that (including emoji) but we haven't enabled that on any columns yet since it would take a data migration and didn't seem that important.
# File lib/invoca/utils/guaranteed_utf8_string.rb, line 138 def replace_unicode_beyond_ffff(string) string.gsub!(/[^\u0000-\uffff]/x, REPLACE_CHARACTER) end