class UTF8::Validator
Purpose¶ ↑
Validate UTF-8 primarily in a Ruby environments other than 1.9.
Instances of this class are thread safe, and a single instance may be used safely by multiple concurrent threads, with one caveat:
The value of #{DEBUG} must not be changed by any thread.
Constants
- DEBUG
For use during development only.
Public Instance Methods
valid_encoding?(string, raise_on_error = false)
click to toggle source
Validate the supplied string for proper UTF-8 encoding.
Calling Sequence:
validator = UTF8::Validator.new -> validator validator.valid_encoding?(string) -> true or false validator.valid_encoding?(string, raise_on_error) -> true or exception
Parameters:
- string
-
the string to validate
- raise_on_error
-
a boolean flag to indicate requested failure behavior
When raise_on_error is true and a string fails validation, an error of type #{UTF8::ValidationError} is raised. The byte in error and the location of that byte are described in the error message.
# File lib/validation/validator.rb, line 94 def valid_encoding?(string, raise_on_error = false) case RUBY_VERSION when /1\.8\.[56]/ bytes = [] 0.upto(string.length-1) {|i| bytes << string[i] } else bytes = string.bytes end # valid = true index = -1 nb_hex = nil ni_hex = nil state = "start" next_byte_save = nil # bytes.each do |next_byte| index += 1 next_byte_save = next_byte ni_hex = sprintf "%x", index nb_hex = sprintf "%x", next_byte puts "Top: #{next_byte}(0x#{nb_hex}), index: #{index}(0x#{ni_hex})" if DEBUG case state # State: 'start' # The 'start' state: # * handles all occurrences of valid single byte characters i.e., the ASCII character set # * provides state transition logic for start bytes of valid characters with 2-4 bytes # * signals a validation failure for all other single bytes # when "start" puts "state: start" if DEBUG puts "next_byte: #{next_byte}" if DEBUG case next_byte # ASCII # * Input = 0x00-0x7F : change state to START when (0x00..0x7f) puts "state: start 1" if DEBUG state = "start" # Start byte of two byte characters # * Input = 0xC2-0xDF: change state to A when (0xc2..0xdf) puts "state: start 2" if DEBUG state = "a" # Start byte of some three byte characters # * Input = 0xE1-0xEC, 0xEE-0xEF: change state to B when (0xe1..0xec) puts "state: start 3" if DEBUG state = "b" when (0xee..0xef) puts "state: start 4" if DEBUG state = "b" # Start byte of special three byte characters # * Input = 0xE0: change state to C when 0xe0 puts "state: start 5" if DEBUG state = "c" # Start byte of the remaining three byte characters # * Input = 0xED: change state to D when 0xed puts "state: start 6" if DEBUG state = "d" # Start byte of some four byte characters # * Input = 0xF1-0xF3:change state to E when (0xf1..0xf3) puts "state: start 7" if DEBUG state = "e" # Start byte of special four byte characters # * Input = 0xF0: change state to F when 0xf0 puts "state: start 8" if DEBUG state = "f" # Start byte of very special four byte characters # * Input = 0xF4: change state to G when 0xf4 puts "state: start 9" if DEBUG state = "g" # All other single characters are invalid # * Input = Others (0x80-0xBF,0xC0-0xC1, 0xF5-0xFF): ERROR else valid = false break end # of the inner case, the 'start' state # The last continuation byte of a 2, 3, or 4 byte character # State: 'a' # o Input = 0x80-0xBF: change state to START # o Others: ERROR when "a" puts "state: a" if DEBUG if (0x80..0xbf) === next_byte state = "start" else valid = false break end # The first continuation byte for most 3 byte characters # (those with start bytes in: 0xe1-0xec or 0xee-0xef) # State: 'b' # o Input = 0x80-0xBF: change state to A # o Others: ERROR when "b" puts "state: b" if DEBUG if (0x80..0xbf) === next_byte state = "a" else valid = false break end # The first continuation byte for some special 3 byte characters # (those with start byte 0xe0) # State: 'c' # o Input = 0xA0-0xBF: change state to A # o Others: ERROR when "c" puts "state: c" if DEBUG if (0xa0..0xbf) === next_byte state = "a" else valid = false break end # The first continuation byte for the remaining 3 byte characters # (those with start byte 0xed) # State: 'd' # o Input = 0x80-0x9F: change state to A # o Others: ERROR when "d" puts "state: d" if DEBUG if (0x80..0x9f) === next_byte state = "a" else valid = false break end # The first continuation byte for some 4 byte characters # (those with start bytes in: 0xf1-0xf3) # State: 'e' # o Input = 0x80-0xBF: change state to B # o Others: ERROR when "e" puts "state: e" if DEBUG if (0x80..0xbf) === next_byte state = "b" else valid = false break end # The first continuation byte for some special 4 byte characters # (those with start byte 0xf0) # State: 'f' # o Input = 0x90-0xBF: change state to B # o Others: ERROR when "f" puts "state: f" if DEBUG if (0x90..0xbf) === next_byte state = "b" else valid = false break end # The first continuation byte for the remaining 4 byte characters # (those with start byte 0xf4) # State: 'g' # o Input = 0x80-0x8F: change state to B # o Others: ERROR when "g" puts "state: g" if DEBUG if (0x80..0x8f) === next_byte state = "b" else valid = false break end # else raise RuntimeError, "state: default" end end # puts "State at end: #{state}" if DEBUG puts "Valid at end: #{valid}" if DEBUG # Catch truncation at end of string if valid and state != 'start' puts "Resetting valid value" if DEBUG valid = false end # if !valid and raise_on_error puts "Raising Error" if DEBUG raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})" end # valid end