module Sterilizer

Constants

VERSION

Public Instance Methods

default_encoding() click to toggle source
# File lib/sterilizer.rb, line 76
def default_encoding
  Encoding.default_internal || "UTF-8"
end
encoding_is_default?() click to toggle source
# File lib/sterilizer.rb, line 39
def encoding_is_default?
  self.encoding == default_encoding
end
find_a_valid_encoding(ignoring = [default_encoding], guessed_already = false) click to toggle source
# File lib/sterilizer.rb, line 51
def find_a_valid_encoding(ignoring = [default_encoding], guessed_already = false)
  # If we've already tried to guess the encoding, resort to picking one at random until valid
  if guessed_already
    provisional_encoding = Encoding.list.detect{ |encoding| !ignoring.include?(encoding) }
  else # On first run, we'll try and guess the character encoding
    provisional_encoding = guess_encoding
  end

  # If the provisional encoding is valid when string is forced to it, select it otherwise continue to find one
  if valid_when_forced?(provisional_encoding)
    provisional_encoding
  else
    find_a_valid_encoding(ignoring << provisional_encoding, :guess_failed)
  end
end
force_encoding_with(encoding) click to toggle source
# File lib/sterilizer.rb, line 72
def force_encoding_with(encoding)
  self.force_encoding(encoding).encode(default_encoding, :invalid => :replace, :undef => :replace)
end
guess_encoding(guesser = CharDet) click to toggle source

Use an external library to attempt to (silently) guess the encoding

# File lib/sterilizer.rb, line 68
def guess_encoding(guesser = CharDet)
  Encoding.find(guesser.detect(self, :silent => true)["encoding"])
end
sterilize!() click to toggle source
# File lib/sterilizer.rb, line 5
def sterilize!

  return self unless !!defined?(Encoding)
  
  # return if valid encoding, simple encode it to UTF-8
  return self.encode!(default_encoding, self.encoding, { :undef => :replace, :invalid => :replace }) if self.valid_encoding?
  
  # return if encoding is valid and equal to default_internal
  return self if valid_and_default?

  # force to default encoding if valid when forced
  return self.force_encoding(default_encoding) if valid_when_forced?

  # At this point, we know the string is not valid encoding, if the encoding is UTF-8,
  # we must try a different encoding that is valid before forcefully encoding to UTF-8
  # Otherwise, the encoding type is non-default. If it is valid, encode it to UTF-8, otherwise
  # find an alternative before forcefully encoding to UTF-8
  if encoding_is_default?
    # Might have a situation where encoding is the same as default, but it's not valid
    # Force it to something else so we can String#encode
    non_default_encoding = find_a_valid_encoding
    force_encoding_with(non_default_encoding)
  else
    if valid_when_forced?(self.encoding)
      self.encode!(default_encoding, self.encoding, { :undef => :replace, :invalid => :replace})
    else
      alternative_encoding = find_a_valid_encoding(self.encoding)
      force_encoding_with(alternative_encoding)
    end
  end
rescue
  self.force_encoding_with("ASCII")
end
valid_and_default?() click to toggle source
# File lib/sterilizer.rb, line 43
def valid_and_default?
  self.valid_encoding? && encoding_is_default?
end
valid_when_forced?(encoding = default_encoding) click to toggle source
# File lib/sterilizer.rb, line 47
def valid_when_forced?(encoding = default_encoding)
  self.dup.force_encoding(encoding).valid_encoding?
end