class CharDet::CharDistributionAnalysis
Public Class Methods
new()
click to toggle source
# File lib/rchardet/chardistribution.rb, line 35 def initialize @charToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder()) @tableSize = nil # Size of above table @typicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail. reset() end
Public Instance Methods
feed(aStr, aCharLen)
click to toggle source
# File lib/rchardet/chardistribution.rb, line 49 def feed(aStr, aCharLen) # # """feed a character with known length""" if aCharLen == 2 # we only care about 2-bytes character in our distribution analysis order = get_order(aStr) else order = -1 end if order >= 0 @totalChars += 1 # order is valid if order < @tableSize if 512 > @charToFreqOrder[order] @freqChars += 1 end end end end
get_confidence()
click to toggle source
# File lib/rchardet/chardistribution.rb, line 68 def get_confidence # """return confidence based on existing data""" # if we didn't receive any character in our consideration range, return negative answer if @totalChars <= 0 return SURE_NO end if @totalChars != @freqChars r = @freqChars / ((@totalChars - @freqChars) * @typicalDistributionRatio) if r < SURE_YES return r end end # normalize confidence (we don't want to be 100% sure) return SURE_YES end
get_order(aStr)
click to toggle source
# File lib/rchardet/chardistribution.rb, line 92 def get_order(aStr) # We do not handle characters based on the original encoding string, but # convert this encoding string to a number, here called order. # This allows multiple encodings of a language to share one frequency table. return -1 end
got_enough_data()
click to toggle source
# File lib/rchardet/chardistribution.rb, line 86 def got_enough_data # It is not necessary to receive all data to draw conclusion. For charset detection, # certain amount of data is enough return @totalChars > ENOUGH_DATA_THRESHOLD end
reset()
click to toggle source
# File lib/rchardet/chardistribution.rb, line 42 def reset # # """reset analyser, clear any state""" @done = false # If this flag is set to constants.True, detection is done and conclusion has been made @totalChars = 0 # Total characters encountered @freqChars = 0 # The number of characters whose frequency order is less than 512 end