class CharDet::SingleByteCharSetProber

NEGATIVE_CAT = 0

Public Class Methods

new(model, reversed=false, nameProber=nil) click to toggle source
Calls superclass method CharDet::CharSetProber::new
# File lib/rchardet/sbcharsetprober.rb, line 41
def initialize(model, reversed=false, nameProber=nil)
  super()
  @model = model
  @reversed = reversed # TRUE if we need to reverse every pair in the model lookup
  @nameProber = nameProber # Optional auxiliary prober for name decision
  reset()
end

Public Instance Methods

feed(aBuf) click to toggle source
# File lib/rchardet/sbcharsetprober.rb, line 66
def feed(aBuf)
  if !@model['keepEnglishLetter']
    aBuf = filter_without_english_letters(aBuf)
  end
  aLen = aBuf.length
  if aLen == 0
    return get_state()
  end
  aBuf.each_byte do |b|
    c = b.chr
    order = @model['charToOrderMap'][c.bytes.first]
    if order < SYMBOL_CAT_ORDER
      @totalChar += 1
    end
    if order < SAMPLE_SIZE
      @freqChar += 1
      if @lastOrder < SAMPLE_SIZE
        @totalSeqs += 1
        if !@reversed
          @seqCounters[@model['precedenceMatrix'][(@lastOrder * SAMPLE_SIZE) + order]] += 1
        else # reverse the order of the letters in the lookup
          @seqCounters[@model['precedenceMatrix'][(order * SAMPLE_SIZE) + @lastOrder]] += 1
        end
      end
    end
    @lastOrder = order
  end

  if get_state() == EDetecting
    if @totalSeqs > SB_ENOUGH_REL_THRESHOLD
      cf = get_confidence()
      if cf > POSITIVE_SHORTCUT_THRESHOLD
        $stderr << "#{@model['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
        @state = EFoundIt
      elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
        $stderr << "#{@model['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
        @state = ENotMe
      end
    end
  end

  return get_state()
end
get_charset_name() click to toggle source
# File lib/rchardet/sbcharsetprober.rb, line 58
def get_charset_name
  if @nameProber
    return @nameProber.get_charset_name()
  else
    return @model['charsetName']
  end
end
get_confidence() click to toggle source
# File lib/rchardet/sbcharsetprober.rb, line 110
def get_confidence
  r = 0.01
  if @totalSeqs > 0
    r = (1.0 * @seqCounters[POSITIVE_CAT]) / @totalSeqs / @model['mTypicalPositiveRatio']
    r = r * @freqChar / @totalChar
    if r >= 1.0
      r = 0.99
    end
  end
  return r
end
reset() click to toggle source
Calls superclass method CharDet::CharSetProber#reset
# File lib/rchardet/sbcharsetprober.rb, line 49
def reset
  super()
  @lastOrder = 255 # char order of last character
  @seqCounters = [0] * NUMBER_OF_SEQ_CAT
  @totalSeqs = 0
  @totalChar = 0
  @freqChar = 0 # characters that fall in our sampling range
end