class CharDet::HebrewProber

Public Class Methods

new() click to toggle source
Calls superclass method CharDet::CharSetProber::new
# File lib/rchardet/hebrewprober.rb, line 151
def initialize
  super()
  @logicalProber = nil
  @visualProber = nil
  reset()
end

Public Instance Methods

feed(aBuf) click to toggle source
# File lib/rchardet/hebrewprober.rb, line 192
def feed(aBuf)
  # Final letter analysis for logical-visual decision.
  # Look for evidence that the received buffer is either logical Hebrew or
  # visual Hebrew.
  # The following cases are checked:
  # 1) A word longer than 1 letter, ending with a final letter. This is an
  #    indication that the text is laid out "naturally" since the final letter
  #    really appears at the end. +1 for logical score.
  # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
  #    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
  #    the Non-Final form of that letter. Exceptions to this rule are mentioned
  #    above in isNonFinal(). This is an indication that the text is laid out
  #    backwards. +1 for visual score
  # 3) A word longer than 1 letter, starting with a final letter. Final letters
  #    should not appear at the beginning of a word. This is an indication that
  #    the text is laid out backwards. +1 for visual score.
  #
  # The visual score and logical score are accumulated throughout the text and
  # are finally checked against each other in GetCharSetName().
  # No checking for final letters in the middle of words is done since that case
  # is not an indication for either Logical or Visual text.
  #
  # We automatically filter out all 7-bit characters (replace them with spaces)
  # so the word boundary detection works properly. [MAP]

  if get_state() == ENotMe
    # Both model probers say it's not them. No reason to continue.
    return ENotMe
  end

  aBuf = filter_high_bit_only(aBuf)

  for cur in aBuf.split(' ')
    if cur == ' '
      # We stand on a space - a word just ended
      if @beforePrev != ' '
        # next-to-last char was not a space so self._mPrev is not a 1 letter word
        if is_final(@prev)
          # case (1) [-2:not space][-1:final letter][cur:space]
          @finalCharLogicalScore += 1
        elsif is_non_final(@prev)
          # case (2) [-2:not space][-1:Non-Final letter][cur:space]
          @finalCharVisualScore += 1
        end
      end
    else
      # Not standing on a space
      if (@beforePrev == ' ') and (is_final(@prev)) and (cur != ' ')
        # case (3) [-2:space][-1:final letter][cur:not space]
        @finalCharVisualScore += 1
      end
    end
    @beforePrev = @prev
    @prev = cur
  end

  # Forever detecting, till the end or until both model probers return eNotMe (handled above)
  return EDetecting
end
get_charset_name() click to toggle source
# File lib/rchardet/hebrewprober.rb, line 252
def get_charset_name
  # Make the decision: is it Logical or Visual?
  # If the final letter score distance is dominant enough, rely on it.
  finalsub = @finalCharLogicalScore - @finalCharVisualScore
  if finalsub >= MIN_FINAL_CHAR_DISTANCE
    return LOGICAL_HEBREW_NAME
  end
  if finalsub <= -MIN_FINAL_CHAR_DISTANCE
    return VISUAL_HEBREW_NAME
  end

  # It's not dominant enough, try to rely on the model scores instead.
  modelsub = @logicalProber.get_confidence() - @visualProber.get_confidence()
  if modelsub > MIN_MODEL_DISTANCE
    return LOGICAL_HEBREW_NAME
  end
  if modelsub < -MIN_MODEL_DISTANCE
    return VISUAL_HEBREW_NAME
  end

  # Still no good, back to final letter distance, maybe it'll save the day.
  if finalsub < 0.0
    return VISUAL_HEBREW_NAME
  end

  # (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
  return LOGICAL_HEBREW_NAME
end
get_state() click to toggle source
# File lib/rchardet/hebrewprober.rb, line 281
def get_state
  # Remain active as long as any of the model probers are active.
  if (@logicalProber.get_state() == ENotMe) and (@visualProber.get_state() == ENotMe)
    return ENotMe
  end
  return EDetecting
end
is_final(c) click to toggle source
# File lib/rchardet/hebrewprober.rb, line 174
def is_final(c)
  return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c)
end
is_non_final(c) click to toggle source
# File lib/rchardet/hebrewprober.rb, line 178
def is_non_final(c)
  # The normal Tsadi is not a good Non-Final letter due to words like
  # 'lechotet' (to chat) containing an apostrophe after the tsadi. This
  # apostrophe is converted to a space in FilterWithoutEnglishLetters causing
  # the Non-Final tsadi to appear at an end of a word even though this is not
  # the case in the original text.
  # The letters Pe and Kaf rarely display a related behavior of not being a
  # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
  # example legally end with a Non-Final Pe or Kaf. However, the benefit of
  # these letters as Non-Final letters outweighs the damage since these words
  # are quite rare.
  return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c)
end
reset() click to toggle source
# File lib/rchardet/hebrewprober.rb, line 158
def reset
  @finalCharLogicalScore = 0
  @finalCharVisualScore = 0
  # The two last characters seen in the previous buffer,
  # mPrev and mBeforePrev are initialized to space in order to simulate a word
  # delimiter at the beginning of the data
  @prev = ' '
  @beforePrev = ' '
  # These probers are owned by the group prober.
end
set_model_probers(logicalProber, visualProber) click to toggle source
# File lib/rchardet/hebrewprober.rb, line 169
def set_model_probers(logicalProber, visualProber)
  @logicalProber = logicalProber
  @visualProber = visualProber
end