class CharDet::HebrewProber
Public Class Methods
new()
click to toggle source
Calls superclass method
CharDet::CharSetProber::new
# File lib/rchardet/hebrewprober.rb, line 151 def initialize super() @logicalProber = nil @visualProber = nil reset() end
Public Instance Methods
feed(aBuf)
click to toggle source
# File lib/rchardet/hebrewprober.rb, line 192 def feed(aBuf) # Final letter analysis for logical-visual decision. # Look for evidence that the received buffer is either logical Hebrew or # visual Hebrew. # The following cases are checked: # 1) A word longer than 1 letter, ending with a final letter. This is an # indication that the text is laid out "naturally" since the final letter # really appears at the end. +1 for logical score. # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal # Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with # the Non-Final form of that letter. Exceptions to this rule are mentioned # above in isNonFinal(). This is an indication that the text is laid out # backwards. +1 for visual score # 3) A word longer than 1 letter, starting with a final letter. Final letters # should not appear at the beginning of a word. This is an indication that # the text is laid out backwards. +1 for visual score. # # The visual score and logical score are accumulated throughout the text and # are finally checked against each other in GetCharSetName(). # No checking for final letters in the middle of words is done since that case # is not an indication for either Logical or Visual text. # # We automatically filter out all 7-bit characters (replace them with spaces) # so the word boundary detection works properly. [MAP] if get_state() == ENotMe # Both model probers say it's not them. No reason to continue. return ENotMe end aBuf = filter_high_bit_only(aBuf) for cur in aBuf.split(' ') if cur == ' ' # We stand on a space - a word just ended if @beforePrev != ' ' # next-to-last char was not a space so self._mPrev is not a 1 letter word if is_final(@prev) # case (1) [-2:not space][-1:final letter][cur:space] @finalCharLogicalScore += 1 elsif is_non_final(@prev) # case (2) [-2:not space][-1:Non-Final letter][cur:space] @finalCharVisualScore += 1 end end else # Not standing on a space if (@beforePrev == ' ') and (is_final(@prev)) and (cur != ' ') # case (3) [-2:space][-1:final letter][cur:not space] @finalCharVisualScore += 1 end end @beforePrev = @prev @prev = cur end # Forever detecting, till the end or until both model probers return eNotMe (handled above) return EDetecting end
get_charset_name()
click to toggle source
# File lib/rchardet/hebrewprober.rb, line 252 def get_charset_name # Make the decision: is it Logical or Visual? # If the final letter score distance is dominant enough, rely on it. finalsub = @finalCharLogicalScore - @finalCharVisualScore if finalsub >= MIN_FINAL_CHAR_DISTANCE return LOGICAL_HEBREW_NAME end if finalsub <= -MIN_FINAL_CHAR_DISTANCE return VISUAL_HEBREW_NAME end # It's not dominant enough, try to rely on the model scores instead. modelsub = @logicalProber.get_confidence() - @visualProber.get_confidence() if modelsub > MIN_MODEL_DISTANCE return LOGICAL_HEBREW_NAME end if modelsub < -MIN_MODEL_DISTANCE return VISUAL_HEBREW_NAME end # Still no good, back to final letter distance, maybe it'll save the day. if finalsub < 0.0 return VISUAL_HEBREW_NAME end # (finalsub > 0 - Logical) or (don't know what to do) default to Logical. return LOGICAL_HEBREW_NAME end
get_state()
click to toggle source
# File lib/rchardet/hebrewprober.rb, line 281 def get_state # Remain active as long as any of the model probers are active. if (@logicalProber.get_state() == ENotMe) and (@visualProber.get_state() == ENotMe) return ENotMe end return EDetecting end
is_final(c)
click to toggle source
# File lib/rchardet/hebrewprober.rb, line 174 def is_final(c) return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c) end
is_non_final(c)
click to toggle source
# File lib/rchardet/hebrewprober.rb, line 178 def is_non_final(c) # The normal Tsadi is not a good Non-Final letter due to words like # 'lechotet' (to chat) containing an apostrophe after the tsadi. This # apostrophe is converted to a space in FilterWithoutEnglishLetters causing # the Non-Final tsadi to appear at an end of a word even though this is not # the case in the original text. # The letters Pe and Kaf rarely display a related behavior of not being a # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for # example legally end with a Non-Final Pe or Kaf. However, the benefit of # these letters as Non-Final letters outweighs the damage since these words # are quite rare. return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c) end
reset()
click to toggle source
# File lib/rchardet/hebrewprober.rb, line 158 def reset @finalCharLogicalScore = 0 @finalCharVisualScore = 0 # The two last characters seen in the previous buffer, # mPrev and mBeforePrev are initialized to space in order to simulate a word # delimiter at the beginning of the data @prev = ' ' @beforePrev = ' ' # These probers are owned by the group prober. end
set_model_probers(logicalProber, visualProber)
click to toggle source
# File lib/rchardet/hebrewprober.rb, line 169 def set_model_probers(logicalProber, visualProber) @logicalProber = logicalProber @visualProber = visualProber end