class String
extend String
class
Public Class Methods
# File lib/hebrew.rb, line 143 def self.is_codepoint_nikkud_cp1255(cp) return ((cp > 191 && cp < 205) or [209, 210].include?(cp)) #NIKKUD_CP1255.include?(cp) # cleaner, but much slower end
# File lib/hebrew.rb, line 147 def self.is_codepoint_nikkud_utf8(cp) return ((cp > 0x05af && cp < 0x05bd) or [0x05c1, 0x05c2].include?(cp)) #NIKKUD_UTF8.include?(cp) # cleaner, but much slower end
this will return true if the first parameter is a final letter in the encoding of the second parameter
# File lib/hebrew.rb, line 162 def self.is_final_by_encoding(c, encoding) case encoding when Encoding::UTF_8 FIANLS_UTF8.include?(c) when Encoding::WINDOWS_1255 || Encoding::CP1255 FINALS_CP1255.include?(c) end end
this will return true if the first parameter is a nikkud character in the encoding of the second parameter
# File lib/hebrew.rb, line 152 def self.is_nikkud_by_encoding(c, encoding) case encoding when Encoding::UTF_8 self.is_codepoint_nikkud_utf8(c.codepoints.first) when Encoding::WINDOWS_1255 || Encoding::CP1255 self.is_codepoint_nikkud_cp1255(c.codepoints.first) # TODO: add Mac encoding? end end
Public Instance Methods
this will return true if the string contains any Hebrew character (short circuit)
# File lib/hebrew.rb, line 77 def any_hebrew? case self.encoding when Encoding::UTF_8 self.each_codepoint {|cp| return true if is_hebrew_codepoint_utf8(cp) } return false when Encoding::WINDOWS_1255 || Encoding::CP1255 self.each_codepoint {|cp| return true if is_hebrew_codepoint_cp1255(cp) } return false else return false end end
# File lib/hebrew.rb, line 116 def any_nikkud? func = case self.encoding when Encoding::UTF_8 :is_codepoint_nikkud_utf8 when Encoding::WINDOWS_1255 || Encoding::CP1255 :is_codepoint_nikkud_cp1255 else :falsehood end self.each_codepoint{|cp| return true if String.send(func, cp)} return false end
# File lib/hebrew.rb, line 90 def falsehood false end
# File lib/hebrew.rb, line 129 def is_hebrew_codepoint_cp1255(cp) return ((cp > 191 && cp < 202) or [203, 204, 209, 210].include?(cp)) end
# File lib/hebrew.rb, line 132 def is_hebrew_codepoint_utf8(cp) return (cp >= HEB_UTF8_START && cp <= HEB_UTF8_END) end
this will return true if the parameter is a nikkud character
# File lib/hebrew.rb, line 139 def is_nikkud(c) self.class.is_nikkud_by_encoding(c, self.encoding) # delegate to class method based on instance encoding end
this will add matres lectionis (yods and vavs as vowels) after diacritics that denote those vowels. The result won't always be morphologically correct Hebrew, but is useful for generating mostly-likely variants users may search for, when typing inputs (almost no Hebrew users know how to produce diacritics on the keyboard).
# File lib/hebrew.rb, line 95 def naive_full_nikkud ret = '' prev_char = nil case self.encoding when Encoding::UTF_8 self.each_char do |c| if c.codepoints[0] == HEB_UTF8_QUBBUTS ret += 'וּ' # replace Qubbuts with vav and shuruk else ret += c end ret += 'י' if c.codepoints[0] == HEB_UTF8_XIRIK ret += 'ו' if c.codepoints[0] == HEB_UTF8_XOLAM && prev_char != 'ו' prev_char = c end return ret.gsub("\u05b4יי","\u05b4י").gsub("\u05b4י\u05bcי", "\u05b4\u05bcי") # get rid of extraneous yods possibly added because we weren't looking ahead else return nil # not implemented for other encodings for now. end end
# File lib/hebrew.rb, line 23 def strip_hebrew case self.encoding when Encoding::UTF_8 strip_hebrew_utf8 when Encoding::WINDOWS_1255 || Encoding::CP1255 strip_hebrew_cp1255 end end
# File lib/hebrew.rb, line 40 def strip_hebrew_cp1255 target = ''.force_encoding('windows-1255') self.each_codepoint {|cp| unless self.class.is_codepoint_nikkud_cp1255(cp) or self.is_hebrew_codepoint_cp1255(cp) target << cp.chr(Encoding::CP1255) # is there a neater way? end } return target end
# File lib/hebrew.rb, line 31 def strip_hebrew_utf8 target = '' self.each_codepoint {|cp| unless self.class.is_codepoint_nikkud_utf8(cp) or self.is_hebrew_codepoint_utf8(cp) target << cp.chr(Encoding::UTF_8) end } return target end
this will return the string, stripped of any Hebrew nikkud characters
# File lib/hebrew.rb, line 50 def strip_nikkud case self.encoding when Encoding::UTF_8 strip_nikkud_utf8 when Encoding::WINDOWS_1255 || Encoding::CP1255 strip_nikkud_cp1255 end end
# File lib/hebrew.rb, line 58 def strip_nikkud_cp1255 target = ''.force_encoding('windows-1255') self.each_codepoint {|cp| unless self.class.is_codepoint_nikkud_cp1255(cp) target << cp.chr(Encoding::CP1255) # is there a neater way? end } return target end
# File lib/hebrew.rb, line 67 def strip_nikkud_utf8 target = '' self.each_codepoint {|cp| unless self.class.is_codepoint_nikkud_utf8(cp) target << cp.chr(Encoding::UTF_8) end } return target end