class CzechStemmer

Public Class Methods

normalize(word) click to toggle source
# File lib/czech-stemmer.rb, line 86
def self.normalize word
  if word.end_with?("čt") then
    return word[0..-3] + "ck"
  end

  if word.end_with?("št") then
    return word[0..-3] + "sk"
  end

  if word.end_with?("c") then
    return word[0..-2] + "k"
  end

  if word.end_with?("č") then
    return word[0..-2] + "k"
  end

  if word.end_with?("z") then
    return word[0..-2] + "h"
  end

  if word.end_with?("ž") then
    return word[0..-2] + "h"
  end

  if (word.size > 1 and word[-2,1] == "e") then
    last_char = word[-1,1]
    return word[0..-3] + last_char
  end

  if (word.size > 2 and word[-2,1] == "ů") then
    last_char = word[-1,1]
    return word[0..-3] + "o" + last_char
  end

  return word

end
remove_case(word) click to toggle source
# File lib/czech-stemmer.rb, line 12
def self.remove_case word
  len = word.size

  if (len > 7 and (
    word.end_with?("atech")
  )) then return word[0..-6] end

  if (len > 6 and (
    word.end_with?("ětem") ||
    word.end_with?("etem") ||
    word.end_with?("atům")
  )) then return word[0..-5] end

  if (len > 5 and (
    word.end_with?("ech") ||
    word.end_with?("ich") ||
    word.end_with?("ích") ||
    word.end_with?("ého") ||
    word.end_with?("ěmi") ||
    word.end_with?("emi") ||
    word.end_with?("ému") ||
    word.end_with?("ěte") ||
    word.end_with?("ete") ||
    word.end_with?("ěti") ||
    word.end_with?("eti") ||
    word.end_with?("ího") ||
    word.end_with?("iho") ||
    word.end_with?("ími") ||
    word.end_with?("ímu") ||
    word.end_with?("imu") ||
    word.end_with?("ách") ||
    word.end_with?("ata") ||
    word.end_with?("aty") ||
    word.end_with?("ých") ||
    word.end_with?("ama") ||
    word.end_with?("ami") ||
    word.end_with?("ové") ||
    word.end_with?("ovi") ||
    word.end_with?("ými")
  )) then return word[0..-4] end

  if (len > 4 and (
    word.end_with?("em") ||
    word.end_with?("es") ||
    word.end_with?("ém") ||
    word.end_with?("ím") ||
    word.end_with?("ům") ||
    word.end_with?("at") ||
    word.end_with?("ám") ||
    word.end_with?("os") ||
    word.end_with?("us") ||
    word.end_with?("ým") ||
    word.end_with?("mi") ||
    word.end_with?("ou")
  )) then return word[0..-3] end


  if (len > 3 and ["a", "e", "i", "o", "u", "ů", "y", "á", "é", "í", "ý", "ě"].include?(word[-1,1])) then
    return word[0..-2]
  end

  return word
end
remove_possessives(word) click to toggle source
# File lib/czech-stemmer.rb, line 76
def self.remove_possessives word
  if (word.size > 5 and (
    word.end_with?("ov") ||
    word.end_with?("in") ||
    word.end_with?("ův")
  )) then return word[0..-3] end

  return word
end
stem(word) click to toggle source
# File lib/czech-stemmer.rb, line 3
def self.stem word
  stem = CzechStemmer.remove_case word
  stem = CzechStemmer.remove_possessives stem
  if stem.size > 0 then
    stem = CzechStemmer.normalize stem
  end
  return stem
end