class Twkorean::TwitterKoreanText
Public Class Methods
new(normalization = true, stemming = true)
click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 4 def initialize(normalization = true, stemming = true) jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(File::PATH_SEPARATOR) Rjb::load(jars, ['-Xmx512M']) end
Public Instance Methods
extract_phrases(tokens)
click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 34 def extract_phrases(tokens) phrases = korean_processor.extractPhrases(tokens, true, true) phrases.toArray.map{|x| x.toString} end
normalize(text)
click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 9 def normalize(text) korean_processor.normalize(text).toString end
stem(tokens)
click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 28 def stem(tokens) # Deprecated method # For legacy Code, Version less 0.0.6 tokens_to_token_list(tokens) end
tokenize(text)
click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 13 def tokenize(text) tokens = korean_processor.tokenize(text) tokens end
tokens_to_string_list(tokens)
click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 18 def tokens_to_string_list(tokens) tokens = korean_processor.tokensToJavaStringList(tokens) tokens.toArray.map{|x| x.toString} end
tokens_to_token_list(tokens)
click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 23 def tokens_to_token_list(tokens) tokens = korean_processor.tokensToJavaKoreanTokenList(tokens) tokens.toArray.map{|x| parser(x.toString)} end
Private Instance Methods
korean_processor()
click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 44 def korean_processor @korean_processor ||= Rjb::import('org.openkoreantext.processor.OpenKoreanTextProcessorJava') end
parser(text)
click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 40 def parser(text) text.match(/(.*)\(([a-zA-Z]*)(\(.*\))?: ([0-9]+), ([0-9]+)\)/).to_a end