class Smalltext::Classifier

Your code goes here…

Public Class Methods

new() click to toggle source
# File lib/smalltext.rb, line 18
def initialize
        @training_data = []

        #organizing our data structures for documents , @categories, words
        @ignore_words = ['?']
        @words=[]
        @categories=[]
        @documents=[]
        @tokenizer = Tokenizer::Tokenizer.new(:en)

        #create our bow training data
        @training=[]
        @output=[]
        @synapse = {}
end

Public Instance Methods

add_item(category, sentence) click to toggle source
# File lib/smalltext.rb, line 34
def add_item(category, sentence)
        @training_data.push({"category":category, "sentence":sentence})
end
classify(sentence, show_details=false) click to toggle source
# File lib/smalltext.rb, line 55
def classify(sentence, show_details=false)
    results = think(sentence, show_details)
    # puts "results is #{results.inspect}"

    # results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD ]
    results = results.to_a.map.each_with_index {|r,i| [i, r] if r > ERROR_THRESHOLD }.compact 
    # # results.sort(key=lambda x: x[1], reverse=True)
    results.sort! {|a,b| b[1] <=> a[1] }
    # return_results =[[classes[r[0]],r[1]] for r in results]
    return_results = results.map {|r| [klasses[r[0]], r[1]] }
    puts "sentence: #{sentence}\nclassification: #{return_results}"
    puts 
    return return_results
end
load_model(synapse_file) click to toggle source
# File lib/smalltext.rb, line 83
def load_model(synapse_file)
        @synapse = Marshal.load(File.binread(synapse_file))
        @synapse[:synapse0] = Numo::NArray.cast(@synapse[:synapse0])
        @synapse[:synapse1] = Numo::NArray.cast(@synapse[:synapse1])

        @words = @synapse[:words]
        @categories = @synapse[:klasses]

        puts "Model #{synapse_file} loaded. Model was created on #{@synapse[:datetime]}"
end
save_model(synapse_file) click to toggle source
# File lib/smalltext.rb, line 70
def save_model(synapse_file)
        synapse_file = synapse_file

    unless @synapse.empty?
            File.open(synapse_file, 'wb') do |file|
              file.write(Marshal.dump(@synapse))
            end
            puts "saved synapses to: #{synapse_file}"
        else
                puts "Model not trained. Use the 'Classifier#train' method to build a model."
    end            
end
train(hidden_neurons=20, alpha=0.1, epochs=1000, dropout=false, dropout_percent=0.2) click to toggle source
# File lib/smalltext.rb, line 38
def train(hidden_neurons=20, alpha=0.1, epochs=1000, dropout=false, dropout_percent=0.2)
        preprocess
        x_inp = Numo::NArray[training][0,true,true]
        y = Numo::NArray[output][0,true,true]

        start_time = Time.now

        neural_network(x_inp, y, hidden_neurons=hidden_neurons, alpha=alpha, epochs=epochs, dropout=dropout, dropout_percent=dropout_percent)

        elapsed_time = Time.now - start_time
        puts
        puts
        puts "Model training complete."
        puts "Processing time: #{elapsed_time} seconds"

end

Private Instance Methods

bow(sentence, words, show_details=false) click to toggle source

return bag of words array: 0 or 1 for each word in the bag that exists in the sentence

# File lib/smalltext.rb, line 171
def bow(sentence, words, show_details=false)
    #tokenize the pattern
    sentence_words=clean_up_sentence(sentence)
    #bag of words
    bag=[0] * words.size
    # for s in sentence_words:
    sentence_words.each do |s|        
        words.each_with_index do |w,i|
            if w == s
                bag[i] = 1
                if show_details
                    puts "found in bag: #{w}"
                end
            end
        end
    end
    # return Numo::Narray.new(bag)
    return Numo::DFloat[bag].flatten
end
clean_up_sentence(sentence) click to toggle source
# File lib/smalltext.rb, line 162
def clean_up_sentence(sentence)
    #tokenize the pattern
    sentence_words = @tokenizer.tokenize(sentence)
    #stem each word
    # sentence_words=[stemmer.stem(word.lower()) for word in sentence_words]
    sentence_words.map! {|word| word.stem }
end
dtanh(y) click to toggle source

derivative for tanh sigmoid

# File lib/smalltext.rb, line 341
def dtanh(y)
    # 1 - y*y
    return 1.0 - Numo::NMath.tanh(y)**2
end
klasses() click to toggle source
# File lib/smalltext.rb, line 154
def klasses
    return @categories
end
neural_network(x_inp, y, hidden_neurons=10, alpha=1, epochs=50000, dropout=false, dropout_percent=0.5) click to toggle source
# File lib/smalltext.rb, line 208
def neural_network(x_inp, y, hidden_neurons=10, alpha=1, epochs=50000, dropout=false, dropout_percent=0.5)

    puts "Training with #{hidden_neurons} neurons, alpha:#{alpha}, dropout:#{dropout} #{dropout_percent if dropout}"
    # puts x_inp.inspect
    # puts "Input matrix: #{x_inp.size}x#{x_inp[0].size}    Output matrix: #{1}x#{@categories.size}"
    puts "Input matrix: #{x_inp.shape}    Output matrix: #{1}x#{@categories.size}"
    puts "Epochs set to #{epochs}. Every 100th iteration will be printed."
    puts
    
        last_mean_error = 1
    # randomly initialize our weights with mean 0
    # synapse_0 = 2*np.random.random((len(x_inp[0]), hidden_neurons)) - 1
    synapse_0 = 2*Numo::DFloat.new(x_inp[0,true].size, hidden_neurons).rand - 1
    # puts "synapse_0 is #{synapse_0.inspect}"
    # synapse_1 = 2*np.random.random((hidden_neurons, len(@categories))) - 1
    synapse_1 = 2*Numo::DFloat.new(hidden_neurons, @categories.size).rand - 1


    prev_synapse_0_weight_update = synapse_0.new_zeros
    prev_synapse_1_weight_update = synapse_1.new_zeros

    synapse_0_direction_count = synapse_0.new_zeros
    synapse_1_direction_count = synapse_1.new_zeros

    (epochs + 1).times do |j|
       # Feed forward through layers 0, 1, and 2
        layer_0 = x_inp
        # puts "synapse_0 in block is #{synapse_0.inspect}"
        # puts "layer_0 is #{layer_0.inspect}"
        layer_1 = sigmoid(layer_0.dot synapse_0)
        # layer_1 = tanh(layer_0.dot synapse_0)
                
        if dropout
            # layer_1 *= np.random.binomial([np.ones((len(x_inp),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))
            b = Croupier::Distributions.binomial size: 1, success: (1-dropout_percent)
            arr = Array.new(x_inp.size) { Array.new(hidden_neurons) {b.generate_number} }
            layer_1 = Numo::DFloat[arr].reshape(x_inp.size,hidden_neurons) * (1.0/(1-dropout_percent))            
        end

        layer_2 = sigmoid((layer_1.dot synapse_1))
        # layer_2 = tanh((layer_1.dot synapse_1))

        # how much did we miss the target value?
        layer_2_error = y - layer_2


        if (j% 10000) == 0 and j > 5000
            # if this 10k iteration's error is greater than the last iteration, break out
            if (layer_2_error.abs).mean < last_mean_error
                puts "delta after #{j} iterations: #{(layer_2_error.abs).mean} )"
                last_mean_error = (layer_2_error.abs).mean
            else
                puts "break: #{(layer_2_error.abs).mean} > #{last_mean_error}"
                break
            end
        end

        # in what direction is the target value?
        # were we really sure? if so, don't change too much.
        # layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2)
        layer_2_delta = layer_2_error * dtanh(layer_2)

        # how much did each l1 value contribute to the l2 error (according to the weights)?
        layer_1_error = layer_2_delta.dot(synapse_1.transpose)

        # in what direction is the target l1?
        # were we really sure? if so, don't change too much.
        # layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1)
        layer_1_delta = layer_1_error * dtanh(layer_1)
        
        synapse_1_weight_update = (layer_1.transpose).dot(layer_2_delta)
        synapse_0_weight_update = (layer_0.transpose).dot(layer_1_delta)


        if(j > 0)
            # Bit array does not support arithmetic operation. Cast to Numo::Int32.cast, see https://github.com/ruby-numo/numo-narray/issues/65#issuecomment-323665534
            # puts "synapse_0_direction_count",synapse_0_direction_count.inspect
            # puts "synapse_0_weight_update", synapse_0_weight_update.inspect
            # puts "prev_synapse_0_weight_update", prev_synapse_0_weight_update.inspect
            synapse_0_direction_count += ( Numo::Int32.cast((synapse_0_weight_update > 0)) - Numo::Int32.cast((prev_synapse_0_weight_update > 0)) ).abs
            synapse_1_direction_count += ( Numo::Int32.cast((synapse_1_weight_update > 0)) - Numo::Int32.cast((prev_synapse_1_weight_update > 0))).abs
        end
        
        synapse_1 += alpha * synapse_1_weight_update
        synapse_0 += alpha * synapse_0_weight_update
        
        prev_synapse_0_weight_update = synapse_0_weight_update
        prev_synapse_1_weight_update = synapse_1_weight_update
        print "."
        if (j%100 == 0)
               print j
        end
    end

    now = Time.now
    # puts "BEFORE DUMPING #{synapse_0.inspect}"
    # persist synapses
    @synapse = {'synapse0': synapse_0.to_a, 'synapse1': synapse_1.to_a,
               'datetime': now.strftime("%Y-%m-%d %H:%M"),
               'words': @words,
               'klasses': @categories
              }

    # synapse_file = "intent_class.nn"

    # File.open(synapse_file, 'wb') do |file|
    #  file.write(Marshal.dump(@synapse))
    # end
    # puts "saved synapses to: #{synapse_file}"
end
output() click to toggle source
# File lib/smalltext.rb, line 150
def output
    return @output
end
prepare_bow() click to toggle source
# File lib/smalltext.rb, line 123
def prepare_bow
        #create an empty array for our output
        output_empty = Array.new(@categories.size) { 0 }

        #training set, bag of words for each sentence
        @documents.each do |doc|
            #initialize our bag of words
            bag=[]
            #list of tokenized words for the pattern
            pattern_words=doc[0]
            #stem each word
            pattern_words.map! {|word| word.stem }
            #create our bag of words array
            @words.each { |w| if pattern_words.include?(w) then bag << 1 else bag << 0 end }
            @training.push(bag)
            #output is a 0 for each tag and 1 for current tag
            # output_row = Array.new(output_empty)
            output_row = output_empty.dup
            output_row[@categories.index(doc[1])] = 1
            @output << output_row
        end
end
preprocess() click to toggle source
# File lib/smalltext.rb, line 97
def preprocess
        #loop through each sentence in our training data
        @training_data.each do |pattern|
            #tokenize in each word in the sentence
            w = @tokenizer.tokenize(pattern[:sentence])

            #add to our words list
            @words += w

            #add to documents in our corpus
            @documents.push([w,pattern[:category]])
            
            #add to our @categories list
            if !@categories.include?(pattern[:category])
                @categories.push(pattern[:category])
            end
        end

        @ignore_words.each {|ign| @words.delete(ign) }
        @words.map! {|word| word.stem }
        @words.uniq!
        @categories.uniq!

        prepare_bow
end
sigmoid(x) click to toggle source

compute sigmoid nonlinearity

# File lib/smalltext.rb, line 320
def sigmoid(x)
    output=1/(1+Numo::NMath.exp(-x))    
end
sigmoid_output_to_derivative(output) click to toggle source

convert output of sigmoid function to its derivative

# File lib/smalltext.rb, line 324
def sigmoid_output_to_derivative(output)
    output*(1-output)
end
softmax(w) click to toggle source

using softmax as output layer is recommended for classification where outputs are mutually exclusive

# File lib/smalltext.rb, line 329
def softmax(w)
    e = Numo::NMath.exp(w - (w.max))
    dist = e / (e.sum)
    return dist
end
tanh(x) click to toggle source

using tanh over logistic sigmoid for the hidden layer is recommended

# File lib/smalltext.rb, line 336
def tanh(x)
     Numo::NMath.tanh(x)
end
think(sentence, show_details=false) click to toggle source
# File lib/smalltext.rb, line 191
def think(sentence, show_details=false)
    x= bow(sentence.downcase, words,show_details)
    if show_details
        puts "sentence: #{sentence},\nbow: #{x}"
    end
    #input layer is our bag of words
    l0=x
    # matrix multiplication of input and hidden layer
    l1 = sigmoid(l0.dot @synapse[:synapse0])
    # l1 = softmax(l0.dot @synapse_0)
    # output layer
    # l2 = sigmoid(l1.dot @synapse_1)
    l2 = softmax(l1.dot @synapse[:synapse1])

    return l2
end
training() click to toggle source
# File lib/smalltext.rb, line 146
def training
    return @training
end
words() click to toggle source
# File lib/smalltext.rb, line 158
def words
    return @words
end