class DecisionTree::Node

Public Class Methods

new(entries, columns=nil, algorithm='c45', dimension=nil, parent_node=nil, threshold=nil, path=nil) click to toggle source
# File lib/decision-tree.rb, line 37
def initialize(entries, columns=nil, algorithm='c45', dimension=nil, parent_node=nil, threshold=nil, path=nil)
    @parent_node = parent_node
    @path = if path.nil?
        Array.new
    else
        path
    end

    @threshold = threshold

    @algorithm = if algorithm=='c45' or algorithm=='id3'
        algorithm
    else
        raise "Unknown algorithm"
    end

    @dimension = if dimension.nil?
        entries[0][:features].size
    else
        dimension
    end

    @columns = if columns.nil?
        @dimension.times.map{|i| "feature_#{i}"}
    elsif columns.size != @dimension
        raise "The number of columns is incorrect"
    else
        columns
    end


    @labels = entries.map{|x| x[:label]}
    @entropy = @labels.entropy
    @child_nodes = Hash.new

    return if @path.size == @dimension
    return if @entropy==0.0

    @path << choose_best_feature(entries)

    if @algorithm == 'id3'
           build_child_nodes(entries)
    elsif algorithm=='c45'
           if feature_type=='num'
                   build_child_nodes_with_continuous_value(entries)
           else
                   build_child_nodes(entries)
           end
    end
end

Public Instance Methods

feature_index() click to toggle source
# File lib/decision-tree.rb, line 89
def feature_index
        @path[-1]
end
feature_name() click to toggle source
# File lib/decision-tree.rb, line 94
def feature_name
        @columns[ @path[-1] ].split(':')[0]
end
feature_type() click to toggle source
# File lib/decision-tree.rb, line 99
def feature_type
        t = @columns[ @path[-1] ].split(':')[1]
        t || 'string'
end
predict(vector, default=nil) click to toggle source
# File lib/decision-tree.rb, line 140
def predict(vector, default=nil)
   if @child_nodes.size==0
           probability = Hash.new(0)
           @labels.each{|k| probability[k] += 1 }
           probability.each{|k,v| probability[k] = v / @labels.size.to_f }
           return probability
   else
           if @algorithm=='c45' and feature_type=='num'
                   curr_value = vector[feature_index]

                   sorted_nodes = @child_nodes.sort_by{|k,v| k.to_f }
                   last_node = sorted_nodes[0][1]
                   sorted_nodes[1..-1].to_a.each do |feature_value,child_node|
                           break if curr_value.to_f < feature_value.to_f
                           last_node = child_node
                   end

                  return last_node.predict(vector,default)
           else
                   feature_value = vector[feature_index]
                   return default if not @child_nodes.has_key?(feature_value)
                  return @child_nodes[feature_value].predict(vector,default)
          end
        end
end
to_pseudo_code(buff=nil,indent="") click to toggle source
# File lib/decision-tree.rb, line 105
    def to_pseudo_code(buff=nil,indent="")
            buff = Array.new if buff.nil?

    if @child_nodes.size==0
        result = @labels.to_set.to_a
        if result.size==1
            buff << "#{indent}return #{result[0]}"
        else
            buff << "#{indent}return #{@labels}"
        end
        return buff
    end

    if @algorithm=='c45' and feature_type=='num'
           sorted_nodes = @child_nodes.sort_by{|k,v| k.to_f }
           sorted_nodes[1..-1].to_a.each do |feature_value,child_node|
                buff << "#{indent}if(#{feature_name} >= #{feature_value}){"
                child_node.to_pseudo_code(buff, indent+"  " )
                buff << "#{indent}}"
            end
            buff << "#{indent}else{"
            sorted_nodes[0][1].to_pseudo_code(buff, indent+"  " )
            buff << "#{indent}}"
    else
           @child_nodes.each do |feature_value,child_node|
                buff << "#{indent}if(#{feature_name} == #{feature_value}){"
                child_node.to_pseudo_code(buff, indent+"  " )
                buff << "#{indent}}"
            end
    end

    return buff
end

Private Instance Methods

build_child_nodes(entries) click to toggle source
# File lib/decision-tree.rb, line 190
def build_child_nodes(entries)

        buff = Hash.new{|h,feature_value| h[feature_value] = Array.new}
        entries.each do |e|
                feature_value = e[:features][feature_index]
                buff[feature_value] << e
        end

        buff.each do |feature_value,child_entries|
                @child_nodes[feature_value] = Node.new(child_entries, @columns, @algorithm, @dimension, self, feature_value, @path.dup)
        end
end
build_child_nodes_with_continuous_value(entries) click to toggle source
# File lib/decision-tree.rb, line 204
def build_child_nodes_with_continuous_value(entries)

        buff = Hash.new{|h,feature_value| h[feature_value] = Array.new}
        sorted_entries = entries.sort_by{|e| e[:features][feature_index].to_f }

        last_label = nil #sorted_entries[0][:label].to_s
        last_value = nil # sorted_entries[0][:features][feature_index]

        sorted_entries.each_with_index do |e, i|

                feature_value = e[:features][feature_index]

                if last_label != e[:label].to_s
                        last_value = feature_value.to_s
                        last_label = e[:label].to_s
                end

                buff[last_value] << e
        end

        buff.each do |feature_value,child_entries|
                @child_nodes[feature_value] = Node.new(child_entries, @columns, @algorithm, @dimension, self, feature_value, @path.dup)
        end

end
choose_best_feature(entries) click to toggle source
# File lib/decision-tree.rb, line 167
def choose_best_feature(entries)

        labels = entries.map{|x| x[:label]}

        max_ig = {index: -1, ig: -1.0}
        @dimension.times do |i|
                next if @path.include?(i)
                child_entropy = entries.map{|x| x[:features][i]}.concitional_entropy_with(labels)

    ig = if @algorithm=='id3'
        @entropy - child_entropy
    else# c45
        gain = (@entropy - child_entropy) / entries.map{|x| x[:features][i]}.entropy
        gain = 0 if gain.nan?
        gain
    end

    max_ig = {index: i, ig: ig} if ig > max_ig[:ig]
        end
        return max_ig[:index]
end