class DecisionTree::Node
Public Class Methods
new(entries, columns=nil, algorithm='c45', dimension=nil, parent_node=nil, threshold=nil, path=nil)
click to toggle source
# File lib/decision-tree.rb, line 37 def initialize(entries, columns=nil, algorithm='c45', dimension=nil, parent_node=nil, threshold=nil, path=nil) @parent_node = parent_node @path = if path.nil? Array.new else path end @threshold = threshold @algorithm = if algorithm=='c45' or algorithm=='id3' algorithm else raise "Unknown algorithm" end @dimension = if dimension.nil? entries[0][:features].size else dimension end @columns = if columns.nil? @dimension.times.map{|i| "feature_#{i}"} elsif columns.size != @dimension raise "The number of columns is incorrect" else columns end @labels = entries.map{|x| x[:label]} @entropy = @labels.entropy @child_nodes = Hash.new return if @path.size == @dimension return if @entropy==0.0 @path << choose_best_feature(entries) if @algorithm == 'id3' build_child_nodes(entries) elsif algorithm=='c45' if feature_type=='num' build_child_nodes_with_continuous_value(entries) else build_child_nodes(entries) end end end
Public Instance Methods
feature_index()
click to toggle source
# File lib/decision-tree.rb, line 89 def feature_index @path[-1] end
feature_name()
click to toggle source
# File lib/decision-tree.rb, line 94 def feature_name @columns[ @path[-1] ].split(':')[0] end
feature_type()
click to toggle source
# File lib/decision-tree.rb, line 99 def feature_type t = @columns[ @path[-1] ].split(':')[1] t || 'string' end
predict(vector, default=nil)
click to toggle source
# File lib/decision-tree.rb, line 140 def predict(vector, default=nil) if @child_nodes.size==0 probability = Hash.new(0) @labels.each{|k| probability[k] += 1 } probability.each{|k,v| probability[k] = v / @labels.size.to_f } return probability else if @algorithm=='c45' and feature_type=='num' curr_value = vector[feature_index] sorted_nodes = @child_nodes.sort_by{|k,v| k.to_f } last_node = sorted_nodes[0][1] sorted_nodes[1..-1].to_a.each do |feature_value,child_node| break if curr_value.to_f < feature_value.to_f last_node = child_node end return last_node.predict(vector,default) else feature_value = vector[feature_index] return default if not @child_nodes.has_key?(feature_value) return @child_nodes[feature_value].predict(vector,default) end end end
to_pseudo_code(buff=nil,indent="")
click to toggle source
# File lib/decision-tree.rb, line 105 def to_pseudo_code(buff=nil,indent="") buff = Array.new if buff.nil? if @child_nodes.size==0 result = @labels.to_set.to_a if result.size==1 buff << "#{indent}return #{result[0]}" else buff << "#{indent}return #{@labels}" end return buff end if @algorithm=='c45' and feature_type=='num' sorted_nodes = @child_nodes.sort_by{|k,v| k.to_f } sorted_nodes[1..-1].to_a.each do |feature_value,child_node| buff << "#{indent}if(#{feature_name} >= #{feature_value}){" child_node.to_pseudo_code(buff, indent+" " ) buff << "#{indent}}" end buff << "#{indent}else{" sorted_nodes[0][1].to_pseudo_code(buff, indent+" " ) buff << "#{indent}}" else @child_nodes.each do |feature_value,child_node| buff << "#{indent}if(#{feature_name} == #{feature_value}){" child_node.to_pseudo_code(buff, indent+" " ) buff << "#{indent}}" end end return buff end
Private Instance Methods
build_child_nodes(entries)
click to toggle source
# File lib/decision-tree.rb, line 190 def build_child_nodes(entries) buff = Hash.new{|h,feature_value| h[feature_value] = Array.new} entries.each do |e| feature_value = e[:features][feature_index] buff[feature_value] << e end buff.each do |feature_value,child_entries| @child_nodes[feature_value] = Node.new(child_entries, @columns, @algorithm, @dimension, self, feature_value, @path.dup) end end
build_child_nodes_with_continuous_value(entries)
click to toggle source
# File lib/decision-tree.rb, line 204 def build_child_nodes_with_continuous_value(entries) buff = Hash.new{|h,feature_value| h[feature_value] = Array.new} sorted_entries = entries.sort_by{|e| e[:features][feature_index].to_f } last_label = nil #sorted_entries[0][:label].to_s last_value = nil # sorted_entries[0][:features][feature_index] sorted_entries.each_with_index do |e, i| feature_value = e[:features][feature_index] if last_label != e[:label].to_s last_value = feature_value.to_s last_label = e[:label].to_s end buff[last_value] << e end buff.each do |feature_value,child_entries| @child_nodes[feature_value] = Node.new(child_entries, @columns, @algorithm, @dimension, self, feature_value, @path.dup) end end
choose_best_feature(entries)
click to toggle source
# File lib/decision-tree.rb, line 167 def choose_best_feature(entries) labels = entries.map{|x| x[:label]} max_ig = {index: -1, ig: -1.0} @dimension.times do |i| next if @path.include?(i) child_entropy = entries.map{|x| x[:features][i]}.concitional_entropy_with(labels) ig = if @algorithm=='id3' @entropy - child_entropy else# c45 gain = (@entropy - child_entropy) / entries.map{|x| x[:features][i]}.entropy gain = 0 if gain.nan? gain end max_ig = {index: i, ig: ig} if ig > max_ig[:ig] end return max_ig[:index] end