class Rarff::Relation
Attributes
attributes[RW]
instances[R]
name[RW]
Public Class Methods
new(name='')
click to toggle source
# File lib/rarff.rb, line 129 def initialize(name='') @name = name @attributes = Array.new @instances = Array.new end
Public Instance Methods
create_attributes(attr_parse=false)
click to toggle source
# File lib/rarff.rb, line 176 def create_attributes(attr_parse=false) raise Exception, "Not enough data to create ARFF attributes" if @instances.nil? or @instances.empty? or @instances[0].empty? # Keep track of whether an attribute has been defined or not. # The only reason an attribute would not be defined in the first # row is if it has nil's in it. The geek inside screams for a binary # encoding like chmod but eh. attributes_defined = {} @instances.each_with_index { |row, i| row.each_with_index { |col, j| next if attributes_defined[j] or col.nil? attributes_defined[j] = true #whatever happens, we are going to define it if attr_parse if col =~ /^\-?\d+\.?\d*$/ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC) end next #parse next column - this one is finished end # No parsing - just take it how it is if col.kind_of?(Numeric) @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC) elsif col.kind_of?(String) @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_STRING) elsif col == false or col == true #exactly equal to a boolean @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_BOOLEAN) else raise Exception, "Could not parse attribute to ARFF data type: #{col.inspect}" end } } # Make sure all attributes have a definition, because otherwise # needless errors are thrown @instances[0].each_index do |i| @attributes[i] ||= Attribute.new("Attr#{i}", ATTRIBUTE_NUMERIC) end end
expand_sparse(str)
click to toggle source
# File lib/rarff.rb, line 244 def expand_sparse(str) arr = Array.new(@attributes.size, 0) str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr| pra = pr.split(/\s/) arr[pra[0].to_i] = pra[1] } arr end
instances=(instances, parse=false)
click to toggle source
Assign instances to the internal array parse: choose to parse strings into numerics
# File lib/rarff.rb, line 169 def instances=(instances, parse=false) @instances = instances create_attributes(parse) end
parse(str)
click to toggle source
# File lib/rarff.rb, line 136 def parse(str) in_data_section = false # TODO: Doesn't handle commas in quoted attributes. str.split("\n").each { |line| next if line =~ /^\s*$/ next if line =~ /^\s*#{COMMENT_MARKER}/ next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name } next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type| @attributes.push(Attribute.new(name, type)) } next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true } next if in_data_section == false ## Below is data section handling # next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data| next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data| # Sparse ARFF # TODO: Factor duplication with non-sparse data below @instances << expand_sparse(data.first) create_attributes(true) } next if line.my_scan(/^\s*(.*)\s*$/) { |data| @instances << data.first.split(/,\s*/).map { |field| # Remove outer single quotes on strings, if any ('foo bar' --> foo bar) field.gsub(/^\s*\'(.*)\'\s*$/, "\\1") } create_attributes(true) } } end
set_string_attributes_to_nominal(column_indices = nil)
click to toggle source
Make all String
type attributes into nominal attributes, because they are more useful in WEKA because more techniques handle them than strings.
column_indices is an optional argumetn specifying the columns that are to be set to nominal (0 based indexes). if nil (the default), then all columns are included
# File lib/rarff.rb, line 225 def set_string_attributes_to_nominal(column_indices = nil) nominals = {} # Frustratingly, we have to traverse this 2D array with the # wrong dimension first. Oh well. @instances.each_with_index do |row, row_index| row.each_with_index do |string, col_index| next unless @attributes[col_index].type == ATTRIBUTE_STRING next unless column_indices.nil? or column_indices.include?(col_index) nominals[col_index] ||= {} nominals[col_index][string] ||= true end end nominals.each do |index, strings| @attributes[index].type = "{#{strings.keys.join(',')}}" end end
to_arff(sparse=false)
click to toggle source
# File lib/rarff.rb, line 254 def to_arff(sparse=false) RELATION_MARKER + " #{@name}\n" + @attributes.join("\n") + "\n" + DATA_MARKER + "\n" + @instances.map { |inst| mapped = inst.map_with_index { |col, i| # First pass - quote strings with spaces, and dates # TODO: Doesn't handle cases in which strings already contain # quotes or are already quoted. unless col.nil? if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i if col =~ /[,\s+]/ col = "'" + col + "'" end elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh. col = '"' + col + '"' end end # Do the final output if sparse if col.nil? or (@attributes[i].type =~ /^#{ATTRIBUTE_NUMERIC}$/i and col == 0) nil else "#{i} #{col}" end else if col.nil? MISSING else col end end } if sparse mapped.reject{|col| col.nil?}.join(', ') else mapped.join(", ") end }.join("\n") end
to_s()
click to toggle source
# File lib/rarff.rb, line 301 def to_s to_arff end