class BioDSL::CSV

Class for manipulating CSV or table files. Allow reading and writing of gzip and bzip2 data. Auto-convert data types. Returns lines, arrays or hashes.

Public Class Methods

new(io) click to toggle source

Constructor method for CSV.

# File lib/BioDSL/csv.rb, line 109
def initialize(io)
  @io        = io
  @delimiter = "\s"
  @header    = nil
  @fields    = nil
  @types     = nil
end
open(*args) { |new(io)| ... } click to toggle source
# File lib/BioDSL/csv.rb, line 56
def self.open(*args)
  io = IO.open(*args)

  if block_given?
    yield new(io)
  else
    return new(io)
  end
end
read_array(file, options = {}) click to toggle source

Method that reads all CSV data from a file into an array of arrays (array of rows) which is returned. In the default mode all columns are read. Using the select option subselects the columns based on a given Array or if a heder line is present a given Hash. Visa versa for the reject option. Header lines are prefixed with ‘#’ and are returned if the include_header option is given.

Options:

* include_header
* delimiter.
* select.
* reject.
# File lib/BioDSL/csv.rb, line 78
def self.read_array(file, options = {})
  data = []

  open(file) do |ios|
    ios.each_array(options) { |row| data << row }
  end

  data
end
read_hash(file, options = {}) click to toggle source

Method that reads all CSV data from a file into an array of hashes (array of rows) which is returned. In the default mode all columns are read. Using the select option subselects the columns based on a given Array or if a heder line is present a given Hash. Visa versa for the reject option. Header lines are prefixed with ‘#’.

Options:

* delimiter.
* select.
* reject.
# File lib/BioDSL/csv.rb, line 98
def self.read_hash(file, options = {})
  data = []

  open(file) do |ios|
    ios.each_hash(options) { |row| data << row }
  end

  data
end

Public Instance Methods

each_array(options = {}) { |map(&:to_s)| ... } click to toggle source

Method to iterate over a CSV IO object yielding arrays or an enumerator

CSV.each_array(options={}) { |item| block } -> ary
CSV.each_array(options={})                  -> Enumerator

Options:

* :include_header -
* :delimiter      -
* :select         -
* :reject         -
# File lib/BioDSL/csv.rb, line 135
def each_array(options = {})
  return to_enum :each_array unless block_given?

  delimiter = options[:delimiter] || @delimiter

  @io.each do |line|
    line.chomp!
    next if line.empty?

    fields = line.split(delimiter)

    if line[0] == '#'
      get_header(fields, options) unless @header
      get_fields(fields, options) unless @fields

      yield @header.map(&:to_s) if options[:include_header]
    else
      get_header(fields, options) unless @header
      get_fields(fields, options) unless @fields

      fields = fields.values_at(*@fields) if @fields

      determine_types(fields) unless @types

      yield fields.convert_types(@types)
    end
  end

  self
end
each_hash(options = {}) { |hash| ... } click to toggle source

Method to iterate over a CSV IO object yielding hashes or an enumerator

CSV.each_hash(options={}) { |item| block } -> hash
CSV.each_hash(options={})                  -> Enumerator

Options:

* :delimiter      -
* :select         -
* :reject         -
# File lib/BioDSL/csv.rb, line 174
def each_hash(options = {})
  each_array(options) do |array|
    hash = {}

    array.convert_types(@types).each_with_index do |field, i|
      hash[@header[i]] = field
    end

    yield hash
  end

  self
end
skip(num) click to toggle source

Method to skip a given number or non-empty lines.

# File lib/BioDSL/csv.rb, line 118
def skip(num)
  while num != 0 && (line = @io.gets)
    line.chomp!

    num -= 1 unless line.empty?
  end
end

Private Instance Methods

determine_types(fields) click to toggle source

Method that determines the data types used in an array of fields.

# File lib/BioDSL/csv.rb, line 280
def determine_types(fields)
  types = []

  fields.each do |field|
    field = field.to_num

    types << if field.is_a? Fixnum
               :to_i
             elsif field.is_a? Float
               :to_f
             elsif field.is_a? String
               :to_s
             end
  end

  @types = types
end
get_fields(fields, options) click to toggle source

Method to determine the indexes of fields to be parsed and store these in @fields. Options:

* :select - list of column indexes, names or a range to select.
* :reject - list of column indexes, names or a range to reject.
# File lib/BioDSL/csv.rb, line 241
def get_fields(fields, options)
  if options[:select]
    if options[:select].first.is_a? Fixnum
      @fields = options[:select]
    else
      fail CSVError, 'No header found' unless @header

      fields = []

      options[:select].each do |value|
        fields << @header.index(value.to_sym)
      end

      @fields = fields
    end

    @header = @header.values_at(*@fields)
  elsif options[:reject]
    if options[:reject].first.is_a? Fixnum
      reject = if options[:reject].is_a?(Range)
                 options[:reject].to_a
               else
                 options[:reject]
               end
      @fields = (0...fields.size).to_a - reject
    else
      fail CSVError, 'No header found' unless @header

      reject = options[:reject].map(&:to_sym)

      @fields = @header.map.with_index.to_h.
                delete_if { |k, _| reject.include? k }.values
    end

    @header = @header.values_at(*@fields)
  end
end
get_header(fields, options) click to toggle source

Method to set the @header given a list of fields (a row). Options:

* :select - list of column indexes, names or a range to select.
* :reject - list of column indexes, names or a range to reject.
# File lib/BioDSL/csv.rb, line 194
def get_header(fields, options)
  if fields[0][0] == '#'
    fields[0] = fields[0][1..-1]
    @header = fields.map(&:to_sym)
  else
    @header = []
    fields.each_with_index { |_field, i| @header << "V#{i}".to_sym }
  end

  if options[:select]
    if options[:select].first.is_a? Fixnum
      if options[:select].max >= @header.size
        fail CSVError, "Selected columns out of bounds: #{options[:select].
          select { |c| c >= @header.size }}"
      end
    else
      options[:select].each do |value|
        unless @header.include? value.to_sym
          fail CSVError, "Selected value: #{value} not in header: " \
            " #{@header}"
        end
      end
    end
  elsif options[:reject]
    if options[:reject].first.is_a? Fixnum
      if options[:reject].max >= @header.size
        fail CSVError, "Rejected columns out of bounds: #{options[:reject].
          reject { |c| c >= @header.size }}"
      end
    else
      options[:reject].map do |value|
        unless @header.include? value.to_sym
          fail CSVError, "Rejected value: #{value} not found in header: " \
            "#{@header}"
        end
      end
    end
  end

  @header
end