class BioDSL::MergeTable

Merge records on a given key with tabular data from one or more files.

merge_table reads in one or more tabular files and merges any records in the stream with identical values for a given key. The values for the given key must be unique in the tabular files, but not necesarily in the stream.

Consult read_table for details on how the tabular files are read.

The stats for merge_table includes the following values:

Usage

merge_table(<input: <glob>>, <key: <string>>[, columns: <list>
            [, keys: <list>[, skip: <uint>[, delimiter: <string>]]]])

Options

Examples

Consider the following two files:

test1.tab:

#ID ORGANISM
1   parrot
2   eel
3   platypus
4   beetle

test2.tab:

#ID COUNT
1   5423
2   34
3   2423
4   234

We can merge the data with merge_table like this:

BD.new.
read_table(input: "test1.tab").
merge_table(input: "test2.tab", key: :ID).
dump.
run

{:ID=>1, :ORGANISM=>"parrot", :COUNT=>5423}
{:ID=>2, :ORGANISM=>"eel", :COUNT=>34}
{:ID=>3, :ORGANISM=>"platypus", :COUNT=>2423}
{:ID=>4, :ORGANISM=>"beetle", :COUNT=>234}

Constants

STATS

Public Class Methods

new(options) click to toggle source

Constructor for MergeTable.

@param options [Hash]

Options hash.

@option options [String] :input

Input glob expression.

@option options [String, Symbol] :key

Key used to merge.

@option options [Array] :keys

List of key identifiers to use for each column.

@option options [Array] :columns

List of columns to read in that order.

@option options [Integer] :skip

Number of initial lines to skip.

@option options [String] :delimiter

Delimter to use for separating columns.

@return [MergeTable] Class instance.

# File lib/BioDSL/commands/merge_table.rb, line 117
def initialize(options)
  @options = options

  check_options
  defaults

  @table = {}
  @key   = @options[:key].to_sym
  @keys  = options[:keys] ? @options[:keys].map(&:to_sym) : nil
end

Public Instance Methods

lmb() click to toggle source

Return command lambda for merge_table.

@return [Proc] Command lambda.

# File lib/BioDSL/commands/merge_table.rb, line 131
def lmb
  lambda do |input, output, status|
    status_init(status, STATS)

    parse_input_tables

    input.each do |record|
      @status[:records_in] += 1

      if record[@key] && @table[record[@key]]
        @status[:merged] += 1
        record = record.merge(@table[record[@key]])
      else
        @status[:non_merged] += 1
      end

      output << record
      @status[:records_out] += 1
    end

    @status[:rows_total] = @status[:rows_matched] + @status[:rows_unmatched]
  end
end

Private Instance Methods

add_row(record) click to toggle source

Add a given record to the table hash.

@param record [Hash] BioDSL record.

@raise [RuntimeError] if duplicate values are found.

# File lib/BioDSL/commands/merge_table.rb, line 203
def add_row(record)
  if record[@key]
    check_duplicate(record)

    @status[:rows_matched] += 1

    @table[record[@key]] = record
  else
    @status[:rows_unmatched] += 1
  end
end
check_duplicate(record) click to toggle source

Check if a given record is already added to the table and raise if so.

@param record [Hash] BioDSL record.

@raise [RuntimeError] if duplicate values are found.

# File lib/BioDSL/commands/merge_table.rb, line 220
def check_duplicate(record)
  return unless @table[record[@key]]
  fail "Duplicate values found for key: #{@key} value: #{record[@key]}"
end
check_options() click to toggle source

Check options.

# File lib/BioDSL/commands/merge_table.rb, line 158
def check_options
  options_allowed(@options, :input, :key, :keys, :columns, :skip,
                  :delimiter)
  options_required(@options, :input, :key)
  options_files_exist(@options, :input)
  options_list_unique(@options, :keys, :columns)
  options_assert(@options, ':skip >= 0')
end
defaults() click to toggle source

Set default options.

# File lib/BioDSL/commands/merge_table.rb, line 168
def defaults
  @options[:skip] ||= 0
end
parse_input_tables() click to toggle source

Parse input table files and add each row to a table hash.

# File lib/BioDSL/commands/merge_table.rb, line 173
def parse_input_tables
  options_glob(@options[:input]).each do |file|
    BioDSL::CSV.open(file) do |ios|
      ios.skip(@options[:skip])

      ios.each_hash(delimiter: @options[:delimiter],
                    select: @options[:columns]) do |record|
        trim_record(record) if @keys

        add_row(record)
      end
    end
  end
end
trim_record(record) click to toggle source

Trim given record removing unwanted key/values.

@param record [Hash] BioDSL record.

# File lib/BioDSL/commands/merge_table.rb, line 191
def trim_record(record)
  record.first(@keys.size).each_with_index do |(k, v), i|
    record.delete(k)
    record[@keys[i]] = v
  end
end