class BioDSL::CountValues

Count the number of times values of given keys exists in stream.

count_values count the values for a given comma seperated list of keys.

Usage

count_values(<keys: <list>)

Options

Examples

Consider the following two column table in the file ‘test.tab`:

Human   H1
Human   H2
Human   H3
Dog     D1
Dog     D2
Mouse   M1

To count the values of both columns we first read the table with read_table and then pass the result to count_values:

BD.new.
read_table(input: "test.tab").
count_values(keys: [:V0, :V1]).
dump.
run

{:V0=>"Human", :V1=>"H1", :V0_COUNT=>3, :V1_COUNT=>1}
{:V0=>"Human", :V1=>"H2", :V0_COUNT=>3, :V1_COUNT=>1}
{:V0=>"Human", :V1=>"H3", :V0_COUNT=>3, :V1_COUNT=>1}
{:V0=>"Dog", :V1=>"D1", :V0_COUNT=>2, :V1_COUNT=>1}
{:V0=>"Dog", :V1=>"D2", :V0_COUNT=>2, :V1_COUNT=>1}
{:V0=>"Mouse", :V1=>"M1", :V0_COUNT=>1, :V1_COUNT=>1}

Constants

STATS

Public Class Methods

new(options) click to toggle source

Constructor for CountValues.

@param options [Hash] Options hash. @option options [Array] List of keys whos values to count.

@return [CountValues] Instance of class.

# File lib/BioDSL/commands/count_values.rb, line 76
def initialize(options)
  @options = options

  check_options

  @keys       = @options[:keys].map(&:to_sym)
  @count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
end

Public Instance Methods

lmb() click to toggle source

Return the command lambda for the count_values command.

@return [Proc] Return command lambda.

# File lib/BioDSL/commands/count_values.rb, line 88
def lmb
  lambda do |input, output, status|
    status_init(status, STATS)

    TmpDir.create('count_values') do |tmp_file, _|
      process_input(input, tmp_file)
      process_output(output, tmp_file)
    end
  end
end

Private Instance Methods

check_options() click to toggle source

Check options.

# File lib/BioDSL/commands/count_values.rb, line 102
def check_options
  options_allowed(@options, :keys)
  options_required(@options, :keys)
end
process_input(input, tmp_file) click to toggle source

Save serialized stream to a temporary file and counting the requested values.

@param input [Enumerator] Input stream. @param tmp_file [String] Path to temp file.

# File lib/BioDSL/commands/count_values.rb, line 112
def process_input(input, tmp_file)
  File.open(tmp_file, 'wb') do |ios|
    BioDSL::Serializer.new(ios) do |s|
      input.each do |record|
        @keys.map do |key|
          @count_hash[key][record[key]] += 1 if record.key? key
        end

        @status[:records_in] += 1

        s << record
      end
    end
  end
end
process_output(output, tmp_file) click to toggle source

Output serialized stream to the output stream including value counts.

@param output [Enumerator::Yielder] Output stream. @param tmp_file [String] Path to temp file with serialized input stream.

# File lib/BioDSL/commands/count_values.rb, line 132
def process_output(output, tmp_file)
  File.open(tmp_file, 'rb') do |ios|
    BioDSL::Serializer.new(ios) do |s|
      s.each do |record|
        @keys.map do |key|
          if record.key? key
            record["#{key}_COUNT".to_sym] = @count_hash[key][record[key]]
          end
        end

        output << record
        @status[:records_out] += 1
      end
    end
  end
end