class BioDSL::Sort

Sort records in the stream.

sort records in the stream given a specific key. Sorting on multiple keys is currently not supported.

Usage

sort(key: <value>[, reverse: <bool>[, block_size: <uint>]])

Options

Examples

Consider the following table in the file ‘test.tab`:

#COUNT  ORGANISM
4 Dog
3 Cat
1 Eel

To sort this accoring to COUNT in descending order do:

BD.new.read_table(input: "test.tab").sort(key: :COUNT).dump.run

{:COUNT=>1, :ORGANISM=>"Eel"}
{:COUNT=>3, :ORGANISM=>"Cat"}
{:COUNT=>4, :ORGANISM=>"Dog"}

And in ascending order:

BD.new.
read_table(input: "test.tab").
sort(key: :COUNT, reverse: true).
dump.
run

{:COUNT=>4, :ORGANISM=>"Dog"}
{:COUNT=>3, :ORGANISM=>"Cat"}
{:COUNT=>1, :ORGANISM=>"Eel"}

The type of value determines the sorting, alphabetical order:

BD.new.read_table(input: "test.tab").sort(key: :ORGANISM).dump.run

{:COUNT=>3, :ORGANISM=>"Cat"}
{:COUNT=>4, :ORGANISM=>"Dog"}
{:COUNT=>1, :ORGANISM=>"Eel"}

And reverse alphabetic order:

BD.new.
read_table(input: "test.tab").
sort(key: :ORGANISM, reverse: true).
dump.
run

{:COUNT=>1, :ORGANISM=>"Eel"}
{:COUNT=>4, :ORGANISM=>"Dog"}
{:COUNT=>3, :ORGANISM=>"Cat"}

Constants

SORT_BLOCK_SIZE
STATS

Public Class Methods

new(options) click to toggle source

Constructor for Sort.

@param options [Hash] Options hash.

@option options [String,Symbol] :key @option options [Boolean] :reverse @option options [Integer] :block_size

@return [Sort] Class instance.

# File lib/BioDSL/commands/sort.rb, line 108
def initialize(options)
  @options    = options
  @block_size = options[:block_size] || SORT_BLOCK_SIZE
  @key        = options[:key].to_sym
  @files      = []
  @records    = []
  @size       = 0
  @pqueue     = pqueue_init
  @fds        = nil

  check_options
end

Public Instance Methods

lmb() click to toggle source

Return command lambda for Sort.

@return [Proc] Command lambda.

# File lib/BioDSL/commands/sort.rb, line 124
def lmb
  lambda do |input, output, status|
    status_init(status, STATS)

    input.each do |record|
      @status[:records_in] += 1
      @records << record
      @size += record.to_s.size
      save_block if @size > @block_size
    end

    save_block
    open_block_files
    fill_pqueue
    output_pqueue(output)
  end
end

Private Instance Methods

check_options() click to toggle source

Check options.

# File lib/BioDSL/commands/sort.rb, line 145
def check_options
  options_allowed(@options, :key, :reverse, :block_size)
  options_required(@options, :key)
  options_allowed_values(@options, reverse: [nil, true, false])
  options_assert(@options, ':block_size >  0')
end
fill_pqueue() click to toggle source

Fill the pqueue with the first record from each of the file descriptors.

# File lib/BioDSL/commands/sort.rb, line 196
def fill_pqueue
  @fds.each_with_index do |fd, i|
    BioDSL::Serializer.new(fd) do |serializer|
      @pqueue << [serializer.next_entry, i] unless fd.eof?
    end
  end
end
open_block_files() click to toggle source

Open all sorted files.

# File lib/BioDSL/commands/sort.rb, line 190
def open_block_files
  @fds = @files.inject([]) { |a, e| a << File.open(e, 'rb') }
  at_exit { @fds.map(&:close) }
end
output_pqueue(output) click to toggle source

Output all records from the pqueue while filling this with the next record from the list of file descriptors.

@param output [Enumerator::Yeilder] Output stream.

# File lib/BioDSL/commands/sort.rb, line 208
def output_pqueue(output)
  until @pqueue.empty?
    record, i = @pqueue.pop

    output << record
    @status[:records_out] += 1

    fd = @fds[i]

    BioDSL::Serializer.new(fd) do |serializer|
      @pqueue << [serializer.next_entry, i] unless fd.eof?
    end
  end
end
pqueue_init() click to toggle source

Initialize pqueue

# File lib/BioDSL/commands/sort.rb, line 153
def pqueue_init
  PQueue.new do |a, b|
    if @options[:reverse]
      a.first[@key] <=> b.first[@key]
    else
      b.first[@key] <=> a.first[@key]
    end
  end
end
save_block() click to toggle source

Save a block of records after sorting this.

# File lib/BioDSL/commands/sort.rb, line 164
def save_block
  return if @records.empty?

  @records.sort_by! { |r| r[@options[:key].to_sym] }
  @records.reverse! if @options[:reverse]

  serialize_records

  @records = []
  @size    = 0
end
serialize_records() click to toggle source

Save sorted records to file.

# File lib/BioDSL/commands/sort.rb, line 177
def serialize_records
  file = Tempfile.new('sort')

  File.open(file, 'wb') do |ios|
    BioDSL::Serializer.new(ios) do |serializer|
      @records.each { |record| serializer << record }
    end
  end

  @files << file
end