module BioDSL::Kmer

Module containing methods for manipulating sequence kmers.

Public Class Methods

to_oligos(kmers, kmer_size) click to toggle source

Debug method to convert an array of binary encoded kmers to nucleotide oligos.

# File lib/BioDSL/seq/kmer.rb, line 36
def self.to_oligos(kmers, kmer_size)
  oligos = []

  kmers.each do |kmer|
    oligo = ''
    bin   = format("%0#{kmer_size * 2}b", kmer)

    bin.scan(/.{2}/) do |m|
      case m
      when '00' then oligo << 'a'
      when '01' then oligo << 't'
      when '10' then oligo << 'c'
      when '11' then oligo << 'g'
      else
        fail "unknown m #{m}"
      end
    end

    oligos << oligo
  end

  oligos
end

Public Instance Methods

to_kmers(options) click to toggle source

Method that returns a sorted array of unique kmers, which are integer representations of DNA/RNA sequence oligos where A is encoded in two bits as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other nucleotides are ignored. The following options apply:

* kmer_size: kmer size in the range 1-12.
* step_size: step size in the range 1-12 (defualt=1).
* score_min: drop kmers with quality score below this.
# File lib/BioDSL/seq/kmer.rb, line 67
def to_kmers(options)
  options[:step_size] ||= 1
  options[:score_min] ||= Seq::SCORE_MAX
  fail KmerError, 'No kmer_size' unless options[:kmer_size]

  unless (1..12).include? options[:kmer_size]
    fail KmerError, "Bad kmer_size: #{options[:kmer_size]}"
  end

  unless (1..12).include? options[:step_size]
    fail KmerError, "Bad step_size: #{options[:step_size]}"
  end

  if @qual && !(Seq::SCORE_MIN..Seq::SCORE_MAX).
               include?(options[:score_min])
    fail KmerError, "score minimum: #{options[:score_min]} out of " \
                    "range #{Seq::SCORE_MIN}..#{Seq::SCORE_MAX}"
  end

  size = Seq::DNA.size**options[:kmer_size]

  if defined?(@kmer_ary) && (@kmer_ary.count == size)
    @kmer_ary.zero!
  else
    @kmer_ary = BioDSL::CAry.new(size, 1)
  end

  if @qual
    to_kmers_qual_C(@seq, @qual, @kmer_ary.ary, length, @kmer_ary.count,
                    options[:kmer_size], options[:step_size],
                    options[:score_min], Seq::SCORE_BASE)
  else
    to_kmers_C(@seq, @kmer_ary.ary, length, @kmer_ary.count,
              options[:kmer_size], options[:step_size])
  end
end

Private Instance Methods

naive(options) click to toggle source
# File lib/BioDSL/seq/kmer.rb, line 264
def naive(options)
  oligos = []

  (0..length - options[:kmer_size]).each do |i|
    oligo = self[i...i + options[:kmer_size]]

    next unless oligo.seq.upcase =~ /^[ATUCG]+$/
    next if oligo.qual &&
            options[:scores_min] &&
            (oligo.scores_min < options[:scores_min])

    oligos << oligo.seq.upcase
  end

  oligos
end
naive_bin(options) click to toggle source
# File lib/BioDSL/seq/kmer.rb, line 281
def naive_bin(options)
  oligos = []

  (0..length - options[:kmer_size]).each do |i|
    oligo = self[i...i + options[:kmer_size]]

    next unless oligo.seq.upcase =~ /^[ATCG]+$/
    next if oligo.qual &&
            options[:scores_min] &&
            (oligo.scores_min < options[:scores_min])

    bin = 0

    oligo.seq.upcase.each_char do |c|
      bin <<= 2
      case c
      when 'T' then bin |= 1
      when 'U' then bin |= 1
      when 'C' then bin |= 2
      when 'G' then bin |= 3
      end
    end

    oligos << bin
  end

  oligos
end