class BioDSL::Seq
Class for manipulating sequences.
Constants
- DNA
Residue alphabets
- INDELS
- Orf
Struct for holding an ORF.
- PROTEIN
- RNA
- SCORE_BASE
Quality scores bases
- SCORE_MAX
- SCORE_MIN
Attributes
Public Class Methods
# File lib/BioDSL/seq.rb, line 109 def self.check_name_pair(entry1, entry2) if entry1.seq_name =~ /^([^ ]+) \d:/ name1 = Regexp.last_match[1] elsif entry1.seq_name =~ %r{^(.+)\/\d$} name1 = Regexp.last_match[1] else fail SeqError, "Could not match sequence name: #{entry1.seq_name}" end if entry2.seq_name =~ /^([^ ]+) \d:/ name2 = Regexp.last_match[1] elsif entry2.seq_name =~ %r{^(.+)\/\d$} name2 = Regexp.last_match[1] else fail SeqError, "Could not match sequence name: #{entry2.seq_name}" end fail SeqError, "Name mismatch: #{name1} != #{name2}" if name1 != name2 end
Class method that generates all possible oligos of a specifed length and type.
# File lib/BioDSL/seq.rb, line 83 def self.generate_oligos(length, type) fail SeqError, "Bad length: #{length}" if length <= 0 case type.downcase when :dna then alph = DNA when :rna then alph = RNA when :protein then alph = PROTEIN else fail SeqError, "Unknown sequence type: #{type}" end oligos = [''] (1..length).each do list = [] oligos.each do |oligo| alph.each { |char| list << oligo + char } end oligos = list end oligos end
Initialize a sequence object with the following options:
-
:seq_name Name of the sequence.
-
:seq The sequence.
-
:qual An Illumina type quality scores string.
# File lib/BioDSL/seq.rb, line 134 def initialize(options = {}) @seq_name = options[:seq_name] @seq = options[:seq] @type = options[:type] @qual = options[:qual] return unless @seq && @qual return if @seq.length == @qual.length fail SeqError, 'Sequence length and score length mismatch: ' \ "#{@seq.length} != #{@qual.length}" end
Class method to instantiate a new Sequence object given a Biopiece record.
# File lib/BioDSL/seq.rb, line 72 def self.new_bp(record) seq_name = record[:SEQ_NAME] seq = record[:SEQ] type = record[:SEQ_TYPE].to_sym if record[:SEQ_TYPE] qual = record[:SCORES] new(seq_name: seq_name, seq: seq, type: type, qual: qual) end
Public Instance Methods
Method to add two Seq
objects.
# File lib/BioDSL/seq.rb, line 401 def +(other) new_entry = Seq.new new_entry.seq = @seq + other.seq new_entry.type = @type if @type == other.type new_entry.qual = @qual + other.qual if @qual && other.qual new_entry end
Method to concatenate sequence entries.
# File lib/BioDSL/seq.rb, line 410 def <<(entry) fail SeqError, 'sequences of different types' unless @type == entry.type fail SeqError, 'qual is missing in one entry' unless @qual.class == entry.qual.class @seq << entry.seq @qual << entry.qual unless entry.qual.nil? self end
Index method for Seq
objects.
# File lib/BioDSL/seq.rb, line 422 def [](*args) entry = Seq.new entry.seq_name = @seq_name.dup unless @seq_name.nil? entry.seq = @seq[*args] || '' entry.type = @type entry.qual = @qual[*args] || '' unless @qual.nil? entry end
Index assignment method for Seq
objects.
# File lib/BioDSL/seq.rb, line 433 def []=(*args, entry) @seq[*args] = entry.seq[*args] @qual[*args] = entry.qual[*args] unless @qual.nil? self end
Method that complements sequence including ambiguity codes.
# File lib/BioDSL/seq.rb, line 314 def complement fail SeqError, 'Cannot complement 0 length sequence' if length == 0 entry = Seq.new(seq_name: @seq_name, type: @type, qual: @qual) if dna? entry.seq = @seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn') elsif rna? entry.seq = @seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn') else fail SeqError, "Cannot complement sequence type: #{@type}" end entry end
Method that complements sequence including ambiguity codes.
# File lib/BioDSL/seq.rb, line 333 def complement! fail SeqError, 'Cannot complement 0 length sequence' if length == 0 if dna? @seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn') elsif rna? @seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn') else fail SeqError, "Cannot complement sequence type: #{@type}" end self end
Method that returns the residue compositions of a sequence in a hash where the key is the residue and the value is the residue count.
# File lib/BioDSL/seq.rb, line 443 def composition comp = Hash.new(0); @seq.upcase.each_char do |char| comp[char] += 1 end comp end
Method that returns true is a given sequence type is DNA
.
# File lib/BioDSL/seq.rb, line 202 def dna? @type == :dna end
Method to find open reading frames (ORFs).
# File lib/BioDSL/seq.rb, line 608 def each_orf(options = {}) size_min = options[:size_min] || 0 size_max = options[:size_max] || length start_codons = options[:start_codons] || 'ATG,GTG,AUG,GUG' stop_codons = options[:stop_codons] || 'TAA,TGA,TAG,UAA,UGA,UAG' pick_longest = options[:pick_longest] orfs = [] pos_beg = 0 regex_start = Regexp.new(start_codons.split(',').join('|'), true) regex_stop = Regexp.new(stop_codons.split(',').join('|'), true) while pos_beg && pos_beg < length - size_min pos_beg = @seq.index(regex_start, pos_beg) next unless pos_beg pos_end = @seq.index(regex_stop, pos_beg) next unless pos_end orf_length = (pos_end - pos_beg) + 3 if (orf_length % 3) == 0 if size_min <= orf_length && orf_length <= size_max subseq = self[pos_beg...pos_beg + orf_length] orfs << Orf.new(subseq, pos_beg, pos_end + 2) end end pos_beg += 1 end if pick_longest orf_hash = {} orfs.each { |orf| orf_hash[orf.stop] = orf unless orf_hash[orf.stop] } orfs = orf_hash.values end if block_given? orfs.each { |orf| yield orf } else return orfs end end
Method to determine the Edit Distance between two Sequence objects (case insensitive).
# File lib/BioDSL/seq.rb, line 361 def edit_distance(entry) Levenshtein.distance(@seq, entry.seq) end
Method that generates a random sequence of a given length and type.
# File lib/BioDSL/seq.rb, line 366 def generate(length, type) fail SeqError, "Cannot generate seq length < 1: #{length}" if length <= 0 case type when :dna then alph = DNA when :rna then alph = RNA when :protein then alph = PROTEIN else fail SeqError, "Unknown sequence type: #{type}" end seq_new = Array.new(length) { alph[rand(alph.size)] }.join('') @seq = seq_new @type = type seq_new end
Method to determine the Hamming
Distance between two Sequence objects (case insensitive).
# File lib/BioDSL/seq.rb, line 351 def hamming_distance(entry, options = {}) if options[:ambiguity] BioDSL::Hamming.distance(@seq, entry.seq, options) else BioDSL::Hamming.distance(@seq.upcase, entry.seq.upcase, options) end end
Method that returns the percentage of hard masked residues or N’s in a sequence.
# File lib/BioDSL/seq.rb, line 455 def hard_mask ((@seq.upcase.scan('N').size.to_f / (length - indels).to_f) * 100). round(2) end
Return the number indels in a sequence.
# File lib/BioDSL/seq.rb, line 174 def indels regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/) @seq.scan(regex).size end
Method to remove indels from seq and qual if qual.
# File lib/BioDSL/seq.rb, line 180 def indels_remove if @qual.nil? @seq.delete!(Regexp.escape(INDELS.join(''))) else na_seq = NArray.to_na(@seq, 'byte') na_qual = NArray.to_na(@qual, 'byte') mask = NArray.byte(length) INDELS.each do |c| mask += na_seq.eq(c.ord) end mask = mask.eq(0) @seq = na_seq[mask].to_s @qual = na_qual[mask].to_s end self end
Returns the length of a sequence.
# File lib/BioDSL/seq.rb, line 167 def length @seq.nil? ? 0 : @seq.length end
Hard masks sequence residues where the corresponding quality scoreis below a given cutoff.
# File lib/BioDSL/seq.rb, line 468 def mask_seq_hard!(cutoff) fail SeqError, 'seq is nil' if @seq.nil? fail SeqError, 'qual is nil' if @qual.nil? fail SeqError, "cufoff value: #{cutoff} out of range: " \ "#{SCORE_MIN}..#{SCORE_MAX}" unless (SCORE_MIN..SCORE_MAX). include? cutoff na_seq = NArray.to_na(@seq.upcase, 'byte') na_qual = NArray.to_na(@qual, 'byte') mask = (na_qual - SCORE_BASE) < cutoff mask *= na_seq.ne('-'.ord) na_seq[mask] = 'N'.ord @seq = na_seq.to_s self end
Soft masks sequence residues where the corresponding quality score is below a given cutoff. Masked sequence will be lowercased and remaining will be uppercased.
# File lib/BioDSL/seq.rb, line 490 def mask_seq_soft!(cutoff) fail SeqError, 'seq is nil' if @seq.nil? fail SeqError, 'qual is nil' if @qual.nil? fail SeqError, "cufoff value: #{cutoff} out of range: " \ "#{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN..SCORE_MAX). include? cutoff na_seq = NArray.to_na(@seq.upcase, 'byte') na_qual = NArray.to_na(@qual, 'byte') mask = (na_qual - SCORE_BASE) < cutoff mask *= na_seq.ne('-'.ord) na_seq[mask] ^= ' '.ord @seq = na_seq.to_s self end
Method that returns true is a given sequence type is protein.
# File lib/BioDSL/seq.rb, line 212 def protein? @type == :protein end
Method that determines if a quality score string can be absolutely identified as base 33.
# File lib/BioDSL/seq.rb, line 511 def qual_base33? @qual.match(/[!-:]/) ? true : false end
Method that determines if a quality score string may be base 64.
# File lib/BioDSL/seq.rb, line 516 def qual_base64? @qual.match(/[K-h]/) ? true : false end
Method to coerce quality scores to be within the 0-40 range.
# File lib/BioDSL/seq.rb, line 534 def qual_coerce!(encoding) fail SeqError, 'Missing qual' if @qual.nil? case encoding when :base_33 then qual_coerce_C(@qual, @qual.length, 33, 73) # !-J when :base_64 then qual_coerce_C(@qual, @qual.length, 64, 104) # @-h else fail SeqError, "unknown quality score encoding: #{encoding}" end self end
Method to convert quality scores.
# File lib/BioDSL/seq.rb, line 548 def qual_convert!(from, to) unless from == :base_33 || from == :base_64 fail SeqError, "unknown quality score encoding: #{from}" end unless to == :base_33 || to == :base_64 fail SeqError, "unknown quality score encoding: #{to}" end if from == :base_33 && to == :base_64 qual_convert_C(@qual, @qual.length, 31) # += 64 - 33 elsif from == :base_64 && to == :base_33 # Handle negative Solexa values from -5 to -1 (set these to 0). qual_coerce_C(@qual, @qual.length, 64, 104) qual_convert_C(@qual, @qual.length, -31) # -= 64 - 33 end self end
Method to determine if a quality score is valid accepting only 0-40 range.
# File lib/BioDSL/seq.rb, line 521 def qual_valid?(encoding) fail SeqError, 'Missing qual' if @qual.nil? case encoding when :base_33 then return true if @qual.match(/^[!-I]*$/) when :base_64 then return true if @qual.match(/^[@-h]*$/) else fail SeqError, "unknown quality score encoding: #{encoding}" end false end
Method to reverse the sequence.
# File lib/BioDSL/seq.rb, line 295 def reverse entry = Seq.new( seq_name: @seq_name, seq: @seq.reverse, type: @type, qual: (@qual ? @qual.reverse : @qual) ) entry end
Method to reverse the sequence.
# File lib/BioDSL/seq.rb, line 307 def reverse! @seq.reverse! @qual.reverse! if @qual self end
Method that returns true is a given sequence type is RNA
.
# File lib/BioDSL/seq.rb, line 207 def rna? @type == :rna end
Method to calculate and return the max quality score.
# File lib/BioDSL/seq.rb, line 589 def scores_max fail SeqError, 'Missing qual in entry' if @qual.nil? na_qual = NArray.to_na(@qual, 'byte') na_qual -= SCORE_BASE na_qual.max end
Method to calculate and return the mean quality score.
# File lib/BioDSL/seq.rb, line 569 def scores_mean fail SeqError, 'Missing qual in entry' if @qual.nil? na_qual = NArray.to_na(@qual, 'byte') na_qual -= SCORE_BASE na_qual.mean end
Method to run a sliding window of a specified size across a Phred type scores string and calculate for each window the mean score and return the minimum mean score.
# File lib/BioDSL/seq.rb, line 601 def scores_mean_local(window_size) fail SeqError, 'Missing qual in entry' if @qual.nil? scores_mean_local_C(@qual, @qual.length, SCORE_BASE, window_size) end
Method to calculate and return the min quality score.
# File lib/BioDSL/seq.rb, line 579 def scores_min fail SeqError, 'Missing qual in entry' if @qual.nil? na_qual = NArray.to_na(@qual, 'byte') na_qual -= SCORE_BASE na_qual.min end
Method to return a new Seq
object with shuffled sequence.
# File lib/BioDSL/seq.rb, line 385 def shuffle Seq.new( seq_name: @seq_name, seq: @seq.split('').shuffle!.join, type: @type, qual: @qual ) end
Method to shuffle a sequence randomly inline.
# File lib/BioDSL/seq.rb, line 395 def shuffle! @seq = @seq.split('').shuffle!.join self end
Method that returns the percentage of soft masked residues or lower cased residues in a sequence.
# File lib/BioDSL/seq.rb, line 462 def soft_mask ((@seq.scan(/[a-z]/).size.to_f / (length - indels).to_f) * 100).round(2) end
Method that given a Seq
entry returns a FASTA entry (a string).
# File lib/BioDSL/seq.rb, line 243 def to_fasta(wrap = nil) fail SeqError, 'Missing seq_name' if @seq_name.nil? || @seq_name == '' fail SeqError, 'Missing seq' if @seq.nil? || @seq.empty? seq_name = @seq_name.to_s seq = @seq.to_s unless wrap.nil? seq.gsub!(/(.{#{wrap}})/) do |match| match << $INPUT_RECORD_SEPARATOR end seq.chomp! end ">#{seq_name}#{$INPUT_RECORD_SEPARATOR}#{seq}#{$INPUT_RECORD_SEPARATOR}" end
Method that given a Seq
entry returns a FASTQ entry (a string).
# File lib/BioDSL/seq.rb, line 262 def to_fastq fail SeqError, 'Missing seq_name' if @seq_name.nil? fail SeqError, 'Missing seq' if @seq.nil? fail SeqError, 'Missing qual' if @qual.nil? seq_name = @seq_name.to_s seq = @seq.to_s qual = @qual.to_s "@#{seq_name}#{$RS}#{seq}#{$RS}+#{$RS}#{qual}#{$RS}" end
Method that generates a unique key for a DNA
sequence and return this key as a Fixnum.
# File lib/BioDSL/seq.rb, line 276 def to_key key = 0 @seq.upcase.each_char do |char| key <<= 2 case char when 'A' then key |= 0 when 'C' then key |= 1 when 'G' then key |= 2 when 'T' then key |= 3 else fail SeqError, "Bad residue: #{char}" end end key end
Method that guesses and returns the sequence type by inspecting the first 100 residues.
# File lib/BioDSL/seq.rb, line 149 def type_guess fail SeqError, 'Guess failed: sequence is nil' if @seq.nil? case @seq[0...100].downcase when /[flpqie]/ then return :protein when /[u]/ then return :rna else return :dna end end
Method that guesses and sets the sequence type by inspecting the first 100 residues.
# File lib/BioDSL/seq.rb, line 161 def type_guess! @type = type_guess self end