class Bio::Alignment::CIGAR
Parse a CIGAR
string An example from Exonerate output. Ideally will also allow SAM file input to be used.
1 : CGGCTATGGGGTCGTGGGTCCCGCGTTG-CTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT : 63 | | || | ||||||||||||||||||| ||||||||||||||||||||||||||||||||||| 1 : CAG-TA-GTGGTCGTGGGTCCCGCGTTGTCTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT : 62
ref: CAGTAGTGGTCGTGGGTCCCGCGTTGTCTCTGG… cigar: SP-A12_D02_2015-01-16.seq 0 611 + SP-A3_ref 0 621 + 2514 M 3 I 1 M 2 I 1 M 21 D 1 M 306 D 9 M 89 I 1 M 126 D 1 M 24 D 1 M 8 I 1 M 6 D 1 M 5 D 1 M 17 I are not counted in reference Regexp (from SAM specification) - but in exonerate the number comes first: ([0-9][MIDNSHP])
|*
Attributes
Public Class Methods
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 27 def initialize(string,ref=nil,source="") # strip out whitespace string.gsub!(/\s+/,"") # Auto-detect source if not supplied if !(Bio::Alignment::CIGAR.regexps.keys.include? source) Bio::Alignment::CIGAR.regexps.each do |k,v| # Look for match at start of string if m = string.match(v) source = k if m.offset(0)[0] == 0 end end if source == "" raise "Source (e.g. 'exonerate', 'sam') not given and failed to auto-detect." end end # Make an array of pairs of of cigar elements: @pairs = string.scan(Bio::Alignment::CIGAR.regexps[source]) if source == "exonerate" @pairs.map!{|pair| [pair[0].to_s, pair[1].to_i]} else # Provision to have number and identifier the other way round @pairs.map!{|pair| [pair[1].to_s, pair[0].to_i]} end # Include reference sequence if provided @reference = ref # Check length of reference = sum(M+D)? #warn "Reference length is not equal to that implied by CIGAR string: #{@reference.length}, #{self.reference_length}." unless @reference.length == self.reference_length end
Public Instance Methods
TODO combine adjacent operations of the same type into a single pair
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 198 def combine_adjacent end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 96 def deleted_length count_type("D") end
Output hgnc variant format given reference position. Only deletions can be accurately annotated from the cigar string; insertions or wild type seqeunces return nil NB mutation calling and annotation now implemented as extension to Bio::DB::Alignment (SAM)
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 148 def hgnc(reference_pos=0,insertions=[],type="g",*subs) if insertions if insertions.is_a? String insertions = [insertions] end end first_match = true total = 0 hgnc_format = [] @pairs.each do |pair| case pair[0] when "M" #break if first_match == false reference_pos += pair[1] total += pair[1] first_match = false when "D" deleted_bases = @reference[total,pair[1]].upcase if (pair[1] == 1) string = (reference_pos + 1).to_s else string = (reference_pos + 1).to_s + "_" + (reference_pos + pair[1]).to_s end string = string + "del" + deleted_bases hgnc_format << string total += pair[1] when "I" inserted_bases = (insertions.length == 0) ? "N" : insertions.shift hgnc_format << (reference_pos).to_s + "_" + (reference_pos + 1).to_s + "ins" + inserted_bases.upcase end end # Use for substitutions, but could also pass any other annotation to include in here, as an array of strings subs = subs.first # >1 arguments discarded if subs if (subs.length > 0 && (subs.is_a? Array)) hgnc_format = hgnc_format + subs end end if hgnc_format.length == 0 nil elsif hgnc_format.length == 1 type.to_s + "." + hgnc_format[0] else type.to_s + "." + "[" + hgnc_format.join(";") + "]" end end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 100 def inserted_length count_type("I") end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 104 def masked_length count_type(/[SH]/) end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 92 def matched_length count_type("M") end
Returns a hash (keyed by operation type) of three element arrays: the start positions on the reference of operations of the given type(s) and the length of the operation, followed by query position (for e.g. retrieving inserted bases from SAM). A regexp can be used to specify multiple types e.g. /[ID]/.
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 204 def positions(type) total = 0 qtotal = 0 hash = Hash.new{|h,k| h[k] = []} @pairs.each do |pair| if pair[0].match(type) hash[$&] << [total, pair[1], qtotal] end total += pair[1] if pair[0].match Bio::Alignment::CIGAR.reference_operations qtotal += pair[1] if pair[0].match Bio::Alignment::CIGAR.query_operations end hash end
Output a representation of the query: replace deleted portions with “-”, flag insertions with “*” or sim. Optionally provide the sequence (or symbols to use) of insertions, in order of appearence. Should be able to accept an array TODO: Add support for substitution highlighting (e.g lowercasing)
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 119 def query(insertions=nil) if (insertions && (insertions.is_a? String)) insertions = [insertions] end sequence = [] total = 0 @pairs.each do |pair| if pair[0].match("M") sequence << @reference[total..total+pair[1]-1].upcase total += pair[1] end if pair[0].match("I") if (insertions) insertion = insertions.shift.to_s else insertion = '['+pair[1].to_s+']' end sequence << insertion end if pair[0].match("D") pair[1].times{ sequence << "-" } total += pair[1] end end sequence.join("") end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 112 def query_length count_type(Bio::Alignment::CIGAR.query_operations) end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 108 def reference_length count_type(Bio::Alignment::CIGAR.reference_operations) end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 87 def remove_empty! self.pairs.keep_if{|pair| pair[1] != 0 } self end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 80 def remove_small!(threshold=1) # Deletions convert to matches, insertions just remove deletions_to_matches(threshold) remove_small_nonmatches(threshold) self end
Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 60 def subalignment(offset,length,regexp=Bio::Alignment::CIGAR.reference_operations) new_array = iterate_pairs(@pairs,offset,length,regexp) # Return a CIGAR instance with just the new alignment new_string = new_array.join(" ") # -1 from offset as ruby string starts at zero new_cigar = Bio::Alignment::CIGAR.new(new_string,@reference[offset-1,length]) new_cigar.remove_empty! end
Given a CIGAR-based [not reference - use subalignment] offset and length, return a subregion
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 71 def subcigar(offset,length) # No regexp - includes everything self.subalignment(offset,length,//) end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 76 def unmasked(offset,length) self.subalignment(offset,length,/[MDI]/) end
Private Instance Methods
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 229 def count_type(type) sum = 0 @pairs.each do |pair| if pair[0].match(type) sum += pair[1] end end sum end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 221 def deletions_to_matches(threshold) self.pairs = @pairs.each{|pair| pair[0].sub!("D","M") if pair[1] <= threshold} end
# File lib/bio-sam-mutation/bio/alignment/cigar.rb, line 225 def remove_small_nonmatches(threshold) self.pairs = @pairs.keep_if{|pair| pair[0] == "M" || pair[1] > threshold} end