class Bio::MAF::Tiler
Tiles a given genomic interval. Inspired by: lib/bx/align/tools/tile.py in bx-python
Attributes
The character used to fill regions where no sequence data is available for a particular species. Defaults to ‘*`. @return [String]
The species of interest to extract from the MAF
file. Will be set as a {Parser#sequence_filter} for parsing. Defaults to the keys of {#species_map}.
@return [Array<String>]
A hash mapping species to their desired output names.
@return [Hash]
Public Class Methods
# File lib/bio/maf/tiler.rb, line 35 def initialize @species_map = {} self.fill_char = '*' self.remove_absent_species = true end
Public Instance Methods
Tile sequences to build a new {Bio::BioAlignment::Alignment Alignment} object. This will have one {Bio::BioAlignment::Sequence Sequence} per entry in {#species} or {#species_map}, in the same order. Each sequence will have an {Bio::BioAlignment::Sequence#id id} given by {#species_map} or, if none is present, the identifier from {#species}.
@return [Bio::BioAlignment::Alignment] @api public
# File lib/bio/maf/tiler.rb, line 181 def build_bio_alignment out = output_text.to_a Bio::BioAlignment::Alignment.new(out.collect { |e| e[1] }, out.collect { |e| e[0] }) end
Set the character to be used for filling regions with no sequence data from the MAF
file or a reference sequence. @param c [String] a one-character String to fill with
# File lib/bio/maf/tiler.rb, line 44 def fill_char=(c) unless c.is_a?(String) && c.length == 1 raise ArgumentError, "not a single character: #{c.inspect}" end @fill_char = c end
# File lib/bio/maf/tiler.rb, line 163 def non_fill_re fill_esc = Regexp.escape(fill_char) Regexp.compile("[^#{fill_esc}]") end
# File lib/bio/maf/tiler.rb, line 168 def output_text species_for_output.zip(tile()).reject { |s, t| t.nil? } end
# File lib/bio/maf/tiler.rb, line 73 def ref_data(range) if reference if reference.respond_to? :read_interval reference.read_interval(range.begin, range.end) elsif reference.is_a? String reference.slice(range) else raise "Unhandled reference data source: #{reference}" end else nil end end
Set the reference sequence. This can be a {Pathname} or a {String} giving the path to an optionally-gzipped FASTA file, an open {IO} stream to a FASTA file, a String containing FASTA data, or a {FASTARangeReader} instance.
@param source [FASTARangeReader, String, Pathname, readline]
# File lib/bio/maf/tiler.rb, line 57 def reference=(source) ref = case when source.is_a?(FASTARangeReader) source when source.respond_to?(:seek) # open file FASTARangeReader.new(source) when source.respond_to?(:start_with?) && source.start_with?('>') # FASTA string FASTARangeReader.new(StringIO.new(source)) else FASTARangeReader.new(source.to_s) end @reference = ref end
# File lib/bio/maf/tiler.rb, line 199 def runs(mask) cur = nil cur_start = nil mask.each_with_index do |obj, i| if ! cur.equal?(obj) yield(cur_start...i, cur) if cur cur = obj cur_start = i end end yield(cur_start...mask.size, cur) end
# File lib/bio/maf/tiler.rb, line 91 def species_for_output species_to_use.collect { |s| species_map[s] || s } end
# File lib/bio/maf/tiler.rb, line 87 def species_to_use species || species_map.keys end
Return an array of tiled sequence data, in the order given by {#species_to_use}. @return [Array<String>]
# File lib/bio/maf/tiler.rb, line 98 def tile parser.sequence_filter[:only_species] = species_to_use parser.opts[:remove_gaps] = true LOG.debug { "finding blocks covering interval #{interval}." } blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] } mask = Array.new(interval.length, :ref) i_start = interval.zero_start i_end = interval.zero_end if reference LOG.debug { "using a #{reference.class} reference." } ref_region = ref_data(i_start...i_end) end LOG.debug "tiling #{blocks.count} blocks." blocks.each do |block| ref = block.ref_seq LOG.debug { "tiling with block #{ref.start}-#{ref.end}" } slice_start = [i_start, ref.start].max slice_end = [i_end, ref.end].min mask.fill(block, (slice_start - i_start)...(slice_end - i_start)) end text = [] species_to_use.each { |s| text << '' } nonref_text = text[1...text.size] runs(mask) do |range, block| g_range = (range.begin + i_start)...(range.end + i_start) if block == :ref # not covered by an alignment block # use the reference sequence if given, otherwise 'N' range_size = range.end - range.begin text[0] << if ref_region ref_region.slice(range) else 'N' * range_size end fill_text = fill_char * range_size nonref_text.each { |t| t << fill_text } else # covered by an alignment block t_range = block.ref_seq.text_range(g_range) species_to_use.each_with_index do |species, i| sp_text = text[i] seq = block.sequences.find { |s| s.source == species || s.species == species } if seq # got alignment text sp_text << seq.text.slice(t_range) else # no alignment for this one here, use the fill char sp_text << fill_char * (t_range.end - t_range.begin) end end end end if remove_absent_species non_fill = non_fill_re LOG.debug { "searching for non-fill characters with #{non_fill}" } text.each_with_index do |seq, i| unless non_fill.match(seq) text[i] = nil end end end text end
Write a FASTA representation of the tiled sequences to the given output stream.
@param [#puts] f the output stream to write the FASTA data to. @api public
# File lib/bio/maf/tiler.rb, line 192 def write_fasta(f) output_text.each do |sp_out, text| f.puts ">#{sp_out}" f.puts text end end