class Bio::Ucsc::File::Twobit
Constants
- BINCODE
- TwoBitHeader
- TwoBitRecord
Attributes
filename[R]
header[R]
offsets[R]
Public Class Methods
load(filename)
click to toggle source
# File lib/bio-ucsc/file/twobit.rb, line 35 def self.load(filename) two_bit = nil ::File.open(filename, 'rb') {|f|two_bit = f.read} tbq = Bio::Ucsc::File::ByteQueue.new(two_bit) twobit_header = TwoBitHeader.new twobit_header.signature = tbq.next(4).unpack('L').first twobit_header.version = tbq.next(4).unpack('L').first twobit_header.sequence_count = tbq.next(4).unpack('L').first twobit_header.reserved = tbq.next(4).unpack('L').first offsets = Hash.new twobit_header.sequence_count.times do name_length = tbq.next(1).unpack('C').first offsets[tbq.next(name_length).unpack('a*').first] = tbq.next(4).unpack('L').first end new(tbq, filename, twobit_header, offsets) end
new(twobit_queue, filename, header, offsets)
click to toggle source
# File lib/bio-ucsc/file/twobit.rb, line 21 def initialize(twobit_queue, filename, header, offsets) @tbq = twobit_queue @filename = filename @header = header @offsets = offsets @records = Hash.new end
open(filename) { |load| ... }
click to toggle source
# File lib/bio-ucsc/file/twobit.rb, line 55 def self.open(filename) if block_given? yield self.load(filename) else return self.load(filename) end end
Public Instance Methods
find_by_interval(interval)
click to toggle source
# File lib/bio-ucsc/file/twobit.rb, line 114 def find_by_interval(interval) # does not support an interval involving multiple N-blocks interval = Bio::Ucsc::Gi.wrap(interval) seq = self.find_by_interval_raw(interval) @records[interval.chrom].n_block_intervals.map do |nb| if interval.overlapped?(nb) case interval.compare(nb) when :equal,:contains seq = 'N' * interval.overlap(nb) when :contained_by left_len = nb.chr_start - interval.chr_start right_len = interval.chr_end - nb.chr_end case when left_len > 0 && right_len > 0 seq = seq[0 .. (left_len-1)] + 'N' * (seq.length - left_len - right_len) + seq[-right_len .. -1] when left_len == 0 && right_len > 0 seq = 'N' * (seq.length - left_len - right_len) + seq[-right_len .. -1] when left_len > 0 && right_len == 0 seq = seq[0 .. (left_len-1)] + 'N' * (seq.length - left_len - right_len) else raise "This should not happen! (Bio::Ucsc::File::Twobit#find_by_interval)" end when :left_overlapped left_len = nb.chr_end - interval.chr_start + 1 seq[0, left_len] = 'N' * left_len when :right_overlapped right_len = interval.chr_end - nb.chr_start + 1 seq[-right_len, right_len] = 'N' * right_len when :right_adjacent, :right_off # expecting that N-blocks are sorted # return Bio::Sequence::NA.new(seq) seq end end end #Bio::Sequence::NA.new(seq) seq end
find_by_interval_raw(interval)
click to toggle source
# File lib/bio-ucsc/file/twobit.rb, line 161 def find_by_interval_raw(interval) interval = Bio::Ucsc::Gi.wrap(interval) byte_count, byte_mod = interval.zero_start.divmod 4 chrom_top = self.records(interval.chrom).packed_dna_offset div_start, mod_start = interval.zero_start.divmod 4 div_end, mod_end = interval.zero_end.divmod 4 div_len, mod_len = interval.length.divmod 4 byte_length = div_end - div_start + 1 @tbq.index = chrom_top + div_start bytes = @tbq.next(byte_length).unpack('C*') seq = bytes_to_nucleotides(bytes) seq[mod_start..(-1-(4-mod_end))] end
inspect()
click to toggle source
# File lib/bio-ucsc/file/twobit.rb, line 31 def inspect "#<#{self.class}:0x#{self.__id__.to_s(16)} @filename=#{@filename} ...>" end
records(chrom)
click to toggle source
# File lib/bio-ucsc/file/twobit.rb, line 63 def records(chrom) return @records[chrom] if @records[chrom] @tbq.index = @offsets[chrom] @records[chrom] = TwoBitRecord.new @records[chrom].dna_size = @tbq.next(4).unpack('L').first n_block_count = @tbq.next(4).unpack('L').first n_block_starts = Array.new n_block_count.times do n_block_starts << @tbq.next(4).unpack('L').first end n_block_sizes = Array.new n_block_count.times do n_block_sizes << @tbq.next(4).unpack('L').first end @records[chrom].n_block_intervals = Array.new n_block_count.times do |idx| @records[chrom].n_block_intervals << Bio::GenomicInterval.zero_based(chrom, n_block_starts[idx], n_block_starts[idx]+n_block_sizes[idx]) end mask_block_count = @tbq.next(4).unpack('L').first mask_block_starts = Array.new mask_block_count.times do mask_block_starts << @tbq.next(4).unpack('L').first end mask_block_sizes = Array.new mask_block_count.times do mask_block_sizes << @tbq.next(4).unpack('L').first end @records[chrom].mask_block_intervals = Array.new mask_block_count.times do |idx| @records[chrom].mask_block_intervals << Bio::GenomicInterval.zero_based(chrom, mask_block_starts[idx], mask_block_starts[idx]+mask_block_sizes[idx]) end @records[chrom].reserved = @tbq.next(4).unpack('L').first @records[chrom].packed_dna_offset = @tbq.index @records[chrom] end
subseq(interval)
click to toggle source
# File lib/bio-ucsc/file/twobit.rb, line 110 def subseq(interval) find_by_interval(interval) end
Private Instance Methods
byte_to_nucleotides(byte)
click to toggle source
# File lib/bio-ucsc/file/twobit.rb, line 186 def byte_to_nucleotides(byte) BINCODE[byte >> 6] + BINCODE[(byte >> 4) & 0b11] + BINCODE[(byte >> 2) & 0b11] + BINCODE[byte & 0b11] end
bytes_to_nucleotides(bytes)
click to toggle source
# File lib/bio-ucsc/file/twobit.rb, line 178 def bytes_to_nucleotides(bytes) results = "" bytes.each do |byte| results << byte_to_nucleotides(byte) end results end