class MgNu::Parser::Genbank
Constants
- InvalidGenbankFile
- LOCUS_REGEX
Attributes
file[R]
genbank_instances[RW]
Public Class Methods
new(filename)
click to toggle source
create a new Genbank
parser
# File lib/mgnu/parser/genbank.rb, line 15 def initialize(filename) @genbank_instances = [] if filename if File.exists?(filename) and File.readable?(filename) @file = File.open(filename) else error("MgNu::Parser::Genbank#parse: problems with filename") raise "File doesn't exist or is not readable!" end else error("MgNu::Parser::Genbank#parse: need a filename") raise "no filename given!" end end
Public Instance Methods
parse(debug=false)
click to toggle source
# File lib/mgnu/parser/genbank.rb, line 31 def parse(debug=false) @debug = debug # parse_header # also triggers parsing of everything else until file.eof? do parse_section end genbank_instances end
parse_features(buffer, genbank)
click to toggle source
# File lib/mgnu/parser/genbank.rb, line 138 def parse_features(buffer, genbank) buffer.shift if buffer[0] =~ /^FEATURES/ all_features = split_at_features(buffer.join("\n")) all_features.each do |feature_str| genbank.features << MgNu::Genbank::Feature.parse(feature_str) end end
parse_references(buffer, genbank)
click to toggle source
# File lib/mgnu/parser/genbank.rb, line 147 def parse_references(buffer, genbank) ref_array = split_at_header_tag(buffer.join("\n")) ref_array.each do |ref| genbank.references << MgNu::Genbank::Reference.parse(ref) end end
parse_section()
click to toggle source
# File lib/mgnu/parser/genbank.rb, line 40 def parse_section locus_line = file.readline if md = locus_line.match(LOCUS_REGEX) genbank = MgNu::Genbank.new info("found a LOCUS line") if @debug genbank.locus = MgNu::Genbank::Locus.new(*md.captures) info("LOCUS name #{genbank.locus.name}") if @debug buffer = parse_until(file, /^ACCESSION/) if buffer.join =~ /^DEFINITION\s+(.+)$/m genbank.definition = $1.gsub(/\n/, ' ').gsub(/\s{2,}/, ' ').strip.chop info genbank.definition if @debug end buffer = parse_until(file, /^VERSION/) # parsing ACESSION number line if buffer.join =~ /^ACCESSION\s+(.+)$/ temp = $1.strip.squeeze(' ').split("\s") # multiple secondary accession numbers possible genbank.accession, genbank.secondary_accession = temp.shift, temp end info "ACCESSION: #{genbank.accession}" if @debug buffer = parse_until(file, /^KEYWORDS/) # parsing VERSION line buffer.each do |line| if line =~ /^VERSION\s+(.+)$/ temp = $1.strip.squeeze(' ').split temp.each do |version| if version =~ /GI:(\d+)/ genbank.geninfo_identifier = $1.to_i else genbank.version = version end end elsif line =~ /^DBLINK\s+(.+)$/ genbank.dblink = $1.strip.squeeze(' ') end end buffer = parse_until(file, /^SOURCE/) # parse keywords and optional segment keyword_lines = [] buffer.each do |line| if line =~ /^KEYWORDS\s+(.+)$/ keyword_lines << $1.strip.squeeze(' ') elsif line =~ /^SEGMENT\s+(.+)$/ genbank.segment = $1.strip.squeeze(' ') else keyword_lines << line end end k = keyword_lines.join unless k == "." k_array = k.split(/;\s*/) # keywords are separated by semicolons k_array[-1].chop! # gets rid of the period after the last keyword genbank.keywords = k_array end buffer = parse_until(file,/^FEATURES/) ri = buffer.index {|l| l =~ /^REFERENCE/ } ci = buffer.index {|l| l =~ /^COMMENT/ } if ri && ci genbank.source = MgNu::Genbank::Source.parse(buffer[0..ri-1]) parse_references(buffer[ri..ci-1], genbank) genbank.comment = buffer[ci..-1].map{|line| line.gsub(/^COMMENT/, '').lstrip!.squeeze(' ')}.join("\n") elsif ri genbank.source = MgNu::Genbank::Source.parse(buffer[0..ri-1]) parse_references(buffer[ri..-1], genbank) elsif ci genbank.source = MgNu::Genbank::Source.parse(buffer[0..ci-1]) genbank.comment = buffer[ci..-1].map{|line| line.gsub(/^COMMENT/, '').lstrip!.squeeze(' ')}.join("\n") else # neither references nor comment line genbank.source = MgNu::Genbank::Source.parse(buffer) end info genbank.source.common_name if @debug info genbank.source.organism if @debug info genbank.source.lineage if @debug parse_features(parse_until(file, /^ORIGIN/), genbank) info "features count: #{genbank.features.length}" if @debug parse_sequence(parse_until(file, /\/\//), genbank) info "sequence length: #{genbank.sequence.try(:length) || 0}" if @debug file.readline # consumes end of section line // genbank_instances << genbank else unless locus_line =~ /^\s*$/ raise InvalidGenbankFile, "Missing or malformed LOCUS line." end end end
parse_sequence(buffer, genbank)
click to toggle source
# File lib/mgnu/parser/genbank.rb, line 154 def parse_sequence(buffer, genbank) buffer.shift # drop ORIGIN line info("inside parse_sequence") if @debug info("buffer is #{buffer.length}") if @debug unless buffer.empty? seq = "" bigstr = buffer.join seq = bigstr.gsub(/[\d\s]+/, "") genbank.sequence = MgNu::Sequence.new(:value => seq) genbank.features.each do |f| f.sequence = f.location.get_sequence(genbank.sequence.value) end else genbank.sequence = nil end end
split_at_features(str)
click to toggle source
# File lib/mgnu/parser/genbank.rb, line 178 def split_at_features(str) sep = "\001" str.gsub(/\n(\s{5}\S)/, "\n#{sep}\\1").split(sep) end
split_at_header_tag(str)
click to toggle source
splits at lines beginning with capital letter and no preceding space chars
# File lib/mgnu/parser/genbank.rb, line 173 def split_at_header_tag(str) sep = "\001" str.gsub(/\n([A-Z])/, "\n#{sep}\\1").split(sep) end