class GeneValidator::RawSequences
Gets the raw sequences for each hit in a BLAST output file
Public Class Methods
index_raw_seq_file(raw_seq_file = opt[:raw_sequences])
click to toggle source
Index the raw sequences file…
# File lib/genevalidator/get_raw_sequences.rb, line 46 def index_raw_seq_file(raw_seq_file = opt[:raw_sequences]) # leave only the identifiers in the fasta description content = File.open(raw_seq_file, 'rb').read.gsub(/ .*/, '') File.open(raw_seq_file, 'w+') { |f| f.write(content) } # index the fasta file keys = content.scan(/>(.*)\n/).flatten values = content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) } # make an index hash index_hash = {} keys.each_with_index do |k, i| start = values[i] endf = i == values.length - 1 ? content.length - 1 : values[i + 1] index_hash[k] = [start, endf] end # create FASTA index fname = File.basename(raw_seq_file) config[:raw_seq_file_index] = File.join(dirs[:tmp_dir], "#{fname}.idx") config[:raw_seq_file_load] = index_hash File.open(config[:raw_seq_file_index], 'w') do |f| YAML.dump(index_hash, f) end content = nil end
init()
click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 18 def init warn '==> Extracting fasta sequences for each BLAST HSP from the' \ ' BLAST database' @blast_file = opt[:blast_xml_file] if opt[:blast_xml_file] @blast_file = opt[:blast_tabular_file] if opt[:blast_tabular_file] fname = File.basename(@blast_file) opt[:raw_sequences] = File.join(dirs[:tmp_dir], "#{fname}.raw_seq") @index_file = File.join(dirs[:tmp_dir], "#{fname}.index") end
run()
click to toggle source
Obtains raw_sequences from BLAST output file…
# File lib/genevalidator/get_raw_sequences.rb, line 32 def run init if opt[:db].match?(/remote/) write_a_raw_seq_file(opt[:raw_sequences], 'remote') else write_an_index_file(@index_file, 'local') FetchRawSequences.extract_from_local_db(true, nil, @index_file) end index_raw_seq_file(opt[:raw_sequences]) end
Private Class Methods
iterate_tabular(file, db_type)
click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 113 def iterate_tabular(file, db_type) table_headers = opt[:blast_tabular_options].split(/[ ,]/) tab_file = File.read(opt[:blast_tabular_file]) rows = CSV.parse(tab_file, col_sep: "\t", skip_lines: /^#/, headers: table_headers) rows.each do |row| raise BLASTDBError if row['sseqid'] =~ /\|BL_ORD_ID\|/i if db_type == 'remote' || row['sseqid'].nil? file.puts FetchRawSequences.extract_from_remote_db(row['sacc']) else file.puts row['sseqid'] end end end
iterate_xml(file, db_type)
click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 99 def iterate_xml(file, db_type) n = Bio::BlastXMLParser::XmlIterator.new(opt[:blast_xml_file]).to_enum n.each do |iter| iter.each do |hit| raise BLASTDBError if hit.hit_id =~ /\|BL_ORD_ID\|/ if db_type == 'remote' || hit.hit_id.nil? file.puts FetchRawSequences.extract_from_remote_db(hit.accession) else file.puts hit.accession end end end end
write_an_index_file(output_file, db_type)
click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 76 def write_an_index_file(output_file, db_type) file = File.open(output_file, 'w+') iterate_xml(file, db_type) if opt[:blast_xml_file] iterate_tabular(file, db_type) if opt[:blast_tabular_file] rescue BLASTDBError warn '*** BLAST Database Error: Genevalidator requires BLAST' \ " databases to be created with the '-parse_seqids argument." warn ' See https://github.com/wurmlab/genevalidator' \ '#setting-up-a-blast-database for more information' exit 1 rescue StandardError warn '*** Error: There was an error in analysing the BLAST' warn ' output file. Please ensure that BLAST output file' warn ' is in the correct format and then try again. If you' warn ' are using a remote database, please ensure that you' warn ' have internet access.' exit 1 ensure file.close unless file.nil? end
Also aliased as: write_a_raw_seq_file