class GeneValidator::FetchRawSequences
Public Class Methods
batch_raw_seq_cmd(index_file)
click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 186 def batch_raw_seq_cmd(index_file) "blastdbcmd -entry_batch '#{index_file}' -db '#{opt[:db]}'" \ " -outfmt '%f' -out '#{opt[:raw_sequences]}'" end
extract_from_index(identifier)
click to toggle source
Gets raw sequence by fasta identifier from a fasta index file Params: identifier
: String Output: String with the nucleotide sequence corresponding to the identifier
# File lib/genevalidator/get_raw_sequences.rb, line 158 def extract_from_index(identifier) idx = config[:raw_seq_file_load][identifier] query = IO.binread(opt[:raw_sequences], idx[1] - idx[0], idx[0]) parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0] parse_query[1].delete("\n") rescue StandardError 'Error' # return error so it can then try alternative fetching method. end
extract_from_local_db(batch, accno = nil, idx_file = nil)
click to toggle source
Gets raw sequence by accession number from a givem database Params: accno
: accession number as String db
: database as String Output: String with the nucleotide sequence corresponding to the accession
# File lib/genevalidator/get_raw_sequences.rb, line 174 def extract_from_local_db(batch, accno = nil, idx_file = nil) cmd = batch ? batch_raw_seq_cmd(idx_file) : single_raw_seq_cmd(accno) efile = Tempfile.new('blast_out') `#{cmd} &>#{efile.path}` raw_seqs = efile.read failed_raw_sequences(raw_seqs) if batch && raw_seqs =~ /Error/ raw_seqs # when obtaining a single raw_seq, this contains the sequence ensure efile.close efile.unlink end
extract_from_remote_db(accession, db_seq_type = 'protein')
click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 208 def extract_from_remote_db(accession, db_seq_type = 'protein') uri = 'https://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' \ "db=#{db_seq_type}&retmax=1&usehistory=y&term=#{accession}/" result = Net::HTTP.get(URI.parse(uri)) query = result.match(%r{<\bQueryKey\b>([\w\W\d]+)</\bQueryKey\b>})[1] web_env = result.match(%r{<\bWebEnv\b>([\w\W\d]+)</\bWebEnv\b>})[1] uri = 'https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' \ 'rettype=fasta&retmode=text&retstart=0&retmax=1&' \ "db=#{db_seq_type}&query_key=#{query}&WebEnv=#{web_env}" result = Net::HTTP.get(URI.parse(uri)) result[0..result.length - 2] end
failed_raw_sequences(blast_output)
click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 195 def failed_raw_sequences(blast_output) blast_output.each_line do |line| acc = line.match(/Error: (\w+): OID not found/)[1] warn "\nCould not find sequence '#{acc.chomp}' within the" \ ' BLAST database.' warn "Attempting to obtain sequence '#{acc.chomp}' from" \ ' remote BLAST databases.' File.open(opt[:raw_sequences], 'a+') do |f| f.puts extract_from_remote_db(acc) end end end
run(identifier, accession)
click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 137 def run(identifier, accession) # first try to extract from previously created raw_sequences HASH raw_seq = extract_from_index(identifier) if opt[:raw_sequences] # then try to just extract that sequence based on accession. if opt[:db] !~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/i) raw_seq = extract_from_local_db(false, accession) end # then try to extract from remote database if opt[:db] =~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/i) raw_seq = extract_from_remote_db(accession) end # return nil if the raw_sequence still produces an error. raw_seq =~ /Error/i ? nil : raw_seq end
single_raw_seq_cmd(accession)
click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 191 def single_raw_seq_cmd(accession) "blastdbcmd -entry '#{accession}' -db '#{opt[:db]}' -outfmt '%s'" end