class GeneValidator::FetchRawSequences

Public Class Methods

batch_raw_seq_cmd(index_file) click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 186
def batch_raw_seq_cmd(index_file)
  "blastdbcmd -entry_batch '#{index_file}' -db '#{opt[:db]}'" \
  " -outfmt '%f' -out '#{opt[:raw_sequences]}'"
end
extract_from_index(identifier) click to toggle source

Gets raw sequence by fasta identifier from a fasta index file Params: identifier: String Output: String with the nucleotide sequence corresponding to the identifier

# File lib/genevalidator/get_raw_sequences.rb, line 158
def extract_from_index(identifier)
  idx         = config[:raw_seq_file_load][identifier]
  query       = IO.binread(opt[:raw_sequences], idx[1] - idx[0], idx[0])
  parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
  parse_query[1].delete("\n")
rescue StandardError
  'Error' # return error so it can then try alternative fetching method.
end
extract_from_local_db(batch, accno = nil, idx_file = nil) click to toggle source

Gets raw sequence by accession number from a givem database Params: accno: accession number as String db: database as String Output: String with the nucleotide sequence corresponding to the accession

# File lib/genevalidator/get_raw_sequences.rb, line 174
def extract_from_local_db(batch, accno = nil, idx_file = nil)
  cmd = batch ? batch_raw_seq_cmd(idx_file) : single_raw_seq_cmd(accno)
  efile = Tempfile.new('blast_out')
  `#{cmd} &>#{efile.path}`
  raw_seqs = efile.read
  failed_raw_sequences(raw_seqs) if batch && raw_seqs =~ /Error/
  raw_seqs # when obtaining a single raw_seq, this contains the sequence
ensure
  efile.close
  efile.unlink
end
extract_from_remote_db(accession, db_seq_type = 'protein') click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 208
def extract_from_remote_db(accession, db_seq_type = 'protein')
  uri     = 'https://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' \
            "db=#{db_seq_type}&retmax=1&usehistory=y&term=#{accession}/"
  result  = Net::HTTP.get(URI.parse(uri))
  query   = result.match(%r{<\bQueryKey\b>([\w\W\d]+)</\bQueryKey\b>})[1]
  web_env = result.match(%r{<\bWebEnv\b>([\w\W\d]+)</\bWebEnv\b>})[1]
  uri     = 'https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' \
            'rettype=fasta&retmode=text&retstart=0&retmax=1&' \
            "db=#{db_seq_type}&query_key=#{query}&WebEnv=#{web_env}"
  result  = Net::HTTP.get(URI.parse(uri))
  result[0..result.length - 2]
end
failed_raw_sequences(blast_output) click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 195
def failed_raw_sequences(blast_output)
  blast_output.each_line do |line|
    acc = line.match(/Error: (\w+): OID not found/)[1]
    warn "\nCould not find sequence '#{acc.chomp}' within the" \
                 ' BLAST database.'
    warn "Attempting to obtain sequence '#{acc.chomp}' from" \
                 ' remote BLAST databases.'
    File.open(opt[:raw_sequences], 'a+') do |f|
      f.puts extract_from_remote_db(acc)
    end
  end
end
run(identifier, accession) click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 137
def run(identifier, accession)
  # first try to extract from previously created raw_sequences HASH
  raw_seq = extract_from_index(identifier) if opt[:raw_sequences]
  # then try to just extract that sequence based on accession.
  if opt[:db] !~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/i)
    raw_seq = extract_from_local_db(false, accession)
  end
  # then try to extract from remote database
  if opt[:db] =~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/i)
    raw_seq = extract_from_remote_db(accession)
  end
  # return nil if the raw_sequence still produces an error.
  raw_seq =~ /Error/i ? nil : raw_seq
end
single_raw_seq_cmd(accession) click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 191
def single_raw_seq_cmd(accession)
  "blastdbcmd -entry '#{accession}' -db '#{opt[:db]}' -outfmt '%s'"
end