class GeneValidator::RawSequences

Gets the raw sequences for each hit in a BLAST output file

Public Class Methods

index_raw_seq_file(raw_seq_file = opt[:raw_sequences]) click to toggle source

Index the raw sequences file…

# File lib/genevalidator/get_raw_sequences.rb, line 46
def index_raw_seq_file(raw_seq_file = opt[:raw_sequences])
  # leave only the identifiers in the fasta description
  content = File.open(raw_seq_file, 'rb').read.gsub(/ .*/, '')
  File.open(raw_seq_file, 'w+') { |f| f.write(content) }

  # index the fasta file
  keys   = content.scan(/>(.*)\n/).flatten
  values = content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }

  # make an index hash
  index_hash = {}
  keys.each_with_index do |k, i|
    start = values[i]
    endf  = i == values.length - 1 ? content.length - 1 : values[i + 1]
    index_hash[k] = [start, endf]
  end

  # create FASTA index
  fname = File.basename(raw_seq_file)
  config[:raw_seq_file_index] = File.join(dirs[:tmp_dir], "#{fname}.idx")
  config[:raw_seq_file_load]  = index_hash

  File.open(config[:raw_seq_file_index], 'w') do |f|
    YAML.dump(index_hash, f)
  end
  content = nil
end
init() click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 18
def init
  warn '==> Extracting fasta sequences for each BLAST HSP from the' \
       ' BLAST database'

  @blast_file = opt[:blast_xml_file] if opt[:blast_xml_file]
  @blast_file = opt[:blast_tabular_file] if opt[:blast_tabular_file]

  fname = File.basename(@blast_file)
  opt[:raw_sequences] = File.join(dirs[:tmp_dir], "#{fname}.raw_seq")
  @index_file         = File.join(dirs[:tmp_dir], "#{fname}.index")
end
run() click to toggle source

Obtains raw_sequences from BLAST output file…

# File lib/genevalidator/get_raw_sequences.rb, line 32
def run
  init
  if opt[:db].match?(/remote/)
    write_a_raw_seq_file(opt[:raw_sequences], 'remote')
  else
    write_an_index_file(@index_file, 'local')
    FetchRawSequences.extract_from_local_db(true, nil, @index_file)
  end
  index_raw_seq_file(opt[:raw_sequences])
end

Private Class Methods

iterate_tabular(file, db_type) click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 113
def iterate_tabular(file, db_type)
  table_headers = opt[:blast_tabular_options].split(/[ ,]/)
  tab_file      = File.read(opt[:blast_tabular_file])
  rows = CSV.parse(tab_file, col_sep: "\t",
                             skip_lines: /^#/,
                             headers: table_headers)

  rows.each do |row|
    raise BLASTDBError if row['sseqid'] =~ /\|BL_ORD_ID\|/i
    if db_type == 'remote' || row['sseqid'].nil?
      file.puts FetchRawSequences.extract_from_remote_db(row['sacc'])
    else
      file.puts row['sseqid']
    end
  end
end
iterate_xml(file, db_type) click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 99
def iterate_xml(file, db_type)
  n = Bio::BlastXMLParser::XmlIterator.new(opt[:blast_xml_file]).to_enum
  n.each do |iter|
    iter.each do |hit|
      raise BLASTDBError if hit.hit_id =~ /\|BL_ORD_ID\|/
      if db_type == 'remote' || hit.hit_id.nil?
        file.puts FetchRawSequences.extract_from_remote_db(hit.accession)
      else
        file.puts hit.accession
      end
    end
  end
end
write_a_raw_seq_file(output_file, db_type)
Alias for: write_an_index_file
write_an_index_file(output_file, db_type) click to toggle source
# File lib/genevalidator/get_raw_sequences.rb, line 76
def write_an_index_file(output_file, db_type)
  file = File.open(output_file, 'w+')
  iterate_xml(file, db_type) if opt[:blast_xml_file]
  iterate_tabular(file, db_type) if opt[:blast_tabular_file]
rescue BLASTDBError
  warn '*** BLAST Database Error: Genevalidator requires BLAST' \
  " databases to be created with the '-parse_seqids argument."
  warn '    See https://github.com/wurmlab/genevalidator' \
  '#setting-up-a-blast-database for more information'
  exit 1
rescue StandardError
  warn '*** Error: There was an error in analysing the BLAST'
  warn '    output file. Please ensure that BLAST output file'
  warn '    is in the correct format and then try again. If you'
  warn '    are using a remote database, please ensure that you'
  warn '    have internet access.'
  exit 1
ensure
  file.close unless file.nil?
end
Also aliased as: write_a_raw_seq_file