class GeneValidator::DuplicationValidation

This class contains the methods necessary for finding duplicated subsequences in the predicted gene

Attributes

index_file_name[R]
raw_seq_file[R]
raw_seq_file_load[R]

Public Class Methods

new(prediction, hits) click to toggle source
Calls superclass method
# File lib/genevalidator/validation_duplication.rb, line 90
def initialize(prediction, hits)
  super
  @short_header      = 'Duplication'
  @header            = 'Duplication'
  @description       = 'Check whether there is a duplicated subsequence' \
                       ' in the predicted gene by counting the hsp' \
                       ' residue coverage of the prediction, for each hit.'
  @cli_name          = 'dup'
  @raw_seq_file      = opt[:raw_sequences]
  @index_file_name   = config[:raw_seq_file_index]
  @raw_seq_file_load = config[:raw_seq_file_load]
  @db                = opt[:db]
  @num_threads       = opt[:mafft_threads]
  @type              = config[:type]
end

Public Instance Methods

check_multiple_coverage(hit_alignment, query_alignment, hsp, coverage, ranges_prediction) click to toggle source
# File lib/genevalidator/validation_duplication.rb, line 226
def check_multiple_coverage(hit_alignment, query_alignment, hsp, coverage,
                            ranges_prediction)
  # for each hsp of the curent hit
  # iterate through the alignment and count the matching residues
  [*(0..hit_alignment.length - 1)].each do |i|
    residue_hit   = hit_alignment[i]
    residue_query = query_alignment[i]
    next if [' ', '+', '-'].include?(residue_hit)
    next if residue_hit != residue_query
    # indexing in blast starts from 1
    idx_hit   = i + (hsp.hit_from - 1) -
                hit_alignment[0..i].scan(/-/).length
    idx_query = i + (hsp.match_query_from - 1) -
                query_alignment[0..i].scan(/-/).length
    coverage[idx_hit] += 1 unless in_range?(ranges_prediction, idx_query)
  end
  coverage
end
find_local_alignment(hit, prediction, hsp) click to toggle source

Only run if the BLAST output does not contain hit alignmment

# File lib/genevalidator/validation_duplication.rb, line 199
def find_local_alignment(hit, prediction, hsp)
  # indexing in blast starts from 1
  hit_local   = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1]
  query_local = prediction.raw_sequence[hsp.match_query_from -
                                        1..hsp.match_query_to - 1]

  # in case of nucleotide prediction sequence translate into protein
  # use translate with reading frame 1 because
  # to/from coordinates of the hsp already correspond to the
  # reading frame in which the prediction was read to match this hsp
  if @type == :nucleotide
    s = Bio::Sequence::NA.new(query_local)
    query_local = s.translate
  end

  opt = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet',
         '--thread', @num_threads.to_s]
  mafft = Bio::MAFFT.new('mafft', opt)

  # local alignment for hit and query
  seqs = [hit_local, query_local]
  report = mafft.query_align(seqs)
  report.alignment.map(&:to_s)
rescue StandardError
  raise NoMafftInstallationError
end
in_range?(ranges, idx) click to toggle source
# File lib/genevalidator/validation_duplication.rb, line 245
def in_range?(ranges, idx)
  ranges.each { |range| return true if range.member?(idx) }
  false
end
run(n = 10) click to toggle source

Check duplication in the first n hits Output: DuplicationValidationOutput object

# File lib/genevalidator/validation_duplication.rb, line 110
def run(n = 10)
  raise NotEnoughHitsError if hits.length < opt[:min_blast_hits]
  raise unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? &&
               hits[0].is_a?(Query)

  start = Time.new
  # get the first n hits
  n_hits = [n - 1, @hits.length].min
  less_hits = @hits[0..n_hits]
  # get raw sequences for less_hits
  less_hits.delete_if do |hit|
    if hit.raw_sequence.nil?
      hit.raw_sequence = FetchRawSequences.run(hit.identifier,
                                               hit.accession_no)
    end
    hit.raw_sequence.nil? ? true : false
  end

  raise NoInternetError if less_hits.length.zero?

  averages = []

  less_hits.each do |hit|
    coverage = Array.new(hit.length_protein, 0)
    # each residue of the protein should be evluated once only
    ranges_prediction = []

    hit.hsp_list.each do |hsp|
      # align subsequences from the hit and prediction that match
      if !hsp.hit_alignment.nil? && !hsp.query_alignment.nil?
        hit_alignment   = hsp.hit_alignment
        query_alignment = hsp.query_alignment
      else
        align = find_local_alignment(hit, prediction, hsp)
        hit_alignment   = align[0]
        query_alignment = align[1]
      end

      coverage = check_multiple_coverage(hit_alignment, query_alignment,
                                         hsp, coverage, ranges_prediction)

      ranges_prediction << (hsp.match_query_from..hsp.match_query_to)
    end
    overlap = coverage.reject(&:zero?)
    if overlap != []
      averages.push((overlap.inject(:+) / (overlap.length + 0.0)).round(2))
    end
  end

  # if all hsps match only one time
  if averages.reject { |x| x == 1 } == []
    @validation_report = DuplicationValidationOutput.new(@short_header,
                                                         @header,
                                                         @description, 1,
                                                         averages)
    @validation_report.run_time = Time.now - start
    return @validation_report
  end

  pval = wilcox_test(averages)

  @validation_report = DuplicationValidationOutput.new(@short_header,
                                                       @header,
                                                       @description, pval,
                                                       averages)
  @run_time = Time.now - start
  @validation_report
rescue NotEnoughHitsError
  @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                            @short_header, @header,
                                            @description)
rescue NoMafftInstallationError
  @validation_report = ValidationReport.new('Mafft error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push NoMafftInstallationError
rescue NoInternetError
  @validation_report = ValidationReport.new('Internet error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push NoInternetError
rescue StandardError
  @validation_report = ValidationReport.new('Unexpected error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push 'Unexpected Error'
end
wilcox_test(averages) click to toggle source

wilcox test implementation from statsample ruby gem many thanks to Claudio for helping us with the implementation!

# File lib/genevalidator/validation_duplication.rb, line 253
def wilcox_test(averages)
  wilcox = Statsample::Test.wilcoxon_signed_rank(
    Daru::Vector.new(averages),
    Daru::Vector.new(Array.new(averages.length, 1))
  )

  averages.length < 15 ? wilcox.probability_exact : wilcox.probability_z
end