class GeneValidator::GeneMergeValidation

This class contains the methods necessary for checking whether there is evidence that the prediction is a merge of multiple genes

Attributes

hits[R]
prediction[R]

Public Class Methods

new(prediction, hits, boundary = 10) click to toggle source

Initilizes the object Params: prediction: a Sequence object representing the blast query hits: a vector of Sequence objects (representing blast hits) plot_path: name of the input file, used when generatig the plot files boundary: the offset of the hit from which we start analysing the hit

Calls superclass method
# File lib/genevalidator/validation_gene_merge.rb, line 106
def initialize(prediction, hits, boundary = 10)
  super
  @short_header = 'GeneMerge'
  @header       = 'Gene Merge'
  @description  = 'Check whether BLAST hits make evidence about a merge' \
                  ' of two genes that match the predicted gene.'
  @cli_name     = 'merge'
  @boundary     = boundary
end

Public Instance Methods

plot_2d_start_from(slope = nil, y_intercept = nil, hits = @hits) click to toggle source

Generates a json file containing data used for plotting the start/end of the matched region offsets in the prediction Param slope: slope of the linear regression line y_intercept: the ecuation of the line is y= slope*x + y_intercept output: location where the plot will be saved in jped file format hits: array of Sequence objects

# File lib/genevalidator/validation_gene_merge.rb, line 229
def plot_2d_start_from(slope = nil, y_intercept = nil, hits = @hits)
  pairs = hits.map do |hit|
    Pair.new(hit.hsp_list.map(&:match_query_from).min,
             hit.hsp_list.map(&:match_query_to).max)
  end

  data = hits.map do |hit|
    { 'x' => hit.hsp_list.map(&:match_query_from).min,
      'y' => hit.hsp_list.map(&:match_query_to).max,
      'color' => 'red' }
  end

  Plot.new(data,
           :scatter,
           'Gene Merge Validation: Start/end of matching hit coord. on' \
           ' query (1 point/hit)',
           '',
           'Start Offset (most left hsp)',
           'End Offset (most right hsp)',
           y_intercept.to_s,
           slope.to_s)
end
plot_matched_regions(hits = @hits) click to toggle source

Generates a json file containing data used for plotting the matched region of the prediction for each hit Param output: location where the plot will be saved in jped file format hits: array of Sequence objects prediction: Sequence objects

# File lib/genevalidator/validation_gene_merge.rb, line 190
def plot_matched_regions(hits = @hits)
  no_lines = hits.length

  hits_less = hits[0..[no_lines, hits.length - 1].min]

  data = hits_less.each_with_index.map do |hit, i|
    { 'y' => i,
      'start' => hit.hsp_list.map(&:match_query_from).min,
      'stop' => hit.hsp_list.map(&:match_query_to).max,
      'color' => 'black',
      'dotted' => 'true' }
  end .flatten +
         hits_less.each_with_index.map do |hit, i|
           hit.hsp_list.map do |hsp|
             { 'y' => i,
               'start' => hsp.match_query_from,
               'stop' => hsp.match_query_to,
               'color' => 'orange' }
           end
         end .flatten

  Plot.new(data,
           :lines,
           'Gene Merge Validation: Query coord covered by blast hit' \
           ' (1 line/hit)',
           '',
           'Offset in Prediction',
           'Hit Number',
           hits_less.length)
end
run() click to toggle source

Validation test for gene merge Output: GeneMergeValidationOutput object

# File lib/genevalidator/validation_gene_merge.rb, line 120
def run
  raise NotEnoughHitsError if hits.length < opt[:min_blast_hits]
  raise unless prediction.is_a?(Query) && hits[0].is_a?(Query)

  start = Time.now

  pairs = hits.map do |hit|
    Pair.new(hit.hsp_list.map(&:match_query_from).min,
             hit.hsp_list.map(&:match_query_to).max)
  end
  xx_0 = pairs.map(&:x)
  yy_0 = pairs.map(&:y)

  # minimum start shoud be at 'boundary' residues
  xx = xx_0.map { |x| x < @boundary ? @boundary : x }

  # maximum end should be at length - 'boundary' residues
  yy = yy_0.map do |y|
    if y > @prediction.raw_sequence.length - @boundary
      @prediction.raw_sequence.length - @boundary
    else
      y
    end
  end

  line_slope = slope(xx, yy, (1..hits.length).map { |x| 1 / (x + 0.0) })
  ## YW - what is this weighting?

  unimodality = false
  if unimodality_test(xx, yy)
    unimodality = true
    lm_slope = 0.0
  else
    lm_slope = line_slope[1]
  end

  y_intercept = line_slope[0]

  @validation_report = GeneMergeValidationOutput.new(@short_header, @header,
                                                     @description, lm_slope,
                                                     unimodality)
  plot1 = if unimodality
            plot_2d_start_from
          else
            plot_2d_start_from(lm_slope, y_intercept)
          end

  @validation_report.plot_files.push(plot1)
  plot2 = plot_matched_regions
  @validation_report.plot_files.push(plot2)
  @validation_report.run_time = Time.now - start
  @validation_report
rescue NotEnoughHitsError
  @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                            @short_header, @header,
                                            @description)
rescue StandardError
  @validation_report = ValidationReport.new('Unexpected error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push 'Unexpected Error'
end
slope(xx, yy, weights = nil) click to toggle source

Caclulates the slope of the regression line give a set of 2d coordonates of the start/stop offests of the hits Param xx: Array of integers yy : Array of integers weights: Array of integers Output: The ecuation of the regression line: [y slope]

# File lib/genevalidator/validation_gene_merge.rb, line 261
def slope(xx, yy, weights = nil)
  weights = Array.new(hits.length, 1) if weights.nil?

  # calculate the slope
  xx_weighted = xx.each_with_index.map { |x, i| x * weights[i] }
  yy_weighted = yy.each_with_index.map { |y, i| y * weights[i] }

  denominator = weights.reduce(0) { |a, e| a + e }

  x_mean = xx_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0)
  y_mean = yy_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0)

  numerator = (0...xx.length).reduce(0) do |a, e|
    a + (weights[e] * (xx[e] - x_mean) * (yy[e] - y_mean))
  end

  denominator = (0...xx.length).reduce(0) do |a, e|
    a + (weights[e] * ((xx[e] - x_mean)**2))
  end

  slope = numerator / (denominator + 0.0)
  y_intercept = y_mean - (slope * x_mean)

  [y_intercept, slope]
end
slope_statsample(xx, yy) click to toggle source

Caclulates the slope of the regression line give a set of 2d coordonates of the start/stop offests of the hits Param xx : Array of integers yy : Array of integers Output: The ecuation of the regression line: [y slope]

# File lib/genevalidator/validation_gene_merge.rb, line 295
def slope_statsample(xx, yy)
  sr = Statsample::Regression.simple(xx.to_scale, yy.to_scale)
  [sr.a, sr.b]
end
unimodality_test(xx, yy) click to toggle source

xx and yy are the projections of the 2-d data on the two axes

# File lib/genevalidator/validation_gene_merge.rb, line 302
def unimodality_test(xx, yy)
  mean_x = xx.mean
  median_x = xx.median
  mode_x = xx.mode
  sd_x = xx.standard_deviation

  if sd_x == 0
    cond1_x = true
    cond2_x = true
    cond3_x = true
  else
    cond1_x = ((mean_x - median_x).abs / (sd_x + 0.0)) < Math.sqrt(0.6)
    cond2_x = ((mean_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3)
    cond3_x = ((median_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3)
  end

  mean_y = yy.mean
  median_y = yy.median
  mode_y = yy.mode
  sd_y = yy.standard_deviation

  if sd_y == 0
    cond1_y = true
    cond2_y = true
    cond3_y = true
  else
    cond1_y = ((mean_y - median_y).abs / (sd_y + 0.0)) < Math.sqrt(0.6)
    cond2_y = ((mean_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3)
    cond3_y = ((median_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3)
  end

  if cond1_x && cond2_x && cond3_x && cond1_y && cond2_y && cond3_y
    true
  else
    false
  end
end