class GeneValidator::GeneMergeValidation
This class contains the methods necessary for checking whether there is evidence that the prediction is a merge of multiple genes
Attributes
Public Class Methods
Initilizes the object Params: prediction
: a Sequence
object representing the blast query hits
: a vector of Sequence
objects (representing blast hits) plot_path
: name of the input file, used when generatig the plot files boundary
: the offset of the hit from which we start analysing the hit
# File lib/genevalidator/validation_gene_merge.rb, line 106 def initialize(prediction, hits, boundary = 10) super @short_header = 'GeneMerge' @header = 'Gene Merge' @description = 'Check whether BLAST hits make evidence about a merge' \ ' of two genes that match the predicted gene.' @cli_name = 'merge' @boundary = boundary end
Public Instance Methods
Generates a json file containing data used for plotting the start/end of the matched region offsets in the prediction Param slope
: slope of the linear regression line y_intercept
: the ecuation of the line is y= slope*x + y_intercept output
: location where the plot will be saved in jped file format hits
: array of Sequence objects
# File lib/genevalidator/validation_gene_merge.rb, line 229 def plot_2d_start_from(slope = nil, y_intercept = nil, hits = @hits) pairs = hits.map do |hit| Pair.new(hit.hsp_list.map(&:match_query_from).min, hit.hsp_list.map(&:match_query_to).max) end data = hits.map do |hit| { 'x' => hit.hsp_list.map(&:match_query_from).min, 'y' => hit.hsp_list.map(&:match_query_to).max, 'color' => 'red' } end Plot.new(data, :scatter, 'Gene Merge Validation: Start/end of matching hit coord. on' \ ' query (1 point/hit)', '', 'Start Offset (most left hsp)', 'End Offset (most right hsp)', y_intercept.to_s, slope.to_s) end
Generates a json file containing data used for plotting the matched region of the prediction for each hit Param output
: location where the plot will be saved in jped file format hits
: array of Sequence objects prediction
: Sequence objects
# File lib/genevalidator/validation_gene_merge.rb, line 190 def plot_matched_regions(hits = @hits) no_lines = hits.length hits_less = hits[0..[no_lines, hits.length - 1].min] data = hits_less.each_with_index.map do |hit, i| { 'y' => i, 'start' => hit.hsp_list.map(&:match_query_from).min, 'stop' => hit.hsp_list.map(&:match_query_to).max, 'color' => 'black', 'dotted' => 'true' } end .flatten + hits_less.each_with_index.map do |hit, i| hit.hsp_list.map do |hsp| { 'y' => i, 'start' => hsp.match_query_from, 'stop' => hsp.match_query_to, 'color' => 'orange' } end end .flatten Plot.new(data, :lines, 'Gene Merge Validation: Query coord covered by blast hit' \ ' (1 line/hit)', '', 'Offset in Prediction', 'Hit Number', hits_less.length) end
Validation test for gene merge Output: GeneMergeValidationOutput
object
# File lib/genevalidator/validation_gene_merge.rb, line 120 def run raise NotEnoughHitsError if hits.length < opt[:min_blast_hits] raise unless prediction.is_a?(Query) && hits[0].is_a?(Query) start = Time.now pairs = hits.map do |hit| Pair.new(hit.hsp_list.map(&:match_query_from).min, hit.hsp_list.map(&:match_query_to).max) end xx_0 = pairs.map(&:x) yy_0 = pairs.map(&:y) # minimum start shoud be at 'boundary' residues xx = xx_0.map { |x| x < @boundary ? @boundary : x } # maximum end should be at length - 'boundary' residues yy = yy_0.map do |y| if y > @prediction.raw_sequence.length - @boundary @prediction.raw_sequence.length - @boundary else y end end line_slope = slope(xx, yy, (1..hits.length).map { |x| 1 / (x + 0.0) }) ## YW - what is this weighting? unimodality = false if unimodality_test(xx, yy) unimodality = true lm_slope = 0.0 else lm_slope = line_slope[1] end y_intercept = line_slope[0] @validation_report = GeneMergeValidationOutput.new(@short_header, @header, @description, lm_slope, unimodality) plot1 = if unimodality plot_2d_start_from else plot_2d_start_from(lm_slope, y_intercept) end @validation_report.plot_files.push(plot1) plot2 = plot_matched_regions @validation_report.plot_files.push(plot2) @validation_report.run_time = Time.now - start @validation_report rescue NotEnoughHitsError @validation_report = ValidationReport.new('Not enough evidence', :warning, @short_header, @header, @description) rescue StandardError @validation_report = ValidationReport.new('Unexpected error', :error, @short_header, @header, @description) @validation_report.errors.push 'Unexpected Error' end
Caclulates the slope of the regression line give a set of 2d coordonates of the start/stop offests of the hits Param xx: Array
of integers yy : Array
of integers weights: Array
of integers Output: The ecuation of the regression line: [y slope]
# File lib/genevalidator/validation_gene_merge.rb, line 261 def slope(xx, yy, weights = nil) weights = Array.new(hits.length, 1) if weights.nil? # calculate the slope xx_weighted = xx.each_with_index.map { |x, i| x * weights[i] } yy_weighted = yy.each_with_index.map { |y, i| y * weights[i] } denominator = weights.reduce(0) { |a, e| a + e } x_mean = xx_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0) y_mean = yy_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0) numerator = (0...xx.length).reduce(0) do |a, e| a + (weights[e] * (xx[e] - x_mean) * (yy[e] - y_mean)) end denominator = (0...xx.length).reduce(0) do |a, e| a + (weights[e] * ((xx[e] - x_mean)**2)) end slope = numerator / (denominator + 0.0) y_intercept = y_mean - (slope * x_mean) [y_intercept, slope] end
Caclulates the slope of the regression line give a set of 2d coordonates of the start/stop offests of the hits Param xx : Array
of integers yy : Array
of integers Output: The ecuation of the regression line: [y slope]
# File lib/genevalidator/validation_gene_merge.rb, line 295 def slope_statsample(xx, yy) sr = Statsample::Regression.simple(xx.to_scale, yy.to_scale) [sr.a, sr.b] end
xx and yy are the projections of the 2-d data on the two axes
# File lib/genevalidator/validation_gene_merge.rb, line 302 def unimodality_test(xx, yy) mean_x = xx.mean median_x = xx.median mode_x = xx.mode sd_x = xx.standard_deviation if sd_x == 0 cond1_x = true cond2_x = true cond3_x = true else cond1_x = ((mean_x - median_x).abs / (sd_x + 0.0)) < Math.sqrt(0.6) cond2_x = ((mean_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3) cond3_x = ((median_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3) end mean_y = yy.mean median_y = yy.median mode_y = yy.mode sd_y = yy.standard_deviation if sd_y == 0 cond1_y = true cond2_y = true cond3_y = true else cond1_y = ((mean_y - median_y).abs / (sd_y + 0.0)) < Math.sqrt(0.6) cond2_y = ((mean_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3) cond3_y = ((median_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3) end if cond1_x && cond2_x && cond3_x && cond1_y && cond2_y && cond3_y true else false end end