class GeneValidator::AlignmentValidation
This class contains the methods necessary for validations based on multiple alignment
Attributes
Public Class Methods
Initilizes the object Params: prediction
: a Sequence
object representing the blast query hits
: a vector of Sequence
objects (representing blast hits) plot_path
: name of the fasta file
# File lib/genevalidator/validation_alignment.rb, line 106 def initialize(prediction, hits) super @short_header = 'MissingExtraSequences' @cli_name = 'align' @header = 'Missing/Extra Sequences' @description = 'Finds missing and extra sequences in the' \ ' prediction, based on the multiple alignment of' \ ' the best hits. Also counts the percentage of' \ ' the conserved regions that appear in the' \ ' prediction.' @raw_seq_file = opt[:raw_sequences] @index_file_name = config[:raw_seq_file_index] @raw_seq_file_load = config[:raw_seq_file_load] @db = opt[:db] @multiple_alignment = [] @num_threads = opt[:mafft_threads] @type = config[:type] end
Public Instance Methods
converts an array of integers into array of ranges
# File lib/genevalidator/validation_alignment.rb, line 379 def array_to_ranges(ar) prev = ar[0] ranges = ar.slice_before do |e| prev2 = prev prev = e prev2 + 1 != e end.map { |a| a[0]..a[-1] } ranges end
Returns the percentage of consesnsus residues from the ma that are in the prediction Params: prediction_raw
: String
corresponding to the prediction sequence consensus
: String
corresponding to the statistical model Output: Fixnum
with the score
# File lib/genevalidator/validation_alignment.rb, line 308 def consensus_validation(prediction_raw, consensus) return 1 if prediction_raw.length != consensus.length # no of conserved residues among the hits no_conserved_residues = consensus.length - consensus.scan(/[\?-]/).length return 1 if no_conserved_residues.zero? # no of conserved residues from the hita that appear in the prediction no_conserved_pred = consensus.split(//).each_index.count { |j| consensus[j] != '-' && consensus[j] != '?' && consensus[j] == prediction_raw[j] } no_conserved_pred / (no_conserved_residues + 0.0) end
Returns the percentage of extra sequences in the prediction with respect to the statistical model Params: prediction
: String
corresponding to the prediction sequence sm
: String
corresponding to the statistical model Output: Fixnum
with the score
# File lib/genevalidator/validation_alignment.rb, line 289 def extra_sequence_validation(prediction_raw, sm) return 1 if prediction_raw.length != sm.length # find residues that are in the prediction # but not in the statistical model no_insertions = 0 (0..sm.length - 1).each do |i| no_insertions += 1 if prediction_raw[i] != '-' && sm[i] == '-' end no_insertions / (sm.length + 0.0) end
Returns the percentage of gaps in the prediction with respect to the statistical model Params: prediction
: String
corresponding to the prediction sequence sm
: String
corresponding to the statistical model Output: Fixnum
with the score
# File lib/genevalidator/validation_alignment.rb, line 270 def gap_validation(prediction_raw, sm) return 1 if prediction_raw.length != sm.length # find gaps in the prediction and # not in the statistical model no_gaps = 0 (0..sm.length - 1).each do |i| no_gaps += 1 if prediction_raw[i] == '-' && sm[i] != '-' end no_gaps / (sm.length + 0.0) end
Returns the consensus regions among a set of multiple aligned sequences i.e positions where there is the same element in all sequences Params: ma
: array of +String+s, corresponding to the multiple aligned sequences Output: String
with the consensus regions
# File lib/genevalidator/validation_alignment.rb, line 257 def get_consensus(ma) align = Bio::Alignment.new(ma) align.consensus end
Builds a statistical model from a set of multiple aligned sequences based on PSSM (Position Specific Matrix) Params: ma
: array of +String+s, corresponding to the multiple aligned sequences threshold
: percentage of genes that are considered in statistical model Output: String
representing the statistical model Array
with the maximum frequeny of the majoritary residue for each position
# File lib/genevalidator/validation_alignment.rb, line 332 def get_sm_pssm(ma, threshold = 0.7) sm = '' freq = [] (0..ma[0].length - 1).each do |i| freqs = Hash.new(0) ma.map { |seq| seq[i] }.each { |res| freqs[res] += 1 } # get the residue with the highest frequency max_freq = freqs.map { |_res, n| n }.max residue = freqs.map { |res, n| n == max_freq ? res : [] }.flatten[0] freq << residue == '-' ? 0 : (max_freq / ma.length.to_f) sm += (max_freq / ma.length.to_f) >= threshold ? residue : '?' end [sm, freq] end
Returns true if the string contains only letters and false otherwise
# File lib/genevalidator/validation_alignment.rb, line 373 def isalpha(str) !str.match(/[^A-Za-z]/) end
Builds the multiple alignment between all the hits and the prediction using MAFFT tool Also creates a fasta file with the alignment Params: prediction
: a Sequence
object representing the blast query hits
: a vector of Sequience
objects (usually representing blast hits) path
: path of mafft installation Output: Array
of +String+s, corresponding to the multiple aligned sequences the prediction is the last sequence in the vector
# File lib/genevalidator/validation_alignment.rb, line 226 def multiple_align_mafft(prediction, hits) raise unless prediction.is_a?(Query) && hits[0].is_a?(Query) opt = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet', '--thread', @num_threads.to_s] mafft = Bio::MAFFT.new('mafft', opt) sequences = hits.map do |h| # remove the seq id - as MAFFT sometimes has an issue with this f = Bio::FastaFormat.new(h.raw_sequence) # check if fasta sequence otherwise returne original entry f.seq.empty? ? h.raw_sequence : f.seq end sequences.push(prediction.protein_translation) report = mafft.query_align(sequences) alignment = report.alignment.map(&:to_s) raise NoMafftInstallationError if alignment.empty? alignment rescue StandardError raise NoMafftInstallationError end
Generates a json file cotaining data used for plotting lines for multiple hits alignment, prediction and statistical model Params: freq
: String
residue frequency from the statistical model output
: plot_path of the json file ma
: String
array with the multiple alignmened hits and prediction
# File lib/genevalidator/validation_alignment.rb, line 397 def plot_alignment(freq, ma = @multiple_alignment) # get indeces of consensus in the multiple alignment consensus = get_consensus(ma[0..ma.length - 2]) consensus_idxs = consensus.split(//).each_index.select { |j| isalpha(consensus[j]) } consensus_ranges = array_to_ranges(consensus_idxs) consensus_all = get_consensus(ma) consensus_all_idxs = consensus_all.split(//).each_index.select { |j| isalpha(consensus_all[j]) } consensus_all_ranges = array_to_ranges(consensus_all_idxs) match_alignment = ma[0..ma.length - 2].each_with_index.map { |seq, _j| seq.split(//).each_index.select { |j| isalpha(seq[j]) } } match_alignment_ranges = match_alignment.map { |e| array_to_ranges(e) } query_alignment = ma[ma.length - 1].split(//).each_index.select { |j| isalpha(ma[ma.length - 1][j]) } query_alignment_ranges = array_to_ranges(query_alignment) len = ma[0].length # plot statistical model data = freq.each_with_index.map { |h, j| { 'y' => ma.length, 'start' => j, 'stop' => j + 1, 'color' => 'orange', 'height' => h } } + # hits match_alignment_ranges.each_with_index.map { |ranges, j| ranges.map { |range| { 'y' => ma.length - j - 1, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } } }.flatten + ma[0..ma.length - 2].each_with_index.map { |_seq, j| consensus_ranges.map { |range| { 'y' => j + 1, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } } }.flatten + # plot prediction [{ 'y' => 0, 'start' => 0, 'stop' => len, 'color' => 'gray', 'height' => -1 }] + query_alignment_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } }.flatten + # plot consensus consensus_all_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } }.flatten y_axis_values = 'Prediction' (1..ma.length - 1).each { |i| y_axis_values << ", hit #{i}" } y_axis_values << ', Statistical Model' Plot.new(data, :align, 'Missing/Extra sequences Validation: Multiple Align. &' \ 'Statistical model of hits', 'Aligned Hit Sequence, red; Conserved Region, Yellow', 'Offset in the Alignment', '', ma.length + 1, y_axis_values) end
Remove isolated residues inside long gaps from a given sequence Params: seq
:String
: sequence of residues len
:Fixnum
: number of isolated residues to be removed Output: String
: the new sequence
# File lib/genevalidator/validation_alignment.rb, line 356 def remove_isolated_residues(seq, len = 2) gap_starts = seq.to_enum(:scan, /(-\w{1,#{len}}-)/i).map { |_m| $`.size + 1 } # remove isolated residues gap_starts.each do |i| (i..i + len - 1).each { |j| seq[j] = '-' if isalpha(seq[j]) } end # remove isolated gaps res_starts = seq.to_enum(:scan, /([?\w]-{1,2}[?\w])/i).map { |_m| $`.size + 1 } res_starts.each do |i| (i..i + len - 1).each { |j| seq[j] = '?' if seq[j] == '-' } end seq end
Find gaps/extra regions based on the multiple alignment of the first n hits Output: AlignmentValidationOutput
object
# File lib/genevalidator/validation_alignment.rb, line 130 def run n = opt[:min_blast_hits] < 10 ? 10 : opt[:min_blast_hits] n = 50 if n > 50 raise NotEnoughHitsError if hits.length < n raise unless prediction.is_a?(Query) && hits[0].is_a?(Query) start = Time.new # get the first n hits n_hits = [n - 1, @hits.length].min less_hits = @hits[0..n_hits] # get raw sequences for less_hits less_hits.delete_if do |hit| if hit.raw_sequence.nil? hit.raw_sequence = FetchRawSequences.run(hit.identifier, hit.accession_no) end hit.raw_sequence.nil? ? true : false end raise NoInternetError if less_hits.length.zero? # in case of nucleotide prediction sequence translate into protein # translate with the reading frame of all hits considered for alignment reading_frames = less_hits.map(&:reading_frame).uniq raise ReadingFrameError if reading_frames.length != 1 if @type == :nucleotide s = Bio::Sequence::NA.new(prediction.raw_sequence) prediction.protein_translation = s.translate(reading_frames[0]) end # multiple align sequences from less_hits with the prediction # the prediction is the last sequence in the vector @multiple_alignment = multiple_align_mafft(prediction, less_hits) out = get_sm_pssm(@multiple_alignment[0..@multiple_alignment.length - 2]) sm = out[0] freq = out[1] # remove isolated residues from the predicted sequence index = @multiple_alignment.length - 1 prediction_raw = remove_isolated_residues(@multiple_alignment[index]) # remove isolated residues from the statistical model sm = remove_isolated_residues(sm) a1 = get_consensus(@multiple_alignment[0..@multiple_alignment.length - 2]) plot1 = plot_alignment(freq) gaps = gap_validation(prediction_raw, sm) extra_seq = extra_sequence_validation(prediction_raw, sm) consensus = consensus_validation(prediction_raw, a1) @validation_report = AlignmentValidationOutput.new(@short_header, @header, @description, gaps, extra_seq, consensus) @validation_report.plot_files.push(plot1) @validation_report.run_time = Time.now - start @validation_report rescue NotEnoughHitsError @validation_report = ValidationReport.new('Not enough evidence', :warning, @short_header, @header, @description) rescue NoMafftInstallationError @validation_report = ValidationReport.new('Mafft error', :error, @short_header, @header, @description) @validation_report.errors.push NoMafftInstallationError rescue NoInternetError @validation_report = ValidationReport.new('Internet error', :error, @short_header, @header, @description) @validation_report.errors.push NoInternetError rescue ReadingFrameError @validation_report = ValidationReport.new('Multiple reading frames', :error, @short_header, @header, @description) @validation_report.errors.push 'Multiple reading frames Error' rescue StandardError @validation_report = ValidationReport.new('Unexpected error', :error, @short_header, @header, @description) @validation_report.errors.push 'Unexpected Error' end