class Ms::ErrorRate::Sbv

Sample Bias Validator

Constants

AASEQ_EXT
LENGTH_EXT

Attributes

frequency_indicator_opposite[RW]
indicator_by_aaseq[RW]

a hash by aaseq giving a value between 0 and 1 telling how much of an indicator the hit is

indicators_signify_true_hit[RW]

boolean

size_to_freq[RW]

Public Class Methods

generate_hashes(pep_to_prot_file, opts={}) { |values_at(*(split('-')))| ... } click to toggle source

if a protein hash is given, will yield the return an array of values generated with the value from keying each protein of the peptide. Otherwise, will yield each peptide in turn

# File lib/ms/error_rate/sbv.rb, line 13
def self.generate_hashes(pep_to_prot_file, opts={})
  op = { :aaseq_ext => AASEQ_EXT,
    :length_ext => LENGTH_EXT,
    :file_ext => '.yml',
    :type_code => '',
    :protein_hash => nil, 
    :stderr_counter => true,
  }.merge(opts)

  base = pep_to_prot_file.chomp(File.extname(pep_to_prot_file)) 
  freqs = Hash.new {|h,k| h[k] = 0.0 }
  counts = Hash.new {|h,k| h[k] = 0 }
  (fileout1, fileout2) = [:aaseq_ext, :length_ext].map do |type_ext|
    base + '.' + op[:type_code] + '.' + op[type_ext] + op[:file_ext]
  end
  protein_hash = op[:protein_hash]
  pep_count = 0
  if op[:stderr_counter]
    $stderr.print "[working, 100,000 peptides = '.'] "
    $stderr.flush
  end
  File.open(fileout1 , 'w') do |out|
    IO.foreach(pep_to_prot_file) do |line|
      (pep, prot_string) = line.chomp!.split(': ')

      total_transmembrane = 0 
      total_known = 0
      answ = 
        if protein_hash
          yield( protein_hash.values_at(*(prot_string.split('-'))) )
        else
          yield(pep)
        end
      out.puts "#{pep}: #{answ}"
      freqs[pep.size] += answ
      counts[pep.size] += 1
      pep_count += 1
      if pep_count % 100000 == 0 && op[:stderr_counter]
        $stderr.print '.'
        $stderr.flush
      end
    end
  end
  $stderr.puts "DONE!" if op[:stderr_counter]
  avg_freq_ar = {}
  freqs.each do |k,v|
    avg_freq_ar[k] = v / counts[k]
  end
  File.open(fileout2, 'w') {|out| out.print avg_freq_ar.to_yaml }
  [fileout1, fileout2] 
end
new(indicator_by_aaseq_hash, size_to_freq, frequency_indicator_opposite, indicators_signify_true_hit=true) click to toggle source
# File lib/ms/error_rate/sbv.rb, line 78
def initialize(indicator_by_aaseq_hash, size_to_freq, frequency_indicator_opposite, indicators_signify_true_hit=true)
  @indicators_signify_true_hit = indicators_signify_true_hit
  @frequency_indicator_opposite = frequency_indicator_opposite
  @indicator_by_aaseq = indicator_by_aaseq_hash
  @tot_num_indicators = 0.0
  @tot_num = 0
end

Public Instance Methods

calculate_background_frequency() click to toggle source
# File lib/ms/error_rate/sbv.rb, line 104
def calculate_background_frequency
  @aaseq_to_fraction
end
update_precision(aaseq) click to toggle source

returns the cumulative precision (fraction of true positives among total hits) frequency_of_indicators is the probability that a generic amino acid sequence will be an indicator (this may variable by sequence length).

# File lib/ms/error_rate/sbv.rb, line 90
def update_precision(aaseq)
  @tot_num_indicators << indicator_by_aaseq[aaseq]
  @tot_num += 1
  @frequency_of_indicators_sum += @size_to_freq[aaseq.size]
  # FP Indicator
  value = @tot_num_indicators * (1.0 - @frequency_indicator_opposite) * @frequency_of_indicators_sum / (@tot_num**2)
  precision = 
    if @indicators_signify_true_hit 
      value  # a true indicator type (gives precision)
    else  # false indicator type
      1 - value  # 1 - fdr == precision
    end
end