class MzID::BatchParser

class to parse an mzIdentML file

Public Class Methods

new(file) click to toggle source
Calls superclass method MzID::BaseParser::new
# File lib/mzid/batch_parser.rb, line 12
def initialize(file)
  super(file)
  @pep_ev_h = Hash.new
  @db_seq_h = Hash.new
  cache_ids
end

Public Instance Methods

cache_db_seq_entries(root) click to toggle source

store database sequence entries (ids)

# File lib/mzid/batch_parser.rb, line 66
def cache_db_seq_entries(root)
  dbseq_lst = root.xpath('//DBSequence')
  dbseq_lst.each do |dnode|
    id = dnode["id"]
    acc_id = dnode["accession"]
    @db_seq_h[id] = acc_id
  end
end
cache_pep_ev(root) click to toggle source

store peptide evidence sequences in hash for lookup

# File lib/mzid/batch_parser.rb, line 47
def cache_pep_ev(root)
  pep_ev_lst = root.xpath('//PeptideEvidence')
  pep_ev_lst.each do |pnode|
    id = pnode["id"]
    
    @pep_ev_h[id] = 
      PeptideEvidence.new(:id => pnode["id"],
                          :db_seq_ref => pnode["dBSequence_ref"],
                          :pep_id => pnode["peptide_ref"],
                          :start_pos => pnode["start"].to_i,
                          :end_pos => pnode["end"].to_i,
                          :pre => pnode["pre"],
                          :post => pnode["post"],
                          :prot_id => @db_seq_h[pnode["dBSequence_ref"]].to_sym)
  end
end
each_psm(use_pbar=nil) { |psm| ... } click to toggle source

iterate through each psm

# File lib/mzid/batch_parser.rb, line 77
def each_psm(use_pbar=nil)
  hit_values = File.open(@mzid_file) do |io|
    doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
    doc.remove_namespaces!
    root = doc.root
    # get list of identifications
    spec_results = root.xpath('//SpectrumIdentificationResult')
    pbar = ProgressBar.new("PSMs", spec_results.size) if use_pbar
    spec_results.each do |sres|
      #
      psms_of_spec = sres.xpath('.//SpectrumIdentificationItem')
      # go over each PSM from the spectra
      psms_of_spec.each do |psm_node|
        # get peptide evidence list
        pep_ev_raw_lst = psm_node.xpath('.//PeptideEvidenceRef')
        pep_ev_lst = pep_ev_raw_lst.map do |penode|
          pep_ev_ref_id = penode["peptideEvidence_ref"]
          @pep_ev_h[pep_ev_ref_id]
        end 
        # get cvparams
        cvlst = psm_node.xpath('.//cvParam')
        # find spectral prob
        tmp_lst = cvlst.select{|v| v['name'] == "MS-GF:SpecEValue"}
        spec_prob = tmp_lst[0]['value']
        # get peptide
        pep_seq = @pep_h[psm_node['peptide_ref']]
        # get spectrum id/ref number
        spec_id = psm_node['id']
        spec_num = spec_id.split("_")[1].to_i
        spec_ref = spec_id.split("_")[-1].to_i
        # store in object
        psm = PSM.new(:spec_num => spec_num, 
                      :spec_ref => spec_ref, 
                      :pep => pep_seq, 
                      :spec_prob => spec_prob.to_f,
                      :mods => (@mod_h.has_key?(psm_node['peptide_ref']) ? @mod_h[psm_node['peptide_ref']] : nil),
                      :pep_ev => pep_ev_lst
                      )
        # yield psm object
        yield psm
      end
      pbar.inc if use_pbar
    end
    pbar.finish if use_pbar
  end
end
each_spectrum(use_pbar=nil) { |spec_lst| ... } click to toggle source

for each spectrum, return a list of PSM objects for that spectrum

# File lib/mzid/batch_parser.rb, line 126
def each_spectrum(use_pbar=nil)
  spec_lst = []
  self.each_psm(use_pbar) do |psm|
    if spec_lst.empty? then
      spec_lst.push(psm) 
    else
      if spec_lst[-1].get_spec_num == psm.get_spec_num then
        spec_lst.push(psm)
      else # found new spec num, yield psm list
        yield spec_lst
        spec_lst = [psm] # add new to list
      end
    end
  end
  yield spec_lst
end

Private Instance Methods

cache_ids() click to toggle source

store peptide sequences in hash for lookup

# File lib/mzid/batch_parser.rb, line 21
def cache_ids()
  hit_values = File.open(@mzid_file) do |io|
    doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
    doc.remove_namespaces!
    root = doc.root
    
    cache_db_seq_entries(root)
    cache_pep_ev(root)
    
    peptide_lst = root.xpath('//Peptide')
    @pep_h = Hash.new
    @mod_h = Hash.new
    peptide_lst.each do |pnode|
      
      pep_id = pnode['id']
      pep_seq = get_peptide_sequence(pnode)
      mod_line = get_modifications(pnode)
      @pep_h[pep_id] = pep_seq 
      @mod_h[pep_id] = mod_line 
    end
    
  end
end