class MzID::StreamingParser

class to parse an mzIdentML file in a streaming (i.e., mem-efficient) manner

Public Class Methods

new(file, use_pbar = nil) click to toggle source
Calls superclass method MzID::BatchParser::new
# File lib/mzid/streaming_parser.rb, line 12
def initialize(file, use_pbar = nil)
  @use_pbar = use_pbar
  @num_spec = 0
  super(file)
end

Public Instance Methods

cache_ids(use_pbar = @use_pbar) click to toggle source

store peptide sequences in hash for lookup

# File lib/mzid/streaming_parser.rb, line 37
def cache_ids(use_pbar = @use_pbar)
  # num_pep = 0
  # num_db_seq = 0
  # num_pep_ev = 0
  # # once through file to count
  # tmp_reader = Nokogiri::XML::Reader(File.open(@mzid_file))
  # tmp_reader.each do |node|
  #   @num_spec += 1 if node.name == "SpectrumIdentificationResult"
  #   num_pep += 1 if node.name == "Peptide"
  #   num_db_seq += 1 if node.name == "DBSequence"
  #   num_pep_ev += 1 if node.name == "PeptideEvidence"
  # end
  num_pep, num_db_seq, num_pep_ev = get_num_elements(nil)
  # puts "SPEC:\t#{@num_spec}"
  # puts "PEP:\t#{num_pep}"
  # puts "DB:\t#{num_db_seq}"
  # puts "PEPEV:\t#{num_pep_ev}"

  @pep_h = Hash.new
  @mod_h = Hash.new
  pbar = ProgressBar.new("Caching", num_pep+num_db_seq+num_pep_ev) if use_pbar
  reader = Nokogiri::XML::Reader(File.open(@mzid_file))
  reader.each do |node|
    # @num_spec += 1 if node.name == "SpectrumIdentificationResult"
    
    if node.name == "Peptide" then
      # parse local peptide entry
      tmp_node = Nokogiri::XML.parse(node.outer_xml)
      tmp_node.remove_namespaces!
      root = tmp_node.root
      
      pep_id = root["id"]
      # skip if already handled PepID
      next if @pep_h.has_key?(pep_id)
      # parse sequence/mods if haven't seen it yet
      pep_seq = get_peptide_sequence(root)
      mod_line = get_modifications(root)
      @pep_h[pep_id] = pep_seq 
      @mod_h[pep_id] = mod_line 
      pbar.inc if use_pbar
    end
    #
    if node.name == "DBSequence" then
      # parse local DBSequence entry
      tmp_node = Nokogiri::XML.parse(node.outer_xml)
      tmp_node.remove_namespaces!
      root = tmp_node.root
      cache_db_seq_entries(root)
      pbar.inc if use_pbar
    end
    #
    if node.name == "PeptideEvidence" then
      # parse local DBSequence entry
      tmp_node = Nokogiri::XML.parse(node.outer_xml)
      tmp_node.remove_namespaces!
      root = tmp_node.root
      cache_pep_ev(root)
      pbar.inc if use_pbar
    end 

  end
  pbar.finish if use_pbar
end
cache_pep_ev(root) click to toggle source

store peptide evidence sequences in hash for lookup

# File lib/mzid/streaming_parser.rb, line 103
def cache_pep_ev(root)
  pep_ev_lst = root.xpath('//PeptideEvidence')
  pep_ev_lst.each do |pnode|
    id = pnode["id"]
    @pep_ev_h[id] = 
      PeptideEvidence.new(#:id => pnode["id"],
                          :db_seq_ref => pnode["dBSequence_ref"],
                          #:pep_id => pnode["peptide_ref"],
                          :start_pos => pnode["start"].to_i,
                          :end_pos => pnode["end"].to_i,
                          #:pre => pnode["pre"],
                          #:post => pnode["post"],
                          :prot_id => @db_seq_h[pnode["dBSequence_ref"]].to_sym)
    # @pep_ev_h[id] = pnode["dBSequence_ref"]
  end
end
each_psm(use_pbar=@use_pbar) { |psm| ... } click to toggle source

iterate through each psm

# File lib/mzid/streaming_parser.rb, line 122
def each_psm(use_pbar=@use_pbar)
  reader = Nokogiri::XML::Reader(File.open(@mzid_file))
  pbar = ProgressBar.new("PSMs", @num_spec) if use_pbar
  reader.each do |node|
    next if node.name != "SpectrumIdentificationResult"        
    # parse local spec result entry
    tmp_node = Nokogiri::XML.parse(node.outer_xml)
    tmp_node.remove_namespaces!
    root = tmp_node.root
    # parse spectrum id item
    psms_of_spec = root.xpath('.//SpectrumIdentificationItem')
    psms_of_spec.each do |psm_node|
      # get PSM object
      psm = get_psm(psm_node)
      # yield psm object
      yield psm
    end
    pbar.inc if use_pbar
  end
  pbar.finish if use_pbar
end

Private Instance Methods

get_num_elements(use_pbar=@use_pbar) click to toggle source

first pass thru file just counting element types

# File lib/mzid/streaming_parser.rb, line 20
def get_num_elements(use_pbar=@use_pbar)
  num_pep = 0
  num_db_seq = 0
  num_pep_ev = 0
  # once through file to count
  tmp_reader = Nokogiri::XML::Reader(File.open(@mzid_file))
  tmp_reader.each do |node|
    @num_spec += 1 if node.name == "SpectrumIdentificationResult"
    num_pep += 1 if node.name == "Peptide"
    num_db_seq += 1 if node.name == "DBSequence"
    num_pep_ev += 1 if node.name == "PeptideEvidence"
  end
  [num_pep, num_db_seq, num_pep_ev]
end
get_psm(psm_node) click to toggle source

given a xml node of a psm, return the PSM

# File lib/mzid/streaming_parser.rb, line 146
def get_psm(psm_node)
  # get peptide evidence list
  pep_ev_raw_lst = psm_node.xpath('.//PeptideEvidenceRef')
  pep_ev_lst = pep_ev_raw_lst.map do |penode|
    pep_ev_ref_id = penode["peptideEvidence_ref"]
    #@db_seq_h[@pep_ev_h[pep_ev_ref_id]]  # if use simpler hash of prot ID
    @pep_ev_h[pep_ev_ref_id]  # if use PeptideEvidence object
  end 
  # get cvparams
  cvlst = psm_node.xpath('.//cvParam')
  # find spectral prob
  tmp_lst = cvlst.select{|v| v['name'] == "MS-GF:SpecEValue"}
  spec_prob = tmp_lst[0]['value']
  # get peptide
  pep_seq = @pep_h[psm_node['peptide_ref']]
  # get spectrum id/ref number
  spec_id = psm_node['id']
  spec_num = spec_id.split("_")[1].to_i
  spec_ref = spec_id.split("_")[-1].to_i
  # store in object
  psm = PSM.new(:spec_num => spec_num, 
                :spec_ref => spec_ref, 
                :pep => pep_seq, 
                :spec_prob => spec_prob.to_f,
                :mods => (@mod_h.has_key?(psm_node['peptide_ref']) ? @mod_h[psm_node['peptide_ref']] : nil),
                :pep_ev => pep_ev_lst)
end