class MzID::FilteredStreamingParser
class to parse an mzIdentML file in a streaming (i.e., mem-efficient) manner performs multi-pass filtering so that can maintain smallest datastruct in memory 1) first collect counts of elements 2) get list of peptide evidence from PSMs that pass filter 3)
Attributes
pep_ev_h_dbseqRef[RW]
Public Class Methods
new(file, sp_thresh = 10.0**-10, use_pbar = nil)
click to toggle source
Calls superclass method
# File lib/mzid/filtered_streaming_parser.rb, line 16 def initialize(file, sp_thresh = 10.0**-10, use_pbar = nil) @num_spec = 0 # @pep_ev_h_protID = Hash.new @pep_ev_h_startPos = Hash.new @pep_ev_h_endPos = Hash.new @pep_ev_h_dbseqRef = Hash.new super(file, use_pbar) end
Public Instance Methods
cache_db_seq_entries(root)
click to toggle source
store database sequence entries (ids)
# File lib/mzid/filtered_streaming_parser.rb, line 141 def cache_db_seq_entries(root) dbseq_lst = root.xpath('//DBSequence') dbseq_lst.each do |dnode| id = dnode["id"].to_sym acc_id = dnode["accession"] @db_seq_h[id] = acc_id.to_sym end end
cache_ids(use_pbar = @use_pbar)
click to toggle source
store peptide sequences in hash for lookup
# File lib/mzid/filtered_streaming_parser.rb, line 43 def cache_ids(use_pbar = @use_pbar) num_pep, num_db_seq, num_pep_ev = get_num_elements(nil) puts "SPEC:\t#{@num_spec}" puts "PEP:\t#{num_pep}" puts "DB:\t#{num_db_seq}" puts "PEPEV:\t#{num_pep_ev}" #pbar1 = ProgressBar.new("Caching psm", num_pep) if use_pbar #reader = Nokogiri::XML::Reader(File.open(@mzid_file)) #reader.each do |node| #end @pep_h = Hash.new @mod_h = Hash.new #pbar = ProgressBar.new("Caching", num_pep+num_db_seq+num_pep_ev) if use_pbar pbar1 = ProgressBar.new("peptides", num_pep/2) if use_pbar reader = Nokogiri::XML::Reader(File.open(@mzid_file)) reader.each do |node| # if node.name == "Peptide" then #pbar.inc if use_pbar # parse local peptide entry tmp_node = Nokogiri::XML.parse(node.outer_xml) tmp_node.remove_namespaces! root = tmp_node.root pep_id = root["id"].to_sym # skip if already handled PepID next if @pep_h.has_key?(pep_id) # parse sequence/mods if haven't seen it yet pep_seq = get_peptide_sequence(root) mod_line = get_modifications(root) @pep_h[pep_id] = pep_seq @mod_h[pep_id] = mod_line pbar1.inc if use_pbar end end pbar1.finish if use_pbar # pbar2 = ProgressBar.new("db_seq", num_db_seq) if use_pbar IO.foreach(@mzid_file) do |line| next if !line.match(/^\s+<DBSequence\s/) prot_id = line.match(/accession=\"([\w|\|]+)/)[1] db_id = line.match(/id=\"(\w+)/)[1] @db_seq_h[db_id.to_sym] = prot_id.to_sym pbar2.inc if use_pbar end # reader2 = Nokogiri::XML::Reader(File.open(@mzid_file)) # reader2.each do |node| # # # if node.name == "DBSequence" then # # parse local DBSequence entry # tmp_node = Nokogiri::XML.parse(node.outer_xml) # tmp_node.remove_namespaces! # root = tmp_node.root # cache_db_seq_entries(root) # pbar2.inc if use_pbar # end # end pbar2.finish if use_pbar # pbar3 = ProgressBar.new("pep_ev", num_pep_ev) if use_pbar IO.foreach(@mzid_file) do |line| next if !line.match(/^\s+<PeptideEvidence\s/) db_id = line.match(/dBSequence_ref=\"(\w+)/)[1] pep_ev = line.match(/id=\"(\w+)/)[1] @pep_ev_h_dbseqRef[pep_ev.to_sym] = db_id.to_sym pbar3.inc if use_pbar end # reader3 = Nokogiri::XML::Reader(File.open(@mzid_file)) # reader3.each do |node| # if node.name == "PeptideEvidence" then # # parse local DBSequence entry # tmp_node = Nokogiri::XML.parse(node.outer_xml) # tmp_node.remove_namespaces! # root = tmp_node.root # cache_pep_ev(root) # pbar3.inc if use_pbar # end # # if node.name == "PeptideEvidence" then # # tmp_node = Nokogiri::XML.parse(node.outer_xml) # # root = tmp_node.root # # pep_ref = root.to_s.match(/peptide_ref=\"(\w+)\"/)[1] # # id_ref = root.to_s.match(/id=\"(\w+)\"/)[1] # # db_ref = root.to_s.match(/dBSequence_ref=\"(\w+)\"/)[1] # # @pep_ev_h_dbseqRef[id_ref.to_sym] = db_ref.to_sym # # end # end pbar3.finish if use_pbar puts "PEP_H SIZE:\t#{@pep_h.size}" puts "DBSEQ_H SIZE:\t#{@db_seq_h.size}" puts "PEP_EV_H SIZE:\t#{@pep_ev_h_dbseqRef.size}" end
cache_ids2(use_pbar = @use_pbar)
click to toggle source
# File lib/mzid/filtered_streaming_parser.rb, line 27 def cache_ids2(use_pbar = @use_pbar) end
cache_pep_ev(root)
click to toggle source
store peptide evidence sequences in hash for lookup
# File lib/mzid/filtered_streaming_parser.rb, line 152 def cache_pep_ev(root) pep_ev_lst = root.xpath('//PeptideEvidence') pep_ev_lst.each do |pnode| id = pnode["id"].to_sym # @pep_ev_h[id] = # PeptideEvidence.new(#:id => pnode["id"], # :db_seq_ref => pnode["dBSequence_ref"], # #:pep_id => pnode["peptide_ref"], # :start_pos => pnode["start"].to_i, # :end_pos => pnode["end"].to_i, # #:pre => pnode["pre"], # #:post => pnode["post"], # :prot_id => @db_seq_h[pnode["dBSequence_ref"]].to_sym) # @pep_ev_h_protID[id.to_sym] = @db_seq_h[pnode["dBSequence_ref"]].to_sym # @pep_ev_h_startPos[id.to_sym] = pnode["start"].to_i, # @pep_ev_h_endPos[id.to_sym] = pnode["end"].to_i @pep_ev_h_dbseqRef[id.to_sym] = pnode["dBSequence_ref"].to_sym end end
each_psm(use_pbar=@use_pbar) { |psm| ... }
click to toggle source
iterate through each psm
# File lib/mzid/filtered_streaming_parser.rb, line 175 def each_psm(use_pbar=@use_pbar) hit_values = File.open(@mzid_file) do |io| doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT) doc.remove_namespaces! root = doc.root # get list of identifications spec_results = root.xpath('//SpectrumIdentificationResult') pbar = ProgressBar.new("PSMs", spec_results.size) if use_pbar spec_results.each do |sres| # psms_of_spec = sres.xpath('.//SpectrumIdentificationItem') # go over each PSM from the spectra psms_of_spec.each do |psm_node| psm = get_psm(psm_node) # yield psm object yield psm end pbar.inc if use_pbar end pbar.finish if use_pbar end end
get_prot_id(pep_ev_id)
click to toggle source
def get_pep_ev_protID(pid) @pep_ev_h_protID end
# File lib/mzid/filtered_streaming_parser.rb, line 32 def get_prot_id(pep_ev_id) dbref = @pep_ev_h_dbseqRef[pep_ev_id] prot_id = @db_seq_h[dbref] prot_id end
get_psm(psm_node)
click to toggle source
given a xml node of a psm, return the PSM
# File lib/mzid/filtered_streaming_parser.rb, line 200 def get_psm(psm_node) # get peptide evidence list pep_ev_raw_lst = psm_node.xpath('.//PeptideEvidenceRef') pep_ev_lst = pep_ev_raw_lst.map{|penode| pep_ev_ref_id = penode["peptideEvidence_ref"].to_sym} # get cvparams cvlst = psm_node.xpath('.//cvParam') # find spectral prob tmp_lst = cvlst.select{|v| v['name'] == "MS-GF:SpecEValue"} spec_prob = tmp_lst[0]['value'] # get peptide pep_seq = @pep_h[psm_node['peptide_ref'].to_sym] # get spectrum id/ref number spec_id = psm_node['id'] spec_num = spec_id.split("_")[1].to_i spec_ref = spec_id.split("_")[-1].to_i # # store in object psm = PSM.new(:spec_num => spec_num, :spec_ref => spec_ref, :pep => pep_seq, :spec_prob => spec_prob.to_f, :mods => (@mod_h.has_key?(psm_node['peptide_ref']) ? @mod_h[psm_node['peptide_ref']] : nil), :pep_ev => pep_ev_lst) end
write_to_file(outfile, use_pbar=@use_pbar)
click to toggle source
load PSMs into memory, and go back to perform lookup for prot ids
# File lib/mzid/filtered_streaming_parser.rb, line 227 def write_to_file(outfile, use_pbar=@use_pbar) pbar3 = ProgressBar.new("Caching pep_ev", num_db_seq) if use_pbar t1_db = Time.now reader3 = Nokogiri::XML::Reader(File.open(@mzid_file)) reader3.each do |node| if node.name == "PeptideEvidence" then # parse local DBSequence entry tmp_node = Nokogiri::XML.parse(node.outer_xml) tmp_node.remove_namespaces! root = tmp_node.root #cache_pep_ev(root) pep_ev_lst = root.xpath('//PeptideEvidence') pep_ev_lst.each do |pnode| id = pnode["id"] start_pos = pnode["start"].to_i, end_pos = pnode["end"].to_i db_seq_ref = pnode["dBSequence_ref"].to_sym end pbar3.inc if use_pbar end end pbar3.finish if use_pbar end