class MzID::StreamingParserLines
class to parse an mzIdentML file in a streaming (i.e., mem-efficient) manner not using any XML parsing library, only exploiting the structure of mzIdentML files
Public Class Methods
new(file, sp_thresh = 10.0**-10, use_pbar = nil, tda_flag = true)
click to toggle source
Calls superclass method
MzID::StreamingParser::new
# File lib/mzid/streaming_parser_lines.rb, line 14 def initialize(file, sp_thresh = 10.0**-10, use_pbar = nil, tda_flag = true) @num_spec = 0 @tda_flag = tda_flag # @pep_ev_h_protID = Hash.new @pep_ev_h_startPos = Hash.new @pep_ev_h_endPos = Hash.new @pep_ev_h_dbseqRef = Hash.new super(file, use_pbar) end
Public Instance Methods
cache_ids(use_pbar = @use_pbar)
click to toggle source
store peptide sequences in hash for lookup
# File lib/mzid/streaming_parser_lines.rb, line 44 def cache_ids(use_pbar = @use_pbar) num_pep, num_db_seq, num_pep_ev = get_num_elements(nil) @pep_h = Hash.new @mod_h = Hash.new pbar1 = ProgressBar.new("peptides", num_pep/2) if use_pbar reader = Nokogiri::XML::Reader(File.open(@mzid_file)) reader.each do |node| # parse Peptide items if node.name == "Peptide" then # parse local peptide entry tmp_node = Nokogiri::XML.parse(node.outer_xml) tmp_node.remove_namespaces! root = tmp_node.root pep_id = root["id"].to_sym # skip if already handled PepID next if @pep_h.has_key?(pep_id) # parse sequence/mods if haven't seen it yet pep_seq = get_peptide_sequence(root) mod_line = get_modifications(root) @pep_h[pep_id] = pep_seq @mod_h[pep_id] = mod_line pbar1.inc if use_pbar end end pbar1.finish if use_pbar # now parse DBSequence items dbseq_re = Regexp.new(/^\s*<DBSequence\s/) pbar2 = ProgressBar.new("db_seq", num_db_seq) if use_pbar IO.foreach(@mzid_file) do |line| next if !dbseq_re.match(line) prot_id = line.match(/accession=\"([\w|\|]+)/)[1] db_id = line.match(/id=\"(\w+)/)[1] @db_seq_h[db_id.to_sym] = prot_id.to_sym pbar2.inc if use_pbar end pbar2.finish if use_pbar # now parse PeptideEvidence items pepev_re = Regexp.new(/^\s*<PeptideEvidence\s/) pbar3 = ProgressBar.new("pep_ev", num_pep_ev) if use_pbar IO.foreach(@mzid_file) do |line| next if !pepev_re.match(line) db_id = line.match(/dBSequence_ref=\"(\w+)/)[1] start_pos = line.match(/start=\"(\d+)/)[1].to_i end_pos = line.match(/end=\"(\d+)/)[1].to_i pep_ev = line.match(/id=\"(\w+)/)[1] is_decoy = line.match(/isDecoy=\"(\w+)\"/)[1] # @pep_ev_h_dbseqRef[pep_ev.to_sym] = db_id.to_sym @pep_ev_h[pep_ev.to_sym] = PeptideEvidence.new(:db_seq_ref => db_id.to_sym, :start_pos => start_pos, :end_pos => end_pos, :is_decoy => is_decoy) pbar3.inc if use_pbar end pbar3.finish if use_pbar end
each_psm(use_pbar=@use_pbar) { |curr_psm| ... }
click to toggle source
iterate through each psm by identifying them parsing the file one line at a time - faster than using XML parser
# File lib/mzid/streaming_parser_lines.rb, line 107 def each_psm(use_pbar=@use_pbar) num_lines = `wc -l #{@mzid_file}`.to_i if use_pbar curr_psm = nil pbar = ProgressBar.new("PSMs", num_lines) if use_pbar specid_item_re = Regexp.new(/^\s+<SpectrumIdentificationItem\s/) pepevref_re = Regexp.new(/^\s+<PeptideEvidenceRef\s/) specprob_re = Regexp.new(/name=\"MS-GF:SpecEValue\"\/>$/) specid_item_end_re = Regexp.new(/^\s+<\/SpectrumIdentificationItem>\s*$/) IO.foreach(@mzid_file) do |line| pbar.inc if use_pbar # skip line if not one pertaiing to spectrum ID item next if !specid_item_re.match(line) && !pepevref_re.match(line) && !specprob_re.match(line) && !specid_item_end_re.match(line) # beginning of spectrum ID item if specid_item_re.match(line) then spec_id_id = line.match(/id=\"(\w+)/)[1] spec_num = spec_id_id.split("_")[1].to_i pep_ref = line.match(/peptide_ref=\"(\w+)/)[1] # get peptide pep_seq = @pep_h[pep_ref.to_sym] mods = @mod_h[pep_ref.to_sym] curr_psm = PSM.new(:spec_num => spec_num, :pep => pep_seq, :mods => mods) elsif pepevref_re.match(line) then pep_ev = line.match(/peptideEvidence_ref=\"(\w+)/)[1] curr_psm.add_pep_ev(pep_ev.to_sym) if curr_psm elsif specprob_re.match(line) then sprob = line.match(/value=\"([\d|\w|\.|-]+)\"/)[1] curr_psm.set_spec_prob(sprob.to_f) if curr_psm elsif specid_item_end_re.match(line) then yield curr_psm curr_psm = nil # kill current PSM object end end pbar.finish if use_pbar end
get_is_decoy(pep_ev_id)
click to toggle source
# File lib/mzid/streaming_parser_lines.rb, line 38 def get_is_decoy(pep_ev_id) @pep_ev_h[pep_ev_id].get_is_decoy end
get_pep_end(pep_ev_id)
click to toggle source
# File lib/mzid/streaming_parser_lines.rb, line 37 def get_pep_end(pep_ev_id) @pep_ev_h[pep_ev_id].get_end_pos end
get_pep_start(pep_ev_id)
click to toggle source
# File lib/mzid/streaming_parser_lines.rb, line 36 def get_pep_start(pep_ev_id) @pep_ev_h[pep_ev_id].get_start_pos end
get_prot_id(pep_ev_id)
click to toggle source
get a protein ID from a PeptideEvidenceID
# File lib/mzid/streaming_parser_lines.rb, line 27 def get_prot_id(pep_ev_id) #dbref = @pep_ev_h_dbseqRef[pep_ev_id] dbref = @pep_ev_h[pep_ev_id].get_db_seq_ref prot_id = @db_seq_h[dbref] prot_id end
write_to_csv(outfile="result.csv", use_pbar=@use_pbar)
click to toggle source
load PSMs into memory, and go back to perform lookup for prot ids
# File lib/mzid/streaming_parser_lines.rb, line 147 def write_to_csv(outfile="result.csv", use_pbar=@use_pbar) CSV.open(outfile, "w", {:col_sep => "\t"}) do |csv| headerAry = ["#spec_num", "peptide", "spec_prob", "decoy", "prot_ids", "start", "end", "num_prot"] headerAry.delete("decoy") if !@tda_flag csv << headerAry # each PSM self.each_psm do |psm| pep_seq = psm.get_pep spec_num = psm.get_spec_num sp_prob = psm.get_spec_prob pass_thresh = psm.get_pass_threshold pep_ev_ref_lst = psm.get_pep_ev # number of proteins with matching peptide num_prot = pep_ev_ref_lst.size # for each PeptideEvidence, write a different line pep_ev_ref_lst.each do |pepev| prot_id = self.get_prot_id(pepev) start_pos = self.get_pep_start(pepev) end_pos = self.get_pep_end(pepev) is_decoy = self.get_is_decoy(pepev) ary = [spec_num, pep_seq, sp_prob, is_decoy, prot_id, start_pos, end_pos, num_prot] ary.delete_at(3) if !@tda_flag csv << ary end end end end