class Peptide

Attributes

charge[RW]
indistinguishable_peptides[RW]
modifications[RW]
modified_sequence[RW]
probability[RW]
protein_name[RW]
sequence[RW]

Stripped sequence (no modifications)

theoretical_neutral_mass[RW]

Public Class Methods

from_mzid(xmlnode,mzid_doc) click to toggle source

<ProteinDetectionHypothesis id=“PAG_0_1” dBSequence_ref=“JEMP01000193.1_rev_g3500.t1 280755” passThreshold=“false”>

<PeptideHypothesis peptideEvidence_ref="PepEv_1">
        <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
</PeptideHypothesis>
<cvParam cvRef="PSI-MS" accession="MS:1002403" name="group representative"/>
<cvParam cvRef="PSI-MS" accession="MS:1002401" name="leading protein"/>
<cvParam cvRef="PSI-MS" accession="MS:1001093" name="sequence coverage" value="0.0"/>

</ProteinDetectionHypothesis>

# File lib/protk/peptide.rb, line 75
def from_mzid(xmlnode,mzid_doc)
        pep=new()
        pep.sequence=mzid_doc.get_sequence_for_peptide(xmlnode)
        best_psm = mzid_doc.get_best_psm_for_peptide(xmlnode)
        # require 'byebug';byebug if !best_psm
        pep.probability = mzid_doc.get_cvParam(best_psm,"MS:1002466")['value'].to_f
        pep.theoretical_neutral_mass = mzid_doc.get_cvParam(best_psm,"MS:1001117")['value'].to_f
        pep.charge = best_psm.attributes['chargeState'].to_i
        pep.protein_name = mzid_doc.get_dbsequence(xmlnode.parent,xmlnode.parent.attributes['dBSequence_ref']).attributes['accession']


        pep
end
from_protxml(xmlnode) click to toggle source
# File lib/protk/peptide.rb, line 37
def from_protxml(xmlnode)
        pep=new()
        pep.sequence=xmlnode['peptide_sequence']
        pep.probability=xmlnode['nsp_adjusted_probability'].to_f
        pep.charge=xmlnode['charge'].to_i

        # This deal with the case where mods are on the primary peptide
        #
        mod_info_node = xmlnode.find('protxml:modification_info','protxml:http://regis-web.systemsbiology.net/protXML')

        # The pepXML spec says there can be multiple modification_info's but in practice there never is.
        # We assume either 1 or 0
        if ( mod_info_node.length > 0 )
                throw "Encountered multiple modification_info nodes for a peptide" if mod_info_node.length > 1
                pep.modified_sequence = mod_info_node[0]['modified_peptide']
                mod_nodes = mod_info_node[0].find('protxml:mod_aminoacid_mass','protxml:http://regis-web.systemsbiology.net/protXML')
                # require 'byebug';byebug
                pep.modifications = mod_nodes.collect { |e| PeptideMod.from_protxml(e) }
        end

        # This deals with indistinguishable peptides
        #
        ips = xmlnode.find('protxml:indistinguishable_peptide','protxml:http://regis-web.systemsbiology.net/protXML')
        # require 'byebug';byebug
        pep.indistinguishable_peptides = ips.collect { |e| IndistinguishablePeptide.from_protxml(e) }

        pep
end
from_sequence(seq,charge=nil) click to toggle source
# File lib/protk/peptide.rb, line 89
def from_sequence(seq,charge=nil)
        pep=new()

        pep.modifications = pep.modifications_from_sequence(seq)
        pep.modified_sequence = seq

        seq = seq.sub(/^n\[[0-9]+?\]/,"")
        pep.sequence = seq.gsub(/[0-9\.\[\]]/,"")
        pep.charge=charge
        pep
end
new() click to toggle source
# File lib/protk/peptide.rb, line 105
def initialize()

end

Public Instance Methods

as_protxml() click to toggle source
# File lib/protk/peptide.rb, line 27
def as_protxml
        node = XML::Node.new('peptide')
        node['peptide_sequence']=self.sequence.to_s
        node['charge']=self.charge.to_s
        node['nsp_adjusted_probability']=self.probability.to_s
        node['calc_neutral_pep_mass']=self.theoretical_neutral_mass.to_s
        node
end
coords_in_protein(prot_seq,reverse=false) click to toggle source

Expects prot_seq not to contain explicit stop codon (ie * at end) AA coords are 0-based unlike genomic coords which are 1 based

# File lib/protk/peptide.rb, line 132
def coords_in_protein(prot_seq,reverse=false)
        if reverse
                pep_index = prot_seq.reverse.index(self.sequence.reverse)
                raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_index.nil?
                pep_start_i = pep_index
        else
                pep_start_i = prot_seq.index(self.sequence)
                raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_start_i.nil?                 
        end
        pep_end_i = pep_start_i+self.sequence.length
        {:start => pep_start_i,:end => pep_end_i}
end
gff_record_for_peptide_fragment(start_i,end_i,parent_record,record_info) click to toggle source
# File lib/protk/peptide.rb, line 270
def gff_record_for_peptide_fragment(start_i,end_i,parent_record,record_info)
        cds_id = parent_record.id
        mod_sequence = record_info[:modified_sequence]
        this_id = mod_sequence ? "#{cds_id}.#{mod_sequence}" : "#{cds_id}.#{self.sequence}"
        this_id << ".#{self.charge}" unless self.charge.nil?
        mod = record_info[:mod]
        this_id << ".#{mod.position}.#{mod.mass}" unless mod.nil?
        score = self.probability.nil? ? "." : self.probability.to_s
        record_type = mod.nil? ? record_info[:type] : "#{record_info[:type]}_#{mod.amino_acid}"
        gff_string = "#{parent_record.seqid}\tMSMS\t#{record_type}\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
        Bio::GFF::GFF3::Record.new(gff_string)
end
gff_records_for_coords_in_protein(aa_coords,seqlen,parent_record,cds_records,record_info ={:type => "polypeptide"}) click to toggle source
# File lib/protk/peptide.rb, line 207
def gff_records_for_coords_in_protein(aa_coords,seqlen,parent_record,cds_records,record_info ={:type => "polypeptide"})
        on_reverse_strand = (parent_record.strand=="-") ? true : false
        ordered_cds_records = on_reverse_strand ? cds_records.sort.reverse : cds_records.sort

        # Initial position is the number of NA's from the start of translation
        #
        pep_nalen = seqlen*3

        i = 0; #Current protein position (in nucleic acids)

        pep_start_i = aa_coords[:start]*3
        pep_end_i = pep_start_i+seqlen*3
        gff_records=[]
        ordered_cds_records.each do |cds_record|

                gff_record = nil
                fragment_len = 0
                if on_reverse_strand

                        in_peptide = (i<pep_end_i) && (i>=pep_start_i)
                        before_len = [pep_start_i-i,0].max

                        if in_peptide
                                fragment_end = cds_record.end
                                fragment_len = [cds_record.length,pep_end_i-i].min
                                fragment_start = fragment_end-fragment_len+1
                                gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
                        elsif before_len>0
                                fragment_end = cds_record.end - before_len
                                fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
                                fragment_start = fragment_end - fragment_len + 1
                                if fragment_len>0
                                        gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
                                end
                        else
                                gff_record=nil
                        end                         
                else
                        in_peptide = (i<pep_end_i) && (i>=pep_start_i)
                        before_len = [pep_start_i-i,0].max
                        if in_peptide
                                fragment_start = cds_record.start
                                fragment_len = [cds_record.length,pep_end_i-i].min
                                fragment_end = fragment_start+fragment_len-1
                                gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
                        elsif before_len>0
                                fragment_start = cds_record.start + before_len
                                fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
                                fragment_end = fragment_start + fragment_len-1
                                if fragment_len>0
                                        gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
                                end
                        else
                                gff_record = nil
                        end

                end
                i+=cds_record.length
                gff_records << gff_record unless gff_record.nil?
        end
        gff_records
end
modifications_from_sequence(seq) click to toggle source
# File lib/protk/peptide.rb, line 109
def modifications_from_sequence(seq)

        seq = seq.sub(/^n\[[0-9]+?\]/,"")
        offset = 0
        mods = seq.enum_for(:scan, /([A-Z])\[([0-9\.]+)\]/).map {
                pm = PeptideMod.from_data(Regexp.last_match.begin(0)+1-offset,Regexp.last_match.captures[0],Regexp.last_match.captures[1].to_f)
                offset += Regexp.last_match.captures[1].length+2
                pm
        }

        # if ( seq == "N[115]VMN[115]LTPAETQ[129]QLHAALESQLSPGELAK" )
        #     require 'byebug';byebug
        #     puts "hi"
        # end


        mods
end
mods_to_gff3_records(prot_seq,parent_record,cds_records) click to toggle source
# File lib/protk/peptide.rb, line 173
def mods_to_gff3_records(prot_seq,parent_record,cds_records)

        throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
        throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array

        pep_aa_coords = coords_in_protein(prot_seq,false)

        mod_records = []
        
        unless ( self.modifications.nil? )
                self.modifications.each { |mod|
                        prot_position = mod.position+pep_aa_coords[:start]
                        mod_aa_coords = {:start => prot_position, :end => prot_position+1}
                        mod_records << gff_records_for_coords_in_protein(mod_aa_coords,1,parent_record,cds_records, {:type => "modified_amino_acid_feature", :mod => mod, :modified_sequence => self.modified_sequence})    
                }
        end

        unless ( self.indistinguishable_peptides.nil? )
                self.indistinguishable_peptides.each { |ip|
                        unless ( ip.modifications.nil? )
                                ip.modifications.each { |mod|
                                        prot_position = mod.position+pep_aa_coords[:start]-1
                                        mod_aa_coords = {:start => prot_position, :end => prot_position+1}
                                        mod_records << gff_records_for_coords_in_protein(mod_aa_coords,1,parent_record,cds_records, {:type => "modified_amino_acid_feature", :mod => mod, :modified_sequence => ip.modified_sequence})    
                                }
                        end
                } 
        end

        mod_records.flatten

end
to_gff3_records(prot_seq,parent_record,cds_records) click to toggle source

Returns a list of fragments (hashes with start and end) in GFF style (1 based) genomic coordinates

Assumes that cds_coords is inclusive of the entire protein sequence including start-met

We assume that gff records conform to the spec

www.sequenceontology.org/gff3.shtml

This part of the spec is crucial

  • The START and STOP codons are included in the CDS.

  • That is, if the locations of the start and stop codons are known,

  • the first three base pairs of the CDS should correspond to the start codon

  • and the last three correspond the stop codon.

We also assume that all the cds records provided, actually form part of the protein (ie skipped exons should not be included)

# File lib/protk/peptide.rb, line 163
def to_gff3_records(prot_seq,parent_record,cds_records)

        throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
        throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array

        aa_coords = coords_in_protein(prot_seq,false) # Always use forward protein coordinates

        gff_records_for_coords_in_protein(aa_coords,self.sequence.length,parent_record,cds_records)           
end