class Bio::UniProt

Description

Parser class for UniProtKB/SwissProt and TrEMBL database entry.

See the UniProtKB document files and manuals.

Examples

str = File.read("p53_human.swiss")
obj = Bio::UniProtKB.new(str)
obj.entry_id #=> "P53_HUMAN"

References

Public Instance Methods

aalen()
Alias for: sequence_length
aaseq()
Alias for: seq
cc(topic = nil) click to toggle source

returns contents in the CC lines.

returns an object of contents in the TOPIC.

returns contents of the “ALTERNATIVE PRODUCTS”.

  • Bio::UniProtKB#cc('ALTERNATIVE PRODUCTS') -> Hash

    {'Event' => str, 
     'Named isoforms' => int,  
     'Comment' => str,
     'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
    
    CC   -!- ALTERNATIVE PRODUCTS:
    CC       Event=Alternative splicing; Named isoforms=15;
    ...
    CC         placentae isoforms. All tissues differentially splice exon 13;
    CC       Name=A; Synonyms=no del;
    CC         IsoId=P15529-1; Sequence=Displayed;

returns contents of the “DATABASE”.

  • Bio::UniProtKB#cc('DATABASE') -> Array

    [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
    
    CC   -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].

returns contents of the “MASS SPECTROMETRY”.

  • Bio::UniProtKB#cc('MASS SPECTROMETRY') -> Array

    [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
    
    CC   -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].

CC lines (>=0, optional)

CC   -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
CC       IN LIVER, KIDNEY, LUNG AND BRAIN.

CC   -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
CC       SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.

See also www.expasy.org/sprot/userman.html#CC_line

    # File lib/bio/db/embl/uniprotkb.rb
774 def cc(topic = nil)
775   unless @data['CC']
776     cc  = Hash.new
777     comment_border= '-' * (77 - 4 + 1)
778     dlm = /-!- /
779 
780     # 12KD_MYCSM has no CC lines.
781     return cc if get('CC').size == 0
782     
783     cc_raw = fetch('CC')
784 
785     # Removing the copyright statement.
786     cc_raw.sub!(/ *---.+---/m, '')
787 
788     # Not any CC Lines without the copyright statement.
789     return cc if cc_raw == ''
790 
791     begin
792       cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0]
793       _ = copyright #dummy for suppress "assigned but unused variable"
794       cc_raw = cc_raw.sub(dlm,'')
795       cc_raw.split(dlm).each do |tmp|
796         tmp = tmp.strip
797 
798         if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
799           key  = $1
800           body = $2
801           body.gsub!(/- (?!AND)/,'-')
802           body.strip!
803           unless cc[key]
804             cc[key] = [body]
805           else
806             cc[key].push(body)
807           end
808         else
809           raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"',
810                  '', get('CC'),''].join("\n")
811         end
812       end
813     rescue NameError
814       if fetch('CC') == ''
815         return {}
816       else
817         raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
818                "\n'#{self.get('CC')}'\n", "(#{$!})"].join
819       end
820     rescue NoMethodError
821     end
822     
823     @data['CC'] = cc
824   end
825 
826 
827   case topic
828   when 'ALLERGEN'
829     return @data['CC'][topic]
830   when 'ALTERNATIVE PRODUCTS'
831     return cc_alternative_products(@data['CC'][topic])
832   when 'BIOPHYSICOCHEMICAL PROPERTIES'
833     return cc_biophysiochemical_properties(@data['CC'][topic])
834   when 'BIOTECHNOLOGY'
835     return @data['CC'][topic]
836   when 'CATALITIC ACTIVITY'
837     return cc_catalytic_activity(@data['CC'][topic])
838   when 'CAUTION'
839     return cc_caution(@data['CC'][topic])
840   when 'COFACTOR'
841     return @data['CC'][topic]
842   when 'DEVELOPMENTAL STAGE'
843     return @data['CC'][topic].join('')
844   when 'DISEASE'
845     return @data['CC'][topic].join('')
846   when 'DOMAIN'
847     return @data['CC'][topic]
848   when 'ENZYME REGULATION'
849     return @data['CC'][topic].join('')
850   when 'FUNCTION'
851     return @data['CC'][topic].join('')
852   when 'INDUCTION'
853     return @data['CC'][topic].join('')
854   when 'INTERACTION'
855     return cc_interaction(@data['CC'][topic])
856   when 'MASS SPECTROMETRY'
857     return cc_mass_spectrometry(@data['CC'][topic])
858   when 'MISCELLANEOUS'
859     return @data['CC'][topic]
860   when 'PATHWAY'
861     return cc_pathway(@data['CC'][topic])
862   when 'PHARMACEUTICAL'
863     return @data['CC'][topic]
864   when 'POLYMORPHISM'
865     return @data['CC'][topic]
866   when 'PTM'
867     return @data['CC'][topic]
868   when 'RNA EDITING'
869     return cc_rna_editing(@data['CC'][topic])
870   when 'SIMILARITY'
871     return @data['CC'][topic]
872   when 'SUBCELLULAR LOCATION'
873     return cc_subcellular_location(@data['CC'][topic])
874   when 'SUBUNIT'
875     return @data['CC'][topic]
876   when 'TISSUE SPECIFICITY'
877     return @data['CC'][topic]
878   when 'TOXIC DOSE'
879     return @data['CC'][topic]
880   when 'WEB RESOURCE'
881     return cc_web_resource(@data['CC'][topic])
882   when 'DATABASE'
883     # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
884     tmp = Array.new
885     db = @data['CC']['DATABASE']
886     return db unless db
887 
888     db.each do |e|
889       db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
890       e.sub(/.$/,'').split(/;/).each do |line|
891         case line
892         when /NAME=(.+)/
893           db['NAME'] = $1
894         when /NOTE=(.+)/
895           db['NOTE'] = $1
896         when /WWW="(.+)"/
897           db['WWW'] = $1
898         when /FTP="(.+)"/
899           db['FTP'] = $1
900         end 
901       end
902       tmp.push(db)
903     end
904     return tmp
905   when nil
906     return @data['CC']
907   else
908     return @data['CC'][topic]
909   end
910 end
dr(key = nil) click to toggle source

Bio::UniProtKB#dr

     # File lib/bio/db/embl/uniprotkb.rb
1131 def dr(key = nil)
1132   unless key
1133     embl_dr
1134   else
1135     (embl_dr[key] or []).map {|x|
1136       {'Accession' => x[0],
1137        'Version' => x[1],
1138        ' ' => x[2],
1139        'Molecular Type' => x[3]}
1140     }
1141   end
1142 end
Also aliased as: embl_dr
dt(key = nil) click to toggle source

returns a Hash of information in the DT lines.

hash keys: 
  ['created', 'sequence', 'annotation']

Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is changed, and the word “annotation” is no longer used in DT lines. Despite the change, the word “annotation” is still used for keeping compatibility.

returns a String of information in the DT lines by a given key.

DT Line; date (3/entry)

DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.)
DT DD-MMM-YYY (sequence version NN)
DT DD-MMM-YYY (entry version NN)

The format have been changed in UniProtKB release 7.0 of 07-Feb-2006. Below is the older format.

Old format of DT Line; date (3/entry)

DT DD-MMM-YYY (rel. NN, Created)
DT DD-MMM-YYY (rel. NN, Last sequence update)
DT DD-MMM-YYY (rel. NN, Last annotation update)
    # File lib/bio/db/embl/uniprotkb.rb
157 def dt(key = nil)
158   return dt[key] if key
159   return @data['DT'] if @data['DT']
160 
161   part = self.get('DT').split(/\n/)
162   @data['DT'] = {
163     'created'    => part[0].sub(/\w{2}   /,'').strip,
164     'sequence'   => part[1].sub(/\w{2}   /,'').strip,
165     'annotation' => part[2].sub(/\w{2}   /,'').strip
166   }
167 end
embl_dr(key = nil)

Backup Bio::EMBLDB#dr as embl_dr

Alias for: dr
entry()
Alias for: entry_id
entry_id() click to toggle source

returns a ENTRY_NAME in the ID line.

    # File lib/bio/db/embl/uniprotkb.rb
 98 def entry_id
 99   id_line('ENTRY_NAME')
100 end
Also aliased as: entry_name, entry
entry_name()
Alias for: entry_id
ft(feature_key = nil) click to toggle source

returns contents in the feature table.

Examples

sp = Bio::UniProtKB.new(entry)
ft = sp.ft
ft.class #=> Hash
ft.keys.each do |feature_key|
  ft[feature_key].each do |feature|
    feature['From'] #=> '1'
    feature['To']   #=> '21'
    feature['Description'] #=> ''
    feature['FTId'] #=> ''
    feature['diff'] #=> []
    feature['original'] #=> [feature_key, '1', '21', '', '']
  end
end
  • Bio::UniProtKB#ft -> Hash

    {FEATURE_KEY => [{'From' => int, 'To' => int, 
                      'Description' => aStr, 'FTId' => aStr,
                      'diff' => [original_residues, changed_residues],
                      'original' => aAry }],...}

returns an Array of the information about the feature_name in the feature table.

FT Line; feature table data (>=0, optional)

Col     Data item
-----   -----------------
 1- 2   FT
 6-13   Feature name 
15-20   `FROM' endpoint
22-27   `TO' endpoint
35-75   Description (>=0 per key)
-----   -----------------

Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors including '<', '>' or '?'. (c.f. '<1', '?42')

See also www.expasy.org/sprot/userman.html#FT_line

     # File lib/bio/db/embl/uniprotkb.rb
1196 def ft(feature_key = nil)
1197   return ft[feature_key] if feature_key
1198   return @data['FT'] if @data['FT']
1199 
1200   table = []
1201   begin
1202     get('FT').split("\n").each do |line|
1203       if line =~ /^FT   \w/
1204         feature = line.chomp.ljust(74)
1205         table << [feature[ 5..12].strip,   # Feature Name
1206                   feature[14..19].strip,   # From
1207                   feature[21..26].strip,   # To
1208                   feature[34..74].strip ]  # Description
1209       else
1210         table.last << line.chomp.sub!(/^FT +/, '')
1211       end
1212     end
1213 
1214     # Joining Description lines
1215     table = table.map { |feature| 
1216       ftid = feature.pop if feature.last =~ /FTId=/
1217       if feature.size > 4
1218         feature = [feature[0], 
1219                    feature[1], 
1220                    feature[2], 
1221                    feature[3, feature.size - 3].join(" ")]
1222       end
1223       feature << if ftid then ftid else '' end
1224     }
1225 
1226     hash = {}
1227     table.each do |feature|
1228       hash[feature[0]] = [] unless hash[feature[0]]
1229       hash[feature[0]] << {
1230         # Removing '<', '>' or '?' in FROM/TO endopoint.
1231         'From' => feature[1].sub(/\D/, '').to_i,  
1232         'To'   => feature[2].sub(/\D/, '').to_i, 
1233         'Description' => feature[3], 
1234         'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''),
1235         'diff' => [],
1236         'original' => feature
1237       }
1238 
1239       case feature[0]
1240       when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
1241         case hash[feature[0]].last['Description']
1242         when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
1243           original_res = $1
1244           changed_res = $2
1245           original_res = original_res.gsub(/ /,'').strip
1246           chenged_res = changed_res.gsub(/ /,'').strip
1247         when /Missing/i
1248           original_res = seq.subseq(hash[feature[0]].last['From'],
1249                                     hash[feature[0]].last['To'])
1250           changed_res = ''
1251         end
1252         hash[feature[0]].last['diff'] = [original_res, chenged_res]
1253       end
1254     end
1255   rescue
1256     raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
1257   end
1258 
1259   @data['FT'] = hash
1260 end
gene_name() click to toggle source

returns a String of the first gene name in the GN line.

    # File lib/bio/db/embl/uniprotkb.rb
437 def gene_name
438   (x = self.gene_names) ? x.first : nil
439 end
gene_names() click to toggle source

returns a Array of gene names in the GN line.

    # File lib/bio/db/embl/uniprotkb.rb
426 def gene_names
427   gn # set @data['GN'] if it hasn't been already done
428   if @data['GN'].first.class == Hash then
429     @data['GN'].collect { |element| element[:name] }
430   else
431     @data['GN'].first
432   end
433 end
gn() click to toggle source

returns gene names in the GN line.

New UniProt/SwissProt format:

where <gene record> is:

{ :name => '...', 
  :synonyms => [ 's1', 's2', ... ],
  :loci   => [ 'l1', 'l2', ... ],
  :orfs     => [ 'o1', 'o2', ... ] 
}

Old format:

GN Line: Gene name(s) (>=0, optional)

    # File lib/bio/db/embl/uniprotkb.rb
350 def gn
351   unless @data['GN']
352     case fetch('GN')
353     when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/
354       @data['GN'] = gn_uniprot_parser
355     else
356       @data['GN'] = gn_old_parser
357     end
358   end
359   @data['GN']
360 end
hi() click to toggle source

The HI line

Bio::UniProtKB#hi #=> hash

    # File lib/bio/db/embl/uniprotkb.rb
690 def hi
691   unless @data['HI']
692     @data['HI'] = []
693     fetch('HI').split(/\. /).each do |hlist|
694       hash = {'Category' => '',  'Keywords' => [], 'Keyword' => ''}
695       hash['Category'], hash['Keywords'] = hlist.split(': ')
696       hash['Keywords'] = hash['Keywords'].split('; ')
697       hash['Keyword'] = hash['Keywords'].pop
698       hash['Keyword'].sub!(/\.$/, '')
699       @data['HI'] << hash
700     end
701   end
702   @data['HI']
703 end
id_line(key = nil) click to toggle source

returns a Hash of the ID line.

returns a content (Int or String) of the ID line by a given key. Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']

ID Line (since UniProtKB release 9.0 of 31-Oct-2006)

ID   P53_HUMAN               Reviewed;         393 AA.
#"ID  #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."

Examples

obj.id_line  #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed", 
                  "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil}

obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"

ID Line (older style)

ID   P53_HUMAN      STANDARD;      PRT;   393 AA.
#"ID  #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."

Examples

obj.id_line  #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD", 
                  "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"}

obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
   # File lib/bio/db/embl/uniprotkb.rb
73 def id_line(key = nil)
74   return id_line[key] if key
75   return @data['ID'] if @data['ID']
76 
77   part = @orig['ID'].split(/ +/)         
78   if part[4].to_s.chomp == 'AA.' then
79     # after UniProtKB release 9.0 of 31-Oct-2006
80     # (http://www.uniprot.org/docs/sp_news.htm)
81     molecule_type   = nil
82     sequence_length = part[3].to_i
83   else
84     molecule_type   = part[3].sub(/;/,'')
85     sequence_length = part[4].to_i
86   end
87   @data['ID'] = {
88     'ENTRY_NAME'      => part[1],
89     'DATA_CLASS'      => part[2].sub(/;/,''),
90     'MOLECULE_TYPE'   => molecule_type,
91     'SEQUENCE_LENGTH' => sequence_length
92   }
93 end
molecule() click to toggle source

returns a MOLECULE_TYPE in the ID line.

A short-cut for Bio::UniProtKB#id_line('MOLECULE_TYPE').

    # File lib/bio/db/embl/uniprotkb.rb
108 def molecule
109   id_line('MOLECULE_TYPE')
110 end
Also aliased as: molecule_type
molecule_type()
Alias for: molecule
oh() click to toggle source

The OH Line;

OH NCBI_TaxID=TaxID; HostName. br.expasy.org/sprot/userman.html#OH_line

    # File lib/bio/db/embl/uniprotkb.rb
520 def oh
521   unless @data['OH']
522     @data['OH'] = fetch('OH').split("\. ").map {|x|
523       if x =~ /NCBI_TaxID=(\d+);/
524         taxid = $1
525       else
526         raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):",
527                               $!, "\n", get('OH'), "\n"].join
528         
529       end
530       if x =~ /NCBI_TaxID=\d+; (.+)/ 
531         host_name = $1
532         host_name.sub!(/\.$/, '')
533       else
534         host_name = nil
535       end
536       {'NCBI_TaxID' => taxid, 'HostName' => host_name}
537     }
538   end
539   @data['OH']
540 end
os(num = nil) click to toggle source

returns a Array of Hashs or a String of the OS line when a key given.

  • Bio::EMBLDB#os -> Array

[{'name' => '(Human)', 'os' => 'Homo sapiens'}, 
 {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
{'name' => "(Human)", 'os' => 'Homo sapiens'}

OS Line; organism species (>=1)

OS   Genus species (name).
OS   Genus species (name0) (name1).
OS   Genus species (name0) (name1).
OS   Genus species (name0), G s0 (name0), and G s (name0) (name1).
OS   Homo sapiens (Human), and Rarrus norveticus (Rat)
OS   Hippotis sp. Clark and Watts 825.
OS   unknown cyperaceous sp.
    # File lib/bio/db/embl/uniprotkb.rb
459 def os(num = nil)
460   unless @data['OS']
461     os = Array.new
462     fetch('OS').split(/, and|, /).each do |tmp|
463       if tmp =~ /(\w+ *[\w \:\'\+\-\.]+[\w\.])/
464         org = $1
465         tmp =~ /(\(.+\))/ 
466         os.push({'name' => $1, 'os' => org})
467       else
468         raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
469       end
470     end
471     @data['OS'] = os
472   end
473 
474   if num
475     # EX. "Trifolium repens (white clover)"
476     return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
477   else
478     return @data['OS']
479   end
480 end
ox() click to toggle source

returns a Hash of oraganism taxonomy cross-references.

OX Line; organism taxonomy cross-reference (>=1 per entry)

OX   NCBI_TaxID=1234;
OX   NCBI_TaxID=1234, 2345, 3456, 4567;
    # File lib/bio/db/embl/uniprotkb.rb
503 def ox
504   unless @data['OX']
505     tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
506     hsh = Hash.new
507     tmp.each do |e|
508       db,refs = e.split(/=/)
509       hsh[db] = refs.split(/, */)
510     end
511     @data['OX'] = hsh
512   end
513   return @data['OX']
514 end
protein_name() click to toggle source

returns the proposed official name of the protein. Returns a String.

Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full name which is taken from “RecName: Full=” or “SubName: Full=” line normally in the beginning of the DE lines. Unlike parser for old format, no special treatments for fragment or precursor.

For old format, the method parses the DE lines and returns the protein name as a String.

DE Line; description (>=1)

"DE #{OFFICIAL_NAME} (#{SYNONYM})"
"DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
OFFICIAL_NAME  1/entry
SYNONYM        >=0
CONTEINS       >=0
    # File lib/bio/db/embl/uniprotkb.rb
250 def protein_name
251   @data['DE'] ||= parse_DE_line_rel14(get('DE'))
252   parsed_de_line = @data['DE']
253   if parsed_de_line then
254     # since UniProtKB release 14.0 of 22-Jul-2008
255     name = nil
256     parsed_de_line.each do |a|
257       case a[0]
258       when 'RecName', 'SubName'
259         if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then
260           name = name_pair[1]
261           break
262         end
263       end
264     end
265     name = name.to_s
266   else
267     # old format (before Rel. 13.x)
268     name = ""
269     if de_line = fetch('DE') then
270       str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
271       name = str[/^[^(]*/].strip
272       name << ' (Fragment)' if str =~ /fragment/i
273     end
274   end
275   return name
276 end
ref() click to toggle source

returns contents in the R lines.

where <reference information Hash> is:

{'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}

R Lines

  • RN RC RP RX RA RT RL RG

    # File lib/bio/db/embl/uniprotkb.rb
556 def ref
557   unless @data['R']
558     @data['R'] = [get('R').split(/\nRN   /)].flatten.map { |str|
559       hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
560              'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
561       str = 'RN   ' + str unless /^RN   / =~ str
562 
563       str.split("\n").each do |line|
564         if /^(R[NPXARLCTG])   (.+)/ =~ line
565           hash[$1] += $2 + ' '
566         else
567           raise "Invalid format in R lines, \n[#{line}]\n"
568         end
569       end
570 
571       hash['RN'] = set_RN(hash['RN'])
572       hash['RC'] = set_RC(hash['RC'])
573       hash['RP'] = set_RP(hash['RP'])
574       hash['RX'] = set_RX(hash['RX'])
575       hash['RA'] = set_RA(hash['RA'])
576       hash['RT'] = set_RT(hash['RT'])
577       hash['RL'] = set_RL(hash['RL'])
578       hash['RG'] = set_RG(hash['RG'])
579 
580       hash
581     }
582 
583   end
584   @data['R']
585 end
references() click to toggle source

returns Bio::Reference object from Bio::EMBLDB::Common#ref.

    # File lib/bio/db/embl/uniprotkb.rb
650 def references
651   unless @data['references']
652     ary = self.ref.map {|ent|
653       hash = Hash.new('')
654       ent.each {|key, value|
655         case key
656         when 'RA'
657           hash['authors'] = value.split(/, /)
658         when 'RT'
659           hash['title'] = value
660         when 'RL'
661           if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
662             hash['journal'] = $1
663             hash['volume']  = $2
664             hash['issue']   = $3
665             hash['pages']   = $4
666             hash['year']    = $5
667           else
668             hash['journal'] = value
669           end
670         when 'RX'  # PUBMED, MEDLINE, DOI
671           value.each do |tag, xref|
672             hash[ tag.downcase ]  = xref
673           end
674         end
675       }
676       Reference.new(hash)
677     }
678     @data['references'] = References.new(ary)
679   end
680   @data['references']
681 end
seq() click to toggle source

returns a Bio::Sequence::AA of the amino acid sequence.

blank Line; sequence data (>=1)

     # File lib/bio/db/embl/uniprotkb.rb
1306 def seq
1307   unless @data['']
1308     @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
1309   end
1310   return @data['']
1311 end
Also aliased as: aaseq
sequence_length() click to toggle source

returns a SEQUENCE_LENGTH in the ID line.

A short-cut for Bio::UniProtKB#id_line('SEQUENCE_LENGHT').

    # File lib/bio/db/embl/uniprotkb.rb
117 def sequence_length
118   id_line('SEQUENCE_LENGTH')
119 end
Also aliased as: aalen
set_RN(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
587 def set_RN(data)
588   data.strip
589 end
sq(key = nil) click to toggle source

returns a Hash of conteins in the SQ lines.

  • Bio::UniProtKBL#sq -> hsh

returns a value of a key given in the SQ lines.

  • Bio::UniProtKBL#sq(key) -> int or str

  • Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length',

    'CRC64']

SQ Line; sequence header (1/entry)

SQ   SEQUENCE   233 AA;  25630 MW;  146A1B48A1475C86 CRC64;
SQ   SEQUENCE  \d+ AA; \d+ MW;  [0-9A-Z]+ CRC64;

MW, Dalton unit. CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).

     # File lib/bio/db/embl/uniprotkb.rb
1278 def sq(key = nil)
1279   unless @data['SQ']
1280     if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
1281       @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
1282     else
1283       raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
1284     end
1285   end
1286 
1287   if key
1288     case key
1289     when /mw/, /molecular/, /weight/
1290       @data['SQ']['MW']
1291     when /len/, /length/, /AA/
1292       @data['SQ']['aalen']
1293     else
1294       @data['SQ'][key]
1295     end
1296   else 
1297     @data['SQ']
1298   end
1299 end
synonyms() click to toggle source

returns synonyms (unofficial and/or alternative names). Returns an Array containing String objects.

Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full or short names which are taken from “RecName: Short=”, “RecName: EC=”, and AltName lines, except after “Contains:” or “Includes:”. For keeping compatibility with old format parser, “RecName: EC=N.N.N.N” is reported as “EC N.N.N.N”. In addition, to prevent confusion, “Allergen=” and “CD_antigen=” prefixes are added for the corresponding fields.

For old format, the method parses the DE lines and returns synonyms. synonyms are each placed in () following the official name on the DE line.

    # File lib/bio/db/embl/uniprotkb.rb
293 def synonyms
294   ary = Array.new
295   @data['DE'] ||= parse_DE_line_rel14(get('DE'))
296   parsed_de_line = @data['DE']
297   if parsed_de_line then
298     # since UniProtKB release 14.0 of 22-Jul-2008
299     parsed_de_line.each do |a|
300       case a[0]
301       when 'Includes', 'Contains'
302         break #the each loop
303       when 'RecName', 'SubName', 'AltName'
304         a[1..-1].each do |b|
305           if name = b[1] and b[1] != self.protein_name then
306             case b[0]
307             when 'EC'
308               name = "EC " + b[1]
309             when 'Allergen', 'CD_antigen'
310               name = b[0] + '=' + b[1]
311             else
312               name = b[1]
313             end
314             ary.push name
315           end
316         end
317       end #case a[0]
318     end #parsed_de_line.each
319   else
320     # old format (before Rel. 13.x)
321     if de_line = fetch('DE') then
322       line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ].  That's the "contains" part
323     line.scan(/\([^)]+/) do |synonym| 
324       unless synonym =~ /fragment/i then 
325         ary << synonym[1..-1].strip # index to remove the leading (
326       end
327       end
328     end
329   end
330   return ary
331 end

Private Instance Methods

cc_alternative_products(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
913 def cc_alternative_products(data)
914   ap = data.join('')
915   return ap unless ap
916 
917   # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
918   tmp = {'Event' => "", 'Named isoforms' => "", 'Comment' => "", 
919          'Variants'  => []}
920   if /Event=(.+?);/ =~ ap
921     tmp['Event'] = $1
922     tmp['Event'] = tmp['Event'].sub(/;/,'').split(/, /)
923   end
924   if /Named isoforms=(\S+?);/ =~ ap
925     tmp['Named isoforms'] = $1
926   end
927   if /Comment=(.+?);/m =~ ap
928     tmp['Comment'] = $1
929   end
930   ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
931     tmp['Variants'] << cc_alternative_products_variants(ent)
932   end
933   return tmp
934 end
cc_alternative_products_variants(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
937 def cc_alternative_products_variants(data)
938   variant = {'Name' => '', 'Synonyms' => [], 'IsoId' => [], 'Sequence' => []}
939   data.split(/; /).map {|x| x.split(/=/) }.each do |e|
940     case e[0]
941     when 'Sequence', 'Synonyms', 'IsoId'
942       e[1] = e[1].sub(/;/,'').split(/, /)
943     end
944     variant[e[0]] = e[1]
945   end
946   variant
947 end
cc_biophysiochemical_properties(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
951 def cc_biophysiochemical_properties(data)
952   data = data[0]
953 
954   hash = {'Absorption' => {}, 
955           'Kinetic parameters' => {},
956           'pH dependence' => "",
957           'Redox potential' => "",
958           'Temperature dependence' => ""}
959   if data =~ /Absorption: Abs\(max\)=(.+?);/
960     hash['Absorption']['Abs(max)'] = $1
961   end
962   if data =~ /Absorption: Abs\(max\)=.+; Note=(.+?);/
963     hash['Absorption']['Note'] = $1
964   end
965   if data =~ /Kinetic parameters: KM=(.+?); Vmax=(.+?);/
966     hash['Kinetic parameters']['KM'] = $1
967     hash['Kinetic parameters']['Vmax'] = $2
968   end
969   if data =~ /Kinetic parameters: KM=.+; Vmax=.+; Note=(.+?);/
970     hash['Kinetic parameters']['Note'] = $1
971   end
972   if data =~ /pH dependence: (.+?);/
973     hash['pH dependence'] = $1
974   end
975   if data =~ /Redox potential: (.+?);/
976     hash['Redox potential'] = $1
977   end
978   if data =~ /Temperature dependence: (.+?);/
979     hash['Temperature dependence'] = $1
980   end
981   hash
982 end
cc_caution(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
986 def cc_caution(data)
987   data.join('')
988 end
cc_interaction(data) click to toggle source

returns conteins in a line of the CC INTERACTION section.

CC       P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
     # File lib/bio/db/embl/uniprotkb.rb
 995 def cc_interaction(data)
 996   str = data.join('')
 997   it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
 998   it.map {|ent|
 999     ent.map! {|x| x.strip }
1000     if ent[0] =~ /^(.+):(.+)/
1001       spac = $1
1002       spid = $2.split(' ')[0]
1003       optid = nil
1004     elsif ent[0] =~ /Self/
1005       spac = self.entry_id
1006       spid = self.entry_id
1007       optid = nil
1008     end
1009     if ent[0] =~ /^.+:.+ (.+)/
1010       optid = $1
1011     end
1012 
1013     {'SP_Ac' => spac,
1014      'identifier' => spid,
1015      'NbExp' => ent[1],
1016      'IntAct' => ent[2].split(', '),
1017      'optional_identifier' => optid}
1018   }
1019 end
cc_mass_spectrometry(data) click to toggle source
     # File lib/bio/db/embl/uniprotkb.rb
1023 def cc_mass_spectrometry(data)
1024   # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
1025   return data unless data
1026 
1027   data.map { |m|
1028     mass = {'MW' => nil, 'MW_ERR' => nil, 'METHOD' => nil, 'RANGE' => nil,
1029             'NOTE' => nil}
1030     m.sub(/.$/,'').split(/;/).each do |line|
1031       case line
1032       when /MW=(.+)/
1033         mass['MW'] = $1
1034       when /MW_ERR=(.+)/
1035         mass['MW_ERR'] = $1
1036       when /METHOD=(.+)/
1037         mass['METHOD'] = $1
1038       when /RANGE=(\d+-\d+)/ 
1039         mass['RANGE'] = $1          # RANGE class ?
1040       when /NOTE=(.+)/
1041         mass['NOTE'] = $1
1042       end 
1043     end
1044     mass
1045   }
1046 end
cc_pathway(data) click to toggle source
     # File lib/bio/db/embl/uniprotkb.rb
1050 def cc_pathway(data)
1051   data.map {|x| x.sub(/\.$/, '') }.map {|x|
1052     x.split(/; | and |: /)
1053   }[0]
1054 end
cc_rna_editing(data) click to toggle source
     # File lib/bio/db/embl/uniprotkb.rb
1058 def cc_rna_editing(data)
1059   data = data.join('')
1060   entry = {'Modified_positions' => [], 'Note' => ""}
1061   if data =~ /Modified_positions=(.+?)(\.|;)/
1062     entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
1063   else
1064     raise ArgumentError, "Invarid CC RNA Editing lines (#{self.entry_id}):#{$!}\n#{get('CC')}"
1065   end
1066   if data =~ /Note=(.+)/
1067     entry['Note'] = $1
1068   end
1069   entry
1070 end
cc_subcellular_location(data) click to toggle source
     # File lib/bio/db/embl/uniprotkb.rb
1074 def cc_subcellular_location(data)
1075   data.map {|x| 
1076     x.split('. ').map {|y| 
1077       y.split('; ').map {|z| 
1078         z.sub(/\.$/, '') 
1079       } 
1080     } 
1081   }[0]
1082 end
cc_web_resource(data) click to toggle source
     # File lib/bio/db/embl/uniprotkb.rb
1092 def cc_web_resource(data)
1093   data.map {|x|
1094     entry = {'Name' => nil, 'Note' => nil, 'URL' => nil}
1095     x.split(';').each do |y|
1096       case y
1097       when /(Name|Note)\=(.+)/
1098         key = $1
1099         val = $2.strip
1100         entry[key] = val
1101       when /(NAME|NOTE)\=(.+)/
1102         key = $1.downcase.capitalize
1103         val = $2.strip
1104         entry[key] = val
1105       when /URL\=\"(.+)\"/
1106         entry['URL'] = $1.strip
1107       end
1108     end
1109     entry
1110   }
1111 end
gn_old_parser() click to toggle source

returns contents in the old style GN line.

GN Line: Gene name(s) (>=0, optional)

GN   HNS OR DRDX OR OSMZ OR BGLY.
GN   CECA1 AND CECA2.
GN   CECA1 AND (HOGE OR FUGA).

GN NAME1 [(AND|OR) NAME]+.

Bio::UniProtKB#gn -> Array # AND

#gn[0] -> Array   # OR
#gene_names -> Array
    # File lib/bio/db/embl/uniprotkb.rb
374 def gn_old_parser
375   names = Array.new
376   if get('GN').size > 0
377     names = fetch('GN').sub(/\.$/,'').split(/ AND /)
378     names.map! { |synonyms|
379       synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
380         e.strip 
381       }
382     }
383   end
384   @data['GN'] = names
385 end
gn_uniprot_parser() click to toggle source

returns contents in the structured GN line. The new format of the GN line is:

GN   Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
GN   ORFNames=[, ...];

where <gene record> is:

{ :name => '...', 
  :synonyms => [ 's1', 's2', ... ],
  :loci   => [ 'l1', 'l2', ... ],
  :orfs     => [ 'o1', 'o2', ... ] 
}
    # File lib/bio/db/embl/uniprotkb.rb
400 def gn_uniprot_parser
401   @data['GN'] = Array.new
402   gn_line = fetch('GN').strip
403   records = gn_line.split(/\s*and\s*/)
404   records.each do |record|
405     gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
406     record.each_line(';') do |element|
407       case element
408       when /Name=/ then
409         gene_hash[:name] = $'[0..-2]
410       when /Synonyms=/ then
411         gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
412       when /OrderedLocusNames=/ then
413         gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
414       when /ORFNames=/ then
415         gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
416       end
417     end
418     @data['GN'] << gene_hash
419   end
420   return @data['GN']
421 end
parse_DE_line_rel14(str) click to toggle source

(private) parses DE line (description lines) since UniProtKB release 14.0 of 22-Jul-2008

Return array containing array.

www.uniprot.org/docs/sp_news.htm

    # File lib/bio/db/embl/uniprotkb.rb
176 def parse_DE_line_rel14(str)
177   # Retruns if it is not the new format since Rel.14
178   return nil unless /^DE   (RecName|AltName|SubName)\: / =~ str
179   ret = []
180   cur = nil
181   str.each_line do |line|
182     case line
183     when /^DE   (Includes|Contains)\: *$/
184       cur = [ $1 ]
185       ret.push cur
186       cur = nil
187       #subcat_and_desc = nil
188       next
189     when /^DE   *(RecName|AltName|SubName)\: +(.*)/
190       category = $1
191       subcat_and_desc = $2
192       cur = [ category ]
193       ret.push cur
194     when /^DE   *(Flags)\: +(.*)/
195       category = $1
196       desc = $2
197       flags = desc.strip.split(/\s*\;\s*/) || []
198       cur = [ category, flags ]
199       ret.push cur
200       cur = nil
201       #subcat_and_desc = nil
202       next
203     when /^DE   *(.*)/
204       subcat_and_desc = $1
205     else
206       warn "Warning: skipped DE line in unknown format: #{line.inspect}"
207       #subcat_and_desc = nil
208       next
209     end
210     case subcat_and_desc
211     when nil
212       # does nothing
213     when /\A([^\=]+)\=(.*)/
214       subcat = $1
215       desc = $2
216       desc.sub!(/\;\s*\z/, '')
217       unless cur
218         warn "Warning: unknown category in DE line: #{line.inspect}"
219         cur = [ '' ]
220         ret.push cur
221       end
222       cur.push [ subcat, desc ]
223     else
224       warn "Warning: skipped DE line description in unknown format: #{line.inspect}"
225     end
226   end
227   ret
228 end
set_RA(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
625 def set_RA(data)
626   data = data.sub(/; *$/, '')
627 end
set_RC(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
591 def set_RC(data)
592   data.scan(/([STP]\w+)=(.+);/).map { |comment|
593     [comment[1].split(/, and |, /)].flatten.map { |text|
594       {'Token' => comment[0], 'Text' => text}
595     }
596   }.flatten
597 end
set_RG(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
641 def set_RG(data)
642   data = data.split('; ')
643 end
set_RL(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
636 def set_RL(data)
637   data = data.strip
638 end
set_RP(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
600 def set_RP(data)
601   data = data.strip
602   data = data.sub(/\.$/, '')
603   data.split(/, AND |, /i).map {|x| 
604     x = x.strip
605     x = x.gsub('  ', ' ')
606   }
607 end
set_RT(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
630 def set_RT(data)
631   data = data.sub(/; *$/, '')
632   data = data.gsub(/(^"|"$)/, '')
633 end
set_RX(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
610 def set_RX(data)
611   rx = {'MEDLINE' => nil, 'PubMed' => nil, 'DOI' => nil}
612   if data =~ /MEDLINE=(.+?);/
613     rx['MEDLINE'] = $1
614   end
615   if data =~ /PubMed=(.+?);/
616     rx['PubMed'] = $1
617   end
618   if data =~ /DOI=(.+?);/
619     rx['DOI'] = $1
620   end
621   rx
622 end