class Bio::UniProt
Description¶ ↑
Parser class for UniProtKB/SwissProt and TrEMBL
database entry.
See the UniProtKB
document files and manuals.
Examples¶ ↑
str = File.read("p53_human.swiss") obj = Bio::UniProtKB.new(str) obj.entry_id #=> "P53_HUMAN"
References
¶ ↑
-
The
UniProt
Knowledgebase (UniProtKB
) www.uniprot.org/help/uniprotkb -
The Universal Protein Resource (
UniProt
) uniprot.org/ -
The UniProtKB/SwissProt/TrEMBL User Manual www.uniprot.org/docs/userman.html
Public Instance Methods
returns contents in the CC lines.
-
Bio::UniProtKB#cc
-> Hash
returns an object of contents in the TOPIC.
-
Bio::UniProtKB#cc(TOPIC)
-> Array w/in Hash, Hash
returns contents of the “ALTERNATIVE PRODUCTS”.
-
Bio::UniProtKB#cc
('ALTERNATIVE PRODUCTS') -> Hash{'Event' => str, 'Named isoforms' => int, 'Comment' => str, 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]} CC -!- ALTERNATIVE PRODUCTS: CC Event=Alternative splicing; Named isoforms=15; ... CC placentae isoforms. All tissues differentially splice exon 13; CC Name=A; Synonyms=no del; CC IsoId=P15529-1; Sequence=Displayed;
returns contents of the “DATABASE”.
-
Bio::UniProtKB#cc
('DATABASE') -> Array[{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...] CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
returns contents of the “MASS SPECTROMETRY”.
-
Bio::UniProtKB#cc
('MASS SPECTROMETRY') -> Array[{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...] CC -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
CC lines (>=0, optional)¶ ↑
CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT CC IN LIVER, KIDNEY, LUNG AND BRAIN. CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK; CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
See also www.expasy.org/sprot/userman.html#CC_line
# File lib/bio/db/embl/uniprotkb.rb 774 def cc(topic = nil) 775 unless @data['CC'] 776 cc = Hash.new 777 comment_border= '-' * (77 - 4 + 1) 778 dlm = /-!- / 779 780 # 12KD_MYCSM has no CC lines. 781 return cc if get('CC').size == 0 782 783 cc_raw = fetch('CC') 784 785 # Removing the copyright statement. 786 cc_raw.sub!(/ *---.+---/m, '') 787 788 # Not any CC Lines without the copyright statement. 789 return cc if cc_raw == '' 790 791 begin 792 cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0] 793 _ = copyright #dummy for suppress "assigned but unused variable" 794 cc_raw = cc_raw.sub(dlm,'') 795 cc_raw.split(dlm).each do |tmp| 796 tmp = tmp.strip 797 798 if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp 799 key = $1 800 body = $2 801 body.gsub!(/- (?!AND)/,'-') 802 body.strip! 803 unless cc[key] 804 cc[key] = [body] 805 else 806 cc[key].push(body) 807 end 808 else 809 raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"', 810 '', get('CC'),''].join("\n") 811 end 812 end 813 rescue NameError 814 if fetch('CC') == '' 815 return {} 816 else 817 raise ["Error: Invalid CC Lines: [#{entry_id}]: ", 818 "\n'#{self.get('CC')}'\n", "(#{$!})"].join 819 end 820 rescue NoMethodError 821 end 822 823 @data['CC'] = cc 824 end 825 826 827 case topic 828 when 'ALLERGEN' 829 return @data['CC'][topic] 830 when 'ALTERNATIVE PRODUCTS' 831 return cc_alternative_products(@data['CC'][topic]) 832 when 'BIOPHYSICOCHEMICAL PROPERTIES' 833 return cc_biophysiochemical_properties(@data['CC'][topic]) 834 when 'BIOTECHNOLOGY' 835 return @data['CC'][topic] 836 when 'CATALITIC ACTIVITY' 837 return cc_catalytic_activity(@data['CC'][topic]) 838 when 'CAUTION' 839 return cc_caution(@data['CC'][topic]) 840 when 'COFACTOR' 841 return @data['CC'][topic] 842 when 'DEVELOPMENTAL STAGE' 843 return @data['CC'][topic].join('') 844 when 'DISEASE' 845 return @data['CC'][topic].join('') 846 when 'DOMAIN' 847 return @data['CC'][topic] 848 when 'ENZYME REGULATION' 849 return @data['CC'][topic].join('') 850 when 'FUNCTION' 851 return @data['CC'][topic].join('') 852 when 'INDUCTION' 853 return @data['CC'][topic].join('') 854 when 'INTERACTION' 855 return cc_interaction(@data['CC'][topic]) 856 when 'MASS SPECTROMETRY' 857 return cc_mass_spectrometry(@data['CC'][topic]) 858 when 'MISCELLANEOUS' 859 return @data['CC'][topic] 860 when 'PATHWAY' 861 return cc_pathway(@data['CC'][topic]) 862 when 'PHARMACEUTICAL' 863 return @data['CC'][topic] 864 when 'POLYMORPHISM' 865 return @data['CC'][topic] 866 when 'PTM' 867 return @data['CC'][topic] 868 when 'RNA EDITING' 869 return cc_rna_editing(@data['CC'][topic]) 870 when 'SIMILARITY' 871 return @data['CC'][topic] 872 when 'SUBCELLULAR LOCATION' 873 return cc_subcellular_location(@data['CC'][topic]) 874 when 'SUBUNIT' 875 return @data['CC'][topic] 876 when 'TISSUE SPECIFICITY' 877 return @data['CC'][topic] 878 when 'TOXIC DOSE' 879 return @data['CC'][topic] 880 when 'WEB RESOURCE' 881 return cc_web_resource(@data['CC'][topic]) 882 when 'DATABASE' 883 # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"]. 884 tmp = Array.new 885 db = @data['CC']['DATABASE'] 886 return db unless db 887 888 db.each do |e| 889 db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil} 890 e.sub(/.$/,'').split(/;/).each do |line| 891 case line 892 when /NAME=(.+)/ 893 db['NAME'] = $1 894 when /NOTE=(.+)/ 895 db['NOTE'] = $1 896 when /WWW="(.+)"/ 897 db['WWW'] = $1 898 when /FTP="(.+)"/ 899 db['FTP'] = $1 900 end 901 end 902 tmp.push(db) 903 end 904 return tmp 905 when nil 906 return @data['CC'] 907 else 908 return @data['CC'][topic] 909 end 910 end
# File lib/bio/db/embl/uniprotkb.rb 1131 def dr(key = nil) 1132 unless key 1133 embl_dr 1134 else 1135 (embl_dr[key] or []).map {|x| 1136 {'Accession' => x[0], 1137 'Version' => x[1], 1138 ' ' => x[2], 1139 'Molecular Type' => x[3]} 1140 } 1141 end 1142 end
returns a Hash of information in the DT lines.
hash keys: ['created', 'sequence', 'annotation']
Since UniProtKB
release 7.0 of 07-Feb-2006, the DT line format is changed, and the word “annotation” is no longer used in DT lines. Despite the change, the word “annotation” is still used for keeping compatibility.
returns a String of information in the DT lines by a given key.
DT Line; date (3/entry)¶ ↑
DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.) DT DD-MMM-YYY (sequence version NN) DT DD-MMM-YYY (entry version NN)
The format have been changed in UniProtKB
release 7.0 of 07-Feb-2006. Below is the older format.
Old format of DT Line; date (3/entry)¶ ↑
DT DD-MMM-YYY (rel. NN, Created) DT DD-MMM-YYY (rel. NN, Last sequence update) DT DD-MMM-YYY (rel. NN, Last annotation update)
# File lib/bio/db/embl/uniprotkb.rb 157 def dt(key = nil) 158 return dt[key] if key 159 return @data['DT'] if @data['DT'] 160 161 part = self.get('DT').split(/\n/) 162 @data['DT'] = { 163 'created' => part[0].sub(/\w{2} /,'').strip, 164 'sequence' => part[1].sub(/\w{2} /,'').strip, 165 'annotation' => part[2].sub(/\w{2} /,'').strip 166 } 167 end
returns a ENTRY_NAME in the ID line.
# File lib/bio/db/embl/uniprotkb.rb 98 def entry_id 99 id_line('ENTRY_NAME') 100 end
returns contents in the feature table.
Examples¶ ↑
sp = Bio::UniProtKB.new(entry) ft = sp.ft ft.class #=> Hash ft.keys.each do |feature_key| ft[feature_key].each do |feature| feature['From'] #=> '1' feature['To'] #=> '21' feature['Description'] #=> '' feature['FTId'] #=> '' feature['diff'] #=> [] feature['original'] #=> [feature_key, '1', '21', '', ''] end end
-
Bio::UniProtKB#ft
-> Hash{FEATURE_KEY => [{'From' => int, 'To' => int, 'Description' => aStr, 'FTId' => aStr, 'diff' => [original_residues, changed_residues], 'original' => aAry }],...}
returns an Array of the information about the feature_name in the feature table.
-
Bio::UniProtKB#ft(feature_name)
-> Array of Hash[{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
FT Line; feature table data (>=0, optional)¶ ↑
Col Data item ----- ----------------- 1- 2 FT 6-13 Feature name 15-20 `FROM' endpoint 22-27 `TO' endpoint 35-75 Description (>=0 per key) ----- -----------------
Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors including '<', '>' or '?'. (c.f. '<1', '?42')
See also www.expasy.org/sprot/userman.html#FT_line
# File lib/bio/db/embl/uniprotkb.rb 1196 def ft(feature_key = nil) 1197 return ft[feature_key] if feature_key 1198 return @data['FT'] if @data['FT'] 1199 1200 table = [] 1201 begin 1202 get('FT').split("\n").each do |line| 1203 if line =~ /^FT \w/ 1204 feature = line.chomp.ljust(74) 1205 table << [feature[ 5..12].strip, # Feature Name 1206 feature[14..19].strip, # From 1207 feature[21..26].strip, # To 1208 feature[34..74].strip ] # Description 1209 else 1210 table.last << line.chomp.sub!(/^FT +/, '') 1211 end 1212 end 1213 1214 # Joining Description lines 1215 table = table.map { |feature| 1216 ftid = feature.pop if feature.last =~ /FTId=/ 1217 if feature.size > 4 1218 feature = [feature[0], 1219 feature[1], 1220 feature[2], 1221 feature[3, feature.size - 3].join(" ")] 1222 end 1223 feature << if ftid then ftid else '' end 1224 } 1225 1226 hash = {} 1227 table.each do |feature| 1228 hash[feature[0]] = [] unless hash[feature[0]] 1229 hash[feature[0]] << { 1230 # Removing '<', '>' or '?' in FROM/TO endopoint. 1231 'From' => feature[1].sub(/\D/, '').to_i, 1232 'To' => feature[2].sub(/\D/, '').to_i, 1233 'Description' => feature[3], 1234 'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''), 1235 'diff' => [], 1236 'original' => feature 1237 } 1238 1239 case feature[0] 1240 when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT' 1241 case hash[feature[0]].last['Description'] 1242 when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/ 1243 original_res = $1 1244 changed_res = $2 1245 original_res = original_res.gsub(/ /,'').strip 1246 chenged_res = changed_res.gsub(/ /,'').strip 1247 when /Missing/i 1248 original_res = seq.subseq(hash[feature[0]].last['From'], 1249 hash[feature[0]].last['To']) 1250 changed_res = '' 1251 end 1252 hash[feature[0]].last['diff'] = [original_res, chenged_res] 1253 end 1254 end 1255 rescue 1256 raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n" 1257 end 1258 1259 @data['FT'] = hash 1260 end
returns a String of the first gene name in the GN line.
# File lib/bio/db/embl/uniprotkb.rb 437 def gene_name 438 (x = self.gene_names) ? x.first : nil 439 end
returns a Array of gene names in the GN line.
# File lib/bio/db/embl/uniprotkb.rb 426 def gene_names 427 gn # set @data['GN'] if it hasn't been already done 428 if @data['GN'].first.class == Hash then 429 @data['GN'].collect { |element| element[:name] } 430 else 431 @data['GN'].first 432 end 433 end
returns gene names in the GN line.
New UniProt/SwissProt format:
-
Bio::UniProtKB#gn
-> [ <gene record>* ]
where <gene record> is:
{ :name => '...', :synonyms => [ 's1', 's2', ... ], :loci => [ 'l1', 'l2', ... ], :orfs => [ 'o1', 'o2', ... ] }
Old format:
-
Bio::UniProtKB#gn
-> Array # AND -
Bio::UniProtKB#gn[0]
-> Array # OR
GN Line: Gene name(s) (>=0, optional)¶ ↑
# File lib/bio/db/embl/uniprotkb.rb 350 def gn 351 unless @data['GN'] 352 case fetch('GN') 353 when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/ 354 @data['GN'] = gn_uniprot_parser 355 else 356 @data['GN'] = gn_old_parser 357 end 358 end 359 @data['GN'] 360 end
The HI line¶ ↑
Bio::UniProtKB#hi
#=> hash
# File lib/bio/db/embl/uniprotkb.rb 690 def hi 691 unless @data['HI'] 692 @data['HI'] = [] 693 fetch('HI').split(/\. /).each do |hlist| 694 hash = {'Category' => '', 'Keywords' => [], 'Keyword' => ''} 695 hash['Category'], hash['Keywords'] = hlist.split(': ') 696 hash['Keywords'] = hash['Keywords'].split('; ') 697 hash['Keyword'] = hash['Keywords'].pop 698 hash['Keyword'].sub!(/\.$/, '') 699 @data['HI'] << hash 700 end 701 end 702 @data['HI'] 703 end
returns a Hash of the ID line.
returns a content (Int or String) of the ID line by a given key. Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
ID Line (since UniProtKB
release 9.0 of 31-Oct-2006)¶ ↑
ID P53_HUMAN Reviewed; 393 AA. #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."
Examples¶ ↑
obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed", "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil} obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
ID Line (older style)¶ ↑
ID P53_HUMAN STANDARD; PRT; 393 AA. #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
Examples¶ ↑
obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD", "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"} obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
# File lib/bio/db/embl/uniprotkb.rb 73 def id_line(key = nil) 74 return id_line[key] if key 75 return @data['ID'] if @data['ID'] 76 77 part = @orig['ID'].split(/ +/) 78 if part[4].to_s.chomp == 'AA.' then 79 # after UniProtKB release 9.0 of 31-Oct-2006 80 # (http://www.uniprot.org/docs/sp_news.htm) 81 molecule_type = nil 82 sequence_length = part[3].to_i 83 else 84 molecule_type = part[3].sub(/;/,'') 85 sequence_length = part[4].to_i 86 end 87 @data['ID'] = { 88 'ENTRY_NAME' => part[1], 89 'DATA_CLASS' => part[2].sub(/;/,''), 90 'MOLECULE_TYPE' => molecule_type, 91 'SEQUENCE_LENGTH' => sequence_length 92 } 93 end
returns a MOLECULE_TYPE in the ID line.
A short-cut for Bio::UniProtKB#id_line
('MOLECULE_TYPE').
# File lib/bio/db/embl/uniprotkb.rb 108 def molecule 109 id_line('MOLECULE_TYPE') 110 end
The OH Line; ¶ ↑
OH NCBI_TaxID=TaxID; HostName. br.expasy.org/sprot/userman.html#OH_line
# File lib/bio/db/embl/uniprotkb.rb 520 def oh 521 unless @data['OH'] 522 @data['OH'] = fetch('OH').split("\. ").map {|x| 523 if x =~ /NCBI_TaxID=(\d+);/ 524 taxid = $1 525 else 526 raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):", 527 $!, "\n", get('OH'), "\n"].join 528 529 end 530 if x =~ /NCBI_TaxID=\d+; (.+)/ 531 host_name = $1 532 host_name.sub!(/\.$/, '') 533 else 534 host_name = nil 535 end 536 {'NCBI_TaxID' => taxid, 'HostName' => host_name} 537 } 538 end 539 @data['OH'] 540 end
returns a Array of Hashs or a String of the OS line when a key given.
-
Bio::EMBLDB#os -> Array
[{'name' => '(Human)', 'os' => 'Homo sapiens'}, {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
-
Bio::EPTR#os -> Hash
{'name' => "(Human)", 'os' => 'Homo sapiens'}
-
Bio::UniProtKB#os[0]
['name'] -> “(Human)” -
Bio::EPTR#os(0) -> “Homo sapiens (Human)”
OS Line; organism species (>=1)¶ ↑
OS Genus species (name). OS Genus species (name0) (name1). OS Genus species (name0) (name1). OS Genus species (name0), G s0 (name0), and G s (name0) (name1). OS Homo sapiens (Human), and Rarrus norveticus (Rat) OS Hippotis sp. Clark and Watts 825. OS unknown cyperaceous sp.
# File lib/bio/db/embl/uniprotkb.rb 459 def os(num = nil) 460 unless @data['OS'] 461 os = Array.new 462 fetch('OS').split(/, and|, /).each do |tmp| 463 if tmp =~ /(\w+ *[\w \:\'\+\-\.]+[\w\.])/ 464 org = $1 465 tmp =~ /(\(.+\))/ 466 os.push({'name' => $1, 'os' => org}) 467 else 468 raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n" 469 end 470 end 471 @data['OS'] = os 472 end 473 474 if num 475 # EX. "Trifolium repens (white clover)" 476 return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}" 477 else 478 return @data['OS'] 479 end 480 end
returns a Hash of oraganism taxonomy cross-references.
-
Bio::UniProtKB#ox
-> Hash{'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
OX Line; organism taxonomy cross-reference (>=1 per entry)¶ ↑
OX NCBI_TaxID=1234; OX NCBI_TaxID=1234, 2345, 3456, 4567;
# File lib/bio/db/embl/uniprotkb.rb 503 def ox 504 unless @data['OX'] 505 tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip } 506 hsh = Hash.new 507 tmp.each do |e| 508 db,refs = e.split(/=/) 509 hsh[db] = refs.split(/, */) 510 end 511 @data['OX'] = hsh 512 end 513 return @data['OX'] 514 end
returns the proposed official name of the protein. Returns a String.
Since UniProtKB
release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full name which is taken from “RecName: Full=” or “SubName: Full=” line normally in the beginning of the DE lines. Unlike parser for old format, no special treatments for fragment or precursor.
For old format, the method parses the DE lines and returns the protein name as a String.
DE Line; description (>=1)¶ ↑
"DE #{OFFICIAL_NAME} (#{SYNONYM})" "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]." OFFICIAL_NAME 1/entry SYNONYM >=0 CONTEINS >=0
# File lib/bio/db/embl/uniprotkb.rb 250 def protein_name 251 @data['DE'] ||= parse_DE_line_rel14(get('DE')) 252 parsed_de_line = @data['DE'] 253 if parsed_de_line then 254 # since UniProtKB release 14.0 of 22-Jul-2008 255 name = nil 256 parsed_de_line.each do |a| 257 case a[0] 258 when 'RecName', 'SubName' 259 if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then 260 name = name_pair[1] 261 break 262 end 263 end 264 end 265 name = name.to_s 266 else 267 # old format (before Rel. 13.x) 268 name = "" 269 if de_line = fetch('DE') then 270 str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part) 271 name = str[/^[^(]*/].strip 272 name << ' (Fragment)' if str =~ /fragment/i 273 end 274 end 275 return name 276 end
returns contents in the R lines.
-
Bio::EMBLDB::Common#ref
-> [ <refernece information Hash>* ]
where <reference information Hash> is:
{'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
R Lines
-
RN RC RP RX RA RT RL RG
# File lib/bio/db/embl/uniprotkb.rb 556 def ref 557 unless @data['R'] 558 @data['R'] = [get('R').split(/\nRN /)].flatten.map { |str| 559 hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 560 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''} 561 str = 'RN ' + str unless /^RN / =~ str 562 563 str.split("\n").each do |line| 564 if /^(R[NPXARLCTG]) (.+)/ =~ line 565 hash[$1] += $2 + ' ' 566 else 567 raise "Invalid format in R lines, \n[#{line}]\n" 568 end 569 end 570 571 hash['RN'] = set_RN(hash['RN']) 572 hash['RC'] = set_RC(hash['RC']) 573 hash['RP'] = set_RP(hash['RP']) 574 hash['RX'] = set_RX(hash['RX']) 575 hash['RA'] = set_RA(hash['RA']) 576 hash['RT'] = set_RT(hash['RT']) 577 hash['RL'] = set_RL(hash['RL']) 578 hash['RG'] = set_RG(hash['RG']) 579 580 hash 581 } 582 583 end 584 @data['R'] 585 end
returns Bio::Reference
object from Bio::EMBLDB::Common#ref
.
# File lib/bio/db/embl/uniprotkb.rb 650 def references 651 unless @data['references'] 652 ary = self.ref.map {|ent| 653 hash = Hash.new('') 654 ent.each {|key, value| 655 case key 656 when 'RA' 657 hash['authors'] = value.split(/, /) 658 when 'RT' 659 hash['title'] = value 660 when 'RL' 661 if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/ 662 hash['journal'] = $1 663 hash['volume'] = $2 664 hash['issue'] = $3 665 hash['pages'] = $4 666 hash['year'] = $5 667 else 668 hash['journal'] = value 669 end 670 when 'RX' # PUBMED, MEDLINE, DOI 671 value.each do |tag, xref| 672 hash[ tag.downcase ] = xref 673 end 674 end 675 } 676 Reference.new(hash) 677 } 678 @data['references'] = References.new(ary) 679 end 680 @data['references'] 681 end
returns a Bio::Sequence::AA
of the amino acid sequence.
blank Line; sequence data (>=1)
# File lib/bio/db/embl/uniprotkb.rb 1306 def seq 1307 unless @data[''] 1308 @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') ) 1309 end 1310 return @data[''] 1311 end
returns a SEQUENCE_LENGTH in the ID line.
A short-cut for Bio::UniProtKB#id_line
('SEQUENCE_LENGHT').
# File lib/bio/db/embl/uniprotkb.rb 117 def sequence_length 118 id_line('SEQUENCE_LENGTH') 119 end
# File lib/bio/db/embl/uniprotkb.rb 587 def set_RN(data) 588 data.strip 589 end
returns a Hash of conteins in the SQ lines.
-
Bio::UniProtKBL#sq -> hsh
returns a value of a key given in the SQ lines.
-
Bio::UniProtKBL#sq(key) -> int or str
-
Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length',
'CRC64']
SQ Line; sequence header (1/entry)¶ ↑
SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64; SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
MW, Dalton unit. CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
# File lib/bio/db/embl/uniprotkb.rb 1278 def sq(key = nil) 1279 unless @data['SQ'] 1280 if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/ 1281 @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 } 1282 else 1283 raise "Invalid SQ Line: \n'#{fetch('SQ')}'" 1284 end 1285 end 1286 1287 if key 1288 case key 1289 when /mw/, /molecular/, /weight/ 1290 @data['SQ']['MW'] 1291 when /len/, /length/, /AA/ 1292 @data['SQ']['aalen'] 1293 else 1294 @data['SQ'][key] 1295 end 1296 else 1297 @data['SQ'] 1298 end 1299 end
returns synonyms (unofficial and/or alternative names). Returns an Array containing String objects.
Since UniProtKB
release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full or short names which are taken from “RecName: Short=”, “RecName: EC=”, and AltName lines, except after “Contains:” or “Includes:”. For keeping compatibility with old format parser, “RecName: EC=N.N.N.N” is reported as “EC N.N.N.N”. In addition, to prevent confusion, “Allergen=” and “CD_antigen=” prefixes are added for the corresponding fields.
For old format, the method parses the DE lines and returns synonyms. synonyms are each placed in () following the official name on the DE line.
# File lib/bio/db/embl/uniprotkb.rb 293 def synonyms 294 ary = Array.new 295 @data['DE'] ||= parse_DE_line_rel14(get('DE')) 296 parsed_de_line = @data['DE'] 297 if parsed_de_line then 298 # since UniProtKB release 14.0 of 22-Jul-2008 299 parsed_de_line.each do |a| 300 case a[0] 301 when 'Includes', 'Contains' 302 break #the each loop 303 when 'RecName', 'SubName', 'AltName' 304 a[1..-1].each do |b| 305 if name = b[1] and b[1] != self.protein_name then 306 case b[0] 307 when 'EC' 308 name = "EC " + b[1] 309 when 'Allergen', 'CD_antigen' 310 name = b[0] + '=' + b[1] 311 else 312 name = b[1] 313 end 314 ary.push name 315 end 316 end 317 end #case a[0] 318 end #parsed_de_line.each 319 else 320 # old format (before Rel. 13.x) 321 if de_line = fetch('DE') then 322 line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part 323 line.scan(/\([^)]+/) do |synonym| 324 unless synonym =~ /fragment/i then 325 ary << synonym[1..-1].strip # index to remove the leading ( 326 end 327 end 328 end 329 end 330 return ary 331 end
Private Instance Methods
# File lib/bio/db/embl/uniprotkb.rb 913 def cc_alternative_products(data) 914 ap = data.join('') 915 return ap unless ap 916 917 # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+ 918 tmp = {'Event' => "", 'Named isoforms' => "", 'Comment' => "", 919 'Variants' => []} 920 if /Event=(.+?);/ =~ ap 921 tmp['Event'] = $1 922 tmp['Event'] = tmp['Event'].sub(/;/,'').split(/, /) 923 end 924 if /Named isoforms=(\S+?);/ =~ ap 925 tmp['Named isoforms'] = $1 926 end 927 if /Comment=(.+?);/m =~ ap 928 tmp['Comment'] = $1 929 end 930 ap.scan(/Name=.+?Sequence=.+?;/).each do |ent| 931 tmp['Variants'] << cc_alternative_products_variants(ent) 932 end 933 return tmp 934 end
# File lib/bio/db/embl/uniprotkb.rb 937 def cc_alternative_products_variants(data) 938 variant = {'Name' => '', 'Synonyms' => [], 'IsoId' => [], 'Sequence' => []} 939 data.split(/; /).map {|x| x.split(/=/) }.each do |e| 940 case e[0] 941 when 'Sequence', 'Synonyms', 'IsoId' 942 e[1] = e[1].sub(/;/,'').split(/, /) 943 end 944 variant[e[0]] = e[1] 945 end 946 variant 947 end
# File lib/bio/db/embl/uniprotkb.rb 951 def cc_biophysiochemical_properties(data) 952 data = data[0] 953 954 hash = {'Absorption' => {}, 955 'Kinetic parameters' => {}, 956 'pH dependence' => "", 957 'Redox potential' => "", 958 'Temperature dependence' => ""} 959 if data =~ /Absorption: Abs\(max\)=(.+?);/ 960 hash['Absorption']['Abs(max)'] = $1 961 end 962 if data =~ /Absorption: Abs\(max\)=.+; Note=(.+?);/ 963 hash['Absorption']['Note'] = $1 964 end 965 if data =~ /Kinetic parameters: KM=(.+?); Vmax=(.+?);/ 966 hash['Kinetic parameters']['KM'] = $1 967 hash['Kinetic parameters']['Vmax'] = $2 968 end 969 if data =~ /Kinetic parameters: KM=.+; Vmax=.+; Note=(.+?);/ 970 hash['Kinetic parameters']['Note'] = $1 971 end 972 if data =~ /pH dependence: (.+?);/ 973 hash['pH dependence'] = $1 974 end 975 if data =~ /Redox potential: (.+?);/ 976 hash['Redox potential'] = $1 977 end 978 if data =~ /Temperature dependence: (.+?);/ 979 hash['Temperature dependence'] = $1 980 end 981 hash 982 end
# File lib/bio/db/embl/uniprotkb.rb 986 def cc_caution(data) 987 data.join('') 988 end
returns conteins in a line of the CC INTERACTION section.
CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
# File lib/bio/db/embl/uniprotkb.rb 995 def cc_interaction(data) 996 str = data.join('') 997 it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/) 998 it.map {|ent| 999 ent.map! {|x| x.strip } 1000 if ent[0] =~ /^(.+):(.+)/ 1001 spac = $1 1002 spid = $2.split(' ')[0] 1003 optid = nil 1004 elsif ent[0] =~ /Self/ 1005 spac = self.entry_id 1006 spid = self.entry_id 1007 optid = nil 1008 end 1009 if ent[0] =~ /^.+:.+ (.+)/ 1010 optid = $1 1011 end 1012 1013 {'SP_Ac' => spac, 1014 'identifier' => spid, 1015 'NbExp' => ent[1], 1016 'IntAct' => ent[2].split(', '), 1017 'optional_identifier' => optid} 1018 } 1019 end
# File lib/bio/db/embl/uniprotkb.rb 1023 def cc_mass_spectrometry(data) 1024 # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX]. 1025 return data unless data 1026 1027 data.map { |m| 1028 mass = {'MW' => nil, 'MW_ERR' => nil, 'METHOD' => nil, 'RANGE' => nil, 1029 'NOTE' => nil} 1030 m.sub(/.$/,'').split(/;/).each do |line| 1031 case line 1032 when /MW=(.+)/ 1033 mass['MW'] = $1 1034 when /MW_ERR=(.+)/ 1035 mass['MW_ERR'] = $1 1036 when /METHOD=(.+)/ 1037 mass['METHOD'] = $1 1038 when /RANGE=(\d+-\d+)/ 1039 mass['RANGE'] = $1 # RANGE class ? 1040 when /NOTE=(.+)/ 1041 mass['NOTE'] = $1 1042 end 1043 end 1044 mass 1045 } 1046 end
# File lib/bio/db/embl/uniprotkb.rb 1050 def cc_pathway(data) 1051 data.map {|x| x.sub(/\.$/, '') }.map {|x| 1052 x.split(/; | and |: /) 1053 }[0] 1054 end
# File lib/bio/db/embl/uniprotkb.rb 1058 def cc_rna_editing(data) 1059 data = data.join('') 1060 entry = {'Modified_positions' => [], 'Note' => ""} 1061 if data =~ /Modified_positions=(.+?)(\.|;)/ 1062 entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ') 1063 else 1064 raise ArgumentError, "Invarid CC RNA Editing lines (#{self.entry_id}):#{$!}\n#{get('CC')}" 1065 end 1066 if data =~ /Note=(.+)/ 1067 entry['Note'] = $1 1068 end 1069 entry 1070 end
# File lib/bio/db/embl/uniprotkb.rb 1074 def cc_subcellular_location(data) 1075 data.map {|x| 1076 x.split('. ').map {|y| 1077 y.split('; ').map {|z| 1078 z.sub(/\.$/, '') 1079 } 1080 } 1081 }[0] 1082 end
# File lib/bio/db/embl/uniprotkb.rb 1092 def cc_web_resource(data) 1093 data.map {|x| 1094 entry = {'Name' => nil, 'Note' => nil, 'URL' => nil} 1095 x.split(';').each do |y| 1096 case y 1097 when /(Name|Note)\=(.+)/ 1098 key = $1 1099 val = $2.strip 1100 entry[key] = val 1101 when /(NAME|NOTE)\=(.+)/ 1102 key = $1.downcase.capitalize 1103 val = $2.strip 1104 entry[key] = val 1105 when /URL\=\"(.+)\"/ 1106 entry['URL'] = $1.strip 1107 end 1108 end 1109 entry 1110 } 1111 end
returns contents in the old style GN line.
GN Line: Gene name(s) (>=0, optional)¶ ↑
GN HNS OR DRDX OR OSMZ OR BGLY. GN CECA1 AND CECA2. GN CECA1 AND (HOGE OR FUGA). GN NAME1 [(AND|OR) NAME]+.
Bio::UniProtKB#gn
-> Array # AND
#gn[0] -> Array # OR #gene_names -> Array
# File lib/bio/db/embl/uniprotkb.rb 374 def gn_old_parser 375 names = Array.new 376 if get('GN').size > 0 377 names = fetch('GN').sub(/\.$/,'').split(/ AND /) 378 names.map! { |synonyms| 379 synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e| 380 e.strip 381 } 382 } 383 end 384 @data['GN'] = names 385 end
returns contents in the structured GN line. The new format of the GN line is:
GN Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...]; GN ORFNames=[, ...];
-
Bio::UniProtKB#gn
-> [ <gene record>* ]
where <gene record> is:
{ :name => '...', :synonyms => [ 's1', 's2', ... ], :loci => [ 'l1', 'l2', ... ], :orfs => [ 'o1', 'o2', ... ] }
# File lib/bio/db/embl/uniprotkb.rb 400 def gn_uniprot_parser 401 @data['GN'] = Array.new 402 gn_line = fetch('GN').strip 403 records = gn_line.split(/\s*and\s*/) 404 records.each do |record| 405 gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []} 406 record.each_line(';') do |element| 407 case element 408 when /Name=/ then 409 gene_hash[:name] = $'[0..-2] 410 when /Synonyms=/ then 411 gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/) 412 when /OrderedLocusNames=/ then 413 gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/) 414 when /ORFNames=/ then 415 gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/) 416 end 417 end 418 @data['GN'] << gene_hash 419 end 420 return @data['GN'] 421 end
(private) parses DE line (description lines) since UniProtKB
release 14.0 of 22-Jul-2008
Return array containing array.
www.uniprot.org/docs/sp_news.htm
# File lib/bio/db/embl/uniprotkb.rb 176 def parse_DE_line_rel14(str) 177 # Returns if it is not the new format since Rel.14 178 return nil unless /^DE (RecName|AltName|SubName)\: / =~ str 179 ret = [] 180 cur = nil 181 str.each_line do |line| 182 case line 183 when /^DE (Includes|Contains)\: *$/ 184 cur = [ $1 ] 185 ret.push cur 186 cur = nil 187 #subcat_and_desc = nil 188 next 189 when /^DE *(RecName|AltName|SubName)\: +(.*)/ 190 category = $1 191 subcat_and_desc = $2 192 cur = [ category ] 193 ret.push cur 194 when /^DE *(Flags)\: +(.*)/ 195 category = $1 196 desc = $2 197 flags = desc.strip.split(/\s*\;\s*/) || [] 198 cur = [ category, flags ] 199 ret.push cur 200 cur = nil 201 #subcat_and_desc = nil 202 next 203 when /^DE *(.*)/ 204 subcat_and_desc = $1 205 else 206 warn "Warning: skipped DE line in unknown format: #{line.inspect}" 207 #subcat_and_desc = nil 208 next 209 end 210 case subcat_and_desc 211 when nil 212 # does nothing 213 when /\A([^\=]+)\=(.*)/ 214 subcat = $1 215 desc = $2 216 desc.sub!(/\;\s*\z/, '') 217 unless cur 218 warn "Warning: unknown category in DE line: #{line.inspect}" 219 cur = [ '' ] 220 ret.push cur 221 end 222 cur.push [ subcat, desc ] 223 else 224 warn "Warning: skipped DE line description in unknown format: #{line.inspect}" 225 end 226 end 227 ret 228 end
# File lib/bio/db/embl/uniprotkb.rb 625 def set_RA(data) 626 data = data.sub(/; *$/, '') 627 end
# File lib/bio/db/embl/uniprotkb.rb 591 def set_RC(data) 592 data.scan(/([STP]\w+)=(.+);/).map { |comment| 593 [comment[1].split(/, and |, /)].flatten.map { |text| 594 {'Token' => comment[0], 'Text' => text} 595 } 596 }.flatten 597 end
# File lib/bio/db/embl/uniprotkb.rb 641 def set_RG(data) 642 data = data.split('; ') 643 end
# File lib/bio/db/embl/uniprotkb.rb 636 def set_RL(data) 637 data = data.strip 638 end
# File lib/bio/db/embl/uniprotkb.rb 600 def set_RP(data) 601 data = data.strip 602 data = data.sub(/\.$/, '') 603 data.split(/, AND |, /i).map {|x| 604 x = x.strip 605 x = x.gsub(' ', ' ') 606 } 607 end
# File lib/bio/db/embl/uniprotkb.rb 630 def set_RT(data) 631 data = data.sub(/; *$/, '') 632 data = data.gsub(/(^"|"$)/, '') 633 end
# File lib/bio/db/embl/uniprotkb.rb 610 def set_RX(data) 611 rx = {'MEDLINE' => nil, 'PubMed' => nil, 'DOI' => nil} 612 if data =~ /MEDLINE=(.+?);/ 613 rx['MEDLINE'] = $1 614 end 615 if data =~ /PubMed=(.+?);/ 616 rx['PubMed'] = $1 617 end 618 if data =~ /DOI=(.+?);/ 619 rx['DOI'] = $1 620 end 621 rx 622 end