class Bio::CNLS::Screenscraper
A class used to automatically submit results to the cNLS webserver and parse the HTML results.
Constants
- ACCEPTABLE_CUTOFFS
Public Class Methods
get_raw_html_result(amino_acid_sequence, cut_off='3.0', seconds_pause=1)
click to toggle source
# File lib/bio/cnls_screenscraper/cnls_screenscraper.rb, line 93 def self.get_raw_html_result(amino_acid_sequence, cut_off='3.0', seconds_pause=1) unless ACCEPTABLE_CUTOFFS.include?(cut_off) raise Exception, "Specified cutoff `#{cut_off}' for the cNLS screenscraper is invalid. Valid cutoffs are #{ACCEPTABLE_CUTOFFS.join(', ')}. They are strings, not floating point values." end # retrieve the webpage res = Net::HTTP.post_form(URI.parse('http://nls-mapper.iab.keio.ac.jp/cgi-bin/NLS_Mapper_y.cgi'), {'cut_off' => cut_off, 'typedseq' => amino_acid_sequence}) # if there is an error, raise it unless res.kind_of?(Net::HTTPOK) raise Exception, "Failed to retrieve cNLS, internet connectivity problem? Using cutoff/sequence #{cutoff}/#{amino_acid_sequence}" end # pause the specified number of seconds sleep seconds_pause return res.body end
parse_html_result(html)
click to toggle source
Given HTML corresponding to a result, return a parse object that is more programmatically palatable.
# File lib/bio/cnls_screenscraper/cnls_screenscraper.rb, line 114 def self.parse_html_result(html) result = Result.new # The mono and bi-partite regular expressions are equivalent except for the Predicted X NLS bit at the beginning, thanksfully. However, they sometimes appear to be slightly different, which is rather odd. monopartite_regex = /Predicted monopartite NLS<\/th>\s+<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td align="center"><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><\/TR>/i bipartite_regex = /Predicted bipartite NLS<\/th>\s+<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td align="center"><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><\/TR>/i monopartite_no_hits = /Predicted monopartite NLS<\/th>\s*<\/tr>\s*<tr bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/tr>\s*<tr><td><strong><big><code><\/code><\/big><\/strong><\/td><td><strong><big><code><\/code><\/big><\/strong><\/td><td align="center"><strong><big><code><\/code><\/big><\/strong><\/td><\/tr>/i bipartite_no_hits = /Predicted bipartite NLS<\/th>\s*<\/tr>\s*<tr bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/tr>\s*<tr><td><strong><big><code><\/code><\/big><\/strong><\/td><td><strong><big><code><\/code><\/big><\/strong><\/td><td align="center"><strong><big><code><\/code><\/big><\/strong><\/td><\/tr>/i monopartite_no_hits2 = /Predicted monopartite NLS<\/th>\s*<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code \/><\/big><\/strong><\/td><td><strong><big><code \/><\/big><\/strong><\/td><td align="center"><strong><big><code \/><\/big><\/strong><\/td><\/TR>/i bipartite_no_hits2 = /Predicted bipartite NLS<\/th>\s*<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code \/><\/big><\/strong><\/td><td><strong><big><code \/><\/big><\/strong><\/td><td align="center"><strong><big><code \/><\/big><\/strong><\/td><\/TR>/i split_regex = /<\/code><\/big><\/strong><br.{0,2}><strong><big><code>/ # Make sure the sequence isn't too long if html.match(/Query sequence should be < 5000 aa/) raise Exception, "Query sequence provided was too long (> 5000 aa)" # parse out monopartite signals elsif matches = html.match(monopartite_regex) positions = matches[1].split(split_regex) seqs = matches[2].split(split_regex) scores = matches[3].split(split_regex) positions.each_with_index do |pos, i| nls = Result::MonopartiteNLS.new nls.position = pos.to_i nls.sequence = seqs[i] nls.score = scores[i].to_f result.signals.push nls end elsif html.match(monopartite_no_hits) or html.match(monopartite_no_hits2) # do nothing, except for not raising a parsing exception else raise Exception, "Could not parse HTML output returned from cNLS prediction server. In particular, looking for monopartite signals, but the whole document is likely problematic.\n#{html}" end # parse out the bipartite signals if matches = html.match(bipartite_regex) positions = matches[1].split(split_regex) seqs = matches[2].split(split_regex) scores = matches[3].split(split_regex) positions.each_with_index do |pos, i| nls = Result::BipartiteNLS.new nls.position = pos.to_i nls.sequence = seqs[i] nls.score = scores[i].to_f result.signals.push nls end elsif html.match(bipartite_no_hits) or html.match(bipartite_no_hits2) # do nothing, except for not raising a parsing exception else raise Exception, "Could not parse HTML output returned from cNLS prediction server. In particular, looking for bipartite signals, monopartite signals seemed to be parsed OK.\n#{html}" end return result end
submit(amino_acid_sequence, cut_off='3.0', seconds_pause=1)
click to toggle source
Contact the cNLS prediction server and submit the amino acid sequence for prediction. Return a Bio::CNLS::Result
object. Pause after each round for pause milliseconds, so as not to overload the server.
# File lib/bio/cnls_screenscraper/cnls_screenscraper.rb, line 85 def self.submit(amino_acid_sequence, cut_off='3.0', seconds_pause=1) # contact webserver and sleep html = get_raw_html_result(amino_acid_sequence, cut_off, seconds_pause) # Return the parsed HTML as a CNLS::Result object return parse_html_result(html) end