class Bio::CNLS::Screenscraper

A class used to automatically submit results to the cNLS webserver and parse the HTML results.

Constants

ACCEPTABLE_CUTOFFS

Public Class Methods

get_raw_html_result(amino_acid_sequence, cut_off='3.0', seconds_pause=1) click to toggle source
# File lib/bio/cnls_screenscraper/cnls_screenscraper.rb, line 93
def self.get_raw_html_result(amino_acid_sequence, cut_off='3.0', seconds_pause=1)
  unless ACCEPTABLE_CUTOFFS.include?(cut_off)
    raise Exception, "Specified cutoff `#{cut_off}' for the cNLS screenscraper is invalid. Valid cutoffs are #{ACCEPTABLE_CUTOFFS.join(', ')}. They are strings, not floating point values."
  end
  
  # retrieve the webpage
  res = Net::HTTP.post_form(URI.parse('http://nls-mapper.iab.keio.ac.jp/cgi-bin/NLS_Mapper_y.cgi'),
  {'cut_off' => cut_off, 'typedseq' => amino_acid_sequence})
  
  # if there is an error, raise it
  unless res.kind_of?(Net::HTTPOK)
    raise Exception, "Failed to retrieve cNLS, internet connectivity problem? Using cutoff/sequence #{cutoff}/#{amino_acid_sequence}"
  end
  
  # pause the specified number of seconds
  sleep seconds_pause
  
  return res.body
end
parse_html_result(html) click to toggle source

Given HTML corresponding to a result, return a parse object that is more programmatically palatable.

# File lib/bio/cnls_screenscraper/cnls_screenscraper.rb, line 114
def self.parse_html_result(html)
  result = Result.new
  
  # The mono and bi-partite regular expressions are equivalent except for the Predicted X NLS bit at the beginning, thanksfully. However, they sometimes appear to be slightly different, which is rather odd.
  monopartite_regex = /Predicted monopartite NLS<\/th>\s+<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td align="center"><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><\/TR>/i
  bipartite_regex =     /Predicted bipartite NLS<\/th>\s+<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td align="center"><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><\/TR>/i
  
  monopartite_no_hits = /Predicted monopartite NLS<\/th>\s*<\/tr>\s*<tr bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/tr>\s*<tr><td><strong><big><code><\/code><\/big><\/strong><\/td><td><strong><big><code><\/code><\/big><\/strong><\/td><td align="center"><strong><big><code><\/code><\/big><\/strong><\/td><\/tr>/i
  bipartite_no_hits =     /Predicted bipartite NLS<\/th>\s*<\/tr>\s*<tr bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/tr>\s*<tr><td><strong><big><code><\/code><\/big><\/strong><\/td><td><strong><big><code><\/code><\/big><\/strong><\/td><td align="center"><strong><big><code><\/code><\/big><\/strong><\/td><\/tr>/i
  monopartite_no_hits2 = /Predicted monopartite NLS<\/th>\s*<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code \/><\/big><\/strong><\/td><td><strong><big><code \/><\/big><\/strong><\/td><td align="center"><strong><big><code \/><\/big><\/strong><\/td><\/TR>/i
  bipartite_no_hits2 =     /Predicted bipartite NLS<\/th>\s*<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code \/><\/big><\/strong><\/td><td><strong><big><code \/><\/big><\/strong><\/td><td align="center"><strong><big><code \/><\/big><\/strong><\/td><\/TR>/i
  
  split_regex = /<\/code><\/big><\/strong><br.{0,2}><strong><big><code>/
  
  # Make sure the sequence isn't too long
  if html.match(/Query sequence should be < 5000 aa/)
    raise Exception, "Query sequence provided was too long (> 5000 aa)"
    
    # parse out monopartite signals
  elsif matches = html.match(monopartite_regex)
    positions = matches[1].split(split_regex)
    seqs = matches[2].split(split_regex)
    scores = matches[3].split(split_regex)
    
    positions.each_with_index do |pos, i|
      nls = Result::MonopartiteNLS.new
      nls.position = pos.to_i
      nls.sequence = seqs[i]
      nls.score = scores[i].to_f
      result.signals.push nls
    end
  elsif html.match(monopartite_no_hits) or html.match(monopartite_no_hits2)
    # do nothing, except for not raising a parsing exception
  else
    raise Exception, "Could not parse HTML output returned from cNLS prediction server. In particular, looking for monopartite signals, but the whole document is likely problematic.\n#{html}"
  end
  
  
  # parse out the bipartite signals
  if matches = html.match(bipartite_regex)
    positions = matches[1].split(split_regex)
    seqs = matches[2].split(split_regex)
    scores = matches[3].split(split_regex)
    
    positions.each_with_index do |pos, i|
      nls = Result::BipartiteNLS.new
      nls.position = pos.to_i
      nls.sequence = seqs[i]
      nls.score = scores[i].to_f
      result.signals.push nls
    end
  elsif html.match(bipartite_no_hits) or html.match(bipartite_no_hits2)
    # do nothing, except for not raising a parsing exception
  else
    raise Exception, "Could not parse HTML output returned from cNLS prediction server. In particular, looking for bipartite signals, monopartite signals seemed to be parsed OK.\n#{html}"
  end
  
  return result
end
submit(amino_acid_sequence, cut_off='3.0', seconds_pause=1) click to toggle source

Contact the cNLS prediction server and submit the amino acid sequence for prediction. Return a Bio::CNLS::Result object. Pause after each round for pause milliseconds, so as not to overload the server.

# File lib/bio/cnls_screenscraper/cnls_screenscraper.rb, line 85
def self.submit(amino_acid_sequence, cut_off='3.0', seconds_pause=1)
  # contact webserver and sleep
  html = get_raw_html_result(amino_acid_sequence, cut_off, seconds_pause)
  
  # Return the parsed HTML as a CNLS::Result object
  return parse_html_result(html)
end