class Bio::HMMER::Report

A parser class for a search report by hmmsearch or hmmpfam program in the HMMER package.

Examples

Examples
 #for multiple reports in a single output file (example.hmmpfam)
 Bio::HMMER.reports(File.read("example.hmmpfam")) do |report|
   report.program['name']
   report.parameter['HMM file']
   report.query_info['Query sequence']
   report.hits.each do |hit|
     hit.accession
     hit.description
     hit.score
     hit.evalue
     hit.hsps.each do |hsp|
       hsp.accession
       hsp.domain
       hsp.evalue
       hsp.midline
   end
 end

References

Constants

DELIMITER

Delimiter of each entry for Bio::FlatFile support.

Attributes

domain_top_hits[R]

statistics by hmmsearch. Keys are 'Total memory', 'Satisfying E cutoff' and 'Total hits'.

histogram[R]

statistics by hmmsearch.

hits[R]
hsps[R]

Returns an Array of Bio::HMMER::Report::Hsp objects. Under special circumstances, some HSPs do not have parent Hit objects. If you want to access such HSPs, use this method.

parameter[R]

A hash contains parameters used. Valid keys are 'HMM file' and 'Sequence file'.

program[R]

A Hash contains program information used. Valid keys are 'name', 'version', 'copyright' and 'license'.

query_info[R]

A hash contains the query information. Valid keys are 'query sequence', 'Accession' and 'Description'.

statistical_detail[R]

statistics by hmmsearch. Keys are 'mu', 'lambda', 'chi-sq statistic' and 'P(chi-square)'.

total_seq_searched[R]

statistics by hmmsearch.

whole_seq_top_hits[R]

statistics by hmmsearch. Keys are 'Total memory', 'Satisfying E cutoff' and 'Total hits'.

Public Class Methods

new(data) click to toggle source

Parses a HMMER search report (by hmmpfam or hmmsearch program) and reutrns a Bio::HMMER::Report object.

Examples

hmmpfam_report = Bio::HMMER::Report.new(File.read("hmmpfam.out"))

hmmsearch_report = Bio::HMMER::Report.new(File.read("hmmsearch.out"))
    # File lib/bio/appl/hmmer/report.rb
156 def initialize(data)
157 
158   # The input data is divided into six data fields, i.e. header,
159   # query infomation, hits, HSPs, alignments and search statistics.
160   # However, header and statistics data don't necessarily exist.
161   subdata, is_hmmsearch = get_subdata(data)
162 
163   # if header exists, parse it
164   if subdata["header"]
165     @program, @parameter = parse_header_data(subdata["header"])
166   else
167     @program, @parameter = [{}, {}]
168   end
169 
170   @query_info = parse_query_info(subdata["query"])
171   @hits       = parse_hit_data(subdata["hit"])
172   @hsps       = parse_hsp_data(subdata["hsp"], is_hmmsearch)
173 
174   if @hsps != []
175     # split alignment subdata into an array of alignments
176     aln_ary = subdata["alignment"].split(/^\S+.*?\n/).slice(1..-1)
177 
178     # append alignment information to corresponding Hsp
179     aln_ary.each_with_index do |aln, i|
180       @hsps[i].set_alignment(aln)
181     end
182   end
183 
184   # assign each Hsp object to its parent Hit
185   hits_hash = {}
186   @hits.each do |hit|
187     hits_hash[hit.accession] = hit
188   end
189   @hsps.each do |hsp|
190     if hits_hash.has_key?(hsp.accession)
191       hits_hash[hsp.accession].append_hsp(hsp)
192     end
193   end
194 
195   # parse statistics (for hmmsearch)
196   if is_hmmsearch
197     @histogram, @statistical_detail, @total_seq_searched, \
198     @whole_seq_top_hits, @domain_top_hits = \
199     parse_stat_data(subdata["statistics"])
200   end
201 
202 end

Public Instance Methods

each() { |hit| ... } click to toggle source

Iterates each hit (Bio::HMMER::Report::Hit).

    # File lib/bio/appl/hmmer/report.rb
206 def each
207   @hits.each do |hit|
208     yield hit
209   end
210 end
Also aliased as: each_hit
each_hit()
Alias for: each

Private Instance Methods

get_subdata(data) click to toggle source

Bio::HMMER::Report#get_subdata

    # File lib/bio/appl/hmmer/report.rb
215 def get_subdata(data)
216   subdata = {}
217   header_prefix = '\Ahmm(search|pfam) - search'
218   query_prefix  = '^Query (HMM|sequence): .*\nAccession: '
219   hit_prefix    = '^Scores for (complete sequences|sequence family)'
220   hsp_prefix    = '^Parsed for domains:'
221   aln_prefix    = '^Alignments of top-scoring domains:\n'
222   stat_prefix   = '^\nHistogram of all scores:'
223 
224   # if header exists, get it
225   if data =~ /#{header_prefix}/
226     is_hmmsearch = ($1 == "search")   # hmmsearch or hmmpfam
227     subdata["header"] = data[/(\A.+?)(?=#{query_prefix})/m]
228   else
229     is_hmmsearch = false   # if no header, assumed to be hmmpfam
230   end
231 
232   # get query, Hit and Hsp data
233   subdata["query"] = data[/(#{query_prefix}.+?)(?=#{hit_prefix})/m]
234   subdata["hit"]   = data[/(#{hit_prefix}.+?)(?=#{hsp_prefix})/m]
235   subdata["hsp"]   = data[/(#{hsp_prefix}.+?)(?=#{aln_prefix})/m]
236 
237   # get alignment data
238   if is_hmmsearch
239     data =~ /#{aln_prefix}(.+?)#{stat_prefix}/m
240     subdata["alignment"] = $1
241   else
242     data =~ /#{aln_prefix}(.+?)\/\/\n/m
243     subdata["alignment"] = $1
244     raise "multiple reports found" if $'.length > 0
245   end
246   
247   # handle -A option of HMMER
248   cutoff_line = '\t\[output cut off at A = \d+ top alignments\]\n\z'
249   subdata["alignment"].sub!(/#{cutoff_line}/, '')
250   
251   # get statistics data
252   subdata["statistics"] = data[/(#{stat_prefix}.+)\z/m]
253 
254   [subdata, is_hmmsearch]
255 end
parse_header_data(data) click to toggle source

Bio::HMMER::Report#parse_header_data

    # File lib/bio/appl/hmmer/report.rb
260 def parse_header_data(data)
261   data =~ /\A(.+? - - -$\n)(.+? - - -$\n)\n\z/m
262   program_data   = $1
263   parameter_data = $2
264   
265   program = {}
266   program['name'], program['version'], program['copyright'], \
267   program['license'] = program_data.split(/\n/)
268   
269   parameter = {}
270   parameter_data.each_line do |x|
271     if /^(.+?):\s+(.*?)\s*$/ =~ x
272       parameter[$1] = $2
273     end
274   end
275 
276   [program, parameter]
277 end
parse_hit_data(data) click to toggle source

Bio::HMMER::Report#parse_hit_data

    # File lib/bio/appl/hmmer/report.rb
297 def parse_hit_data(data)
298   data.sub!(/.+?---\n/m, '').chop!
299   hits = []
300   return hits if data == "\t[no hits above thresholds]\n"
301   data.each_line do |l|
302     hits.push(Hit.new(l))
303   end
304   hits
305 end
parse_hsp_data(data, is_hmmsearch) click to toggle source

Bio::HMMER::Report#parse_hsp_data

    # File lib/bio/appl/hmmer/report.rb
310 def parse_hsp_data(data, is_hmmsearch)
311   data.sub!(/.+?---\n/m, '').chop!
312   hsps=[]
313   return hsps if data == "\t[no hits above thresholds]\n"
314   data.each_line do |l|
315     hsps.push(Hsp.new(l, is_hmmsearch))
316   end
317   return hsps
318 end
parse_query_info(data) click to toggle source

Bio::HMMER::Report#parse_query_info

    # File lib/bio/appl/hmmer/report.rb
282 def parse_query_info(data)
283   hash = {}
284   data.each_line do |x|
285     if /^(.+?):\s+(.*?)\s*$/ =~ x
286       hash[$1] = $2
287     elsif /\s+\[(.+)\]/ =~ x
288       hash['comments'] = $1
289     end
290   end
291   hash
292 end
parse_stat_data(data) click to toggle source

Bio::HMMER::Report#parse_stat_data

    # File lib/bio/appl/hmmer/report.rb
323 def parse_stat_data(data)
324   data.sub!(/\nHistogram of all scores:\n(.+?)\n\n\n%/m, '')
325   histogram = $1.strip
326 
327   statistical_detail = {}
328   data.sub!(/(.+?)\n\n/m, '')
329   $1.each_line do |l|
330     statistical_detail[$1] = $2.to_f if /^\s*(.+?)\s*=\s*(\S+)/ =~ l
331   end
332     
333   total_seq_searched = nil
334   data.sub!(/(.+?)\n\n/m, '')
335   $1.each_line do |l|
336     total_seq_searched = $2.to_i if /^\s*(.+)\s*:\s*(\S+)/ =~ l
337   end
338     
339   whole_seq_top_hits = {}
340   data.sub!(/(.+?)\n\n/m, '')
341   $1.each_line do |l|
342     if /^\s*(.+?):\s*(\d+)\s*$/ =~ l
343       whole_seq_top_hits[$1] = $2.to_i
344     elsif /^\s*(.+?):\s*(\S+)\s*$/ =~ l
345       whole_seq_top_hits[$1] = $2
346     end
347   end
348     
349   domain_top_hits = {}
350   data.each_line do |l|
351     if /^\s*(.+?):\s*(\d+)\s*$/ =~ l
352       domain_top_hits[$1] = $2.to_i
353     elsif /^\s*(.+?):\s*(\S+)\s*$/ =~ l
354       domain_top_hits[$1] = $2
355     end
356   end
357 
358   [histogram, statistical_detail, total_seq_searched, \
359     whole_seq_top_hits, domain_top_hits]
360 end