class BioDSL::PlotScores

Create a histogram with mean sequence quality scores.

plot_scores creates a histogram of the mean values per base of the quality scores from sequence data.

Plotting is done using GNUplot which allows for different types of output the default one being crufty ASCII graphics.

If plotting scores from sequences of variable length you can use the count option to co-plot the relative count at each base position. This allow you to detect areas with a low relative count showing a high mean score.

GNUplot must be installed for plot_scores to work. Read more here:

www.gnuplot.info/

Usage

plot_scores([count: <bool>[, output: <file>[, force: <bool>
            [, terminal: <string>[, title: <string>
            [, xlabel: <string>[, ylabel: <string>
            [, test: <bool>]]]]]]]])

Options

Examples

Here we plot the mean quality scores from a FASTQ file:

read_fastq(input: "test.fq").plot_scores.run

                             Mean Quality Scores
    +             +            +             +             +            +
40 ++-------------+------------+-------------+-------------+------------+++
    |  *****************                               mean score ****** |
35 ++ ***********************                                            ++
    ****************************** **                                    |
30 +*********************************   *                                ++
    ************************************* *                              |
25 +*************************************** *                            ++
    ****************************************** *****                     |
20 +****************************************************  ** * *         ++
    ******************************************************************** *
15 +**********************************************************************+
    **********************************************************************
10 +**********************************************************************+
    **********************************************************************
 5 +**********************************************************************+
    **********************************************************************
 0 +**********************************************************************+
    +             +            +             +             +            +
    0             50          100           150           200          250
                              Sequence position

To render X11 output (i.e. instant view) use the terminal option:

read_fastq(input: "test.fq").
plot_scores(terminal: :x11).run

To generate a PNG image and save to file:

read_fastq(input: "test.fq").
plot_scores(terminal: :png, output: "plot.png").run

rubocop: enable LineLength rubocop: disable ClassLength

Constants

SCORES_MAX
STATS

Public Class Methods

new(options) click to toggle source

Constructor for PlotScores.

@param options [Hash] Options hash. @option options [Boolean] :count @option options [String] :output @option options [Boolean] :force @option options [Symbol] :terminal @option options [String] :title @option options [String] :xlabel @option options [String] :ylabel @option options [Boolean] :ylogscale @option options [Boolean] :test

@return [PlotScores] Class instance.

# File lib/BioDSL/commands/plot_scores.rb, line 132
def initialize(options)
  @options    = options
  @scores_vec = NArray.int(SCORES_MAX)
  @count_vec  = NArray.int(SCORES_MAX)
  @max        = 0

  aux_exist('gnuplot')
  check_options
  default
end

Public Instance Methods

lmb() click to toggle source

Return command lambda for plot_scores.

@return [Proc] Command lambda.

# File lib/BioDSL/commands/plot_scores.rb, line 146
def lmb
  lambda do |input, output, status|
    status_init(status, STATS)

    input.each do |record|
      @status[:records_in] += 1

      collect_plot_data(record)

      write_output(output, record)
    end

    prepare_plot_data

    plot_defaults
    plot_scores
    plot_count
    plot_output
  end
end

Private Instance Methods

check_length(scores) click to toggle source

Check if the scores string is longer than SCORES_MAX.

@raise [BioDSLError] if too long.

# File lib/BioDSL/commands/plot_scores.rb, line 207
def check_length(scores)
  return unless scores.length > SCORES_MAX
  msg = "score string too long: #{scores.length} > #{SCORES_MAX}"
  fail BioDSLError, msg
end
check_options() click to toggle source

Check options.

# File lib/BioDSL/commands/plot_scores.rb, line 170
def check_options
  options_allowed(@options, :count, :output, :force, :terminal, :title,
                  :xlabel, :ylabel, :ylogscale, :test)
  options_allowed_values(@options, count: [true, false])
  options_allowed_values(@options, test: [true, false])
  options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
                                              :aqua, :png, :pdf])
  options_files_exist_force(@options, :output)
end
collect_plot_data(record) click to toggle source

Collect plot data from a given record.

@param record [Hash] BioDSL record.

# File lib/BioDSL/commands/plot_scores.rb, line 191
def collect_plot_data(record)
  scores = record[:SCORES]
  return unless scores && scores.length > 0

  check_length(scores)

  score_vec = NArray.to_na(scores, 'byte') - Seq::SCORE_BASE
  @scores_vec[0...scores.length] += score_vec
  @count_vec[0...scores.length] += 1

  @max = scores.length if scores.length > @max
end
default() click to toggle source

Set default options.

# File lib/BioDSL/commands/plot_scores.rb, line 181
def default
  @options[:terminal] ||= :dumb
  @options[:title] ||= 'Mean Quality Scores'
  @options[:xlabel] ||= 'Sequence Position'
  @options[:ylabel] ||= 'Mean Score'
end
mean_vec() click to toggle source

Calculate the mean scores vector.

@return [NArray] NArray with mean scores.

# File lib/BioDSL/commands/plot_scores.rb, line 228
def mean_vec
  @scores_vec[0...@max].to_f / @count_vec[0...@max]
end
plot_count() click to toggle source

Plot count data.

# File lib/BioDSL/commands/plot_scores.rb, line 257
def plot_count
  return unless @options[:count]

  style = {with: 'lines lt rgb "black"', title: '"relative count"'}

  @gp.add_dataset(style) do |plotter|
    @x.zip(@y2).each { |e| plotter << e }
  end
end
plot_defaults() click to toggle source

Set plot defaults

# File lib/BioDSL/commands/plot_scores.rb, line 233
def plot_defaults
  @gp = GnuPlotter.new
  @gp.set terminal: @options[:terminal]
  @gp.set title:    @options[:title]
  @gp.set xlabel:   @options[:xlabel]
  @gp.set ylabel:   @options[:ylabel]
  @gp.set output:   @options[:output] if @options[:output]
  @gp.set xrange:   "[#{@x.min - 1}:#{@x.max + 1}]"
  @gp.set yrange:   "[#{Seq::SCORE_MIN}:#{Seq::SCORE_MAX}]"
  @gp.set style:    'fill solid 0.5 border'
  @gp.set xtics:    'out'
  @gp.set ytics:    'out'
end
plot_output() click to toggle source

Output plot

# File lib/BioDSL/commands/plot_scores.rb, line 268
def plot_output
  if @options[:test]
    $stderr.puts @gp.to_gp
  elsif @options[:terminal] == :dumb
    puts @gp.plot
  else
    @gp.plot
  end
end
plot_scores() click to toggle source

Plot scores data.

# File lib/BioDSL/commands/plot_scores.rb, line 248
def plot_scores
  style = {with: 'boxes lc rgb "red"', title: '"mean score"'}

  @gp.add_dataset(style) do |plotter|
    @x.zip(@y1).each { |e| plotter << e }
  end
end
prepare_plot_data() click to toggle source

Prepare data to plot.

# File lib/BioDSL/commands/plot_scores.rb, line 214
def prepare_plot_data
  @max = 1 if @max == 0 # ugly fix to avaid index error

  count_vec  = @count_vec[0...@max].to_f
  count_vec *= (Seq::SCORE_MAX / @count_vec.max(0).to_f)

  @x  = (1..@max).to_a
  @y1 = mean_vec.to_a
  @y2 = count_vec.to_a
end
write_output(output, record) click to toggle source

Write record to output.

# File lib/BioDSL/commands/plot_scores.rb, line 279
def write_output(output, record)
  return unless output
  output << record
  @status[:records_out] += 1
end