class BioDSL::PlotResidueDistribution

Plot the residue distribution of sequences in the stream.

plot_residue_distribution creates a residue distribution plot per sequence position of sequences in the stream. Plotting is done using GNUplot which allows for different types of output the default one being crufty ASCII graphics.

If plotting distributions from sequences of variable length you can use the count option to co-plot the relative count at each base position. This allow you to explain areas with a scewed distribution.

GNUplot must be installed for plot_residue_distribution to work. Read more here:

www.gnuplot.info/

Usage

plot_residue_distribution([count: <bool>[, output: <file>
                          [, force: <bool> [, terminal: <string>
                          [, title: <string>[, xlabel: <string>
                          [, ylabel: <string>[, test: <bool>]]]]]]])

Options

Examples

Here we plot a residue distribution of a FASTA file:

BD.new.read_fasta(input: "test.fna").plot_residue_distribution.run

rubocop: disable ClassLength

Constants

STATS

Public Class Methods

new(options) click to toggle source

Constructo for PlotResidueDistribution.

@param options [Hash] Options hash. @option options [Boolean] :count @option options [String] :output @option options [Boolean] :force @option options [:Symbol] :terminal @option options [String] :title @option options [String] :xlabel @option options [String] :ylabel @option options [Boolean] :test

@return [PlotResidueDistribution] Class instance.

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 94
def initialize(options)
  @options  = options
  @counts   = Hash.new { |h, k| h[k] = Hash.new(0) }
  @total    = Hash.new(0)
  @residues = Set.new
  @gp       = nil
  @offset   = Set.new # Hackery thing to offset datasets 1 postion.

  aux_exist('gnuplot')
  check_options
  defaults
end

Public Instance Methods

lmb() click to toggle source

Return command lambda for PlotResidueDistribution.

@return [Proc] Command lambda.

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 110
def lmb
  lambda do |input, output, status|
    status_init(status, STATS)

    input.each do |record|
      @status[:records_in] += 1

      count_residues(record) if record.key? :SEQ

      next unless output
      output << record
      @status[:records_out] += 1

      if record.key? :SEQ
        @status[:sequences_out] += 1
        @status[:residues_out] += record[:SEQ].length
      end
    end

    plot_create
    plot_output
  end
end

Private Instance Methods

check_options() click to toggle source

Check options.

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 137
def check_options
  options_allowed(@options, :count, :output, :force, :terminal, :title,
                  :xlabel, :ylabel, :test)
  options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
                                              :aqua, :png, :pdf])
  options_allowed_values(@options, count: [nil, true, false])
  options_allowed_values(@options, test: [nil, true, false])
  options_files_exist_force(@options, :output)
end
count_residues(record) click to toggle source

Given a record with a sequence count its residues.

@param record [Hash] BioDSL record

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 158
def count_residues(record)
  @status[:sequences_in] += 1
  @status[:residues_in] += record[:SEQ].length

  record[:SEQ].upcase.chars.each_with_index do |char, i|
    c = char.to_sym
    @counts[i][c] += 1
    @total[i] += 1
    @residues.add(c)
  end
end
defaults() click to toggle source

Set default options.

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 148
def defaults
  @options[:terminal] ||= :dumb
  @options[:title] ||= 'Residue Distribution'
  @options[:xlabel] ||= 'Sequence position'
  @options[:ylabel] ||= '%'
end
plot_colors() click to toggle source

Set plot line colors color scheme: en.wikipedia.org/wiki/Help:Distinguishable_colors

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 237
def plot_colors
  @gp.set linetype: '1 lc rgb "#FF0010"'  # Red
  @gp.set linetype: '2 lc rgb "#191919"'  # Ebony
  @gp.set linetype: '3 lc rgb "#0075DC"'  # Blue
  @gp.set linetype: '4 lc rgb "#2BCE48"'  # Green
  @gp.set linetype: '5 lc rgb "#FFFF00"'  # Yellow
  @gp.set linetype: '6 lc rgb "#4C005C"'  # Damson
  @gp.set linetype: '7 lc rgb "#993F00"'  # Caramel
  @gp.set linetype: '8 lc rgb "#FFCC99"'  # Honeydew
  @gp.set linetype: '9 lc rgb "#808080"'  # Iron
  @gp.set linetype: '10 lc rgb "#94FFB5"' # Jade
  @gp.set linetype: '11 lc rgb "#8F7C00"' # Khaki
  @gp.set linetype: '12 lc rgb "#9DCC00"' # Lime
  @gp.set linetype: '13 lc rgb "#C20088"' # Mallow
  @gp.set linetype: '14 lc rgb "#003380"' # Navy
  @gp.set linetype: '15 lc rgb "#FFA405"' # Orpiment
  @gp.set linetype: '16 lc rgb "#FFA8BB"' # Pink
  @gp.set linetype: '17 lc rgb "#426600"' # Quagmire
  @gp.set linetype: '18 lc rgb "#F0A3FF"' # Amethyst
  @gp.set linetype: '19 lc rgb "#5EF1F2"' # Sky
  @gp.set linetype: '20 lc rgb "#00998F"' # Turquoise
  @gp.set linetype: '21 lc rgb "#E0FF66"' # Uranium
  @gp.set linetype: '22 lc rgb "#740AFF"' # Violet
  @gp.set linetype: '23 lc rgb "#990000"' # Wine
  @gp.set linetype: '24 lc rgb "#FFFF80"' # Xanthin
  @gp.set linetype: '25 lc rgb "#005C31"' # Forest
  @gp.set linetype: '26 lc rgb "#FF5005"' # Zinnia
  @gp.set linetype: 'cycle 26'
end
plot_count() click to toggle source

Plot count data.

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 195
def plot_count
  max   = @total.values.max
  style = {using: '1:2', with: 'lines lw 2 lt rgb "black"',
           title: '"count"'}

  @gp.add_dataset(style) do |plotter|
    @counts.each_key do |pos|
      plotter << [0, 0.0] unless @offset.include? :count
      plotter << [pos, 100 * @total[pos].to_f / max]
      @offset << :count
    end
  end
end
plot_create() click to toggle source

Create plot.

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 171
def plot_create
  @gp = GnuPlotter.new
  plot_defaults

  @residues.sort.reverse.each_with_index do |residue, i|
    plot_residue(residue, i)
  end

  plot_count if @options[:count]
end
plot_defaults() click to toggle source

Set plot defaults

rubocop: disable MethodLength

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 212
def plot_defaults
  @gp.set terminal:  @options[:terminal].to_s
  @gp.set title:     @options[:title]
  @gp.set xlabel:    @options[:xlabel]
  @gp.set ylabel:    @options[:ylabel]
  @gp.set output:    @options[:output] if @options[:output]
  @gp.set xtics:     'out'
  @gp.set ytics:     'out'
  @gp.set yrange:   '[0:100]'
  @gp.set xrange:   "[0:#{@counts.size}]"
  @gp.set auto:     'fix'
  @gp.set offsets:  '1'
  @gp.set key:      'outside right top vertical Left reverse noenhanced ' \
    'autotitles columnhead nobox'
  @gp.set key:      'invert samplen 4 spacing 1 width 0 height 0'
  @gp.set style:    'fill solid 0.5 border'
  @gp.set style:    'histogram rowstacked'
  @gp.set style:    'data histograms'
  @gp.set boxwidth: '0.75 absolute'

  plot_colors
end
plot_output() click to toggle source

Output plot data.

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 268
def plot_output
  if @options[:test]
    $stderr.puts @gp.to_gp
  elsif @options[:terminal] == :dumb
    puts @gp.plot
  else
    @gp.plot
  end
end
plot_residue(residue, i) click to toggle source

Plot residue data.

# File lib/BioDSL/commands/plot_residue_distribution.rb, line 183
def plot_residue(residue, i)
  @gp.add_dataset(using: 1, with: "histogram lt #{i + 1}",
                  title: "\"#{residue}\"") do |plotter|
    @counts.each do |pos, dist|
      plotter << 0.0 unless @offset.include? residue
      plotter << 100 * dist[residue].to_f / @total[pos]
      @offset << residue
    end
  end
end