class GeneValidator::Cluster

Stores the values belonging to one cluster Used for clusterization among a vector of values

Attributes

lengths[RW]

a hash map containing the pair (length, no_occurences)

Public Class Methods

new(lengths) click to toggle source
# File lib/genevalidator/clusterization.rb, line 158
def initialize(lengths)
  @lengths = lengths
end

Public Instance Methods

add(cluster) click to toggle source

Merges the current cluster with the one given as parameter clusters vector of Cluster objects

# File lib/genevalidator/clusterization.rb, line 267
def add(cluster)
  cluster.lengths.each do |elem|
    lengths[elem[0]] = elem[1]
  end
end
density() click to toggle source

Returns the density of the cluster: how many values it contains

# File lib/genevalidator/clusterization.rb, line 177
def density
  d = 0
  lengths.each do |elem|
    d += elem[1]
  end
  d
end
deviation(clusters, queryLength) click to toggle source

Returns the deviation of a value from the values in all clusters Params: clusters: a list of Cluster objects queryLength: a reference Sequence object Output: Real number

# File lib/genevalidator/clusterization.rb, line 255
def deviation(clusters, queryLength)
  hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten
  raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.delete('[').delete(']')
  R.eval("sd = sd(c(#{raw_hits}))")
  sd = R.pull('sd')
  sd = standard_deviation(hits)
  (queryLength - mean).abs / sd
end
distance(cluster, method = 0) click to toggle source

Returns the euclidian distance between the current cluster and the one given as parameter Params: cluster: Cluster object method: 0 or 1 method = 0: do not into condseideration duplicate values method = 1: average linkage clusterization

# File lib/genevalidator/clusterization.rb, line 192
def distance(cluster, method = 0)
  d = 0
  norm = 0

  cluster.lengths.each do |elem1|
    lengths.each do |elem2|
      if method == 1
        d += elem1[1] * elem2[1] * (elem1[0] - elem2[0]).abs
        norm += elem1[1] * elem2[1]
      else
        d += (elem1[0] - elem2[0]).abs
        norm = cluster.lengths.length * lengths.length
      end
    end
  end

  # group average distance
  d /= (norm + 0.0)
  d.round(4)
end
get_limits() click to toggle source

Returns the interval limits of the current cluster

# File lib/genevalidator/clusterization.rb, line 285
def get_limits
  lengths.map { |elem| elem[0] }.minmax
end
inside_cluster(value) click to toggle source

Returns whether the value is inside the cluster Params: value: value to compare Output: :ok or :shorter or :longer

# File lib/genevalidator/clusterization.rb, line 295
def inside_cluster(value)
  limits = get_limits
  left = limits[0]
  right = limits[1]

  :ok if left <= value && right >= value
  :shorter if left >= value
  :longer if right <= value
end
mean() click to toggle source

Returns the weighted mean value of the cluster

# File lib/genevalidator/clusterization.rb, line 164
def mean
  mean_len = 0
  weight = 0

  lengths.each do |length, n|
    mean_len += length * n
    weight += n
  end
  mean_len /= weight
end
print() click to toggle source

Prints the current cluster

standard_deviation(lengths = nil) click to toggle source

Returns the standard deviation of a set of values Params: lengths: a vector of values (optional, by default it takes the values in the cluster) Output: Real number

# File lib/genevalidator/clusterization.rb, line 235
def standard_deviation(lengths = nil)
  if lengths.nil?
    lengths = @lengths.map { |x| Array.new(x[1], x[0]) }.flatten
  end

  cluster_mean = mean
  std_deviation = 0
  lengths.each do |len|
    std_deviation += (cluster_mean - len) * (cluster_mean - len)
  end
  std_deviation = Math.sqrt(std_deviation.to_f / (lengths.length - 1))
end
wss(lengths = nil) click to toggle source

Returns within cluster sum of squares

# File lib/genevalidator/clusterization.rb, line 215
def wss(lengths = nil)
  if lengths.nil?
    lengths = @lengths.map { |x| Array.new(x[1], x[0]) }.flatten
  end

  cluster_mean = mean
  ss = 0
  lengths.each do |len|
    ss += (cluster_mean - len) * (cluster_mean - len)
  end
  ss
end