class GeneValidator::Cluster
Stores the values belonging to one cluster Used for clusterization among a vector of values
Attributes
a hash map containing the pair (length, no_occurences)
Public Class Methods
# File lib/genevalidator/clusterization.rb, line 158 def initialize(lengths) @lengths = lengths end
Public Instance Methods
Merges the current cluster with the one given as parameter clusters
vector of Cluster
objects
# File lib/genevalidator/clusterization.rb, line 267 def add(cluster) cluster.lengths.each do |elem| lengths[elem[0]] = elem[1] end end
Returns the density of the cluster: how many values it contains
# File lib/genevalidator/clusterization.rb, line 177 def density d = 0 lengths.each do |elem| d += elem[1] end d end
Returns the deviation of a value from the values in all clusters Params: clusters
: a list of Cluster
objects queryLength
: a reference Sequence object Output: Real number
# File lib/genevalidator/clusterization.rb, line 255 def deviation(clusters, queryLength) hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.delete('[').delete(']') R.eval("sd = sd(c(#{raw_hits}))") sd = R.pull('sd') sd = standard_deviation(hits) (queryLength - mean).abs / sd end
Returns the euclidian distance between the current cluster and the one given as parameter Params: cluster
: Cluster
object method
: 0 or 1 method = 0: do not into condseideration duplicate values method = 1: average linkage clusterization
# File lib/genevalidator/clusterization.rb, line 192 def distance(cluster, method = 0) d = 0 norm = 0 cluster.lengths.each do |elem1| lengths.each do |elem2| if method == 1 d += elem1[1] * elem2[1] * (elem1[0] - elem2[0]).abs norm += elem1[1] * elem2[1] else d += (elem1[0] - elem2[0]).abs norm = cluster.lengths.length * lengths.length end end end # group average distance d /= (norm + 0.0) d.round(4) end
Returns the interval limits of the current cluster
# File lib/genevalidator/clusterization.rb, line 285 def get_limits lengths.map { |elem| elem[0] }.minmax end
Returns whether the value is inside the cluster Params: value
: value to compare Output: :ok or :shorter or :longer
# File lib/genevalidator/clusterization.rb, line 295 def inside_cluster(value) limits = get_limits left = limits[0] right = limits[1] :ok if left <= value && right >= value :shorter if left >= value :longer if right <= value end
Returns the weighted mean value of the cluster
# File lib/genevalidator/clusterization.rb, line 164 def mean mean_len = 0 weight = 0 lengths.each do |length, n| mean_len += length * n weight += n end mean_len /= weight end
Prints the current cluster
# File lib/genevalidator/clusterization.rb, line 275 def print warn "Cluster: mean = #{mean}, density = #{density}" lengths.sort { |a, b| a <=> b }.each do |elem| warn "#{elem[0]}, #{elem[1]}" end warn '--------------------------' end
Returns the standard deviation of a set of values Params: lengths
: a vector of values (optional, by default it takes the values in the cluster) Output: Real number
# File lib/genevalidator/clusterization.rb, line 235 def standard_deviation(lengths = nil) if lengths.nil? lengths = @lengths.map { |x| Array.new(x[1], x[0]) }.flatten end cluster_mean = mean std_deviation = 0 lengths.each do |len| std_deviation += (cluster_mean - len) * (cluster_mean - len) end std_deviation = Math.sqrt(std_deviation.to_f / (lengths.length - 1)) end
Returns within cluster sum of squares
# File lib/genevalidator/clusterization.rb, line 215 def wss(lengths = nil) if lengths.nil? lengths = @lengths.map { |x| Array.new(x[1], x[0]) }.flatten end cluster_mean = mean ss = 0 lengths.each do |len| ss += (cluster_mean - len) * (cluster_mean - len) end ss end