class GeneValidator::HierarchicalClusterization
Attributes
clusters[RW]
values[RW]
Public Class Methods
new(values)
click to toggle source
Object initialization Params: values
:vector of values
# File lib/genevalidator/clusterization.rb, line 314 def initialize(values) @values = values @clusters = [] end
Public Instance Methods
hierarchical_clusterization(no_clusters = 0, distance_method = 0, vec = @values, debug = false)
click to toggle source
Makes an hierarchical clusterization until the most dense cluster is obtained or the distance between clusters is sufficintly big or the desired number of clusters is obtained Params: no_clusters
: stop test (number of clusters) distance_method
: distance method (method 0 or method 1) vec
: a vector of values (by default the values from initialization) debug
: display debug information Output: vector of Cluster
objects
# File lib/genevalidator/clusterization.rb, line 417 def hierarchical_clusterization(no_clusters = 0, distance_method = 0, vec = @values, debug = false) clusters = [] vec = vec.sort if vec.length == 1 hash = { vec[0] => 1 } cluster = Cluster.new(hash) clusters.push(cluster) clusters end # Thresholds threshold_distance = (0.25 * (vec.max - vec.min)) threshold_density = (0.5 * vec.length).to_i # make a histogram from the input vector histogram = Hash[vec.group_by { |x| x }.map { |k, vs| [k, vs.length] }] # clusters = array of clusters # initially each length belongs to a different cluster histogram.sort_by { |a| a[0] }.each do |elem| warn "len #{elem[0]} appears #{elem[1]} times" if debug hash = { elem[0] => elem[1] } cluster = Cluster.new(hash) clusters.push(cluster) end clusters.each(&:print) if debug return clusters if clusters.length == 1 # each iteration merge the closest two adiacent cluster # the loop stops according to the stop conditions iteration = 0 loop do # stop condition 1 break if no_clusters != 0 && clusters.length == no_clusters iteration += iteration warn "\nIteration #{iteration}" if debug min_distance = 100_000_000 cluster = 0 density = 0 clusters[0..clusters.length - 2].each_with_index do |_item, i| dist = clusters[i].distance(clusters[i + 1], distance_method) warn "distance btwn clusters #{i} and #{i + 1} is #{dist}" if debug current_density = clusters[i].density + clusters[i + 1].density if dist < min_distance min_distance = dist cluster = i density = current_density elsif dist == min_distance && density < current_density cluster = i density = current_density end end # stop condition 2 # the distance between the closest clusters exceeds the threshold if no_clusters == 0 && (clusters[cluster].mean - clusters[cluster + 1].mean).abs > threshold_distance break end # merge clusters 'cluster' and 'cluster'+1 warn "clusters to merge #{cluster} and #{cluster + 1}" if debug clusters[cluster].add(clusters[cluster + 1]) clusters.delete_at(cluster + 1) if debug clusters.each_with_index do |elem, i| warn "cluster #{i}" elem.print end end # stop condition 3 # the density of the biggest clusters exceeds the threshold if no_clusters == 0 && clusters[cluster].density > threshold_density break end end @clusters = clusters end
hierarchical_clusterization_2d(no_clusters = 0, distance_method = 0, vec = @values, debug = false)
click to toggle source
# File lib/genevalidator/clusterization.rb, line 319 def hierarchical_clusterization_2d(no_clusters = 0, distance_method = 0, vec = @values, debug = false) clusters = [] if vec.length == 1 hash = { vec[0] => 1 } cluster = PairCluster.new(hash) clusters.push(cluster) clusters end # Thresholds # threshold_distance = (0.25 * (vec.max-vec.min)) threshold_density = (0.5 * vec.length).to_i # make a histogram from the input vector histogram = Hash[vec.group_by { |a| a }.map { |k, vs| [k, vs.length] }] # clusters = array of clusters # initially each length belongs to a different cluster histogram.each do |e| warn "pair (#{e[0].x} #{e[0].y}) appears #{e[1]} times" if debug hash = { e[0] => e[1] } cluster = PairCluster.new(hash) clusters.push(cluster) end clusters.each(&:print) if debug return clusters if clusters.length == 1 # each iteration merge the closest two adiacent cluster # the loop stops according to the stop conditions iteration = 0 loop do # stop condition 1 break if no_clusters != 0 && clusters.length == no_clusters iteration += iteration warn "\nIteration #{iteration}" if debug min_distance = 100_000_000 cluster1 = 0 cluster2 = 0 density = 0 [*(0..(clusters.length - 2))].each do |i| [*((i + 1)..(clusters.length - 1))].each do |j| dist = clusters[i].distance(clusters[j], distance_method) warn "distance between clusters #{i} and #{j} is #{dist}" if debug current_density = clusters[i].density + clusters[j].density if dist < min_distance min_distance = dist cluster1 = i cluster2 = j density = current_density elsif dist == min_distance && density < current_density cluster1 = i cluster2 = j density = current_density end end end # merge clusters 'cluster1' and 'cluster2' warn "clusters to merge #{cluster1} and #{cluster2}" if debug clusters[cluster1].add(clusters[cluster2]) clusters.delete_at(cluster2) if debug clusters.each_with_index do |elem, i| warn "cluster #{i}" elem.print end end # stop condition 3 # the density of the biggest clusters exceeds the threshold if no_clusters == 0 && clusters[cluster].density > threshold_density break end end @clusters = clusters end
most_dense_cluster(clusters = @clusters)
click to toggle source
Returns the cluster with the maimum density Params: clusters
: list of Clususter
objects
# File lib/genevalidator/clusterization.rb, line 510 def most_dense_cluster(clusters = @clusters) max_density = 0 max_density_cluster = 0 nil if clusters.nil? clusters.each_with_index do |item, i| if item.density > max_density max_density = item.density max_density_cluster = i end end clusters[max_density_cluster] end