class SamplingHash::Sampler

Attributes

samples[R]
size[R]

Public Class Methods

new(size, sample_size = 1024, header_samples = 1000, minimum_samples = 5000, remaining_factor = 0.001) click to toggle source

Calculates sample offsets.

Parameters:

  • sample_size: Size of a sample (in bytes).

  • header_samples: Number of samples at front of data always to be included.

  • minimum_samples: Minimum number of samples to be included.

  • remaining_factor: If size is greater than minimum_samples * sample_size, this specifies the

    linear factor function used to determine the additional data used.
# File lib/sampling-hash/sampler.rb, line 15
def initialize(size, sample_size = 1024, header_samples = 1000, minimum_samples = 5000, remaining_factor = 0.001)
  @samples = []

  minimum_sampling_size = minimum_samples * sample_size
  if (size > minimum_sampling_size)
    # Continuous header samples first.
    header_samples.times { |i| @samples << [i * sample_size, sample_size] }

    # Spread the rest.
    start_offset                       = header_samples * sample_size
    remaining_size                     = size - start_offset

    remaining_minimum_samples          = [0, minimum_samples - header_samples].max
    remaining_minimum_sampling_size    = remaining_minimum_samples * sample_size

    remaining_additional_size          = remaining_size - remaining_minimum_sampling_size
    remaining_additional_sampling_size = remaining_additional_size * remaining_factor
    remaining_additional_samples       = (remaining_additional_sampling_size / sample_size).truncate

    remaining_total_samples            = remaining_minimum_samples + remaining_additional_samples
    remaining_total_sampling_size      = remaining_minimum_sampling_size + remaining_additional_sampling_size

    remaining_unsampled_size           = remaining_size - remaining_total_sampling_size
    remaining_sampling_gap             = (remaining_unsampled_size / remaining_total_samples).truncate

    # NOTE: We can not overflow since we calculated the remaining_additional_samples with integer division.
    remaining_total_samples.times do |i|
      @samples << [start_offset + i * (sample_size + remaining_sampling_gap), sample_size]
    end
  else
    total_full_samples = size / sample_size
    last_sample_size   = size - ((size / sample_size) * sample_size)

    # Simply take them all.
    total_full_samples.times { |i| @samples << [i * sample_size, sample_size] }
    @samples << [total_full_samples * sample_size, last_sample_size] if last_sample_size != 0
  end

  @size = @samples.inject(0) { |i, v| i + v[1] }
end

Public Instance Methods

each(&block) click to toggle source
# File lib/sampling-hash/sampler.rb, line 56
def each(&block)
  @samples.each(&block)
end