class Elastic::Stats::KS

Utility to determine the KolmogorovSmirnov difference between to sets of data fetched from Elasticsearch

Constants

MULTIPLIERS

Attributes

field[R]
from[R]
indices[R]
interval[R]
logger[RW]
query[RW]
span[R]
to[R]

Public Class Methods

new(indices, options = {}) click to toggle source

indices should include all possible indices.

# File lib/elastic/stats/ks.rb, line 25
def initialize(indices, options = {})
  @indices  = indices

  options = default_options.update(options)

  @to       = options.delete(:to)
  @span     = options.delete(:span)
  @interval = options.delete(:interval)
  @field    = options.delete(:field)
  @offset   = options.delete(:offset)

  @indices = [indices] unless @indices.is_a? Array
  @from    = @to - @span
end

Public Instance Methods

calculate(current, previous, confidence) click to toggle source
# File lib/elastic/stats/ks.rb, line 55
def calculate(current, previous, confidence)
  MULTIPLIERS[confidence] * Math.sqrt(
    (
      (current.count + previous.count).to_f /
      (current.count * previous.count)
    )
  )
end
fetch(confidence = 0.05) click to toggle source
# File lib/elastic/stats/ks.rb, line 40
def fetch(confidence = 0.05)
  current  = range(@from, @to)
  previous = range(@from - @offset, @to - @offset)

  difference = Statsample::Test::KolmogorovSmirnov.new(
    current, previous
  ).d

  comparison = calculate(current, previous, confidence)
  {
    confidence: confidence, comparison: comparison,
    difference: difference, different?: (difference > comparison)
  }
end

Private Instance Methods

aggregate(from, to) click to toggle source
# File lib/elastic/stats/ks.rb, line 81
def aggregate(from, to)
  {
    field: field, interval: interval, min_doc_count: 0,
    extended_bounds: {
      min: (from * 1000),
      max:   (to * 1000)
    }
  }
end
body(from, to) click to toggle source
# File lib/elastic/stats/ks.rb, line 72
def body(from, to)
  body = Hashie::Mash.new
  body.query = query if query
  body.aggregations!.hits_per_minute!.date_histogram = aggregate(from, to)
  body
end
default_options() click to toggle source
# File lib/elastic/stats/ks.rb, line 93
def default_options
  {
    to: Time.new.to_i,
    span: (60 * 60 * 12),
    interval: '1h',
    field: '@timestamp',
    offset: (60 * 60 * 24 * 7)
  }
end
range(from, to) click to toggle source
# File lib/elastic/stats/ks.rb, line 66
def range(from, to)
  Hashie::Mash.new(
    client.search index: indices.join(','), body: body(from, to)
  ).aggregations.hits_per_minute.buckets.collect(&:doc_count)
end