class Anomaly::Detector

Constants

SQRT2PI

Attributes

eps[RW]
mean[R]
std[R]

Public Class Methods

new(examples = nil, **opts) click to toggle source
# File lib/anomaly/detector.rb, line 6
def initialize(examples = nil, **opts)
  @m = 0
  train(examples, **opts) if examples
end

Public Instance Methods

anomaly?(x, eps = @eps) click to toggle source
# File lib/anomaly/detector.rb, line 97
def anomaly?(x, eps = @eps)
  y = probability(x)

  if y.is_a?(Array)
    y.map do |yi|
      yi < eps
    end
  else
    y < eps
  end
end
probability(x) click to toggle source

Limit the probability of features to [0,1] to keep probabilities at same scale. Use log to prevent underflow

# File lib/anomaly/detector.rb, line 78
def probability(x)
  raise "Train me first" unless trained?

  singular = !x.first.is_a?(Array)
  x = [x] if singular

  y =
    x.map do |xi|
      prob = 0
      @n.times.map do |i|
        pi = normal_pdf(xi[i], @mean[i], @std[i])
        prob += Math.log(pi > 1 ? 1 : pi)
      end
      Math.exp(prob)
    end

  singular ? y.first : y
end
train(examples, eps: 0) click to toggle source
# File lib/anomaly/detector.rb, line 11
def train(examples, eps: 0)
  # for Numo::NArray
  # TODO make more efficient when possible
  examples = examples.to_a

  raise "No examples" if examples.empty?
  raise "Must have at least two columns" if examples.first.size < 2

  # Divide into groups since we only want to train with non-anomalies.
  anomalies = []
  non_anomalies = []
  examples.each do |example|
    if example.last == 0
      non_anomalies << example
    else
      anomalies << example
    end
  end

  raise "Must have at least one non-anomaly" if non_anomalies.empty?

  @eps = eps
  if @eps > 0
    # Use all non-anomalies to train.
    training_examples = non_anomalies
  else
    training_examples, test_examples = partition!(non_anomalies)
    test_examples.concat(anomalies)
  end
  # Remove last column.
  training_examples = training_examples.map { |e| e[0..-2] }
  @m = training_examples.size
  @n = training_examples.first.size

  if defined?(Numo::SFloat)
    training_examples = Numo::SFloat.cast(training_examples)
    # Convert these to an Array for Marshal.dump
    @mean = training_examples.mean(0).to_a
    @std = training_examples.stddev(0).to_a
  elsif defined?(NMatrix)
    training_examples = NMatrix.to_na(training_examples)
    # Convert these to an Array for Marshal.dump
    @mean = training_examples.mean(1).to_a
    @std = training_examples.stddev(1).to_a
  else
    # Default to Array, since built-in Matrix does not give us a big performance advantage.
    cols = @n.times.map { |i| training_examples.map { |r| r[i] } }
    @mean = cols.map { |c| alt_mean(c) }
    @std = cols.each_with_index.map { |c, i| alt_std(c, @mean[i]) }
  end
  @std.map! { |std| (std == 0 || std.nan?) ? 1e-10 : std }

  if @eps == 0
    # Find the best eps.
    epss = (1..9).map { |i| [1, 3, 5, 7, 9].map { |j| (j * 10**(-i)).to_f } }.flatten
    f1_scores = epss.map { |eps| [eps, compute_f1_score(test_examples, eps)] }
    @eps, _ = f1_scores.max_by { |v| v[1] }
  end
end
trained?() click to toggle source
# File lib/anomaly/detector.rb, line 71
def trained?
  @m > 0
end

Protected Instance Methods

alt_mean(x) click to toggle source

Not used for NArray

# File lib/anomaly/detector.rb, line 152
def alt_mean(x)
  x.sum / x.size
end
alt_std(x, mean) click to toggle source
# File lib/anomaly/detector.rb, line 156
def alt_std(x, mean)
  Math.sqrt(x.sum { |i| (i - mean)**2 }.to_f / (x.size - 1))
end
compute_f1_score(examples, eps) click to toggle source
# File lib/anomaly/detector.rb, line 125
def compute_f1_score(examples, eps)
  tp = 0
  fp = 0
  fn = 0
  examples.each do |example|
    act = example.last != 0
    pred = anomaly?(example[0..-2], eps)
    if act && pred
      tp += 1
    elsif pred # and !act
      fp += 1
    elsif act # and !pred
      fn += 1
    end
  end
  f1_score(tp, fp, fn)
end
f1_score(tp, fp, fn) click to toggle source
# File lib/anomaly/detector.rb, line 143
def f1_score(tp, fp, fn)
  precision = tp / (tp + fp).to_f
  recall = tp / (tp + fn).to_f
  score = 2.0 * precision * recall / (precision + recall)
  score.nan? ? 0.0 : score
end
normal_pdf(x, mean = 0, std = 1) click to toggle source
# File lib/anomaly/detector.rb, line 113
def normal_pdf(x, mean = 0, std = 1)
  1 / (SQRT2PI * std) * Math.exp(-((x - mean)**2 / (2.0 * (std**2))))
end
partition!(examples, p_last = 0.2) click to toggle source

Find best eps.

# File lib/anomaly/detector.rb, line 119
def partition!(examples, p_last = 0.2)
  examples.shuffle!
  n = (examples.size * p_last).floor
  [examples[n..-1], examples[0...n]]
end