class Interferon::Destinations::Datadog

Constants

ALERT_KEY

Attributes

alert_key[R]
concurrency[RW]

Public Class Methods

new(options) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 17
def initialize(options)
  %w(app_key api_key).each do |req|
    unless options[req]
      raise ArgumentError, "missing required argument #{req}"
    end
  end

  # Set dogapi timeout explicitly
  api_timeout = options['api_timeout'] || 15

  # Default parameters of Dogapi Client initialize() can be referenced from link below:
  # (as of this writing)
  # https://github.com/DataDog/dogapi-rb/blob/master/lib/dogapi/facade.rb#L14
  args = [
    options['api_key'],
    options['app_key'],
    nil, # host to talk to
    nil, # device
    true, # silent?
    api_timeout, # API timeout
  ]
  @dog = Dogapi::Client.new(*args)

  @existing_alerts = nil
  @max_mute_minutes = options['max_mute_minutes']
  @dry_run = options['dry_run']
  @alert_key = options['alert_key'] || ALERT_KEY

  # Datadog communication threads
  @concurrency = options['concurrency'] || 10
  # Fetch page size
  @page_size = options['page_size'] || 1000

  # configure retries
  @retries = options['retries'] || 3

  @stats = {
    alerts_created: 0,
    alerts_to_be_created: 0,
    alerts_updated: 0,
    alerts_to_be_updated: 0,
    alerts_deleted: 0,
    alerts_to_be_deleted: 0,
    alerts_silenced: 0,
    api_successes: 0,
    api_client_errors: 0,
    api_unknown_errors: 0,
    manually_created_alerts: 0,
  }
end
normalize_monitor_type(monitor_type) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 340
def self.normalize_monitor_type(monitor_type)
  # Convert 'query alert' type to 'metric alert' type. They can used interchangeably when
  # submitting monitors to Datadog. Datadog will automatically do the conversion to 'query
  # alert' for a "complex" query that includes multiple metrics/tags while using 'metric alert'
  # for monitors that include a single scope/metric.
  monitor_type == 'query alert' ? 'metric alert' : monitor_type
end
same_monitor_type(monitor_type_a, monitor_type_b) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 348
def self.same_monitor_type(monitor_type_a, monitor_type_b)
  normalize_monitor_type(monitor_type_a) == normalize_monitor_type(monitor_type_b)
end

Public Instance Methods

api_errors() click to toggle source
# File lib/interferon/destinations/datadog.rb, line 68
def api_errors
  @api_errors ||= []
end
create_alert(alert, people) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 141
def create_alert(alert, people)
  # create a message which includes the notifications
  # Datadog may have a race condition where alerts created in a bad state may be triggered
  # during the dry-run creation process. Delete people from dry-run alerts to avoid this
  message = generate_message(
    alert['message'],
    people,
    notify_recovery: alert['notify']['recovery']
  )

  # create the hash of options to send to datadog
  alert_options = {
    notify_audit: alert['notify']['audit'],
    notify_no_data: alert['notify_no_data'],
    no_data_timeframe: alert['no_data_timeframe'],
    silenced: alert['silenced'],
    timeout_h: alert['timeout_h'],
  }

  unless alert['notify']['include_tags'].nil?
    alert_options[:include_tags] = alert['notify']['include_tags']
  end

  unless alert['evaluation_delay'].nil?
    alert_options[:evaluation_delay] = alert['evaluation_delay']
  end

  unless alert['new_host_delay'].nil?
    alert_options[:new_host_delay] = alert['new_host_delay']
  end

  unless alert['require_full_window'].nil?
    alert_options[:require_full_window] = alert['require_full_window']
  end

  unless alert['thresholds'].nil?
    alert_options[:thresholds] = alert['thresholds']
  end

  datadog_query = alert['metric']['datadog_query']
  existing_alert = existing_alerts[alert['name']]

  # new alert, create it
  if existing_alert.nil?
    action = :creating
    resp = create_datadog_alert(alert, datadog_query, message, alert_options)
  else
    # existing alert, modify it
    action = :updating
    resp = update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert)
  end

  # log whenever we've encountered errors
  code = resp[0].to_i
  log_datadog_response_code(resp, code, action, alert)

  # assume this was a success
  unless code >= 400 || code == -1
    # assume this was a success
    @stats[:alerts_created] += 1 if action == :creating
    @stats[:alerts_updated] += 1 if action == :updating
    @stats[:alerts_silenced] += 1 unless alert_options[:silenced].empty?
  end

  id = resp[1].nil? ? nil : [resp[1]['id']]
  # lets key alerts by their name
  [alert['name'], id]
end
create_datadog_alert(alert, datadog_query, message, alert_options) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 210
    def create_datadog_alert(alert, datadog_query, message, alert_options)
      @stats[:alerts_to_be_created] += 1
      new_alert_text = <<-EOM
Query:
#{datadog_query}
Message:
#{message}
Options:
#{alert_options}
EOM
      log.info("creating new alert #{alert['name']}: #{new_alert_text}")

      monitor_options = {
        name: alert['name'],
        message: message,
        options: alert_options,
      }

      if @dry_run
        @dog.validate_monitor(
          alert['monitor_type'],
          datadog_query,
          monitor_options
        )
      else
        @dog.monitor(
          alert['monitor_type'],
          datadog_query,
          monitor_options
        )
      end
    end
existing_alerts() click to toggle source
# File lib/interferon/destinations/datadog.rb, line 112
def existing_alerts
  unless @existing_alerts
    alerts = fetch_existing_alerts

    # key alerts by name
    @existing_alerts = {}
    alerts.each do |alert|
      existing_alert = @existing_alerts[alert['name']]
      if existing_alert.nil?
        alert['id'] = [alert['id']]
        @existing_alerts[alert['name']] = alert
      else
        existing_alert['id'] << alert['id']
      end
    end

    # count how many are manually created
    @stats[:manually_created_alerts] = \
      @existing_alerts.reject { |_n, a| a['message'].include?(alert_key) }.length

    log.info(
      "datadog: found #{@existing_alerts.length} existing alerts; " \
      "#{@stats[:manually_created_alerts]} were manually created"
    )
  end

  @existing_alerts
end
fetch_existing_alerts() click to toggle source
# File lib/interferon/destinations/datadog.rb, line 83
def fetch_existing_alerts
  alerts = Queue.new
  has_more = true

  Parallel.map_with_index(-> { has_more || Parallel::Stop },
                          in_threads: @concurrency) do |_, page|
    successful = false
    @retries.downto(0) do
      resp = @dog.get_all_monitors(page: page, page_size: @page_size)
      code = resp[0].to_i
      if code != 200
        log.info("Failed to retrieve existing alerts from datadog. #{code}: #{resp[1].inspect}")
      else
        alerts_page = resp[1]
        has_more = false if alerts_page.length < @page_size
        alerts_page.map { |alert| alerts.push(alert) }
        successful = true
        break
      end
    end

    unless successful
      # Out of retries
      raise 'Retries exceeded for fetching data from datadog.'
    end
  end
  Array.new(alerts.size) { alerts.pop }
end
generate_message(message, people, options = {}) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 72
def generate_message(message, people, options = {})
  mentions = people.sort.map { |p| "@#{p}" }

  unless options[:notify_recovery]
    # Only mention on alert/warning
    mentions = "{{^is_recovery}}#{mentions}{{/is_recovery}}"
  end

  [message, alert_key, mentions].flatten.join("\n")
end
log_datadog_response_code(resp, code, action, alert = nil) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 412
def log_datadog_response_code(resp, code, action, alert = nil)
  # log whenever we've encountered errors
  if code != 200 && !alert.nil?
    api_errors << "#{code} on alert #{alert['name']}"
  end

  # client error
  if code == 400
    @stats[:api_client_errors] += 1
    unless alert.nil?
      statsd.gauge('datadog.api.unknown_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 1, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 0, tags: ["alert:#{alert}"])
      log.error("client error while #{action} alert '#{alert['name']}';" \
                " query was '#{alert['metric']['datadog_query']}'" \
                " response was #{resp[0]}:'#{resp[1].inspect}'")
    end

  # unknown (prob. datadog) error:
  elsif code > 400 || code == -1
    @stats[:api_unknown_errors] += 1
    unless alert.nil?
      statsd.gauge('datadog.api.unknown_error', 1, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 0, tags: ["alert:#{alert}"])
      log.error("unknown error while #{action} alert '#{alert['name']}':" \
                " query was '#{alert['metric']['datadog_query']}'" \
                " response was #{resp[0]}:'#{resp[1].inspect}'")
    end
  else
    @stats[:api_successes] += 1
    unless alert.nil?
      statsd.gauge('datadog.api.unknown_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.client_error', 0, tags: ["alert:#{alert}"])
      statsd.gauge('datadog.api.success', 1, tags: ["alert:#{alert}"])
    end
  end
end
need_update(alert_people_pair, existing_alerts_from_api) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 334
def need_update(alert_people_pair, existing_alerts_from_api)
  alert, people = alert_people_pair
  existing = existing_alerts_from_api[alert['name']]
  existing.nil? || !same_alerts(alert, people, existing)
end
remove_alert(alert) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 311
def remove_alert(alert)
  if alert['message'].include?(alert_key)
    @stats[:alerts_to_be_deleted] += 1
    log.info("deleting alert: #{alert['name']}")

    # Safety to protect aginst accident dry_run deletion
    unless @dry_run
      alert['id'].each do |alert_id|
        resp = @dog.delete_monitor(alert_id)
        code = resp[0].to_i
        log_datadog_response_code(resp, code, :deleting)

        unless code >= 300 || code == -1
          # assume this was a success
          @stats[:alerts_deleted] += 1
        end
      end
    end
  else
    log.warn("not deleting manually-created alert #{alert['id']} (#{alert['name']})")
  end
end
report_stats() click to toggle source
# File lib/interferon/destinations/datadog.rb, line 395
def report_stats
  @stats.each do |k, v|
    statsd.gauge("datadog.#{k}", v)
  end

  log.info(
    'datadog: successfully created (%d/%d), updated (%d/%d), and deleted (%d/%d) alerts' % [
      @stats[:alerts_created],
      @stats[:alerts_to_be_created],
      @stats[:alerts_updated],
      @stats[:alerts_to_be_updated],
      @stats[:alerts_deleted],
      @stats[:alerts_to_be_deleted],
    ]
  )
end
same_alerts(alert, people, alert_api_json) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 352
def same_alerts(alert, people, alert_api_json)
  prev_alert = {
    monitor_type: self.class.normalize_monitor_type(alert_api_json['type']),
    query: alert_api_json['query'].strip,
    message: alert_api_json['message'].strip,
    evaluation_delay: alert_api_json['options']['evaluation_delay'],
    new_host_delay: alert_api_json['options']['new_host_delay'],
    include_tags: alert_api_json['options']['include_tags'],
    notify_no_data: alert_api_json['options']['notify_no_data'],
    notify_audit: alert_api_json['options']['notify_audit'],
    no_data_timeframe: alert_api_json['options']['no_data_timeframe'],
    silenced: alert_api_json['options']['silenced'],
    thresholds: alert_api_json['options']['thresholds'],
    timeout_h: alert_api_json['options']['timeout_h'],
  }

  new_alert = {
    monitor_type: self.class.normalize_monitor_type(alert['monitor_type']),
    query: alert['metric']['datadog_query'],
    message: generate_message(
      alert['message'],
      people,
      notify_recovery: alert['notify']['recovery']
    ).strip,
    evaluation_delay: alert['evaluation_delay'],
    new_host_delay: alert['new_host_delay'],
    include_tags: alert['notify']['include_tags'],
    notify_no_data: alert['notify_no_data'],
    notify_audit: alert['notify']['audit'],
    no_data_timeframe: alert['no_data_timeframe'],
    silenced: alert['silenced'],
    thresholds: alert['thresholds'],
    timeout_h: alert['timeout_h'],
  }

  unless alert['require_full_window'].nil?
    prev_alert[:require_full_window] = alert_api_json['options']['require_full_window']
    new_alert[:require_full_window] = alert['require_full_window']
  end

  prev_alert == new_alert
end
update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert) click to toggle source
# File lib/interferon/destinations/datadog.rb, line 243
    def update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert)
      @stats[:alerts_to_be_updated] += 1
      id = existing_alert['id'][0]

      new_alert_text = <<-EOM.strip
Query:
#{datadog_query.strip}
Message:
#{message.strip}
Options:
#{alert_options}
EOM
      existing_alert_text = <<-EOM.strip
Query:
#{existing_alert['query'].strip}
Message:
#{existing_alert['message'].strip}
Options:
#{alert_options}
EOM
      diff = Diffy::Diff.new(existing_alert_text, new_alert_text, context: 1)
      log.info("updating existing alert #{id} (#{alert['name']}):\n#{diff}")

      monitor_options = {
        name: alert['name'],
        message: message,
        options: alert_options,
      }

      if @dry_run
        resp = @dog.validate_monitor(
          alert['monitor_type'],
          datadog_query,
          monitor_options
        )
      elsif self.class.same_monitor_type(alert['monitor_type'], existing_alert['type'])
        resp = @dog.update_monitor(
          id,
          datadog_query,
          monitor_options
        )

        # Unmute existing alerts that exceed the max silenced time
        # Datadog does not allow updates to silencing via the update_alert API call.
        silenced = existing_alert['options']['silenced']
        if !@max_mute_minutes.nil?
          silenced = silenced.values.reject do |t|
            t.nil? || t == '*' || t > Time.now.to_i + @max_mute_minutes * 60
          end
          @dog.unmute_monitor(id) if alert_options[:silenced].empty? && silenced.empty?
        elsif alert_options[:silenced].empty? && !silenced.empty?
          @dog.unmute_monitor(id)
        end
      else
        # Need to recreate alert with new monitor type
        resp = @dog.delete_monitor(id)
        code = resp[0].to_i
        unless code >= 300 || code == -1
          resp = @dog.monitor(
            alert['monitor_type'],
            datadog_query,
            monitor_options
          )
        end
      end
      resp
    end