class Roundhouse::Middleware::Server::RetryJobs

Automatically retry jobs that fail in Roundhouse. Roundhouse’s retry support assumes a typical development lifecycle:

0. push some code changes with a bug in it
1. bug causes job processing to fail, roundhouse's middleware captures
   the job and pushes it onto a retry queue
2. roundhouse retries jobs in the retry queue multiple times with
   an exponential delay, the job continues to fail
3. after a few days, a developer deploys a fix.  the job is
   reprocessed successfully.
4. once retries are exhausted, roundhouse will give up and move the
   job to the Dead Job Queue (aka morgue) where it must be dealt with
   manually in the Web UI.
5. After 6 months on the DJQ, Roundhouse will discard the job.

A job looks like:

{ 'class' => 'HardWorker', 'args' => [1, 2, 'foo'], 'retry' => true }

The ‘retry’ option also accepts a number (in place of ‘true’):

{ 'class' => 'HardWorker', 'args' => [1, 2, 'foo'], 'retry' => 5 }

The job will be retried this number of times before giving up. (If simply ‘true’, Roundhouse retries 25 times)

We’ll add a bit more data to the job to support retries:

* 'queue' - the queue to use
* 'retry_count' - number of times we've retried so far.
* 'error_message' - the message from the exception
* 'error_class' - the exception class
* 'failed_at' - the first time it failed
* 'retried_at' - the last time it was retried
* 'backtrace' - the number of lines of error backtrace to store

We don’t store the backtrace by default as that can add a lot of overhead to the job and everyone is using an error service, right?

The default number of retry attempts is 25 which works out to about 3 weeks of retries. You can pass a value for the max number of retry attempts when adding the middleware using the options hash:

Roundhouse.configure_server do |config|
  config.server_middleware do |chain|
    chain.add Roundhouse::Middleware::Server::RetryJobs, :max_retries => 7
  end
end

or limit the number of retries for a particular worker with:

class MyWorker
  include Roundhouse::Worker
  roundhouse_options :retry => 10
end

Constants

DEFAULT_MAX_RETRY_ATTEMPTS

Public Class Methods

new(options = {}) click to toggle source
# File lib/roundhouse/middleware/server/retry_jobs.rb, line 69
def initialize(options = {})
  @max_retries = options.fetch(:max_retries, DEFAULT_MAX_RETRY_ATTEMPTS)
end

Public Instance Methods

call(worker, msg, queue) { || ... } click to toggle source
# File lib/roundhouse/middleware/server/retry_jobs.rb, line 73
def call(worker, msg, queue)
  yield
rescue Roundhouse::Shutdown
  # ignore, will be pushed back onto queue during hard_shutdown
  raise
rescue Exception => e
  # ignore, will be pushed back onto queue during hard_shutdown
  raise Roundhouse::Shutdown if exception_caused_by_shutdown?(e)

  raise e unless msg['retry']
  attempt_retry(worker, msg, queue, e)
end

Private Instance Methods

attempt_retry(worker, msg, queue, exception) click to toggle source
# File lib/roundhouse/middleware/server/retry_jobs.rb, line 88
def attempt_retry(worker, msg, queue, exception)
  max_retry_attempts = retry_attempts_from(msg['retry'], @max_retries)

  msg['queue'] = if msg['retry_queue']
    msg['retry_queue']
  else
    queue
  end

  # App code can stuff all sorts of crazy binary data into the error message
  # that won't convert to JSON.
  m = exception.message[0..10_000]
  if m.respond_to?(:scrub!)
    m.force_encoding("utf-8")
    m.scrub!
  end

  msg['error_message'] = m
  msg['error_class'] = exception.class.name
  count = if msg['retry_count']
    msg['retried_at'] = Time.now.to_f
    msg['retry_count'] += 1
  else
    msg['failed_at'] = Time.now.to_f
    msg['retry_count'] = 0
  end

  if msg['backtrace'] == true
    msg['error_backtrace'] = exception.backtrace
  elsif !msg['backtrace']
    # do nothing
  elsif msg['backtrace'].to_i != 0
    msg['error_backtrace'] = exception.backtrace[0...msg['backtrace'].to_i]
  end

  if count < max_retry_attempts
    delay = delay_for(worker, count)
    logger.debug { "Failure! Retry #{count} in #{delay} seconds" }
    retry_at = Time.now.to_f + delay
    payload = Roundhouse.dump_json(msg)
    Roundhouse.redis do |conn|
      conn.zadd('retry', retry_at.to_s, payload)
    end
  else
    # Goodbye dear message, you (re)tried your best I'm sure.
    retries_exhausted(worker, msg)
  end

  raise exception
end
delay_for(worker, count) click to toggle source
# File lib/roundhouse/middleware/server/retry_jobs.rb, line 173
def delay_for(worker, count)
  worker.roundhouse_retry_in_block? && retry_in(worker, count) || seconds_to_delay(count)
end
exception_caused_by_shutdown?(e, checked_causes = []) click to toggle source
# File lib/roundhouse/middleware/server/retry_jobs.rb, line 191
def exception_caused_by_shutdown?(e, checked_causes = [])
  # In Ruby 2.1.0 only, check if exception is a result of shutdown.
  return false unless defined?(e.cause)

  # Handle circular causes
  checked_causes << e.object_id
  return false if checked_causes.include?(e.cause.object_id)

  e.cause.instance_of?(Roundhouse::Shutdown) ||
    exception_caused_by_shutdown?(e.cause, checked_causes)
end
retries_exhausted(worker, msg) click to toggle source
# File lib/roundhouse/middleware/server/retry_jobs.rb, line 139
def retries_exhausted(worker, msg)
  logger.debug { "Dropping message after hitting the retry maximum: #{msg}" }
  begin
    if worker.roundhouse_retries_exhausted_block?
      worker.roundhouse_retries_exhausted_block.call(msg)
    end
  rescue => e
    handle_exception(e, { context: "Error calling retries_exhausted for #{worker.class}", job: msg })
  end

  send_to_morgue(msg) unless msg['dead'] == false
end
retry_attempts_from(msg_retry, default) click to toggle source
# File lib/roundhouse/middleware/server/retry_jobs.rb, line 165
def retry_attempts_from(msg_retry, default)
  if msg_retry.is_a?(Fixnum)
    msg_retry
  else
    default
  end
end
retry_in(worker, count) click to toggle source
# File lib/roundhouse/middleware/server/retry_jobs.rb, line 182
def retry_in(worker, count)
  begin
    worker.roundhouse_retry_in_block.call(count)
  rescue Exception => e
    handle_exception(e, { context: "Failure scheduling retry using the defined `roundhouse_retry_in` in #{worker.class.name}, falling back to default" })
    nil
  end
end
seconds_to_delay(count) click to toggle source

delayed_job uses the same basic formula

# File lib/roundhouse/middleware/server/retry_jobs.rb, line 178
def seconds_to_delay(count)
  (count ** 4) + 15 + (rand(30)*(count+1))
end
send_to_morgue(msg) click to toggle source
# File lib/roundhouse/middleware/server/retry_jobs.rb, line 152
def send_to_morgue(msg)
  Roundhouse.logger.info { "Adding dead #{msg['class']} job #{msg['jid']}" }
  payload = Roundhouse.dump_json(msg)
  now = Time.now.to_f
  Roundhouse.redis do |conn|
    conn.multi do
      conn.zadd('dead', now, payload)
      conn.zremrangebyscore('dead', '-inf', now - DeadSet.timeout)
      conn.zremrangebyrank('dead', 0, -DeadSet.max_jobs)
    end
  end
end