class GovukMirrorer::Crawler

Constants

DEFAULT_SITE_ROOT
RETRY_RESP_CODES
USER_AGENT

Attributes

logger[RW]

Public Class Methods

new(attrs = {}) click to toggle source
Calls superclass method
# File lib/govuk_mirrorer/crawler.rb, line 12
def initialize(attrs = {})
  super
  setup_agent
  @http_errors = {}

  setup_logger(attrs)

  @site_root = attrs[:site_root] || DEFAULT_SITE_ROOT

  @indexer = GovukMirrorer::Indexer.new(@site_root)
  @indexer.all_start_urls.each do |url|
    @logger.debug "Adding start url #{url}"
    handle url, :process_govuk_page
  end
end

Public Instance Methods

crawl(options = {}) click to toggle source
# File lib/govuk_mirrorer/crawler.rb, line 34
def crawl(options = {})
  GovukMirrorer.statsd.time("govuk.app.mirrorer.crawl_duration") do
    each_url do |url, handler, default_data|
      retried = false
      begin
        page = agent.get(url)
        logger.debug "Handling #{url.inspect}"
        send handler, page, default_data
      rescue => ex
        if ex.is_a?(Mechanize::ResponseCodeError) and RETRY_RESP_CODES.include?(ex.response_code.to_i) and ! retried
          retried = true
          sleep 1
          retry
        end
        handle_error url: url, handler: handler, error: ex, data: default_data
      end
      sleep request_interval if request_interval > 0
    end
    logger.info "Completed crawling the site"
  end
end
maybe_handle(url, handler, data = {}) click to toggle source
# File lib/govuk_mirrorer/crawler.rb, line 101
def maybe_handle(url, handler, data = {})
  logger.debug "Evaluating link #{url}"
  if @urls.include?(url)
    logger.debug "Skipping seen url #{url}"
    return
  end
  if @http_errors.has_key?(url)
    logger.debug "Skipping previous erroring url #{url}"
    return
  end
  if @indexer.blacklisted_url?(url)
    logger.debug "Skipping blacklisted url #{url}"
    return
  end
  if url.include? '?'
    logger.debug "Skipping querystringed url #{url}"
    return
  end
  logger.debug "Adding url #{url} from #{data[:referrer]}"
  handle url, handler, data
end
process_govuk_page(page, data = {}) click to toggle source
# File lib/govuk_mirrorer/crawler.rb, line 56
def process_govuk_page(page, data = {})
  unless page.uri.host == site_hostname
    msg = "Ended up on non #{site_hostname} page #{page.uri.to_s}"
    msg << " from #{agent.history[-2].uri.to_s}" if agent.history[-2]
    logger.warn msg
    return
  end
  save_to_disk(page)
  extract_and_handle_links(page)
end
site_hostname() click to toggle source
# File lib/govuk_mirrorer/crawler.rb, line 28
def site_hostname
  URI.parse(@site_root).host
end

Private Instance Methods

handle_error(attrs) click to toggle source
# File lib/govuk_mirrorer/crawler.rb, line 145
def handle_error(attrs)
  msg = "Error #{attrs[:error].inspect} for #{attrs[:url]}, data: #{attrs[:data].inspect}"
  msg << "\n#{attrs[:error].backtrace.join("\n")}" unless attrs[:error].is_a?(Mechanize::Error)
  logger.warn msg
  @http_errors[attrs[:url]] = attrs[:error]
end
save_to_disk(page) click to toggle source

Saves to a file in ./hostname/path adds .html for html files

# File lib/govuk_mirrorer/crawler.rb, line 154
def save_to_disk(page)
  path = page.extract_filename(true)
  logger.debug "Saving #{page.uri.to_s} to #{path}"
  FileUtils.mkdir_p(File.dirname(path))
  File.open(path, 'wb') do |f|
    f.write page.body
  end
end
setup_agent() click to toggle source
# File lib/govuk_mirrorer/crawler.rb, line 163
def setup_agent
  agent.user_agent = USER_AGENT
  agent.request_headers["X-Govuk-Mirrorer"] = "1"
  # Force Mechanize to use Net::HTTP which we've monkey-patched above
  agent.agent.http.reuse_ssl_sessions = false
end
setup_logger(options) click to toggle source
# File lib/govuk_mirrorer/crawler.rb, line 125
def setup_logger(options)
  if options[:syslog]
    # Syslog settings
    # programname: govuk_mirrorer
    # options: Syslog::LOG_PID | Syslog::LOG_CONS
    # facility: from options
    # Syslog::LOG_PID - adds the process number to the message (just after the program name)
    # Syslog::LOG_CONS - writes the message on the console if an error occurs when sending the message
    facility = Syslog.const_get("LOG_#{options[:syslog].upcase}")
    @logger = Syslogger.new('govuk_mirrorer', Syslog::LOG_PID | Syslog::LOG_CONS, facility)
  else
    @logger = Logger.new(options[:log_file] || STDOUT)
  end
  if options[:log_level]
    @logger.level = Logger.const_get(options[:log_level].upcase)
  else
    @logger.level = Logger::INFO
  end
end