class GovukMirrorer::Indexer

Constants

ADDITIONAL_BLACKLIST_PATHS
ADDITIONAL_START_PATHS
BLACKLISTED_ATOM_FEEDS_USED_FOR_EMAIL_ALERTS
FORMATS_TO_503
WHITELIST_PATHS

Calendars currently register as custom-application

Attributes

all_start_urls[R]
blacklist_paths[R]

Public Class Methods

new(root) click to toggle source
# File lib/govuk_mirrorer/indexer.rb, line 42
def initialize(root)
  @root = root
  @api_endpoint = @root + '/api/artefacts.json'
  @all_start_urls = ADDITIONAL_START_PATHS.map{ |x| @root + x}
  @blacklist_paths = ADDITIONAL_BLACKLIST_PATHS.dup + BLACKLISTED_ATOM_FEEDS_USED_FOR_EMAIL_ALERTS.dup
  process_artefacts
end

Public Instance Methods

blacklisted_url?(url) click to toggle source
# File lib/govuk_mirrorer/indexer.rb, line 52
def blacklisted_url?(url)
  path = URI.parse(url).path
  return false if path.nil? # e.g. mailto: links...
  url_segments = path.sub(%r{\A/}, '').split('/')
  @blacklist_paths.any? do |blacklist_path|
    bl_segments = blacklist_path.sub(%r{\A/}, '').split('/')
    url_segments[0..(bl_segments.length - 1)] == bl_segments
  end
end

Private Instance Methods

artefacts() click to toggle source
# File lib/govuk_mirrorer/indexer.rb, line 77
def artefacts
  retried = false
  @artefacts ||= begin
    content_api = GdsApi::ContentApi.new("#{@root}/api", :timeout => 10)
    GovukMirrorer.statsd.time("govuk.app.mirrorer.artefacts_duration") do
      content_api.artefacts.with_subsequent_pages.to_a
    end
  rescue GdsApi::HTTPErrorResponse, GdsApi::TimedOutException
    if ! retried
      retried = true
      sleep 1
      retry
    end
    raise
  end
end
process_artefacts() click to toggle source
# File lib/govuk_mirrorer/indexer.rb, line 64
def process_artefacts
  artefacts.each do |artefact|
    uri = URI.parse(artefact.web_url)
    if WHITELIST_PATHS.include?(uri.path)
      @all_start_urls << artefact.web_url
    elsif FORMATS_TO_503.include?(artefact.format)
      @blacklist_paths << uri.path
    else
      @all_start_urls << artefact.web_url
    end
  end
end