class GovukMirrorer::Indexer
Constants
- ADDITIONAL_BLACKLIST_PATHS
- ADDITIONAL_START_PATHS
- BLACKLISTED_ATOM_FEEDS_USED_FOR_EMAIL_ALERTS
- FORMATS_TO_503
- WHITELIST_PATHS
Calendars currently register as custom-application
Attributes
all_start_urls[R]
blacklist_paths[R]
Public Class Methods
new(root)
click to toggle source
# File lib/govuk_mirrorer/indexer.rb, line 42 def initialize(root) @root = root @api_endpoint = @root + '/api/artefacts.json' @all_start_urls = ADDITIONAL_START_PATHS.map{ |x| @root + x} @blacklist_paths = ADDITIONAL_BLACKLIST_PATHS.dup + BLACKLISTED_ATOM_FEEDS_USED_FOR_EMAIL_ALERTS.dup process_artefacts end
Public Instance Methods
blacklisted_url?(url)
click to toggle source
# File lib/govuk_mirrorer/indexer.rb, line 52 def blacklisted_url?(url) path = URI.parse(url).path return false if path.nil? # e.g. mailto: links... url_segments = path.sub(%r{\A/}, '').split('/') @blacklist_paths.any? do |blacklist_path| bl_segments = blacklist_path.sub(%r{\A/}, '').split('/') url_segments[0..(bl_segments.length - 1)] == bl_segments end end
Private Instance Methods
artefacts()
click to toggle source
# File lib/govuk_mirrorer/indexer.rb, line 77 def artefacts retried = false @artefacts ||= begin content_api = GdsApi::ContentApi.new("#{@root}/api", :timeout => 10) GovukMirrorer.statsd.time("govuk.app.mirrorer.artefacts_duration") do content_api.artefacts.with_subsequent_pages.to_a end rescue GdsApi::HTTPErrorResponse, GdsApi::TimedOutException if ! retried retried = true sleep 1 retry end raise end end
process_artefacts()
click to toggle source
# File lib/govuk_mirrorer/indexer.rb, line 64 def process_artefacts artefacts.each do |artefact| uri = URI.parse(artefact.web_url) if WHITELIST_PATHS.include?(uri.path) @all_start_urls << artefact.web_url elsif FORMATS_TO_503.include?(artefact.format) @blacklist_paths << uri.path else @all_start_urls << artefact.web_url end end end