class Sitemaps::Instance

Holder for methods that shouldn’t be exposed as public API @private @api private

Public Instance Methods

discover_roots(url, fetcher) click to toggle source

interrogate a host for sitemaps from robots.txt, or return some potential locations. @return [Array<URI>]

# File lib/sitemaps.rb, line 169
def discover_roots(url, fetcher)
  robots = begin
    robotsurl      = url.clone
    robotsurl.path = '/robots.txt'
    robotstxt      = fetcher.call(robotsurl)

    discovered = robotstxt.scan(/^Sitemap: (\S+)/).flatten.map do |url|
      URI.parse(url.strip)
    end
    discovered.presence
  rescue
    nil
  end

  # try for files in a handful of known locations
  known_locations = %w(/sitemap_index.xml.gz /sitemap_index.xml /sitemap.xml.gz /sitemap.xml)
  known_locations = known_locations.lazy.map do |path|
    pathurl      = url.clone
    pathurl.path = path
    pathurl
  end

  robots || known_locations.to_a
end
fetch_recursive(urls, fetcher, max_entries, filter_indexes, &block) click to toggle source

recursively fetch sitemaps and sitemap indexes from the given urls. @return [Sitemap]

# File lib/sitemaps.rb, line 128
def fetch_recursive(urls, fetcher, max_entries, filter_indexes, &block)
  queue = urls.is_a?(Array) ? urls : [urls]
  maps  = {}

  # walk the queue, fetching the sitemap requested and adding
  # new sitemaps to the queue as found
  loop do
    begin
      url = queue.pop
      break if url.nil?
      next  unless maps[url].nil?

      # fetch this item in the queue, and queue up any sub maps it found
      source  = fetcher.call(url)
      sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block, filter_indexes: filter_indexes)

      # save the results and queue up any submaps it found
      maps[url] = sitemap
      queue.push(*sitemap.sitemaps.map(&:loc))

      # decrement max_entries (since it's max_entries total, not per map)
      unless max_entries.nil?
        max_entries -= maps[url].entries.length
        break if max_entries <= 0
      end
    rescue => ex
      # otherwise keep on going, because we've got something at least
      $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
      next
    end
  end

  # collapse the recovered maps into a single one with everything
  maps.each_with_object(Sitemap.new([], [])) do |(_, map), result|
    result.sitemaps.concat(map.sitemaps).uniq! { |e| e.loc.to_s }
    result.entries.concat(map.entries).uniq!   { |e| e.loc.to_s }
  end
end