class Sitemaps::Instance
Holder for methods that shouldn't be exposed as public API @private @api private
Public Instance Methods
discover_roots(url, fetcher)
click to toggle source
interrogate a host for sitemaps from robots.txt, or return some potential locations. @return [Array<URI>]
# File lib/sitemaps.rb, line 169 def discover_roots(url, fetcher) robots = begin robotsurl = url.clone robotsurl.path = '/robots.txt' robotstxt = fetcher.call(robotsurl) discovered = robotstxt.scan(/^Sitemap: (\S+)/).flatten.map do |url| URI.parse(url.strip) end discovered.presence rescue nil end # try for files in a handful of known locations known_locations = %w(/sitemap_index.xml.gz /sitemap_index.xml /sitemap.xml.gz /sitemap.xml) known_locations = known_locations.lazy.map do |path| pathurl = url.clone pathurl.path = path pathurl end robots || known_locations.to_a end
fetch_recursive(urls, fetcher, max_entries, filter_indexes, &block)
click to toggle source
recursively fetch sitemaps and sitemap indexes from the given urls. @return [Sitemap]
# File lib/sitemaps.rb, line 128 def fetch_recursive(urls, fetcher, max_entries, filter_indexes, &block) queue = urls.is_a?(Array) ? urls : [urls] maps = {} # walk the queue, fetching the sitemap requested and adding # new sitemaps to the queue as found loop do begin url = queue.pop break if url.nil? next unless maps[url].nil? # fetch this item in the queue, and queue up any sub maps it found source = fetcher.call(url) sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block, filter_indexes: filter_indexes) # save the results and queue up any submaps it found maps[url] = sitemap queue.push(*sitemap.sitemaps.map(&:loc)) # decrement max_entries (since it's max_entries total, not per map) unless max_entries.nil? max_entries -= maps[url].entries.length break if max_entries <= 0 end rescue => ex # otherwise keep on going, because we've got something at least $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..." next end end # collapse the recovered maps into a single one with everything maps.each_with_object(Sitemap.new([], [])) do |(_, map), result| result.sitemaps.concat(map.sitemaps).uniq! { |e| e.loc.to_s } result.entries.concat(map.entries).uniq! { |e| e.loc.to_s } end end