module Sitemaps::Parser

Parse XML Sitemaps

Constants

VALID_CHANGEFREQ

Public Class Methods

get_text(root, key) click to toggle source
# File lib/sitemaps/parser.rb, line 68
def self.get_text(root, key)
  root.get_text(key)&.value&.strip
end
parse(source, max_entries: nil, filter: nil, filter_indexes: nil) click to toggle source

Given a source string, returns a sitemap containing all valid url entries, or all valid sub-sitemaps. See ‘sitemaps.org` for information on the spec.

@param source [String] an XML string to parse. @param max_entries [Integer, nil] the maximum number of entries to add to the sitemap. @param filter [#call, nil] if provided, called per entry to filter the entry out of the sitemap. @param filter_indexes [Boolean, nil] if truthy, filter is called per submap as well as entries. @return [Sitemap] the sitemap parsed from the XML string. If the XML string given is invalid,

a sitemap will still be returned, but the entries and sitemaps keys will be empty.
# File lib/sitemaps/parser.rb, line 15
def self.parse(source, max_entries: nil, filter: nil, filter_indexes: nil)
  document = REXML::Document.new(source)
  entries  = document.elements.to_a("/urlset/url").map do |root|
    loc  = parse_loc(root) || next
    mod  = parse_lastmod(root)
    freq = parse_changefreq(root)
    pri  = parse_priority(root)

    entry = Sitemaps::Entry.new(loc, mod, freq, pri)
    (!filter || filter.call(entry)) ? entry : nil
  end.reject(&:nil?)
  entries = entries.uniq(&:loc)
  entries = entries.take(max_entries) unless max_entries.nil?

  sitemaps = document.elements.to_a("/sitemapindex/sitemap").map do |root|
    loc  = parse_loc(root) || next
    mod  = parse_lastmod(root)

    submap = Sitemaps::Submap.new(loc, mod)
    (!filter || !filter_indexes || filter.call(submap)) ? submap : nil
  end.reject(&:nil?)

  Sitemaps::Sitemap.new(entries, sitemaps)
end
parse_changefreq(root) click to toggle source

@api private @private

# File lib/sitemaps/parser.rb, line 56
def self.parse_changefreq(root)
  freq = get_text(root, 'changefreq')
  freq && VALID_CHANGEFREQ.include?(freq) ? freq.to_sym : nil
end
parse_lastmod(root) click to toggle source

@api private @private

# File lib/sitemaps/parser.rb, line 49
def self.parse_lastmod(root)
  mod = get_text(root, 'lastmod')
  mod && Time.parse(mod) rescue nil
end
parse_loc(root) click to toggle source

@api private @private

# File lib/sitemaps/parser.rb, line 42
def self.parse_loc(root)
  loc = get_text(root, 'loc')
  loc && URI.parse(loc) rescue nil
end
parse_priority(root) click to toggle source

@api private @private

# File lib/sitemaps/parser.rb, line 63
def self.parse_priority(root)
  priority = get_text(root, 'priority') || '0.5'
  priority && Float(priority) rescue 0.5 # default priority according to spec
end