module Sitemaps::Parser
Parse XML Sitemaps
Constants
- VALID_CHANGEFREQ
Public Class Methods
get_text(root, key)
click to toggle source
# File lib/sitemaps/parser.rb, line 68 def self.get_text(root, key) root.get_text(key)&.value&.strip end
parse(source, max_entries: nil, filter: nil, filter_indexes: nil)
click to toggle source
Given a source string, returns a sitemap containing all valid url entries, or all valid sub-sitemaps. See `sitemaps.org` for information on the spec.
@param source [String] an XML string to parse. @param max_entries [Integer, nil] the maximum number of entries to add to the sitemap. @param filter [#call, nil] if provided, called per entry to filter the entry out of the sitemap. @param filter_indexes [Boolean, nil] if truthy, filter is called per submap as well as entries. @return [Sitemap] the sitemap parsed from the XML string. If the XML string given is invalid,
a sitemap will still be returned, but the entries and sitemaps keys will be empty.
# File lib/sitemaps/parser.rb, line 15 def self.parse(source, max_entries: nil, filter: nil, filter_indexes: nil) document = REXML::Document.new(source) entries = document.elements.to_a("/urlset/url").map do |root| loc = parse_loc(root) || next mod = parse_lastmod(root) freq = parse_changefreq(root) pri = parse_priority(root) entry = Sitemaps::Entry.new(loc, mod, freq, pri) (!filter || filter.call(entry)) ? entry : nil end.reject(&:nil?) entries = entries.uniq(&:loc) entries = entries.take(max_entries) unless max_entries.nil? sitemaps = document.elements.to_a("/sitemapindex/sitemap").map do |root| loc = parse_loc(root) || next mod = parse_lastmod(root) submap = Sitemaps::Submap.new(loc, mod) (!filter || !filter_indexes || filter.call(submap)) ? submap : nil end.reject(&:nil?) Sitemaps::Sitemap.new(entries, sitemaps) end
parse_changefreq(root)
click to toggle source
@api private @private
# File lib/sitemaps/parser.rb, line 56 def self.parse_changefreq(root) freq = get_text(root, 'changefreq') freq && VALID_CHANGEFREQ.include?(freq) ? freq.to_sym : nil end
parse_lastmod(root)
click to toggle source
@api private @private
# File lib/sitemaps/parser.rb, line 49 def self.parse_lastmod(root) mod = get_text(root, 'lastmod') mod && Time.parse(mod) rescue nil end
parse_loc(root)
click to toggle source
@api private @private
# File lib/sitemaps/parser.rb, line 42 def self.parse_loc(root) loc = get_text(root, 'loc') loc && URI.parse(loc) rescue nil end
parse_priority(root)
click to toggle source
@api private @private
# File lib/sitemaps/parser.rb, line 63 def self.parse_priority(root) priority = get_text(root, 'priority') || '0.5' priority && Float(priority) rescue 0.5 # default priority according to spec end