class SitemapParser

Attributes

domain[R]
list_nested_sitemap[R]
robots_sitemap_path[R]

Public Class Methods

new(domain) click to toggle source
# File lib/sitemaps_parsers.rb, line 11
def initialize(domain)
  @domain = domain
  @url = URI.parse("http://#{@domain}")
end

Public Instance Methods

sitemap_path() click to toggle source
# File lib/sitemaps_parsers.rb, line 22
def sitemap_path
  robots_sitemap
  check_default_sitemap&.to_s || robots_sitemap_path&.join(',')
end
urls() click to toggle source
# File lib/sitemaps_parsers.rb, line 16
def urls
  return parse_sitemap(check_default_sitemap) if check_default_sitemap

  parse_nested_sitemaps(robots_sitemap)
end

Private Instance Methods

check_default_sitemap() click to toggle source
# File lib/sitemaps_parsers.rb, line 29
def check_default_sitemap
  url_http_sitemap = URI.join(@url, 'sitemap.xml')
  url_https_sitemap = URI.join(URI.parse("https://#{@domain}"), 'sitemap.xml')
  url_http_www_sitemap = URI.join(URI.parse("http://www.#{@domain}"), 'sitemap.xml')
  url_https_www_sitemap = URI.join(URI.parse("https://www.#{@domain}"), 'sitemap.xml')

  return url_http_sitemap if Net::HTTP.get_response(url_http_sitemap).code == '200'
  return url_https_sitemap if Net::HTTP.get_response(url_https_sitemap).code == '200'
  return url_http_www_sitemap if Net::HTTP.get_response(url_http_www_sitemap).code == '200'
  return url_https_www_sitemap if Net::HTTP.get_response(url_https_www_sitemap).code == '200'
rescue StandardError
  nil
end
filter_sitemap_urls(sitemap_data) click to toggle source
# File lib/sitemaps_parsers.rb, line 79
def filter_sitemap_urls(sitemap_data)
  sitemap_data.search('url').map { |url| url.at('loc').content.strip }
end
load_sitemap(url = nil) click to toggle source
# File lib/sitemaps_parsers.rb, line 67
def load_sitemap(url = nil)
  sitemap_io = open(url)
rescue StandardError
  nil
else
  begin
    return Zlib::GzipReader.new(sitemap_io)
  rescue StandardError
    return sitemap_io
  end
end
nested_sitemaps(sitemap_list = []) click to toggle source
# File lib/sitemaps_parsers.rb, line 52
def nested_sitemaps(sitemap_list = [])
  sitemap_list.map do |path|
    path_io = open(path)
    Nokogiri::HTML(path_io).xpath('//sitemapindex/sitemap/loc').map(&:text)
  end.flatten
end
parse_nested_sitemaps(nested_sitemaps = []) click to toggle source
# File lib/sitemaps_parsers.rb, line 59
def parse_nested_sitemaps(nested_sitemaps = [])
  nested_sitemaps.map do |sitemap|
    Nokogiri::XML(load_sitemap(sitemap))
  end.compact.map do |sitemap_io|
    filter_sitemap_urls(sitemap_io)
  end.compact.flatten
end
parse_sitemap(url) click to toggle source
# File lib/sitemaps_parsers.rb, line 83
def parse_sitemap(url)
  sitemap_data = Nokogiri::XML(open(url))

  if !sitemap_data.at('urlset').nil?
    return filter_sitemap_urls(sitemap_data.at('urlset'))

  elsif !sitemap_data.at('sitemapindex').nil?
    found_urls = []
    nested_sitemaps = sitemap_data.at('sitemapindex').search('sitemap')

    nested_sitemaps.each do |sitemap|
      child_sitemap_location = sitemap.at('loc').content.strip
      found_urls << filter_sitemap_urls(Nokogiri::XML(open(child_sitemap_location)))
    end
    return found_urls.flatten
  end
end
robots_sitemap() click to toggle source
# File lib/sitemaps_parsers.rb, line 43
def robots_sitemap
  @robots_sitemap_path ||= open(URI.join(@url, 'robots.txt')).read.scan(/\s*sitemap:\s*([^\r\n]+)\s*$/i).flatten!.uniq

  @list_nested_sitemap ||= nested_sitemaps(@robots_sitemap_path)
  @list_nested_sitemap
rescue StandardError
  nil
end