module TrueURL::Fetch

Public Instance Methods

execute(context) click to toggle source
# File lib/true_url/fetch.rb, line 8
def execute(context)
  starting_url = context.working_url

  response = HTTP.follow
                 .get(starting_url)

  canonical_url = find_canonical_header(response.headers) || find_canonical_url(response.to_s) || response.uri
  context.set_working_url(canonical_url, starting_url)
end
find_canonical_header(headers) click to toggle source
# File lib/true_url/fetch.rb, line 18
def find_canonical_header(headers)
  return if headers['Link'].nil?

  links = headers['Link'].is_a?(String) ? [headers['Link']] : headers['Link']
  links.each { |link| return link.split(/[<>;]/)[1] if link.end_with?('rel="canonical"') }
  nil
end
find_canonical_url(html) click to toggle source
# File lib/true_url/fetch.rb, line 26
def find_canonical_url(html)
  doc = Nokogiri::HTML(html)

  elem = doc.at('link[rel="canonical"]')
  canonical_url = elem['href'] unless elem.nil?

  elem = doc.at('meta[property="og:url"]')
  og_url = elem['content'] unless elem.nil?

  canonical_url || og_url
end