class HTMLProofer::UrlValidator

Attributes

before_request[W]
external_urls[R]

Public Class Methods

new(logger, cache, external_urls, options) click to toggle source
# File lib/html-proofer/url_validator.rb, line 15
def initialize(logger, cache, external_urls, options)
  @logger = logger
  @external_urls = external_urls
  @failed_tests = []
  @options = options
  @hydra = Typhoeus::Hydra.new(@options[:hydra])
  @cache = cache
  @before_request = []
end

Public Instance Methods

add_external_issue(filenames, desc, status = nil) click to toggle source
# File lib/html-proofer/url_validator.rb, line 221
def add_external_issue(filenames, desc, status = nil)
  # possible if we're checking an array of links
  if filenames.nil?
    @failed_tests << Issue.new('', desc, status: status)
  else
    filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) }
  end
end
check_hash_in_2xx_response(href, effective_url, response, filenames) click to toggle source

Even though the response was a success, we may have been asked to check if the hash on the URL exists on the page

# File lib/html-proofer/url_validator.rb, line 177
def check_hash_in_2xx_response(href, effective_url, response, filenames)
  return false if @options[:only_4xx]
  return false unless @options[:check_external_hash]
  return false unless (hash = hash?(href))

  body_doc = create_nokogiri(response.body)

  unencoded_hash = Addressable::URI.unescape(hash)
  xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
  # user-content is a special addition by GitHub.
  if URI.parse(href).host =~ /github\.com/i
    xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
    # when linking to a file on GitHub, like #L12-L34, only the first "L" portion
    # will be identified as a linkable portion
    xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
  end

  return unless body_doc.xpath(xpath.join('|')).empty?

  msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
  add_external_issue(filenames, msg, response.code)
  @cache.add(href, filenames, response.code, msg)
  true
end
clean_url(href) click to toggle source
# File lib/html-proofer/url_validator.rb, line 120
def clean_url(href)
  # catch any obvious issues, like strings in port numbers
  parsed = Addressable::URI.parse(href)
  if href =~ /^([!#{Regexp.last_match(0)}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
    href
  else
    parsed.normalize
  end
end
establish_queue(external_urls) click to toggle source
# File lib/html-proofer/url_validator.rb, line 102
def establish_queue(external_urls)
  external_urls.each_pair do |url, filenames|
    url = begin
      clean_url(url)
    rescue URI::Error, Addressable::URI::InvalidURIError
      add_external_issue(filenames, "#{url} is an invalid URL")
      next
    end

    method = if hash?(url) && @options[:check_external_hash]
               :get
             else
               :head
             end
    queue_request(method, url, filenames)
  end
end
extract_domain_path(uri) click to toggle source
# File lib/html-proofer/url_validator.rb, line 73
def extract_domain_path(uri)
  uri.host + uri.path
end
handle_failure(href, filenames, response_code, return_message) click to toggle source
# File lib/html-proofer/url_validator.rb, line 210
def handle_failure(href, filenames, response_code, return_message)
  msg = "External link #{href} failed: response code #{response_code} means something's wrong.
         It's possible libcurl couldn't connect to the server or perhaps the request timed out.
         Sometimes, making too many requests at once also breaks things.
         Either way, the return message (if any) from the server is: #{return_message}"
  @cache.add(href, filenames, 0, msg)
  return if @options[:only_4xx]

  add_external_issue(filenames, msg, response_code)
end
handle_timeout(href, filenames, response_code) click to toggle source
# File lib/html-proofer/url_validator.rb, line 202
def handle_timeout(href, filenames, response_code)
  msg = "External link #{href} failed: got a time out (response code #{response_code})"
  @cache.add(href, filenames, 0, msg)
  return if @options[:only_4xx]

  add_external_issue(filenames, msg, response_code)
end
hash?(url) click to toggle source

Does the URL have a hash?

# File lib/html-proofer/url_validator.rb, line 231
def hash?(url)
  URI.parse(url).fragment
rescue URI::InvalidURIError
  false
end
new_url_query_values?(uri, paths_with_queries) click to toggle source

remember queries we've seen, ignore future ones

# File lib/html-proofer/url_validator.rb, line 59
def new_url_query_values?(uri, paths_with_queries)
  queries = uri.query_values.keys.join('-')
  domain_path = extract_domain_path(uri)
  if paths_with_queries[domain_path].nil?
    paths_with_queries[domain_path] = [queries]
    true
  elsif !paths_with_queries[domain_path].include?(queries)
    paths_with_queries[domain_path] << queries
    true
  else
    false
  end
end
queue_request(method, href, filenames) click to toggle source
# File lib/html-proofer/url_validator.rb, line 130
def queue_request(method, href, filenames)
  opts = @options[:typhoeus].merge(method: method)
  request = Typhoeus::Request.new(href, opts)
  @before_request.each do |callback|
    callback.call(request)
  end
  request.on_complete { |response| response_handler(response, filenames) }
  @hydra.queue request
end
remove_query_values() click to toggle source
# File lib/html-proofer/url_validator.rb, line 39
def remove_query_values
  return nil if @external_urls.nil?

  paths_with_queries = {}
  iterable_external_urls = @external_urls.dup
  @external_urls.each_key do |url|
    uri = begin
      Addressable::URI.parse(url)
    rescue URI::Error, Addressable::URI::InvalidURIError
      @logger.log :error, "#{url} is an invalid URL"
      nil
    end
    next if uri.nil? || uri.query.nil?

    iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries)
  end
  iterable_external_urls
end
response_handler(response, filenames) click to toggle source
# File lib/html-proofer/url_validator.rb, line 140
def response_handler(response, filenames)
  effective_url = response.options[:effective_url]
  href = response.request.base_url.to_s
  method = response.request.options[:method]
  response_code = response.code
  response.body.delete!("\x00")

  debug_msg = if filenames.nil?
                "Received a #{response_code} for #{href}"
              else
                "Received a #{response_code} for #{href}  in #{filenames.join(' ')}"
              end

  @logger.log :debug, debug_msg

  return if @options[:http_status_ignore].include?(response_code)

  if response_code.between?(200, 299)
    @cache.add(href, filenames, response_code) unless check_hash_in_2xx_response(href, effective_url, response, filenames)
  elsif response.timed_out?
    handle_timeout(href, filenames, response_code)
  elsif response_code.zero?
    handle_failure(effective_url, filenames, response_code, response.return_message)
  elsif method == :head
    queue_request(:get, href, filenames)
  else
    return if @options[:only_4xx] && !response_code.between?(400, 499)

    # Received a non-successful http response.
    msg = "External link #{href} failed: #{response_code} #{response.return_message}"
    add_external_issue(filenames, msg, response_code)
    @cache.add(href, filenames, response_code, msg)
  end
end
run() click to toggle source
# File lib/html-proofer/url_validator.rb, line 25
def run
  @external_urls = remove_query_values

  if @cache.use_cache?
    urls_to_check = @cache.retrieve_urls(@external_urls, :external)
    external_link_checker(urls_to_check)
    @cache.write
  else
    external_link_checker(@external_urls)
  end

  @failed_tests
end