class Kudzu::Agent::UrlExtractor::ForHTML

Public Class Methods

new(config) click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 49
def initialize(config)
  @config = config
end

Public Instance Methods

extract(response) click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 53
def extract(response)
  doc = response.parsed_doc
  return [] if @config.respect_nofollow && nofollow?(doc)

  if (filter = @config.find_filter(response.url))
    if filter.allow_element
      doc = doc.search(*Array(filter.allow_element))
    end
    if filter.deny_element
      doc = doc.dup
      doc.search(*Array(filter.deny_element)).remove
    end
  end

  refs = from_html(doc) + from_meta(doc)
  refs.reject { |ref| ref.url.nil? || ref.url.empty? }
end

Private Instance Methods

from_html(doc) click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 78
def from_html(doc)
  nodes = doc.xpath('.//*[@href or @src]').to_a

  if @config.respect_nofollow
    nodes.reject! { |url| url[:rel] =~ /nofollow/i }
  end

  nodes.map do |node|
    Reference.new(url: (node[:href] || node[:src]).to_s,
                  title: node_to_title(node))
  end
end
from_meta(doc) click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 99
def from_meta(doc)
  nodes = doc.xpath('.//meta[@http-equiv]').select { |node| node[:'http-equiv'] =~ /^refresh$/i }
  urls = nodes.map { |node| Util::ContentTypeParser.parse(node[:content]).last[:url] }.compact
  urls.map do |url|
    Reference.new(url: url.to_s)
  end
end
node_to_title(node) click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 91
def node_to_title(node)
  unless node.inner_text.empty?
    node.inner_text
  else
    (node[:title] || node[:alt]).to_s
  end
end
nofollow?(doc) click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 73
def nofollow?(doc)
  nodes = doc.xpath('//meta[@name]')
  nodes.any? { |node| node[:name] =~ /^robots$/i && node[:content] =~ /nofollow/i }
end