class Kudzu::Agent::UrlExtractor::ForHTML
Public Class Methods
new(config)
click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 49 def initialize(config) @config = config end
Public Instance Methods
extract(response)
click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 53 def extract(response) doc = response.parsed_doc return [] if @config.respect_nofollow && nofollow?(doc) if (filter = @config.find_filter(response.url)) if filter.allow_element doc = doc.search(*Array(filter.allow_element)) end if filter.deny_element doc = doc.dup doc.search(*Array(filter.deny_element)).remove end end refs = from_html(doc) + from_meta(doc) refs.reject { |ref| ref.url.nil? || ref.url.empty? } end
Private Instance Methods
from_html(doc)
click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 78 def from_html(doc) nodes = doc.xpath('.//*[@href or @src]').to_a if @config.respect_nofollow nodes.reject! { |url| url[:rel] =~ /nofollow/i } end nodes.map do |node| Reference.new(url: (node[:href] || node[:src]).to_s, title: node_to_title(node)) end end
from_meta(doc)
click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 99 def from_meta(doc) nodes = doc.xpath('.//meta[@http-equiv]').select { |node| node[:'http-equiv'] =~ /^refresh$/i } urls = nodes.map { |node| Util::ContentTypeParser.parse(node[:content]).last[:url] }.compact urls.map do |url| Reference.new(url: url.to_s) end end
node_to_title(node)
click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 91 def node_to_title(node) unless node.inner_text.empty? node.inner_text else (node[:title] || node[:alt]).to_s end end
nofollow?(doc)
click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 73 def nofollow?(doc) nodes = doc.xpath('//meta[@name]') nodes.any? { |node| node[:name] =~ /^robots$/i && node[:content] =~ /nofollow/i } end