class Kudzu::Agent::PageFilterer

Public Class Methods

new(config) click to toggle source
# File lib/kudzu/agent/page_filterer.rb, line 4
def initialize(config)
  @config = config
end

Public Instance Methods

allowed?(response) click to toggle source
# File lib/kudzu/agent/page_filterer.rb, line 8
def allowed?(response)
  filter = @config.find_filter(response.url)

  if filter.nil? || (allowed_mime_type?(response.mime_type, filter) &&
                     allowed_size?(response.size, filter) &&
                     allowed_index?(response))
    Kudzu.log :info, "passed page: #{response.url}"
    true
  else
    Kudzu.log :info, "dropped page: #{response.url}"
    false
  end
end
allowed_response_header?(url, response_header) click to toggle source
# File lib/kudzu/agent/page_filterer.rb, line 22
def allowed_response_header?(url, response_header)
  filter = @config.find_filter(url)

  if response_header['content-type']
    mime_type = Util::ContentTypeParser.parse(response_header['content-type']).first
  end
  if response_header['content-length']
    size = response_header['content-length'].to_i
  end

  filter.nil? || (allowed_mime_type?(mime_type, filter) &&
                  allowed_size?(size, filter))
end

Private Instance Methods

allowed_index?(response) click to toggle source
# File lib/kudzu/agent/page_filterer.rb, line 48
def allowed_index?(response)
  return true if response.body.nil? || !response.html?
  return true unless @config.respect_noindex

  doc = response.parsed_doc
  doc.xpath('html/head/meta[@name]')
     .all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
end
allowed_mime_type?(mime_type, filter) click to toggle source
# File lib/kudzu/agent/page_filterer.rb, line 38
def allowed_mime_type?(mime_type, filter)
  return true if mime_type.nil?
  Util::Matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
end
allowed_size?(size, filter) click to toggle source
# File lib/kudzu/agent/page_filterer.rb, line 43
def allowed_size?(size, filter)
  return true if filter.max_size.nil? || size.nil?
  size.to_i < filter.max_size.to_i
end