class Kudzu::Agent

Public Class Methods

new(config, &block) click to toggle source
# File lib/kudzu/agent.rb, line 5
def initialize(config, &block)
  @config = config

  @robots = Robots.new(@config)
  @fetcher = Fetcher.new(@config, @robots)
  @url_extractor = UrlExtractor.new(@config)
  @url_filterer = UrlFilterer.new(@config, @robots)
  @page_filterer = PageFilterer.new(@config)
end

Public Instance Methods

extract_refs(response) click to toggle source
# File lib/kudzu/agent.rb, line 32
def extract_refs(response)
  refs = @url_extractor.extract(response)
  @url_filterer.filter(refs, response.url)
end
fetch(url, request_header = {}) click to toggle source
# File lib/kudzu/agent.rb, line 20
def fetch(url, request_header = {})
  response = @fetcher.fetch(url, request_header: request_header)
  return response unless response.fetched?

  response.size = response.body.size
  response.digest = Digest::MD5.hexdigest(response.body)
  response.mime_type = Util::MimeTypeDetector.detect(response)
  response.charset = Util::CharsetDetector.detect(response) if response.text?
  response.title = Util::TitleParser.parse(response)
  response
end
filter_response?(response) click to toggle source
# File lib/kudzu/agent.rb, line 37
def filter_response?(response)
  return false if response.redirect_from && !@url_filterer.allowed?(response.url, response.redirect_from)
  !@page_filterer.allowed?(response)
end
start() { || ... } click to toggle source
# File lib/kudzu/agent.rb, line 15
 def start
   yield
   @fetcher.pool.close
end