class Kudzu::Agent
Public Class Methods
new(config, &block)
click to toggle source
# File lib/kudzu/agent.rb, line 5 def initialize(config, &block) @config = config @robots = Robots.new(@config) @fetcher = Fetcher.new(@config, @robots) @url_extractor = UrlExtractor.new(@config) @url_filterer = UrlFilterer.new(@config, @robots) @page_filterer = PageFilterer.new(@config) end
Public Instance Methods
extract_refs(response)
click to toggle source
# File lib/kudzu/agent.rb, line 32 def extract_refs(response) refs = @url_extractor.extract(response) @url_filterer.filter(refs, response.url) end
fetch(url, request_header = {})
click to toggle source
# File lib/kudzu/agent.rb, line 20 def fetch(url, request_header = {}) response = @fetcher.fetch(url, request_header: request_header) return response unless response.fetched? response.size = response.body.size response.digest = Digest::MD5.hexdigest(response.body) response.mime_type = Util::MimeTypeDetector.detect(response) response.charset = Util::CharsetDetector.detect(response) if response.text? response.title = Util::TitleParser.parse(response) response end
filter_response?(response)
click to toggle source
# File lib/kudzu/agent.rb, line 37 def filter_response?(response) return false if response.redirect_from && !@url_filterer.allowed?(response.url, response.redirect_from) !@page_filterer.allowed?(response) end
start() { || ... }
click to toggle source
# File lib/kudzu/agent.rb, line 15 def start yield @fetcher.pool.close end