class Kudzu::Crawler
Attributes
agent[R]
config[R]
frontier[R]
repository[R]
uuid[R]
Public Class Methods
new(options = {}, &block)
click to toggle source
# File lib/kudzu/crawler.rb, line 14 def initialize(options = {}, &block) @uuid = options[:uuid] || SecureRandom.uuid @config = Kudzu::Config.new(options, &block) @frontier = Kudzu.adapter::Frontier.new(@uuid) @repository = Kudzu.adapter::Repository.new @agent = Kudzu.agent.new(@config) end
Public Instance Methods
run(seed_url, &block)
click to toggle source
# File lib/kudzu/crawler.rb, line 23 def run(seed_url, &block) @callback = Kudzu::Callback.new(&block) seed_refs = Array(seed_url).map { |url| Kudzu::Agent::Reference.new(url: url) } enqueue_links(refs_to_links(seed_refs, 1)) @agent.start do if @config.thread_num.to_i <= 1 single_thread else multi_thread(@config.thread_num) end end @frontier.clear end
Private Instance Methods
delete_page(page)
click to toggle source
# File lib/kudzu/crawler.rb, line 158 def delete_page(page) @callback.around(:delete, page) do @repository.delete(page) end end
enqueue_links(links, response = nil)
click to toggle source
# File lib/kudzu/crawler.rb, line 174 def enqueue_links(links, response = nil) @callback.around(:enqueue, links, response) do @frontier.enqueue(links) end end
fetch(link, request_header)
click to toggle source
# File lib/kudzu/crawler.rb, line 108 def fetch(link, request_header) response = nil (@config.max_retry.to_i + 1).times do @callback.around(:fetch, link, request_header, response) do response = @agent.fetch(link.url, request_header) end if response.fetched? Kudzu.log :info, "fetched page: #{response.status} #{response.url}" else Kudzu.log :info, "skipped page: #{response.status} #{response.url}" end break if !response.fetched? || response.status_success? || response.status_redirection? end response rescue Exception => e Kudzu.log :warn, "failed to fetch page: #{link.url}", error: e @callback.on(:failure, link, e) nil end
handle_success(page, link, response)
click to toggle source
# File lib/kudzu/crawler.rb, line 128 def handle_success(page, link, response) page.response_header = response.response_header page.body = response.body page.size = response.size page.mime_type = response.mime_type page.charset = response.charset page.title = response.title page.redirect_from = response.redirect_from page.revised_at = Time.now if page.digest != response.digest page.digest = response.digest if @config.max_depth.nil? || link.depth < @config.max_depth.to_i refs = @agent.extract_refs(response) enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty? end if @agent.filter_response?(response) page.filtered = true delete_page(page) else register_page(page) end end
multi_thread(thread_num)
click to toggle source
# File lib/kudzu/crawler.rb, line 50 def multi_thread(thread_num) @thread_pool = Kudzu::ThreadPool.new(thread_num) @thread_pool.start do |queue| limit_num = [thread_num - queue.size, 0].max @frontier.dequeue(limit: limit_num).each do |link| queue.push(link) end link = queue.pop visit_link(link) end @thread_pool.wait @thread_pool.shutdown end
refs_to_links(refs, depth)
click to toggle source
# File lib/kudzu/crawler.rb, line 164 def refs_to_links(refs, depth) refs.map do |ref| Kudzu.adapter::Link.new(uuid: @uuid, url: ref.url, title: ref.title, state: 0, depth: depth) end end
register_page(page)
click to toggle source
# File lib/kudzu/crawler.rb, line 152 def register_page(page) @callback.around(:register, page) do @repository.register(page) end end
run_callback(page, link)
click to toggle source
# File lib/kudzu/crawler.rb, line 92 def run_callback(page, link) if page.status_success? if page.filtered @callback.on(:filter, page, link) else @callback.on(:success, page, link) end elsif page.status_redirection? @callback.on(:redirection, page, link) elsif page.status_client_error? @callback.on(:client_error, page, link) elsif page.status_server_error? @callback.on(:server_error, page, link) end end
single_thread()
click to toggle source
# File lib/kudzu/crawler.rb, line 42 def single_thread loop do link = @frontier.dequeue.first break unless link visit_link(link) end end
visit_link(link)
click to toggle source
# File lib/kudzu/crawler.rb, line 66 def visit_link(link) response = fetch(link, @config.default_request_header.to_h.dup) return unless response page = @repository.find_by_url(response.url) page.url = response.url page.status = response.status page.response_time = response.response_time page.fetched_at = Time.now if response.fetched? if page.status_success? handle_success(page, link, response) elsif page.status_not_modified? register_page(page) elsif page.status_not_found? || page.status_gone? delete_page(page) end else page.filtered = true delete_page(page) end run_callback(page, link) end