class Kudzu::Crawler

Attributes

agent[R]
config[R]
frontier[R]
repository[R]
uuid[R]

Public Class Methods

new(options = {}, &block) click to toggle source
# File lib/kudzu/crawler.rb, line 14
def initialize(options = {}, &block)
  @uuid = options[:uuid] || SecureRandom.uuid
  @config = Kudzu::Config.new(options, &block)

  @frontier = Kudzu.adapter::Frontier.new(@uuid)
  @repository = Kudzu.adapter::Repository.new
  @agent = Kudzu.agent.new(@config)
end

Public Instance Methods

run(seed_url, &block) click to toggle source
# File lib/kudzu/crawler.rb, line 23
def run(seed_url, &block)
  @callback = Kudzu::Callback.new(&block)

  seed_refs = Array(seed_url).map { |url| Kudzu::Agent::Reference.new(url: url) }
  enqueue_links(refs_to_links(seed_refs, 1))

  @agent.start do
    if @config.thread_num.to_i <= 1
      single_thread
    else
      multi_thread(@config.thread_num)
    end
  end

  @frontier.clear
end

Private Instance Methods

delete_page(page) click to toggle source
# File lib/kudzu/crawler.rb, line 158
def delete_page(page)
  @callback.around(:delete, page) do
    @repository.delete(page)
  end
end
fetch(link, request_header) click to toggle source
# File lib/kudzu/crawler.rb, line 108
def fetch(link, request_header)
  response = nil
  (@config.max_retry.to_i + 1).times do
    @callback.around(:fetch, link, request_header, response) do
      response = @agent.fetch(link.url, request_header)
    end
    if response.fetched?
      Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
    else
      Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
    end
    break if !response.fetched? || response.status_success? || response.status_redirection?
  end
  response
rescue Exception => e
  Kudzu.log :warn, "failed to fetch page: #{link.url}", error: e
  @callback.on(:failure, link, e)
  nil
end
handle_success(page, link, response) click to toggle source
# File lib/kudzu/crawler.rb, line 128
def handle_success(page, link, response)
  page.response_header = response.response_header
  page.body = response.body
  page.size = response.size
  page.mime_type = response.mime_type
  page.charset = response.charset
  page.title = response.title
  page.redirect_from = response.redirect_from
  page.revised_at = Time.now if page.digest != response.digest
  page.digest = response.digest

  if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
    refs = @agent.extract_refs(response)
    enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty?
  end

  if @agent.filter_response?(response)
    page.filtered = true
    delete_page(page)
  else
    register_page(page)
  end
end
multi_thread(thread_num) click to toggle source
# File lib/kudzu/crawler.rb, line 50
def multi_thread(thread_num)
  @thread_pool = Kudzu::ThreadPool.new(thread_num)

  @thread_pool.start do |queue|
    limit_num = [thread_num - queue.size, 0].max
    @frontier.dequeue(limit: limit_num).each do |link|
      queue.push(link)
    end
    link = queue.pop
    visit_link(link)
  end

  @thread_pool.wait
  @thread_pool.shutdown
end
register_page(page) click to toggle source
# File lib/kudzu/crawler.rb, line 152
def register_page(page)
  @callback.around(:register, page) do
    @repository.register(page)
  end
end
run_callback(page, link) click to toggle source
# File lib/kudzu/crawler.rb, line 92
def run_callback(page, link)
  if page.status_success?
    if page.filtered
      @callback.on(:filter, page, link)
    else
      @callback.on(:success, page, link)
    end
  elsif page.status_redirection?
    @callback.on(:redirection, page, link)
  elsif page.status_client_error?
    @callback.on(:client_error, page, link)
  elsif page.status_server_error?
    @callback.on(:server_error, page, link)
  end
end
single_thread() click to toggle source
# File lib/kudzu/crawler.rb, line 42
def single_thread
  loop do
    link = @frontier.dequeue.first
    break unless link
    visit_link(link)
  end
end