class SiteDiff::Crawler

SiteDiff Crawler.

Constants

DEFAULT_DEPTH

Public Class Methods

new(hydra, base, interval, include_regex, exclude_regex, depth = DEFAULT_DEPTH, curl_opts = UriWrapper::DEFAULT_CURL_OPTS, debug = true, &block) click to toggle source

Create a crawler with a base URL

# File lib/sitediff/crawler.rb, line 18
def initialize(hydra, base,
               interval,
               include_regex,
               exclude_regex,
               depth = DEFAULT_DEPTH,
               curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
               debug = true,
               &block)
  @hydra = hydra
  @base_uri = Addressable::URI.parse(base)
  @base = base
  @interval = interval
  @include_regex = include_regex
  @exclude_regex = exclude_regex
  @found = Set.new
  @callback = block
  @curl_opts = curl_opts
  @debug = debug

  add_uri('', depth)
end

Public Instance Methods

add_uri(rel, depth) click to toggle source

Handle a newly found relative URI

# File lib/sitediff/crawler.rb, line 41
def add_uri(rel, depth)
  return if @found.include? rel

  @found << rel

  wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
  wrapper.queue(@hydra) do |res|
    fetched_uri(rel, depth, res)
  end
end
fetched_uri(rel, depth, res) click to toggle source

Handle the fetch of a URI

# File lib/sitediff/crawler.rb, line 53
def fetched_uri(rel, depth, res)
  if res.error
    SiteDiff.log(res.error, :error)
    return
  elsif !res.content
    SiteDiff.log('Response is missing content. Treating as an error.', :error)
    return
  end

  base = Addressable::URI.parse(@base + rel)
  doc = Nokogiri::HTML(res.content)

  # Call the callback
  info = Info.new(
    relative: rel,
    uri: base,
    read_result: res,
    document: doc
  )
  # Insert delay to limit fetching rate
  if @interval != 0
    SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
    sleep(@interval / 1000.0)
  end
  @callback[info]

  return unless depth >= 1

  # Find links
  links = find_links(doc)
  uris = links.map { |l| resolve_link(base, l) }.compact
  uris = filter_links(uris)

  # Make them relative
  rels = uris.map { |u| relativize_link(u) }

  # Queue them in turn
  rels.each do |r|
    next if @found.include? r

    add_uri(r, depth - 1)
  end
end