class SiteDiff::Crawler
Constants
- DEFAULT_DEPTH
Public Class Methods
new(hydra, base, interval, include_regex, exclude_regex, depth = DEFAULT_DEPTH, curl_opts = UriWrapper::DEFAULT_CURL_OPTS, debug = true, &block)
click to toggle source
Create a crawler with a base URL
# File lib/sitediff/crawler.rb, line 18 def initialize(hydra, base, interval, include_regex, exclude_regex, depth = DEFAULT_DEPTH, curl_opts = UriWrapper::DEFAULT_CURL_OPTS, debug = true, &block) @hydra = hydra @base_uri = Addressable::URI.parse(base) @base = base @interval = interval @include_regex = include_regex @exclude_regex = exclude_regex @found = Set.new @callback = block @curl_opts = curl_opts @debug = debug add_uri('', depth) end
Public Instance Methods
add_uri(rel, depth)
click to toggle source
Handle a newly found relative URI
# File lib/sitediff/crawler.rb, line 41 def add_uri(rel, depth) return if @found.include? rel @found << rel wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug) wrapper.queue(@hydra) do |res| fetched_uri(rel, depth, res) end end
fetched_uri(rel, depth, res)
click to toggle source
Handle the fetch of a URI
# File lib/sitediff/crawler.rb, line 53 def fetched_uri(rel, depth, res) if res.error SiteDiff.log(res.error, :error) return elsif !res.content SiteDiff.log('Response is missing content. Treating as an error.', :error) return end base = Addressable::URI.parse(@base + rel) doc = Nokogiri::HTML(res.content) # Call the callback info = Info.new( relative: rel, uri: base, read_result: res, document: doc ) # Insert delay to limit fetching rate if @interval != 0 SiteDiff.log("Waiting #{@interval} milliseconds.", :info) sleep(@interval / 1000.0) end @callback[info] return unless depth >= 1 # Find links links = find_links(doc) uris = links.map { |l| resolve_link(base, l) }.compact uris = filter_links(uris) # Make them relative rels = uris.map { |u| relativize_link(u) } # Queue them in turn rels.each do |r| next if @found.include? r add_uri(r, depth - 1) end end
filter_links(uris)
click to toggle source
Filter out links we don't want. Links passed in are absolute URIs.
# File lib/sitediff/crawler.rb, line 116 def filter_links(uris) uris.find_all do |u| is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path) next unless is_sub_uri is_included = @include_regex.nil? ? false : @include_regex.match(u.path) is_excluded = @exclude_regex.nil? ? false : @exclude_regex.match(u.path) if is_excluded && !is_included SiteDiff.log "Ignoring excluded URL #{u.path}", :info end is_included || !is_excluded end end
find_links(doc)
click to toggle source
Return a list of string links found on a page.
# File lib/sitediff/crawler.rb, line 111 def find_links(doc) doc.xpath('//a[@href]').map { |e| e['href'] } end
relativize_link(uri)
click to toggle source
Make a link relative to @base_uri
# File lib/sitediff/crawler.rb, line 106 def relativize_link(uri) uri.path.slice(@base_uri.path.length, uri.path.length) end
resolve_link(base, rel)
click to toggle source
Resolve a potentially-relative link. Return nil on error.
# File lib/sitediff/crawler.rb, line 98 def resolve_link(base, rel) base + rel rescue Addressable::URI::InvalidURIError SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning nil end