class Snapshotify::Scraper

Attributes

emitter[RW]
invalid[RW]
processed[RW]
unprocessed[RW]
unprocessed_urls[RW]
valid_domains[RW]

Public Class Methods

new(initial_url) click to toggle source
# File lib/snapshotify/scraper.rb, line 5
def initialize initial_url
  seed = Snapshotify::Document.new(initial_url)

  self.valid_domains = [seed.url.host]

  self.unprocessed = [seed]
  self.unprocessed_urls = [seed.url.canonical_url]

  self.processed = []
  self.invalid = []
end

Public Instance Methods

run() click to toggle source
# File lib/snapshotify/scraper.rb, line 17
def run
  while !unprocessed.empty?
    document = unprocessed.shift
    process document
  end
  self
end

Private Instance Methods

enqueue(links) click to toggle source
# File lib/snapshotify/scraper.rb, line 39
def enqueue links
  links.each_with_index do |document, index|
    next unless valid?(document)
    next if !valid_domains.include?(document.url.host)
    next if processed.include?(document.url.canonical_url)
    next if unprocessed_urls.include?(document.url.canonical_url)

    emitter.log("> Enqueued: #{document.url.canonical_url}")

    unprocessed << document
    unprocessed_urls << document.url.canonical_url
  end
end
process(document) click to toggle source
# File lib/snapshotify/scraper.rb, line 27
def process document
  emitter.log(document.url.canonical_url.colorize(:green))

  document.emitter = emitter

  enqueue(document.links)
  document.rewrite
  document.write!

  processed << document.url.canonical_url
end
valid?(document) click to toggle source
# File lib/snapshotify/scraper.rb, line 53
def valid?(document)
  if !document.url.valid && !self.invalid.include?(document.url.raw_url)
    emitter.warning("> Invalid URL: #{document.url.raw_url}")
    invalid << document.url.raw_url
  end
  document.url.valid
end