class Elsmore::Scraper
Attributes
emitter[RW]
invalid[RW]
processed[RW]
unprocessed[RW]
unprocessed_urls[RW]
valid_domains[RW]
Public Class Methods
new(initial_url)
click to toggle source
# File lib/elsmore/scraper.rb, line 5 def initialize initial_url seed = Elsmore::Document.new(initial_url) self.valid_domains = [seed.url.host] self.unprocessed = [seed] self.unprocessed_urls = [seed.url.canonical_url] self.processed = [] self.invalid = [] end
Public Instance Methods
run()
click to toggle source
# File lib/elsmore/scraper.rb, line 17 def run while !unprocessed.empty? document = unprocessed.shift process document end self end
Private Instance Methods
enqueue(links)
click to toggle source
# File lib/elsmore/scraper.rb, line 39 def enqueue links links.each_with_index do |document, index| next unless valid?(document) next if !valid_domains.include?(document.url.host) next if processed.include?(document.url.canonical_url) next if unprocessed_urls.include?(document.url.canonical_url) emitter.log("> Enqueued: #{document.url.canonical_url}") unprocessed << document unprocessed_urls << document.url.canonical_url end end
process(document)
click to toggle source
# File lib/elsmore/scraper.rb, line 27 def process document emitter.log(document.url.canonical_url.colorize(:green)) document.emitter = emitter enqueue(document.links) document.rewrite document.write! processed << document.url.canonical_url end
valid?(document)
click to toggle source
# File lib/elsmore/scraper.rb, line 53 def valid?(document) if !document.url.valid && !self.invalid.include?(document.url.raw_url) emitter.warning("> Invalid URL: #{document.url.raw_url}") invalid << document.url.raw_url end document.url.valid end