module Spiderman

Turn any class into a crawler by including this module.

Example:

 class MySpider < ApplicationJob # Yup, you can define this in a job
   queue_as :crawler

   include Spiderman

   crawl "https://example.com/" do |response|
     response.css('.selector a').each do |a|
       process! a["href"], :listing
     end
   end

   process :listing do |response|
     process! response.css('img'), :image
     save_the_thing response.css('.some_selector')
   end

   process :image do |response|
     # Do something with the image file
   end

   def save_the_thing(thing)
     # logic here for saving the thing
   end
end

Constants

VERSION

Public Instance Methods

add(clazz) click to toggle source
# File lib/spiderman.rb, line 118
def add(clazz)
  list.push(clazz)
end
crawl(*urls, &block) click to toggle source

Use `crawl` to specify URLs to start with. `crawl` accepts one or more URLs, and will call the block for each URL requested. You can also define multiple `crawl` blocks with different behavior for each starting URL. All `crawl` blocks will be called when calling `SpiderName.crawl!`.

`response` is an enhanced `HTTP::Response` object that also acts like a `Nokogiri::HTML` document, e.g. `response.css(…)`

# File lib/spiderman.rb, line 62
def crawl(*urls, &block)
  urls.each { |url| crawler.register(url, &block) }
  crawler.start_at(*urls)
end
crawl!() click to toggle source
# File lib/spiderman.rb, line 79
def crawl!
  crawler.urls.each do |url|
    process! url
  end
end
find(name) click to toggle source
# File lib/spiderman.rb, line 114
def find(name)
  self.list.detect { |crawler| crawler.name.demodulize.underscore == name }
end
inherited(subclass) click to toggle source
# File lib/spiderman.rb, line 73
def inherited(subclass)
  subclass.crawler = crawler.dup
  Spiderman.add(subclass)
end
list() click to toggle source
# File lib/spiderman.rb, line 105
def list
  @list ||= []
end
name() click to toggle source
# File lib/spiderman.rb, line 99
def name
  self.class.name.demodulize
end
perform(url, with = nil) click to toggle source
# File lib/spiderman.rb, line 93
def perform(url, with = nil)
  handler = crawler.handler_for(with || url)
  response = crawler.request(url)
  instance_exec response, &handler
end
process(type, &block) click to toggle source

Processors are called from `crawl` and can be used to handle different types of responsezs.

# File lib/spiderman.rb, line 69
def process(type, &block)
  crawler.register(type, &block)
end
process!(url, with = nil) click to toggle source
# File lib/spiderman.rb, line 85
def process!(url, with = nil)
  if defined?(ActiveJob) && self.is_a?(ActiveJob::Base)
    self.class.perform_later(url.to_s, with)
  else
    perform(url, with)
  end
end
run(crawler = nil) click to toggle source
# File lib/spiderman.rb, line 109
def run(crawler = nil)
  crawlers = crawler ? [find(crawler)] : list
  crawlers.each(&:crawl!)
end