class StupidCrawler::Crawler
Constants
- NotAbsoluteURI
Attributes
ignore_links[R]
max_urls[R]
robots[R]
sleep_time[R]
uri[R]
Public Class Methods
new(site, max_urls:, sleep_time:, robots:, ignore_links:)
click to toggle source
# File lib/stupid_crawler/crawler.rb, line 11 def initialize(site, max_urls:, sleep_time:, robots:, ignore_links:) @uri = build_uri!(site) @max_urls = max_urls @sleep_time = sleep_time @robots = robots @ignore_links = ignore_links.nil? ? [] : [Regexp.new(ignore_links)] end
Public Instance Methods
call()
click to toggle source
# File lib/stupid_crawler/crawler.rb, line 19 def call crawl end
Private Instance Methods
build_uri!(site)
click to toggle source
# File lib/stupid_crawler/crawler.rb, line 48 def build_uri!(site) uri = URI.parse(site) unless uri.absolute raise(NotAbsoluteURI, 'must be an absolute url with http(s) protocol') end uri end
crawl()
click to toggle source
# File lib/stupid_crawler/crawler.rb, line 25 def crawl found_urls = Set.new failed_urls = Set.new Spidr.site(uri.to_s, ignore_links: ignore_links, robots: robots) do |spider| spider.every_url do |url| puts url found_urls << url sleep sleep_time return found_urls.to_a if found_urls.length > max_urls end spider.every_failed_url do |url| puts "FAILED: #{url}" failed_urls << url end end { found: found_urls.to_a, failed: failed_urls.to_a } end