class Tjcrawler::Crawler
Public Class Methods
new(css_selector_for_link_tags)
click to toggle source
# File lib/tjcrawler/crawler.rb, line 8 def initialize css_selector_for_link_tags @css = css_selector_for_link_tags end
Public Instance Methods
crawl(url)
click to toggle source
# File lib/tjcrawler/crawler.rb, line 12 def crawl url uri = URI(url).tap(&:normalize!) content = open(uri).read doc = Nokogiri::HTML(content) links = doc.css(@css) Result.new url: uri.to_s, content: content, links: links.map{|link| uri.merge(link[:href]).to_s} end
start()
click to toggle source
# File lib/tjcrawler/crawler.rb, line 20 def start loop do sleep 1 until page = Page.dequeue print :'.' result = crawl page.url page.update(content: result.content) page.touch(:crawled_at) result.links.each{ |url| Page.enqueue url } end end