class Tjcrawler::Crawler

Public Class Methods

new(css_selector_for_link_tags) click to toggle source
# File lib/tjcrawler/crawler.rb, line 8
def initialize css_selector_for_link_tags
  @css = css_selector_for_link_tags
end

Public Instance Methods

crawl(url) click to toggle source
# File lib/tjcrawler/crawler.rb, line 12
def crawl url
  uri = URI(url).tap(&:normalize!)
  content = open(uri).read
  doc = Nokogiri::HTML(content)
  links = doc.css(@css)
  Result.new url: uri.to_s, content: content, links: links.map{|link| uri.merge(link[:href]).to_s}
end
start() click to toggle source
# File lib/tjcrawler/crawler.rb, line 20
def start
  loop do
    sleep 1 until page = Page.dequeue
    print :'.'
    result = crawl page.url
    page.update(content: result.content)
    page.touch(:crawled_at)
    result.links.each{ |url| Page.enqueue url }
  end
end