class SpiderMech
Attributes
crawled[R]
data[R]
queue[R]
Public Class Methods
new(start_page)
click to toggle source
# File lib/spidermech.rb, line 10 def initialize(start_page) @logger = Logger.new 'spidermech.log' @start_page = start_page @queue = [] @crawled = [] @data = [] @queue << @start_page @bot = Mechanize.new end
Public Instance Methods
crawl()
click to toggle source
# File lib/spidermech.rb, line 52 def crawl url = @queue.shift if @crawled.include? url # @logger.warn "Already crawled #{url}" return else @logger.info "Crawling #{url}" @logger.info "Left in Queue: #{left_in_queue}" end page = @bot.get url if page.class != Mechanize::Page @logger.info "File crawling is not supported." return end @crawled << url # get all the assets data = { :url => url, :assets => { :scripts => find_scripts(page), :images => find_images(page), :css => find_css(page) }, :links => [] } page.links.each do |link| begin if link.href[0] == '/' # this is a relative link @queue << link.href data[:links] << link.href elsif link.href[0..@start_page.length] == @start_page # still part of this domain @queue << link.href data[:links] << link.href else # @logger.info "This link did not fall under our jurisdiction: #{link.href}" end rescue Exception => e # @logger.error e end end @data << data end
find_css(page)
click to toggle source
# File lib/spidermech.rb, line 124 def find_css(page) page.search('link').map do |css| begin css.attributes['href'].value rescue Exception => e # @logger.error e end end end
find_images(page)
click to toggle source
# File lib/spidermech.rb, line 114 def find_images(page) page.search('img').map do |img| begin img.attributes['src'].value rescue Exception => e # @logger.error e end end end
find_scripts(page)
click to toggle source
# File lib/spidermech.rb, line 104 def find_scripts(page) page.search('script').map do |script| begin script.attributes['src'].value rescue Exception => e # @logger.error e end end end
left_in_queue()
click to toggle source
# File lib/spidermech.rb, line 23 def left_in_queue i = 0 @queue.each do |link| if @crawled.include? link # we don't need to crawl this one else i += 1 end end i end
run()
click to toggle source
# File lib/spidermech.rb, line 44 def run while !@queue.empty? crawl end @data end
save_json()
click to toggle source
# File lib/spidermech.rb, line 37 def save_json filename = "#{URI.parse(@start_page).host}.json" @logger.info "Writing sitemap data to #{filename}" json = @data.to_json File.open(filename, 'w') { |f| f.write json } end