class SpiderMech

Attributes

crawled[R]
data[R]
queue[R]

Public Class Methods

new(start_page) click to toggle source
# File lib/spidermech.rb, line 10
def initialize(start_page)
  @logger = Logger.new 'spidermech.log'
  @start_page = start_page

  @queue   = []
  @crawled = []
  @data    = []

  @queue << @start_page

  @bot = Mechanize.new
end

Public Instance Methods

crawl() click to toggle source
# File lib/spidermech.rb, line 52
def crawl
  url = @queue.shift

  if @crawled.include? url
    # @logger.warn "Already crawled #{url}"
    return
  else
    @logger.info "Crawling #{url}"
    @logger.info "Left in Queue: #{left_in_queue}"
  end

  page = @bot.get url

  if page.class != Mechanize::Page
    @logger.info "File crawling is not supported."
    return
  end

  @crawled << url

  # get all the assets
  data = {
    :url => url,
    :assets => {
      :scripts => find_scripts(page),
      :images => find_images(page),
      :css => find_css(page)
    },

    :links => []
  }

  page.links.each do |link|

    begin
      if link.href[0] == '/' # this is a relative link
        @queue << link.href
        data[:links] << link.href
      elsif link.href[0..@start_page.length] == @start_page # still part of this domain
        @queue << link.href
        data[:links] << link.href
      else
        # @logger.info "This link did not fall under our jurisdiction: #{link.href}"
      end
    rescue Exception => e
      # @logger.error e
    end
  end

  @data << data
end
find_css(page) click to toggle source
# File lib/spidermech.rb, line 124
def find_css(page)
  page.search('link').map do |css|
    begin
      css.attributes['href'].value
     rescue Exception => e
       # @logger.error e
     end
  end
end
find_images(page) click to toggle source
# File lib/spidermech.rb, line 114
def find_images(page)
  page.search('img').map do |img|
    begin
      img.attributes['src'].value
    rescue Exception => e
      # @logger.error e
    end
  end
end
find_scripts(page) click to toggle source
# File lib/spidermech.rb, line 104
def find_scripts(page)
  page.search('script').map do |script| 
    begin
      script.attributes['src'].value
    rescue Exception => e
      # @logger.error e
    end
  end
end
left_in_queue() click to toggle source
# File lib/spidermech.rb, line 23
def left_in_queue
  i = 0

  @queue.each do |link|
    if @crawled.include? link
      # we don't need to crawl this one
    else
      i += 1
    end
  end

  i
end
run() click to toggle source
# File lib/spidermech.rb, line 44
def run
  while !@queue.empty?
    crawl
  end

  @data
end
save_json() click to toggle source
# File lib/spidermech.rb, line 37
def save_json
  filename = "#{URI.parse(@start_page).host}.json"
  @logger.info "Writing sitemap data to #{filename}"
  json = @data.to_json
  File.open(filename, 'w') { |f| f.write json }
end