class Pantopoda::Pantopoda

Public Class Methods

new(url, options = {}) click to toggle source
# File lib/pantopoda.rb, line 10
def initialize(url, options = {})
        @start_url = url
        @domain = parse_domain(url)

        @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
        @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
        @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
        @debug = options[:debug] ? options[:debug] : false
end

Public Instance Methods

crawl(options = {}) { |response| ... } click to toggle source
# File lib/pantopoda.rb, line 20
def crawl(options = {})
        # Defaults to 1 thread, so we won't do a crazy amount of crawling on domains
        threads = options[:threads] ? options[:threads] : 1

        # Defaults to -1 so it will always keep running until it runs out of urls
        max_urls = options[:max_urls] ? options[:max_urls] : nil

        @hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
        @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
        @global_queue = []

        @global_queue << @start_url

        while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
                temp_queue = @global_queue
                temp_queue.each do |q|
                        begin
                                ip,port,user,pass = nil

                                request = Typhoeus::Request.new(q, :timeout => 100, :follow_location => true) if ip == nil
                                request.on_complete do |response|
                                        yield response
                                        links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
                                        links.each do |link|
                                                if (internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
                                                        sanitize_link = sanitize_link(split_url_at_hash(link))
                                                        if (sanitize_link)
                                                                absolute_link = make_absolute(sanitize_link, response.effective_url)
                                                                if absolute_link
                                                                        @global_queue << absolute_link
                                                                end
                                                        end
                                                end
                                        end
                                end

                                @hydra.queue request

                        rescue URI::InvalidURIError, NoMethodError => e
                                puts "Exception caught: #{e}" if @debug == true
                        end

                        @global_visited.insert(q)
                        @global_queue.delete(q)
                end

                @hydra.run
        end
end
ignore_extensions(url) click to toggle source
# File lib/pantopoda.rb, line 112
def ignore_extensions(url)
        return true if url.to_s.length == 0
        return true unless @exclude_urls_with_extensions

        not_found = true

        @exclude_urls_with_extensions.each do |e|
                if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
                        not_found = false
                        puts "#{e} Found At URL: #{url}" if @debug
                end
        end

        return not_found
end
make_absolute(href, root) click to toggle source
# File lib/pantopoda.rb, line 136
def make_absolute(href, root)
        begin
              URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
      rescue URI::InvalidURIError, URI::InvalidComponentError, NoMethodError => e
              return false
      end
end
no_hash_in_url?(url) click to toggle source
# File lib/pantopoda.rb, line 102
def no_hash_in_url?(url)
        return true unless @exclude_urls_with_hash

        if(url.to_s.scan(/#/).size > 0)
                return false
        else
                return true
        end
end
parse_domain(url) click to toggle source
# File lib/pantopoda.rb, line 70
def parse_domain(url)
        puts "Parsing URL: #{url}" if @debug

        begin
                parsed_domain = Domainatrix.parse(url)
                if (parsed_domain.subdomain != "")
                        parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
                else
                        parsed_domain.domain + '.' + parsed_domain.public_suffix
                end

        rescue NoMethodError, Addressable::URI::InvalidURIError => e
                puts "URL Parsing Exception (#{url}) : #{e}" if @debug
                return nil
        end
end
split_url_at_hash(url) click to toggle source
# File lib/pantopoda.rb, line 97
def split_url_at_hash(url)
        return url.to_s unless @split_url_at_hash
        return url.to_s.split('#')[0]
end