class Pantopoda::Pantopoda
Public Class Methods
new(url, options = {})
click to toggle source
# File lib/pantopoda.rb, line 10 def initialize(url, options = {}) @start_url = url @domain = parse_domain(url) @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false @debug = options[:debug] ? options[:debug] : false end
Public Instance Methods
crawl(options = {}) { |response| ... }
click to toggle source
# File lib/pantopoda.rb, line 20 def crawl(options = {}) # Defaults to 1 thread, so we won't do a crazy amount of crawling on domains threads = options[:threads] ? options[:threads] : 1 # Defaults to -1 so it will always keep running until it runs out of urls max_urls = options[:max_urls] ? options[:max_urls] : nil @hydra = Typhoeus::Hydra.new(:max_concurrency => threads) @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false) @global_queue = [] @global_queue << @start_url while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls)) temp_queue = @global_queue temp_queue.each do |q| begin ip,port,user,pass = nil request = Typhoeus::Request.new(q, :timeout => 100, :follow_location => true) if ip == nil request.on_complete do |response| yield response links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href') links.each do |link| if (internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link)) sanitize_link = sanitize_link(split_url_at_hash(link)) if (sanitize_link) absolute_link = make_absolute(sanitize_link, response.effective_url) if absolute_link @global_queue << absolute_link end end end end end @hydra.queue request rescue URI::InvalidURIError, NoMethodError => e puts "Exception caught: #{e}" if @debug == true end @global_visited.insert(q) @global_queue.delete(q) end @hydra.run end end
ignore_extensions(url)
click to toggle source
# File lib/pantopoda.rb, line 112 def ignore_extensions(url) return true if url.to_s.length == 0 return true unless @exclude_urls_with_extensions not_found = true @exclude_urls_with_extensions.each do |e| if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase) not_found = false puts "#{e} Found At URL: #{url}" if @debug end end return not_found end
internal_link?(url, effective_url)
click to toggle source
# File lib/pantopoda.rb, line 87 def internal_link?(url, effective_url) absolute_url = make_absolute(url, effective_url) parsed_url = parse_domain(absolute_url) if (@domain == parsed_url) return true else return false end end
make_absolute(href, root)
click to toggle source
# File lib/pantopoda.rb, line 136 def make_absolute(href, root) begin URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s rescue URI::InvalidURIError, URI::InvalidComponentError, NoMethodError => e return false end end
no_hash_in_url?(url)
click to toggle source
# File lib/pantopoda.rb, line 102 def no_hash_in_url?(url) return true unless @exclude_urls_with_hash if(url.to_s.scan(/#/).size > 0) return false else return true end end
parse_domain(url)
click to toggle source
# File lib/pantopoda.rb, line 70 def parse_domain(url) puts "Parsing URL: #{url}" if @debug begin parsed_domain = Domainatrix.parse(url) if (parsed_domain.subdomain != "") parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix else parsed_domain.domain + '.' + parsed_domain.public_suffix end rescue NoMethodError, Addressable::URI::InvalidURIError => e puts "URL Parsing Exception (#{url}) : #{e}" if @debug return nil end end
sanitize_link(url)
click to toggle source
# File lib/pantopoda.rb, line 128 def sanitize_link(url) begin return url.gsub(/\s+/, "%20") rescue return false end end
split_url_at_hash(url)
click to toggle source
# File lib/pantopoda.rb, line 97 def split_url_at_hash(url) return url.to_s unless @split_url_at_hash return url.to_s.split('#')[0] end