module Arachnid2::Exoskeleton
Public Instance Methods
bound_time()
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 62 def bound_time boundary = "#{@options[:time_box]}".to_i boundary = BASE_CRAWL_TIME if boundary <= 0 boundary = MAX_CRAWL_TIME if boundary > MAX_CRAWL_TIME return Time.now + boundary end
Also aliased as: time_limit
bound_urls()
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 70 def bound_urls amount = "#{@options[:max_urls]}".to_i amount = BASE_URLS if amount <= 0 amount = MAX_URLS if amount > MAX_URLS amount end
Also aliased as: max_urls
browser_type()
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 3 def browser_type unless @browser_type @browser_type = "#{@options[:browser_type]}".to_sym if @options[:browser_type] @browser_type ||= :firefox end @browser_type end
crawl_options()
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 88 def crawl_options @crawl_options ||= { max_urls: max_urls, time_limit: time_limit } end
extension_ignored?(url)
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 104 def extension_ignored?(url) return false if url.empty? !non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil? end
extract_hrefs(body)
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 18 def extract_hrefs(body) elements = Nokogiri::HTML.parse(body).css('a') return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? } end
in_docker?()
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 121 def in_docker? File.file?(MEMORY_USE_FILE) end
internal_link?(absolute_url)
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 100 def internal_link?(absolute_url) "#{Adomain[absolute_url]}".include? @domain end
make_absolute(href, root)
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 96 def make_absolute(href, root) Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s end
maximum_load_rate()
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 125 def maximum_load_rate return @maximum_load_rate if @maximum_load_rate @maximum_load_rate = "#{@options[:memory_limit]}".to_f @maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0)) @maximum_load_rate end
memory_danger?()
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 110 def memory_danger? return false unless in_docker? use = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f @limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f return false unless ( (use > 0.0) && (@limit > 0.0) ) return ( ( (use / @limit) * 100.0 ) >= maximum_load_rate ) end
non_html_extensions()
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 55 def non_html_extensions return @non_html_extensions if @non_html_extensions @non_html_extensions = @options[:non_html_extensions] @non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS end
preflight(opts)
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 45 def preflight(opts) @options = opts @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true) @global_queue = [@url] end
process(url, html)
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 12 def process(url, html) return false unless Adomain["#{url}"]&.include? @domain extract_hrefs(html) end
proxy()
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 51 def proxy @options[:proxy] end
skip_link?(absolute_link)
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 38 def skip_link?(absolute_link) !internal_link?(absolute_link) || \ @global_visited.include?(absolute_link) || \ extension_ignored?(absolute_link) || \ @global_queue.include?(absolute_link) end
timeout()
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 78 def timeout unless @timeout @timeout = @options[:timeout] @timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer) @timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT @timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT end @timeout end
vacuum(links, url)
click to toggle source
# File lib/arachnid2/exoskeleton.rb, line 23 def vacuum(links, url) links.each do |link| next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/) begin absolute_link = make_absolute(link, url) next if skip_link?(absolute_link) @global_queue << absolute_link rescue Addressable::URI::InvalidURIError end end end