class SuperCrawler::Crawl
Crawl
a whole website For each new link detected, scrap the corresponding page.
Attributes
crawl_results[R]
links[R]
Public Class Methods
new(start_url, options = {})
click to toggle source
# File lib/super_crawler/crawl.rb, line 16 def initialize start_url, options = {} @start_url = URI(URI.encode start_url).normalize().to_s # Normalize the given URL @links = [@start_url] # Will contain the list of all links found @crawl_results = [] # Will contain the crawl results (links and assets), as array of hashes @option_debug = options[:debug].nil? ? true : !!(options[:debug]) # Debug by default end
Public Instance Methods
get_assets(asset)
click to toggle source
Get specific assets (images, stylesheets and scripts)
# File lib/super_crawler/crawl.rb, line 72 def get_assets asset return [] if @crawl_results.empty? # No crawling yet? Return empty search # The asset parameter can only be images, stylesheets or scripts unless %w(images stylesheets scripts).include? asset.to_s # Display error message in this case. SuperCrawler::Render.error "`asset` parameter can only be `images`, `stylesheets` or `scripts`" return [] # Return empty array end # Good! Return flatten array of unique assets return @crawl_results.map{ |cr| cr[:assets][asset.to_sym] }.flatten.uniq end
render(max_pages = 10)
click to toggle source
Render
the crawling result as a sitemap in the console
# File lib/super_crawler/crawl.rb, line 65 def render max_pages = 10 SuperCrawler::Render.console( @crawl_results, max_pages ) end
start(threads_count = 10)
click to toggle source
Start crawling site Could take a while! Use threads to speed up crawling and log to inform user.
# File lib/super_crawler/crawl.rb, line 28 def start threads_count = 10 SuperCrawler::Render.crawling_start_notice( @start_url, threads_count ) if @option_debug # Show message on what will happen threads = [] # Will contain our n-threads @links_queue = Queue.new # Will contain the links queue that the threads will use @links = [@start_url] # Re-init the links list @crawl_results = [] # Re-init the crawling results start_time = Time.now if @option_debug # Start the timer # Let's populate our queue with links and resources from source url process_page( @start_url ) # Create threads to handle new links threads_count.times do # Create threads_count threads threads << Thread.new do # Instantiate a new threads begin while current_link = @links_queue.pop(true) # Pop one link after another process_page( current_link ) # Get links and assets of the popped link end rescue ThreadError # Stop when empty links queue end end end threads.map(&:join) # Activate the threads SuperCrawler::Render.crawling_summary_notice(Time.now - start_time, threads_count, @links.count) if @option_debug # Display crawling summary return true end
Private Instance Methods
process_page(page_url)
click to toggle source
Process a page by extracting information and updating links queue, links list and results.
# File lib/super_crawler/crawl.rb, line 92 def process_page page_url page = SuperCrawler::Scrap.new(page_url) # Scrap the current page current_page_links = page.get_links # Get current page internal links new_links = current_page_links - @links # Select new links new_links.each { |link| @links_queue.push(link) } # Add new links to the queue @links += new_links # Add new links to the links list @crawl_results << { # Provide current page crawl result as a hash url: page.url, # The crawled page links: current_page_links, # Its internal links assets: page.get_assets # Its assets } SuperCrawler::Render.log_status( page_url, @crawl_results.length, @links.length ) if @option_debug # Display site crawling status end