class BrowserCrawler::EngineUtilities::CrawlManager

This main operated class which controls queue of unvisisted links.

Attributes

deep_visit[R]
host_name[R]
logger[R]
max_pages[R]
page_inspector[R]
report_store[R]
target_url[R]

Public Class Methods

new(report_store:, max_pages: 0, deep_visit: false, logger: nil) click to toggle source
# File lib/browser_crawler/engine_utilities/crawl_manager.rb, line 20
def initialize(report_store:,
               max_pages: 0,
               deep_visit: false,
               logger: nil)
  @report_store     = report_store
  @max_pages        = max_pages
  @deep_visit       = deep_visit
  @logger           = logger || Logger.new(STDOUT)
end

Public Instance Methods

crawl(target_url:, capybara_session:, screenshot_operator: nil) click to toggle source
# File lib/browser_crawler/engine_utilities/crawl_manager.rb, line 30
def crawl(target_url:, capybara_session:, screenshot_operator: nil)
  @host_name             = UrlTools.uri!(url: target_url).host
  @unvisited_links_queue = [target_url]

  loop do
    break if unvisited_links_queue.empty? || limit_reached?

    unvisited_link = unvisited_links_queue.shift

    link_inspector = LinkInspector.new(raw_link: unvisited_link,
                                       host_name: host_name)

    unless link_valid?(link_inspector)
      @logger.info("Skipped visited #{unvisited_link}")
      report_store.record_unrecognized_link(unvisited_link)
      next
    end

    inspect_page(link_inspector: link_inspector,
                 capybara_session: capybara_session,
                 screenshot_operator: screenshot_operator)
  end
end

Private Instance Methods

error_handler(link:, error:) click to toggle source
# File lib/browser_crawler/engine_utilities/crawl_manager.rb, line 91
def error_handler(link:, error:)
  error_link = "visiting link - #{link};\n"
  error_message = "error message: #{error.message};\n"
  error_backtrace = "error backtrace: #{error.backtrace.join("\n")};\n"
  logger.error("Error: #{error_link} #{error_message} #{error_backtrace}")
  report_store.record_crawler_error(link: link, error: error)
end
inspect_page(link_inspector:, capybara_session:, screenshot_operator:) click to toggle source
# File lib/browser_crawler/engine_utilities/crawl_manager.rb, line 62
def inspect_page(link_inspector:, capybara_session:, screenshot_operator:)
  InspectPageProcess.new(link_inspector: link_inspector,
                         capybara_session: capybara_session,
                         screenshot_operator: screenshot_operator,
                         report_store: report_store,
                         logger: logger)
                    .call(unvisited_links_queue: unvisited_links_queue)
rescue StandardError => error
  error_handler(link: link_inspector.raw_link, error: error)
end
internal_resource?(link_inspector) click to toggle source
# File lib/browser_crawler/engine_utilities/crawl_manager.rb, line 73
def internal_resource?(link_inspector)
  link_inspector.internal_url? || deep_visit
end
limit_reached?() click to toggle source
# File lib/browser_crawler/engine_utilities/crawl_manager.rb, line 81
def limit_reached?
  return false if max_pages.zero?

  visited_pages.count >= max_pages
end
page_unvisited?(link_inspector) click to toggle source
# File lib/browser_crawler/engine_utilities/crawl_manager.rb, line 77
def page_unvisited?(link_inspector)
  !visited_pages.include?(link_inspector.full_url)
end
visited_pages() click to toggle source
# File lib/browser_crawler/engine_utilities/crawl_manager.rb, line 87
def visited_pages
  report_store.visited_pages
end