class BrowserCrawler::Engine
Constants
- CUPRITE_OPTIONS
- REPORT_SAVE_FOLDER_PATH
- SCREENSHOT_OPERATOR_OPTIONS
Attributes
crawl_manager[R]
logger[R]
report_store[R]
screenshot_operator[R]
Public Class Methods
new(browser_options: {}, screenshots_options: {}, max_pages: nil, deep_visit: false, logger: nil)
click to toggle source
# File lib/browser_crawler/engine.rb, line 44 def initialize(browser_options: {}, screenshots_options: {}, max_pages: nil, deep_visit: false, logger: nil) screenshots_operator_options = SCREENSHOT_OPERATOR_OPTIONS .merge(screenshots_options) @screenshot_operator = ScreenshotOperator.new(**screenshots_operator_options) cuprite_options = CUPRITE_OPTIONS.merge(browser_options) @logger = logger || Logger.new(STDOUT) register_chrome_driver(cuprite_options) initialize_report_store(cuprite_options) initialize_crawl_manager(max_pages, deep_visit) end
Public Instance Methods
after(type: :all, &hook)
click to toggle source
# File lib/browser_crawler/engine.rb, line 99 def after(type: :all, &hook) HooksContainer.instance.add_hook(method: :after, type: type, hook: hook) end
before(type: :all, &hook)
click to toggle source
# File lib/browser_crawler/engine.rb, line 95 def before(type: :all, &hook) HooksContainer.instance.add_hook(method: :before, type: type, hook: hook) end
change_page_scan_rules(&hook)
click to toggle source
# File lib/browser_crawler/engine.rb, line 107 def change_page_scan_rules(&hook) HooksContainer.instance.add_hook(type: :scan_rules, hook: hook) end
extract_links(url:)
click to toggle source
# File lib/browser_crawler/engine.rb, line 68 def extract_links(url:) initialize_crawler(url) begin with_hooks_for(type: :all) do crawl_manager.crawl( target_url: url, capybara_session: Capybara.current_session, screenshot_operator: screenshot_operator ) end rescue StandardError => error logger .fatal("#{error.message} \n #{error.backtrace.join("\n")}") ensure @report_store.finish end self end
js_before_run(javascript: '')
click to toggle source
# File lib/browser_crawler/engine.rb, line 62 def js_before_run(javascript: '') return if javascript.empty? @javascript_before_run = javascript end
report_save(folder_path: '', type: :yaml)
click to toggle source
# File lib/browser_crawler/engine.rb, line 88 def report_save(folder_path: '', type: :yaml) save_folder_path = folder_path.empty? ? REPORT_SAVE_FOLDER_PATH : folder_path ReportFactory.save(store: @report_store, type: type.to_sym, save_folder_path: save_folder_path) end
unvisited_links(&hook)
click to toggle source
# File lib/browser_crawler/engine.rb, line 103 def unvisited_links(&hook) HooksContainer.instance.add_hook(type: :unvisited_links, hook: hook) end
Private Instance Methods
initialize_crawl_manager(max_pages, deep_visit)
click to toggle source
# File lib/browser_crawler/engine.rb, line 147 def initialize_crawl_manager(max_pages, deep_visit) @crawl_manager = EngineUtilities::CrawlManager.new( report_store: report_store, max_pages: max_pages.to_i, deep_visit: deep_visit, logger: @logger ) end
initialize_crawler(url)
click to toggle source
# File lib/browser_crawler/engine.rb, line 113 def initialize_crawler(url) Capybara.current_session.quit uri = UrlTools.uri!(url: url) Capybara.app_host = "#{uri.scheme}://#{uri.host}:#{uri.port}" @report_store.start(url: url) return if @javascript_before_run.nil? Capybara.current_session .driver .browser .page .command('Page.addScriptToEvaluateOnNewDocument', source: @javascript_before_run) end
initialize_report_store(cuprite_options)
click to toggle source
# File lib/browser_crawler/engine.rb, line 131 def initialize_report_store(cuprite_options) @report_store = Reports::Store.new @report_store.metadata[:screenshots_path] = screenshot_operator .screenshots_folder @report_store.metadata[:window_width] = cuprite_options[:window_size][0] @report_store.metadata[:window_height] = cuprite_options[:window_size][1] end
register_chrome_driver(cuprite_options)
click to toggle source
# File lib/browser_crawler/engine.rb, line 139 def register_chrome_driver(cuprite_options) Capybara.register_chrome_driver(:cuprite_chrome, options: cuprite_options) Capybara.run_server = false Capybara.default_driver = :cuprite_chrome # a workaround to extracting data from inactive tabs, dialogs, etc. Capybara.ignore_hidden_elements = false end