module Arachni::Framework::Parts::Data

Provides access to {Arachni::Data::Framework} and helpers.

@author Tasos “Zapotek” Laskos <tasos.laskos@arachni-scanner.com>

Public Instance Methods

data() click to toggle source

@return [Data::Framework]

# File lib/arachni/framework/parts/data.rb, line 19
def data
    Arachni::Data.framework
end
page_queue_total_size() click to toggle source

@return [Integer]

Total number of pages added to the {#push_to_page_queue page audit queue}.
# File lib/arachni/framework/parts/data.rb, line 74
def page_queue_total_size
    data.page_queue_total_size
end
push_to_page_queue( page, force = false ) click to toggle source

@param [Page] page

Page to push to the page audit queue -- increases {#page_queue_total_size}

@return [Bool]

`true` if push was successful, `false` if the `page` matched any
exclusion criteria or has already been seen.
# File lib/arachni/framework/parts/data.rb, line 29
def push_to_page_queue( page, force = false )
    return false if !force && (!accepts_more_pages? ||
        state.page_seen?( page ) || page.scope.out? ||
        page.scope.redundant?( true ))

    # We want to update from the already loaded page cache (if there is one)
    # as we have to store the page anyways (needs to go through Browser analysis)
    # and it's not worth the resources to parse its elements.
    #
    # We're basically doing this to give the Browser and Trainer a better
    # view of what elements have been seen, so that they won't feed us pages
    # with elements that they think are new, but have been provided to us by
    # some other component; however, it wouldn't be the end of the world if
    # that were to happen.
    ElementFilter.update_from_page_cache page
    page.clear_cache

    data.push_to_page_queue page
    state.page_seen page

    true
end
push_to_url_queue( url, force = false ) click to toggle source

@param [String] url

URL to push to the audit queue -- increases {#url_queue_total_size}

@return [Bool]

`true` if push was successful, `false` if the `url` matched any
exclusion criteria or has already been seen.
# File lib/arachni/framework/parts/data.rb, line 58
def push_to_url_queue( url, force = false )
    return if !force && !accepts_more_pages?

    url = to_absolute( url ) || url
    if state.url_seen?( url ) || skip_path?( url ) || redundant_path?( url, true )
        return false
    end

    data.push_to_url_queue url
    state.url_seen url

    true
end
sitemap() click to toggle source

@return [Hash<String, Integer>]

List of crawled URLs with their HTTP codes.
# File lib/arachni/framework/parts/data.rb, line 86
def sitemap
    data.sitemap
end
url_queue_total_size() click to toggle source

@return [Integer]

Total number of URLs added to the {#push_to_url_queue URL audit queue}.
# File lib/arachni/framework/parts/data.rb, line 80
def url_queue_total_size
    data.url_queue_total_size
end

Private Instance Methods

add_to_sitemap( page ) click to toggle source
# File lib/arachni/framework/parts/data.rb, line 198
def add_to_sitemap( page )
    data.add_page_to_sitemap( page )
end
has_audit_workload?() click to toggle source
# File lib/arachni/framework/parts/data.rb, line 100
def has_audit_workload?
    !url_queue.empty? || !page_queue.empty?
end
page_queue() click to toggle source
# File lib/arachni/framework/parts/data.rb, line 92
def page_queue
    data.page_queue
end
pop_page() click to toggle source

@return [Page, nil]

A page if the queues aren't empty, `nil` otherwise.
# File lib/arachni/framework/parts/data.rb, line 106
def pop_page
    pop_page_from_queue || pop_page_from_url_queue
end
pop_page_from_queue() click to toggle source

@return [Page, nil]

A page if the queue wasn't empty, `nil` otherwise.
# File lib/arachni/framework/parts/data.rb, line 166
def pop_page_from_queue
    page = nil

    # Scope may have changed since the page was pushed.
    loop do
        return if page_queue.empty?

        page = page_queue.pop
        break if !page.scope.out?
    end

    page
end
pop_page_from_url_queue( &block ) click to toggle source

@return [Page, nil]

A page if the queue wasn't empty, `nil` otherwise.
# File lib/arachni/framework/parts/data.rb, line 112
def pop_page_from_url_queue( &block )
    url = nil

    # Scope may have changed since the URL was pushed.
    loop do
        return if url_queue.empty?

        url = url_queue.pop
        break if !skip_path?( url )
    end

    grabbed_page = nil
    Page.from_url( url, http: {
           update_cookies: true,
           performer:      self
       }
    ) do |page|
        @retries[page.url.hash] ||= 0

        if (location = page.response.headers.location)
            [location].flatten.each do |l|
                print_info "Scheduled #{page.code} redirection: #{page.url} => #{l}"
                push_to_url_queue to_absolute( l, page.url )
            end
        end

        if page.code != 0
            grabbed_page = page
            block.call grabbed_page if block_given?
            next
        end

        if @retries[page.url.hash] >= AUDIT_PAGE_MAX_TRIES
            @failures << page.url

            print_error "Giving up trying to audit: #{page.url}"
            print_error "Couldn't get a response after #{AUDIT_PAGE_MAX_TRIES}" +
                            " tries: #{page.response.return_message}."
        else
            print_bad "Retrying for: #{page.url} [#{page.response.return_message}]"
            @retries[page.url.hash] += 1
            url_queue << page.url
        end

        grabbed_page = nil
        block.call grabbed_page if block_given?
    end

    http.run if !block_given?
    grabbed_page
end
push_paths_from_page( page ) click to toggle source
# File lib/arachni/framework/parts/data.rb, line 206
def push_paths_from_page( page )
    page.paths.select { |path| push_to_url_queue( path ) }
end
replenish_page_queue_from_url_queue() click to toggle source
# File lib/arachni/framework/parts/data.rb, line 180
def replenish_page_queue_from_url_queue
    return if !page_queue.empty?

    # Number pulled out of my ass, low enough to not add any noticeable
    # stress, hopefully high enough to grab us at least one page that has
    # some workload which will result in HTTP requests which will mask the
    # next replenishing operation.
    [10, page_queue.free_buffer_size].min.times do
        return if url_queue.empty?

        # We push directly to the queue instead of using #push_to_page_queue
        # because it's too early to deduplicate.
        pop_page_from_url_queue { |p| page_queue << p if p }
    end

    !url_queue.empty?
end
update_sitemap( entries ) click to toggle source
# File lib/arachni/framework/parts/data.rb, line 202
def update_sitemap( entries )
    data.update_sitemap( entries )
end
url_queue() click to toggle source
# File lib/arachni/framework/parts/data.rb, line 96
def url_queue
    data.url_queue
end