class Arachni::Page

It holds page data like elements, cookies, headers, etc…

@author Tasos “Zapotek” Laskos <tasos.laskos@arachni-scanner.com>

Constants

ELEMENTS
METADATA

Attributes

cache[R]

@return [Hash]

@private

dom[RW]

@return [DOM]

DOM snapshot.
element_audit_whitelist[R]

@return [Set<Integer>]

Audit whitelist based on {Element::Capabilities::Auditable#coverage_hash}.

@see update_element_audit_whitelist @see audit_element? @see Check::Auditor#skip?

metadata[R]

@return [Hash]

Holds page data that will need to persist between {#clear_cache} calls
and other utility data.
response[R]

@return [HTTP::Response]

HTTP response.

Public Class Methods

_load( data ) click to toggle source
# File lib/arachni/page.rb, line 605
def self._load( data )
    new( Marshal.load( data ) )
end
from_data( data ) click to toggle source

@option options [String] :url

URL of the page.

@option options [String] :body

Body of the page.

@option options [Array<Link>] :links

{Link} elements.

@option options [Array<Form>] :forms

{Form} elements.

@option options [Array<Cookie>] :cookies

{Cookie} elements.

@option options [Array<Header>] :headers

{Header} elements.

@option options [Array<Cookie>] :cookie_jar

{Cookie} elements with which to update the HTTP cookiejar before
auditing.

@option options [Array<String>] :paths

Paths contained in the page.

@option options [Array<String>] :request

{Request#initialize} options.
# File lib/arachni/page.rb, line 82
def self.from_data( data )
    data = data.dup

    data[:response]        ||= {}
    data[:response][:code] ||= 200
    data[:response][:url]  ||= data.delete( :url )
    data[:response][:body] ||= data.delete( :body ) || ''

    data[:response][:request]       ||= {}
    data[:response][:request][:url] ||= data[:response][:url]

    data[:cookie_jar] ||= []

    data[:response][:request] = Arachni::HTTP::Request.new( data[:response][:request] )
    data[:response]           = Arachni::HTTP::Response.new( data[:response] )

    new data
end
from_response( response ) click to toggle source

@param [HTTP::Response] response

HTTP response to parse.

@return [Page]

# File lib/arachni/page.rb, line 59
def self.from_response( response )
    Parser.new( response ).page
end
from_rpc_data( data ) click to toggle source

@param [Hash] data

{#to_rpc_data}

@return [Page]

# File lib/arachni/page.rb, line 573
def self.from_rpc_data( data )
    dom = data.delete('dom')
    normalized_data = {}
    data.each do |name, value|

        value = case name
                    when 'response'
                        HTTP::Response.from_rpc_data( value )

                    when *ELEMENTS.map(&:to_s)
                        value.map do |e|
                            Element.type_to_class( name[0...-1].to_sym ).from_rpc_data( e )
                        end.to_a

                    else
                        value
                end

        normalized_data[name.to_sym] = value
    end

    instance = new( normalized_data )
    instance.instance_variable_set(
        '@dom', DOM.from_rpc_data( dom.merge( page: instance ) )
    )
    instance
end
from_url( url, opts = {}, &block ) click to toggle source

@param [String] url

URL to fetch.

@param [Hash] opts @option opts [Integer] :precision (2)

How many times to request the page and examine changes between requests.
Used tp identify nonce tokens etc.

@option opts [Hash] :http

HTTP {HTTP::Client#get request} options.

@param [Block] block

Block to which to pass the page object. If given, the request will be
performed asynchronously. If no block is given, the page will be fetched
synchronously and be returned by this method.

@return [Page]

# File lib/arachni/page.rb, line 37
def self.from_url( url, opts = {}, &block )
    responses = []

    opts[:precision] ||= 2
    opts[:precision].times do
        HTTP::Client.get( url, opts[:http] || {} ) do |res|
            responses << res
            next if responses.size != opts[:precision]
            block.call( from_response( responses ) ) if block_given?
        end
    end

    if !block_given?
        HTTP::Client.run
        from_response( responses )
    end
end
new( options ) click to toggle source

Needs either a ‘:parser` or a `:response` or user provided data.

@param [Hash] options

Hash from which to set instance attributes.

@option options [Array<HTTP::Response>, HTTP::Response] :response

HTTP response of the page -- or array of responses for the page for
content refinement.

@option options [Parser] :parser

An instantiated {Parser}.
# File lib/arachni/page.rb, line 143
def initialize( options )
    fail ArgumentError, 'Options cannot be empty.' if options.empty?
    options = options.dup

    @cache = {}

    @do_not_audit_elements = options.delete(:do_not_audit_elements)

    @cache[:parser] = options.delete(:parser)
    @response = @cache[:parser].response if @cache[:parser]

    # We need to know whether or not the page has been dynamically updated
    # with elements, in order to optimize #dup and #hash operations.
    @has_custom_elements = Set.new

    @metadata ||= {}

    options.each do |k, v|
        send( "#{k}=", try_dup( v ) )
    end

    @dom = DOM.new( (options[:dom] || {}).merge( page: self ) )

    fail ArgumentError, 'No URL given!' if !url

    Platform::Manager.fingerprint( self )

    @element_audit_whitelist ||= []
    @element_audit_whitelist   = Set.new( @element_audit_whitelist )
end

Public Instance Methods

==( other ) click to toggle source
# File lib/arachni/page.rb, line 467
def ==( other )
    hash == other.hash
end
_dump( _ ) click to toggle source
# File lib/arachni/page.rb, line 601
def _dump( _ )
    Marshal.dump( to_initialization_options( false ) )
end
audit_element?( element ) click to toggle source

@param [Element::Capabilities::Auditable, Integer] element

Element or {Element::Capabilities::Auditable#coverage_hash}.

@return [Bool]

`true` if the element should be audited, `false` otherwise.

@see element_audit_whitelist @see Check::Auditor#skip?

# File lib/arachni/page.rb, line 229
def audit_element?( element )
    return if @do_not_audit_elements
    return true if @element_audit_whitelist.empty?
    @element_audit_whitelist.include?(
        element.is_a?( Integer ) ? element : element.coverage_hash
    )
end
body() click to toggle source

@return [String]

HTTP response body.
# File lib/arachni/page.rb, line 269
def body
    return '' if !@body && !@response
    @body ||= response.body
end
body=( string ) click to toggle source

@param [String] string

Page body.
# File lib/arachni/page.rb, line 276
def body=( string )
    @has_javascript = nil
    clear_cache

    @body = string.to_s.freeze
end
clear_cache() click to toggle source

@note Will preserve caches for elements which have been externally modified.

@return [Page]

`self` with caches cleared.
# File lib/arachni/page.rb, line 352
def clear_cache
    ELEMENTS.each do |type|
        next if @has_custom_elements.include? type
        # Remove the association to this page before clearing the elements
        # from cache to make it easier on the GC.
        (@cache[type] || []).each { |e| e.page = nil }
    end

    @cache.delete_if { |k, _| !@has_custom_elements.include? k }
    self
end
code() click to toggle source

@return [String]

URL of the page.
# File lib/arachni/page.rb, line 256
def code
    return 0 if !@code && !response
    @code ||= response.code
end
do_not_audit_elements() click to toggle source

It forces {#audit_element?} to always returns false.

# File lib/arachni/page.rb, line 238
def do_not_audit_elements
    @do_not_audit_elements = true
end
document() click to toggle source

@return [Arachni::Parser::Document]

Parsed {#body HTML} document.
# File lib/arachni/page.rb, line 342
def document
    @cache[:document] ||= (parser.nil? ?
        Arachni::Parser.parse( body ) :
        parser.document)
end
dup() click to toggle source
# File lib/arachni/page.rb, line 475
def dup
    self.class.new to_initialization_options
end
elements() click to toggle source

@return [Array<Element::Base>]

All page elements.
# File lib/arachni/page.rb, line 320
def elements
    ELEMENTS.map { |type| send( type ) }.flatten
end
elements_within_scope() click to toggle source

@return [Array<Element::Base>]

All page elements that are within the scope of the scan.
# File lib/arachni/page.rb, line 326
def elements_within_scope
    ELEMENTS.map do |type|
        next if !Options.audit.element? type
        send( type ).select { |e| e.scope.in? }
    end.flatten.compact
end
eql?( other ) click to toggle source
# File lib/arachni/page.rb, line 471
def eql?( other )
    self == other
end
has_elements?( *tags ) click to toggle source

@param [String, Symbol,Array<String, Symbol>] tags

Element tag names.

@return [Boolean]

`true` if the page contains any of the given elements, `false` otherwise.
# File lib/arachni/page.rb, line 412
def has_elements?( *tags )
    return if !text?

    tags.flatten.each do |tag|
        tag = tag.to_s

        next if !body.has_html_tag?( tag )

        return false if !document
        return true  if document.nodes_by_name( tag ).any?
    end

    false
end
has_script?() click to toggle source

@return [Boolean]

`true` if the page contains client-side code, `false` otherwise.
# File lib/arachni/page.rb, line 384
def has_script?
    return @has_javascript if !@has_javascript.nil?

    if !response.headers.content_type.to_s.start_with?( 'text/html' ) || !text?
        return @has_javascript = false
    end

    dbody = body.downcase

    # First check, quick and simple.
    if dbody.include?( '<script' ) || dbody.include?( 'javascript:' )
        return @has_javascript = true
    end

    # Check for event attributes, if there are any then there's JS to be
    # executed.
    Browser::Javascript.events.flatten.each do |event|
        return @has_javascript = true if dbody.include?( "#{event}=" )
    end

    @has_javascript = false
end
hash() click to toggle source
# File lib/arachni/page.rb, line 463
def hash
    digest.hash
end
import_metadata( other, metas = METADATA ) click to toggle source
# File lib/arachni/page.rb, line 495
def import_metadata( other, metas = METADATA )
    [metas].flatten.each do |meta|
        other.metadata.each do |element_type, data|
            @metadata[element_type] ||= {}
            @metadata[element_type][meta.to_s] ||= {}
            @metadata[element_type][meta.to_s].merge!( data[meta.to_s] )
        end
    end

    reload_metadata

    self
end
inspect()
Alias for: to_s
method( *args ) click to toggle source

@return [String]

The request method that returned the page
Calls superclass method
# File lib/arachni/page.rb, line 335
def method( *args )
    return super( *args ) if args.any?
    response.request.method
end
parsed_url() click to toggle source

@return [Arachni::URI]

# File lib/arachni/page.rb, line 186
def parsed_url
    Arachni::URI( url )
end
parser() click to toggle source

@return [Parser]

# File lib/arachni/page.rb, line 191
def parser
    return if !@response
    return @cache[:parser] if @cache[:parser]

    @cache[:parser] = Parser.new( @response )

    # The page may have a browser-assigned body, set it as the one to parse.
    @cache[:parser].body = body
    @cache[:parser]
end
parser=( p ) click to toggle source
# File lib/arachni/page.rb, line 202
def parser=( p )
    @cache[:parser] = p
end
paths() click to toggle source

@return [Array<String>]

Paths contained in this page.

@see Parser#paths

# File lib/arachni/page.rb, line 308
def paths
    @cache[:paths] ||= (parser ? parser.paths : [])
end
performer() click to toggle source

@return [Object]

Object which performed the {#request} which lead to this page.
# File lib/arachni/page.rb, line 181
def performer
    request.performer
end
persistent_hash() click to toggle source
# File lib/arachni/page.rb, line 459
def persistent_hash
    digest.persistent_hash
end
platforms() click to toggle source

@return [Platform]

Applicable platforms for the page.
# File lib/arachni/page.rb, line 314
def platforms
    Platform::Manager[url]
end
prepare_for_report() click to toggle source
# File lib/arachni/page.rb, line 364
def prepare_for_report
    # We want a hard clear, that's why we don't call #clear_cache.
    @cache.clear

    # If we're dealing with binary data remove it before storing.
    if !text?
        response.body = nil
        self.body     = nil
    end

    @cookie_jar.clear if @cookie_jar

    @dom.digest      = nil
    @dom.skip_states = nil

    self
end
query_vars() click to toggle source

@return [Hash]

{#url URL} query parameters.
# File lib/arachni/page.rb, line 263
def query_vars
    @cache[:query_vars] ||= uri_parse_query( url )
end
reload_metadata() click to toggle source
# File lib/arachni/page.rb, line 487
def reload_metadata
    ELEMENTS.each do |type|
        next if !@cache[type]

        @cache[type].each { |e| restore_from_metadata e }
    end
end
request() click to toggle source

@return [HTTP::Request]

HTTP request.
# File lib/arachni/page.rb, line 244
def request
    response.request
end
scope() click to toggle source

@return [Scope]

# File lib/arachni/page.rb, line 175
def scope
    @scope = Scope.new( self )
end
text?() click to toggle source

@return [Boolean]

`true` if the body of the page is text-base, `false` otherwise.
# File lib/arachni/page.rb, line 429
def text?
    return false if !response
    response.text?
end
title() click to toggle source

@return [String]

Title of the page.
# File lib/arachni/page.rb, line 436
def title
    document.nodes_by_name( 'title' ).first.text rescue nil
end
to_h() click to toggle source

@return [Hash]

Converts the page data to a hash.
# File lib/arachni/page.rb, line 442
def to_h
    skip = [:@document, :@do_not_audit_elements, :@has_custom_elements, :@scope]

    instance_variables.inject({}) do |h, iv|
        next h if skip.include? iv

        h[iv.to_s.gsub( '@', '').to_sym] = try_dup( instance_variable_get( iv ) )
        h
    end.merge(@cache).tap { |h| h.delete :parser }
end
Also aliased as: to_hash
to_hash()
Alias for: to_h
to_initialization_options( deep = true ) click to toggle source
# File lib/arachni/page.rb, line 509
def to_initialization_options( deep = true )
    h = {}
    h[:body] = @body if @body

    [:cookie_jar, :element_audit_whitelist, :metadata].each do |m|
        h[m] = instance_variable_get( "@#{m}".to_sym )

        if deep
            h[m] = try_dup( h[m] )
        end

        h.delete( m ) if !h[m]
    end

    ELEMENTS.each do |type|
        next if !@has_custom_elements.include?( type )
        h[type] = @cache[type]

        if !h[type] || h[type].empty?
            h.delete( type )
            next
        end

        h[type] = h[type].map { |e| c = e.dup; c.page = nil; c }
    end

    h[:response] = response
    h[:do_not_audit_elements] = @do_not_audit_elements

    h[:dom] = dom.to_h.keys.inject({}) do |dh, k|
        dh[k] = dom.send( k )

        if deep
            dh[k] = try_dup( dh[k] )
        end

        dh
    end

    h
end
to_rpc_data() click to toggle source

@return [Hash]

Data representing this instance that are suitable the RPC transmission.
# File lib/arachni/page.rb, line 553
def to_rpc_data
    data        = to_initialization_options( false ).my_stringify_keys(false)
    data['dom'] = dom.to_rpc_data
    data['element_audit_whitelist'] = element_audit_whitelist.to_a
    data['response'] = data['response'].to_rpc_data

    (ELEMENTS - [:headers]).map(&:to_s).each do |e|
        next if !data[e]
        data[e] = send(e).map(&:to_rpc_data)
    end

    data.delete 'cookie_jar'

    data
end
to_s() click to toggle source
# File lib/arachni/page.rb, line 454
def to_s
    "#<#{self.class}:#{object_id} @url=#{@url.inspect} @dom=#{@dom}>"
end
Also aliased as: inspect
update_element_audit_whitelist( list ) click to toggle source

@param [Array<Element::Capabilities::Auditable, Integer>] list

Audit whitelist based on {Element::Capabilities::Auditable elements} or
{Element::Capabilities::Auditable#coverage_hash}s.

@return [Set] {#element_audit_whitelist}

@see element_audit_whitelist @see Check::Auditor#skip?

# File lib/arachni/page.rb, line 214
def update_element_audit_whitelist( list )
    [list].flatten.each do |e|
        @element_audit_whitelist <<
            (e.is_a?( Integer ) ? e : e.coverage_hash )
    end
end
update_metadata() click to toggle source
# File lib/arachni/page.rb, line 479
def update_metadata
    ELEMENTS.each do |type|
        next if !@cache[type]

        @cache[type].each { |e| store_to_metadata e }
    end
end
url() click to toggle source

@return [String]

URL of the page.
# File lib/arachni/page.rb, line 250
def url
    @url ||= @response.url
end

Private Instance Methods

assign_page_to_elements( list ) click to toggle source
# File lib/arachni/page.rb, line 629
def assign_page_to_elements( list )
    list.map do |e|
        e.page = self

        store_to_metadata e
        restore_from_metadata e

        e
    end.freeze
end
digest() click to toggle source
# File lib/arachni/page.rb, line 611
def digest
    element_hashes = []
    ELEMENTS.each do |type|
        next if !@has_custom_elements.include?( type ) || !(list = @cache[type])
        element_hashes |= list.map(&:hash)
    end

    "#{dom.playable_transitions.hash}:#{body.hash}#{element_hashes.sort}"
end
ensure_metadata( element, meta ) click to toggle source
# File lib/arachni/page.rb, line 662
def ensure_metadata( element, meta )
    @metadata[element.type.to_s] ||= {}
    @metadata[element.type.to_s][meta.to_s] ||= {}
end
paths=( paths ) click to toggle source
# File lib/arachni/page.rb, line 625
def paths=( paths )
    @cache[:paths] = paths
end
restore_from_metadata( element ) click to toggle source
# File lib/arachni/page.rb, line 650
def restore_from_metadata( element )
    METADATA.each do |meta|
        next if !element.respond_to?( "#{meta}=" )

        ensure_metadata( element, meta )
        element.send(
            "#{meta}=",
            @metadata[element.type.to_s][meta.to_s][element.coverage_hash]
        )
    end
end
store_to_metadata( element ) click to toggle source
# File lib/arachni/page.rb, line 640
def store_to_metadata( element )
    METADATA.each do |meta|
        next if !element.respond_to?(meta)

        ensure_metadata( element, meta )
        @metadata[element.type.to_s][meta.to_s][element.coverage_hash] ||=
            element.send(meta)
    end
end
try_dup( v ) click to toggle source
# File lib/arachni/page.rb, line 667
def try_dup( v )
    v.dup rescue v
end