class Arachni::Parser

Analyzes HTML code extracting inputs vectors and supporting information.

@author Tasos “Zapotek” Laskos <tasos.laskos@arachni-scanner.com>

Constants

CACHE
CACHE_SIZES
IGNORE_REQUEST_HEADERS
WHITELIST

Attributes

response[RW]

@return [HTTP::Response]

url[R]

@return [String]

Public Class Methods

html?( string ) click to toggle source
# File lib/arachni/parser.rb, line 117
def html?( string )
    CACHE[__method__].fetch string do
        begin
            _html? string
        rescue => e
            false
        end
    end
end
markup?( string ) click to toggle source
# File lib/arachni/parser.rb, line 109
def markup?( string )
    begin
        Ox.parse( string ).is_a?( Ox::Element )
    rescue => e
        false
    end
end
new( resource ) click to toggle source

@param [Document, HTTP::Response, Array<HTTP::Response>] resource

Response(s) to analyze and parse. By providing multiple responses the
parser will be able to perform some preliminary differential analysis
and identify nonce tokens in inputs.
# File lib/arachni/parser.rb, line 176
def initialize( resource )
    case resource

        when Document
            @resource = :document
            @document = resource

        when HTTP::Response
            @resource = :response

            @response = resource
            self.url = @response.url

        when Array
            @secondary_responses = resource[1..-1]
            @secondary_responses.compact! if @secondary_responses
            response = resource.shift

            @resource = :response

            @response = response
            self.url = response.url
    end
end
parse( html, options = {} ) click to toggle source
# File lib/arachni/parser.rb, line 66
def parse( html, options = {} )
    CACHE[__method__].fetch [html, options] do
        handler, sax_options = prepare_ox_options( options )

        begin
            Ox.sax_html( handler, StringIO.new( html ), sax_options )
        rescue SAX::Stop
        end

        handler.document
    end
end
parse_fragment( html ) click to toggle source
# File lib/arachni/parser.rb, line 94
def parse_fragment( html )
    CACHE[__method__].fetch html do
        parse( html ).children.first.tap do |o|
            o.parent   = nil
            o.document = nil
        end
    end
end
parse_xml( xml ) click to toggle source
# File lib/arachni/parser.rb, line 103
def parse_xml( xml )
    CACHE[__method__].fetch xml do
        Nokogiri::XML( xml )
    end
end
push_parse( options = {} ) click to toggle source
# File lib/arachni/parser.rb, line 79
def push_parse( options = {} )
    buffer, buffer_in = IO.pipe

    document, sax_options = prepare_ox_options( options )

    push_parse_pool.post do
        begin
            Ox.sax_html( document, buffer, sax_options )
        rescue SAX::Stop
        end
    end

    [buffer_in, document]
end

Private Class Methods

_html?( string ) click to toggle source
# File lib/arachni/parser.rb, line 129
def _html?( string )
    parse( string ).traverse do |n|
        return true if n.is_a? Nodes::Element
    end

    false
end
extractors() click to toggle source
# File lib/arachni/parser.rb, line 498
def self.extractors
    @manager ||= Component::Manager.new( Options.paths.path_extractors, Extractors )
end
prepare_ox_options( options ) click to toggle source
# File lib/arachni/parser.rb, line 141
def prepare_ox_options( options )
    handler = options[:handler] || SAX.new( options )

    sax_options = {}
    if options[:whitelist] && options[:whitelist].any?
        overlay = Ox.sax_html_overlay.dup
        overlay.each do |k, v|
            overlay[k] = :off
        end

        options[:whitelist].each do |e|
            overlay[e] = :active
        end

        sax_options[:overlay] = overlay
    end

    [handler, sax_options]
end
push_parse_pool() click to toggle source
# File lib/arachni/parser.rb, line 137
def push_parse_pool
    @push_parse_pool ||= Concurrent::CachedThreadPool.new
end

Public Instance Methods

base() click to toggle source

@return [String]

Base `href`, if there is one.
# File lib/arachni/parser.rb, line 459
def base
    @base ||= document.nodes_by_name( :base ).map { |b| b['href'] }.first || @url
end
body() click to toggle source
# File lib/arachni/parser.rb, line 252
def body
    @body || (@response.body if from_response?)
end
body=( string ) click to toggle source

@return [String]

Override the {#response} body for the parsing process.
# File lib/arachni/parser.rb, line 247
def body=( string )
    @links = @forms = @cookies = @document = nil
    @body = string
end
cookies() click to toggle source

@return [Array<Element::Cookie>]

Cookies from HTTP headers and response body.
# File lib/arachni/parser.rb, line 389
def cookies
    return @cookies.freeze if @cookies

    @cookies = Cookie.from_headers( @url, @response.headers )
    return @cookies if !text? || !Cookie.in_html?( body )

    @cookies |= Cookie.from_parser( self )
end
cookies_to_be_audited() click to toggle source

@return [Array<Element::Cookie>]

Cookies to be audited.
# File lib/arachni/parser.rb, line 408
def cookies_to_be_audited
    return @cookies_to_be_audited.freeze if @cookies_to_be_audited
    return [] if !text?

    # Make a list of the response cookie names.
    cookie_names = Set.new( cookies.map(&:name) )

    # Grab all cookies from the cookiejar giving preferrence to the ones
    # specified by the current page, if there are any.
    from_http_jar = HTTP::Client.cookie_jar.cookies.reject do |c|
        cookie_names.include?( c.name )
    end

    # These cookies are to be audited and thus are dirty and anarchistic,
    # so they have to contain even cookies completely irrelevant to the
    # current page. I.e. it contains all cookies that have been observed
    # since the beginning of the scan
    @cookies_to_be_audited = (cookies | from_http_jar).map do |c|
        dc = c.dup
        dc.action = @url
        dc
    end
end
document() click to toggle source

@return [Arachni::Parser::Document, nil]

Returns a parsed HTML document from the body of the HTTP response or
`nil` if the response data wasn't {#text? text-based} or the response
couldn't be parsed.
# File lib/arachni/parser.rb, line 260
def document
    return @document if @document
    return if !text?

    @document = self.class.parse( body, filter: true )
end
forms() click to toggle source

@return [Array<Element::Form>]

Forms from {#document}.
# File lib/arachni/parser.rb, line 291
def forms
    return @forms.freeze if @forms
    return [] if !text? || (body && !Form.in_html?( body ))

    f = Form.from_parser( self )
    return f if !@secondary_responses

    @secondary_responses.each do |response|
        next if response.body.to_s.empty?

        Form.from_parser( Parser.new( response ) ).each do |form2|
            f.each do |form|
                next if "#{form.coverage_id}:#{form.name_or_id}" !=
                    "#{form2.coverage_id}:#{form2.name_or_id}"

                form.inputs.each do |k, v|
                    next if v == form2.inputs[k] ||
                        form.field_type_for( k ) != :hidden

                    form.nonce_name = k
                end
            end
        end
    end

    @forms = f
end
from_document?() click to toggle source
# File lib/arachni/parser.rb, line 241
def from_document?
    @resource == :document
end
from_response?() click to toggle source
# File lib/arachni/parser.rb, line 237
def from_response?
    @resource == :response
end
headers() click to toggle source

@note It will include common request headers as well headers from the HTTP

request.

@return [Hash]

List of valid auditable HTTP header fields.
# File lib/arachni/parser.rb, line 272
def headers
    @headers ||= {
        'Accept'          => 'text/html,application/xhtml+xml,application' +
            '/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset'  => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
        'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
        'From'            => Options.authorized_by  || '',
        'User-Agent'      => Options.http.user_agent || '',
        'Referer'         => @url,
        'Pragma'          => 'no-cache'
    }.merge(
        response.request.headers.dup.tap do |h|
            IGNORE_REQUEST_HEADERS.each { |k| h.delete k }
        end
    ).map { |k, v| Header.new( url: @url, inputs: { k => v } ) }.freeze
end
jsons() click to toggle source

@return [Array<Element::JSON>]

# File lib/arachni/parser.rb, line 360
def jsons
    @jsons ||= [JSON.from_request( @url, response.request )].compact
end
nested_cookies() click to toggle source

@return [Array<Element::NestedCookie>]

Nested cookies from {#cookies_to_be_audited}.
# File lib/arachni/parser.rb, line 400
def nested_cookies
    return @nested_cookies.freeze if @nested_cookies

    @nested_cookies = NestedCookie.from_cookies( cookies_to_be_audited )
end
page() click to toggle source

@return [Page]

# File lib/arachni/parser.rb, line 227
def page
    @page ||= Page.new( parser: self )
end
paths() click to toggle source

@return [Array<String>]

Distinct links to follow.
# File lib/arachni/parser.rb, line 449
def paths
  return @paths if @paths
  @paths = []
  return @paths.freeze if !document

  @paths = run_extractors.freeze
end
text?() click to toggle source

@return [Boolean]

`true` if the given HTTP response data are text based, `false` otherwise.
# File lib/arachni/parser.rb, line 233
def text?
    from_response? ? @response.text? : true
end
to_absolute( relative_url ) click to toggle source

Converts a relative URL to an absolute one.

@param [String] relative_url

URL to convert to absolute.

@return [String]

Absolute URL.
Calls superclass method Arachni::Utilities#to_absolute
# File lib/arachni/parser.rb, line 216
def to_absolute( relative_url )
    if (url = base)
        base_url = url
    else
        base_url = @url
    end

    super( relative_url, base_url )
end
ui_forms() click to toggle source

Dummy method, only the {Browser#to_page browser} can fill this in.

# File lib/arachni/parser.rb, line 383
def ui_forms
    []
end
ui_inputs() click to toggle source

Dummy method, only the {Browser#to_page browser} can fill this in.

# File lib/arachni/parser.rb, line 378
def ui_inputs
    []
end
url=( str ) click to toggle source
# File lib/arachni/parser.rb, line 201
def url=( str )
    return @url = nil if !str

    @url = normalize_url( uri_decode( str ) )
    @url = normalize_url( str ) if !@url
    @url.freeze
end
xmls() click to toggle source

@return [Array<Element::XML>]

# File lib/arachni/parser.rb, line 365
def xmls
    @xmls ||= [XML.from_request( @url, response.request )].compact
end

Private Instance Methods

run_extractors() click to toggle source

Runs all path extraction components and returns an array of paths.

@return [Array<String>]

Paths.
# File lib/arachni/parser.rb, line 469
def run_extractors
    begin
        unsanitized_paths = Set.new
        self.class.extractors.available.each do |name|
            exception_jail false do
                unsanitized_paths.merge self.class.extractors[name].new(
                    parser: self,
                    html:   body
                ).run.flatten
            end
        end

        sanitized_paths = Set.new
        unsanitized_paths.map do |path|
            next if !path || path =~ /^mailto:/i

            abs = to_absolute( path )
            next if !abs || skip?( abs )

            sanitized_paths << abs
        end

        sanitized_paths.to_a
    rescue => e
        print_exception e
        []
    end
end