class Arachni::Parser
Analyzes HTML code extracting inputs vectors and supporting information.
@author Tasos “Zapotek” Laskos <tasos.laskos@arachni-scanner.com>
Constants
- CACHE
- CACHE_SIZES
- IGNORE_REQUEST_HEADERS
- WHITELIST
Attributes
@return [HTTP::Response]
@return [String]
Public Class Methods
# File lib/arachni/parser.rb, line 117 def html?( string ) CACHE[__method__].fetch string do begin _html? string rescue => e false end end end
# File lib/arachni/parser.rb, line 109 def markup?( string ) begin Ox.parse( string ).is_a?( Ox::Element ) rescue => e false end end
@param [Document, HTTP::Response
, Array
<HTTP::Response>] resource
Response(s) to analyze and parse. By providing multiple responses the parser will be able to perform some preliminary differential analysis and identify nonce tokens in inputs.
# File lib/arachni/parser.rb, line 176 def initialize( resource ) case resource when Document @resource = :document @document = resource when HTTP::Response @resource = :response @response = resource self.url = @response.url when Array @secondary_responses = resource[1..-1] @secondary_responses.compact! if @secondary_responses response = resource.shift @resource = :response @response = response self.url = response.url end end
# File lib/arachni/parser.rb, line 66 def parse( html, options = {} ) CACHE[__method__].fetch [html, options] do handler, sax_options = prepare_ox_options( options ) begin Ox.sax_html( handler, StringIO.new( html ), sax_options ) rescue SAX::Stop end handler.document end end
# File lib/arachni/parser.rb, line 94 def parse_fragment( html ) CACHE[__method__].fetch html do parse( html ).children.first.tap do |o| o.parent = nil o.document = nil end end end
# File lib/arachni/parser.rb, line 103 def parse_xml( xml ) CACHE[__method__].fetch xml do Nokogiri::XML( xml ) end end
# File lib/arachni/parser.rb, line 79 def push_parse( options = {} ) buffer, buffer_in = IO.pipe document, sax_options = prepare_ox_options( options ) push_parse_pool.post do begin Ox.sax_html( document, buffer, sax_options ) rescue SAX::Stop end end [buffer_in, document] end
Private Class Methods
# File lib/arachni/parser.rb, line 129 def _html?( string ) parse( string ).traverse do |n| return true if n.is_a? Nodes::Element end false end
# File lib/arachni/parser.rb, line 498 def self.extractors @manager ||= Component::Manager.new( Options.paths.path_extractors, Extractors ) end
# File lib/arachni/parser.rb, line 141 def prepare_ox_options( options ) handler = options[:handler] || SAX.new( options ) sax_options = {} if options[:whitelist] && options[:whitelist].any? overlay = Ox.sax_html_overlay.dup overlay.each do |k, v| overlay[k] = :off end options[:whitelist].each do |e| overlay[e] = :active end sax_options[:overlay] = overlay end [handler, sax_options] end
# File lib/arachni/parser.rb, line 137 def push_parse_pool @push_parse_pool ||= Concurrent::CachedThreadPool.new end
Public Instance Methods
@return [String]
Base `href`, if there is one.
# File lib/arachni/parser.rb, line 459 def base @base ||= document.nodes_by_name( :base ).map { |b| b['href'] }.first || @url end
# File lib/arachni/parser.rb, line 252 def body @body || (@response.body if from_response?) end
@return [String]
Override the {#response} body for the parsing process.
# File lib/arachni/parser.rb, line 247 def body=( string ) @links = @forms = @cookies = @document = nil @body = string end
@return [Arachni::Parser::Document, nil]
Returns a parsed HTML document from the body of the HTTP response or `nil` if the response data wasn't {#text? text-based} or the response couldn't be parsed.
# File lib/arachni/parser.rb, line 260 def document return @document if @document return if !text? @document = self.class.parse( body, filter: true ) end
@return [Array<Element::Form>]
Forms from {#document}.
# File lib/arachni/parser.rb, line 291 def forms return @forms.freeze if @forms return [] if !text? || (body && !Form.in_html?( body )) f = Form.from_parser( self ) return f if !@secondary_responses @secondary_responses.each do |response| next if response.body.to_s.empty? Form.from_parser( Parser.new( response ) ).each do |form2| f.each do |form| next if "#{form.coverage_id}:#{form.name_or_id}" != "#{form2.coverage_id}:#{form2.name_or_id}" form.inputs.each do |k, v| next if v == form2.inputs[k] || form.field_type_for( k ) != :hidden form.nonce_name = k end end end end @forms = f end
# File lib/arachni/parser.rb, line 241 def from_document? @resource == :document end
# File lib/arachni/parser.rb, line 237 def from_response? @resource == :response end
@note It will include common request headers as well headers from the HTTP
request.
@return [Hash]
List of valid auditable HTTP header fields.
# File lib/arachni/parser.rb, line 272 def headers @headers ||= { 'Accept' => 'text/html,application/xhtml+xml,application' + '/xml;q=0.9,*/*;q=0.8', 'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'From' => Options.authorized_by || '', 'User-Agent' => Options.http.user_agent || '', 'Referer' => @url, 'Pragma' => 'no-cache' }.merge( response.request.headers.dup.tap do |h| IGNORE_REQUEST_HEADERS.each { |k| h.delete k } end ).map { |k, v| Header.new( url: @url, inputs: { k => v } ) }.freeze end
@return [Array<Element::JSON>]
# File lib/arachni/parser.rb, line 360 def jsons @jsons ||= [JSON.from_request( @url, response.request )].compact end
@return [Element::Link]
Link to the page.
# File lib/arachni/parser.rb, line 321 def link return if link_vars.empty? && (@response && !@response.redirection?) Link.new( url: @url, inputs: link_vars ) end
@return [Element::LinkTemplate]
LinkTemplate for the current page.
# File lib/arachni/parser.rb, line 328 def link_template template, inputs = LinkTemplate.extract_inputs( @url ) return if !template LinkTemplate.new( url: @url.freeze, action: @url.freeze, inputs: inputs, template: template ) end
@return [Array<Element::LinkTemplate>]
Links matching {Arachni::OptionsGroups::Audit#link_templates} in {#document}.
# File lib/arachni/parser.rb, line 351 def link_templates return @link_templates.freeze if @link_templates return @link_templates = [link_template].compact if !text? @link_templates = [link_template].compact | LinkTemplate.from_parser( self ) end
@return [Hash]
Parameters found in {#url}.
# File lib/arachni/parser.rb, line 371 def link_vars return {} if !(parsed = uri_parse( @url )) @link_vars ||= parsed.rewrite.query_parameters.freeze end
@return [Array<Element::Link>]
Links in {#document}.
# File lib/arachni/parser.rb, line 342 def links return @links.freeze if @links return @links = [link].compact if !text? || (body && !Link.in_html?( body )) @links = [link].compact | Link.from_parser( self ) end
@return [Page]
# File lib/arachni/parser.rb, line 227 def page @page ||= Page.new( parser: self ) end
@return [Array<String>]
Distinct links to follow.
# File lib/arachni/parser.rb, line 449 def paths return @paths if @paths @paths = [] return @paths.freeze if !document @paths = run_extractors.freeze end
@return [Boolean]
`true` if the given HTTP response data are text based, `false` otherwise.
# File lib/arachni/parser.rb, line 233 def text? from_response? ? @response.text? : true end
Converts a relative URL to an absolute one.
@param [String] relative_url
URL to convert to absolute.
@return [String]
Absolute URL.
Arachni::Utilities#to_absolute
# File lib/arachni/parser.rb, line 216 def to_absolute( relative_url ) if (url = base) base_url = url else base_url = @url end super( relative_url, base_url ) end
Dummy method, only the {Browser#to_page browser} can fill this in.
# File lib/arachni/parser.rb, line 383 def ui_forms [] end
Dummy method, only the {Browser#to_page browser} can fill this in.
# File lib/arachni/parser.rb, line 378 def ui_inputs [] end
# File lib/arachni/parser.rb, line 201 def url=( str ) return @url = nil if !str @url = normalize_url( uri_decode( str ) ) @url = normalize_url( str ) if !@url @url.freeze end
@return [Array<Element::XML>]
# File lib/arachni/parser.rb, line 365 def xmls @xmls ||= [XML.from_request( @url, response.request )].compact end
Private Instance Methods
Runs all path extraction components and returns an array of paths.
@return [Array<String>]
Paths.
# File lib/arachni/parser.rb, line 469 def run_extractors begin unsanitized_paths = Set.new self.class.extractors.available.each do |name| exception_jail false do unsanitized_paths.merge self.class.extractors[name].new( parser: self, html: body ).run.flatten end end sanitized_paths = Set.new unsanitized_paths.map do |path| next if !path || path =~ /^mailto:/i abs = to_absolute( path ) next if !abs || skip?( abs ) sanitized_paths << abs end sanitized_paths.to_a rescue => e print_exception e [] end end