class Arachni::Support::Signature

Represents a signature, used to maintain a lightweight representation of a {String} and refine it using similar {String}s to remove noise.

@author Tasos “Zapotek” Laskos <tasos.laskos@arachni-scanner.com>

Constants

CACHE

Attributes

tokens[R]

Public Class Methods

new( data, options = {} ) click to toggle source

@note The string will be tokenized based on whitespace.

@param [String, Signature] data

Seed data to use to initialize the signature.

@param [Hash] options @option options :threshold [Float]

Sets the maximum allowed {#differences} when performing
{#similar? similarity} comparisons.
# File lib/arachni/support/signature.rb, line 31
def initialize( data, options = {} )
    @tokens  = tokenize( data )
    @options = options

    if @options[:threshold] && !@options[:threshold].is_a?( Numeric )
        fail ArgumentError, 'Option :threshold must be a number.'
    end
end

Public Instance Methods

<<( data ) click to toggle source
# File lib/arachni/support/signature.rb, line 53
def <<( data )
    @hash_cache = nil
    @tokens.merge tokenize( data )
    self
end
==( other ) click to toggle source

@param [Signature] other

# File lib/arachni/support/signature.rb, line 107
def ==( other )
    hash == other.hash
end
differences( other ) click to toggle source

@param [Signature] other

@return [Float]

Ratio of difference between signatures.
# File lib/arachni/support/signature.rb, line 74
def differences( other )
    return 1 if other.nil?
    return 0 if self == other

    ((tokens - other.tokens) | (other.tokens - tokens)).size /
        Float((other.tokens | tokens).size)
end
dup() click to toggle source

@return [Signature]

Copy of `self`.
# File lib/arachni/support/signature.rb, line 98
def dup
    self.class.new( '' ).tap { |s| s.copy( @hash_cache, tokens, @options ) }
end
empty?() click to toggle source
# File lib/arachni/support/signature.rb, line 92
def empty?
    @tokens.empty?
end
hash() click to toggle source
# File lib/arachni/support/signature.rb, line 102
def hash
    @hash_cache ||= tokens.hash
end
refine( data ) click to toggle source

@note The string will be tokenized based on whitespace.

@param [String, Signature] data

Data to use to refine the signature.

@return [Signature]

New, refined signature.
# File lib/arachni/support/signature.rb, line 66
def refine( data )
    dup.refine!( data )
end
refine!( data ) click to toggle source

@note The string will be tokenized based on whitespace.

@param [String, Signature] data

Data to use to refine the signature.

@return [Signature]

`self`
# File lib/arachni/support/signature.rb, line 47
def refine!( data )
    @hash_cache = nil
    @tokens &= tokenize( data )
    self
end
similar?( other, threshold = @options[:threshold] ) click to toggle source

@param [Signature] other @param [Integer] threshold

Threshold of {#differences differences}.

@return [Bool]

# File lib/arachni/support/signature.rb, line 87
def similar?( other, threshold = @options[:threshold] )
    fail 'No threshold given.' if !threshold
    self == other || differences( other ) < threshold
end

Protected Instance Methods

copy( hash, tokens, options ) click to toggle source
# File lib/arachni/support/signature.rb, line 113
def copy( hash, tokens, options )
    @hash_cache = hash
    @tokens     = tokens.dup
    @options    = options.dup
end

Private Instance Methods

compress( tokens ) click to toggle source

Compresses the tokens by only storing unique hash values. Seems kinda silly but this can actually save us GB of RAM when comparing large signatures, not to mention CPU cycles.

# File lib/arachni/support/signature.rb, line 138
def compress( tokens )
    s = Set.new
    tokens.each do |token|
        # Left-over non-word characters will be on their own, this is a
        # low-overhead way to dispose of them.
        next if token.empty?

        s << token.hash
    end
    s
end
tokenize( data ) click to toggle source

@param [Signature, String] data

@return [Array<String,Integer>]

Words as tokens.
# File lib/arachni/support/signature.rb, line 125
def tokenize( data )
    return data.tokens if data.is_a? self.class

    if CACHE[:tokens][data]
        CACHE[:tokens][data].dup
    else
        CACHE[:tokens][data] = compress( data.split( /\W/ ) )
    end
end