class HomographDetector

Constants

APPROVED_SCRIPT_COMBINATIONS

Certain combinations of Unicode Scripts are okay

CHINESE_SCRIPTS
JAPANESE_SCRIPTS
KOREAN_SCRIPTS
SCRIPT_BOPOMOFO

Unicode Script names returned by the 'unicode-scripts' gem

SCRIPT_COMMON
SCRIPT_CYRILLIC
SCRIPT_GREEK
SCRIPT_HAN
SCRIPT_HANGUL
SCRIPT_HIRAGANA
SCRIPT_INHERITED
SCRIPT_KATAKANA
SCRIPT_LATIN
SPECIAL_SCRIPTS

Groups of Unicode Scripts

Attributes

address[R]

Public Class Methods

homograph_attack?(address) click to toggle source
# File lib/homograph_detector.rb, line 40
def self.homograph_attack?(address)
  new(address).homograph_attack?
end
new(address) click to toggle source
# File lib/homograph_detector.rb, line 36
def initialize(address)
  @address = address
end

Public Instance Methods

homograph_attack?() click to toggle source
# File lib/homograph_detector.rb, line 44
def homograph_attack?
  # If we can't determine the Unicode Scripts for the domain, return false
  return false if domain_scripts.nil?

  # If the combination of Unicode Scripts used in the domain are ones we have
  # whitelisted, return false
  return false if domain_has_approved_combination_of_scripts?

  # If the combination of Unicode Scripts in the domain are problematic,
  # return true
  return true if domain_has_sketchy_combination_of_scripts?

  # If the domain is entirely composed of Cyrillic characters and each
  # character can be confusable with a Latin character, return true
  return true if domain_has_confusable_cyrillic_chars?

  false
end

Private Instance Methods

addressable_uri() click to toggle source
# File lib/homograph_detector.rb, line 119
        def addressable_uri
  @addressable_uri ||= Addressable::URI.parse(address)
end
domain_has_approved_combination_of_scripts?() click to toggle source
# File lib/homograph_detector.rb, line 88
        def domain_has_approved_combination_of_scripts?
  APPROVED_SCRIPT_COMBINATIONS.any? do |approved_script_combination|
    domain_scripts.subset?(approved_script_combination)
  end
end
domain_has_confusable_cyrillic_chars?() click to toggle source
# File lib/homograph_detector.rb, line 81
        def domain_has_confusable_cyrillic_chars?
  domain_without_tld.chars.all? do |char|
    Unicode::Scripts.scripts(char).include?(SCRIPT_CYRILLIC) &&
      Unicode::Confusable.skeleton(char) != char
  end
end
domain_has_sketchy_combination_of_scripts?() click to toggle source

Returns true if one of the following is satisfied:

  • Two Unicode Scripts are used in the domain, neither are 'Latin'

  • More than two Unicode Scripts are used in the domain

  • Two Unicode Scripts are used in the domain, one is 'Latin' and the other is either 'Cyrillic' or 'Greek'

# File lib/homograph_detector.rb, line 69
        def domain_has_sketchy_combination_of_scripts?
  (
    domain_scripts.length == 2 && !domain_scripts.include?(SCRIPT_LATIN) ||
      domain_scripts.length > 2 ||
      (
        domain_scripts.length == 2 &&
          (domain_scripts.include?(SCRIPT_CYRILLIC) ||
          domain_scripts.include?(SCRIPT_GREEK))
      )
  )
end
domain_scripts() click to toggle source

Retrieve the set of Unicode Scripts used in the domain name. If the domain name can't be parsed, return nil

# File lib/homograph_detector.rb, line 96
        def domain_scripts
  if domain_without_tld.nil?
    nil
  else
    @domain_scripts ||=
      Set[*Unicode::Scripts.scripts(domain_without_tld)] - SPECIAL_SCRIPTS
  end
end
domain_without_tld() click to toggle source

Retrieve the domain without the TLD. If there's a parsing error, return nil

# File lib/homograph_detector.rb, line 106
        def domain_without_tld
  @domain_without_tld ||=
    begin
      tld = addressable_uri.tld
    rescue Addressable::URI::InvalidURIError, PublicSuffix::Error
      # The `tld` can raise a couple different errors when called if the URI
      # is invalid.
      nil
    else
      addressable_uri.domain.chomp(tld).chomp('.')
    end
end