class Domainatrix::DomainParser
Constants
- VALID_SCHEMA
Attributes
approved_sections[R]
found_sections[R]
public_suffixes[R]
Public Class Methods
new(file_name, approved_sections = (Array.new << "*"))
click to toggle source
# File lib/domainatrix/domain_parser.rb, line 15 def initialize(file_name, approved_sections = (Array.new << "*")) @public_suffixes = {} @found_sections =[] @approved_sections = approved_sections read_dat_file(file_name) end
parse(url)
click to toggle source
# File lib/domainatrix/domain_parser.rb, line 11 def self.parse(url) self.new("#{File.dirname(__FILE__)}/../effective_tld_names.dat").parse(url) end
Public Instance Methods
parse(url)
click to toggle source
# File lib/domainatrix/domain_parser.rb, line 53 def parse(url) return {} unless url && url.strip != '' url = "http://#{url}" unless url[/:\/\//] url = url.downcase uri = begin Addressable::URI.parse(url) rescue Addressable::URI::InvalidURIError nil end raise ParseError, "URL is not parsable by Addressable::URI" if not uri url = uri.normalize.to_s raise ParseError, "URL does not have valid scheme" unless uri.scheme =~ VALID_SCHEMA raise ParseError, "URL does not have a valid host" if uri.host.nil? path = uri.path path << "?#{uri.query}" if uri.query path << "##{uri.fragment}" if uri.fragment if uri.host == 'localhost' uri_hash = { :public_suffix => '', :domain => 'localhost', :subdomain => '' } else uri_hash = parse_domains_from_host(uri.host || uri.basename) end uri_hash.merge({ :scheme => uri.scheme, :host => uri.host, :path => path, :url => url }) end
parse_domains_from_host(host)
click to toggle source
# File lib/domainatrix/domain_parser.rb, line 108 def parse_domains_from_host(host) return {} unless host parts = host.split(".").reverse ip_address = false if host == '*' tld_size = 0 elsif !parts.map { |part| part.match(/^\d{1,3}$/) }.include?(nil) # host is an ip address ip_address = true else main_tld = parts.first tld_size = 1 raise ParseError, "Invalid URL" if parts.size < 2 if main_tld != '*' #PunyCode, and New Anydomain TLD invalidate this, just use the DAT file #raise ParseError, "Invalid characters for TLD" unless main_tld =~ /^[a-z]{2,}/ if not current_suffixes = @public_suffixes[main_tld] raise ParseError, "Invalid main TLD: #{main_tld}" end parts.each_with_index do |part, i| if current_suffixes.empty? # no extra rules found (eg domain.net) break else if current_suffixes.has_key?("!#{parts[i+1]}") # exception tld domain found (eg metro.tokyo.jp) break elsif current_suffixes.has_key?(parts[i+1]) # valid extra domain level found (eg co.uk) tld_size += 1 current_suffixes = current_suffixes[parts[i+1]] elsif current_suffixes.has_key?('*') # wildcard domain level (eg *.jp) tld_size += 1 break else # no extra rules found (eg domain.net) break end # if current_suffixes end # if current_suffixes.empty? end # parts .. do end# if main_tld end # if host if ip_address subdomain, domain, tld = '', host, '' else subdomain, domain, tld = split_domain(parts, tld_size) end {:public_suffix => tld, :domain => domain, :subdomain => subdomain, :ip_address => ip_address} end
read_dat_file(file_name)
click to toggle source
# File lib/domainatrix/domain_parser.rb, line 22 def read_dat_file(file_name) # If we're in 1.9, make sure we're opening it in UTF-8 if RUBY_VERSION >= '1.9' dat_file = File.open(file_name, "r:UTF-8") else dat_file = File.open(file_name) end section = "" dat_file.each_line do |line| line = line.strip #// ===BEGIN ICANN DOMAINS=== if line =~ /^\/\/ ===BEGIN/ section = /^\/\/ ===BEGIN(.*)===/.match(line)[1].strip @found_sections << section end if @approved_sections.include?(section) or @approved_sections.include?("*") unless (line =~ /^\/\//) || line.empty? parts = line.split(".").reverse sub_hash = @public_suffixes parts.each do |part| sub_hash = (sub_hash[part] ||= {}) end end end end end
split_domain(parts, tld_size)
click to toggle source
# File lib/domainatrix/domain_parser.rb, line 88 def split_domain(parts, tld_size) if parts.size == 1 and tld_size == 0 subdomain = '' domain = '*' tld = '' else # parts are host split on . reversed, eg com.pauldix.www domain_parts = parts.reverse if domain_parts.size - tld_size <= 0 raise ParseError, "Invalid TLD size found for #{domain_parts.join('.')}: #{tld_size}" end tld = domain_parts.slice!(-tld_size, tld_size).join('.') domain = domain_parts.pop subdomain = domain_parts.join('.') end [subdomain, domain, tld] end