class Kudzu::Agent::Util::CharsetDetector

Constants

CORRECTION

Public Class Methods

detect(response) click to toggle source
# File lib/kudzu/agent/util/charset_detector.rb, line 13
def detect(response)
  if response.html?
    from_html(response.body) || from_text(response.body)
  elsif response.xml?
    from_xml(response.body) || from_text(response.body)
  elsif response.text?
    from_text(response.body)
  end
rescue => e
  Kudzu.log :warn, "failed to detect charset: #{response.url}", error: e
  nil
end

Private Class Methods

correct(charset) click to toggle source
# File lib/kudzu/agent/util/charset_detector.rb, line 69
def correct(charset)
  charset = charset.downcase
  charset = CORRECTION[charset] if CORRECTION.key?(charset)

  begin
    Encoding.find(charset)
  rescue
    charset = nil
  end
  charset
end
from_html(body) click to toggle source
# File lib/kudzu/agent/util/charset_detector.rb, line 28
def from_html(body)
  doc = Nokogiri::HTML(body.encode('utf-8', undef: :replace, invalid: :replace))

  if (node = doc.xpath('//meta/@charset').first)
    charset = correct(node.to_s)
    return charset if charset
  end

  doc.xpath('//meta[@http-equiv]').each do |meta|
    if meta['http-equiv'] =~ /content-type/i
      charset = ContentTypeParser.parse(meta[:content].to_s)[1][:charset]
      charset = correct(node.to_s)
      return charset if charset
    end
  end

  return nil
end
from_text(text) click to toggle source
# File lib/kudzu/agent/util/charset_detector.rb, line 56
def from_text(text)
  if text.ascii_only?
    'ascii'
  else
    detection = CharlockHolmes::EncodingDetector.detect(text)
    if detection && detection.key?(:encoding)
      detection[:encoding].downcase
    else
      nil
    end
  end
end
from_xml(body) click to toggle source
# File lib/kudzu/agent/util/charset_detector.rb, line 47
def from_xml(body)
  doc = Nokogiri::XML(body.encode('utf-8', undef: :replace, invalid: :replace))
  if doc.encoding
    correct(doc.encoding)
  else
    nil
  end
end