class SiteDiff::Sanitizer

SiteDiff Sanitizer.

Constants

DOM_TRANSFORMS
TOOLS

Public Class Methods

domify(str, force_doc = false) click to toggle source

Parse HTML into a node

# File lib/sitediff/sanitize.rb, line 213
def self.domify(str, force_doc = false)
  if force_doc || /<!DOCTYPE/.match(str[0, 512])
    Nokogiri::HTML(str)
  else
    Nokogiri::HTML.fragment(str)
  end
end
new(html, config, opts = {}) click to toggle source

Creates a Sanitizer.

# File lib/sitediff/sanitize.rb, line 23
def initialize(html, config, opts = {})
  @html = html
  @config = config
  @opts = opts
end
prettify(obj) click to toggle source

Pretty-print some HTML

# File lib/sitediff/sanitize.rb, line 170
def self.prettify(obj)
  @stylesheet ||= begin
    stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
    Nokogiri::XSLT(File.read(stylesheet_path))
  end

  # Pull out the html element's children
  # The obvious way to do this is to iterate over pretty.css('html'),
  # but that tends to segfault Nokogiri
  str = @stylesheet.apply_to(to_document(obj))

  # There's a lot of cruft left over,that we don't want

  # Prevent potential UTF-8 encoding errors by removing invalid bytes.
  # Not the only solution.
  # An alternative is to return the string unmodified.
  str = str.encode(
    'UTF-8',
    'binary',
    invalid: :replace,
    undef: :replace,
    replace: ''
  )
  # Remove xml declaration and <html> tags
  str.sub!(/\A<\?xml.*$\n/, '')
  str.sub!(/\A^<html>$\n/, '')
  str.sub!(%r{</html>\n\Z}, '')

  # Remove top-level indentation
  indent = /\A(\s*)/.match(str)[1].size
  str.gsub!(/^\s{,#{indent}}/, '')

  # Remove blank lines
  str.gsub!(/^\s*$\n/, '')

  # Remove DOS newlines
  str.gsub!(/\x0D$/, '')
  str.gsub!(/&#13;$/, '')

  str
end
remove_node_spacing(node) click to toggle source

Remove double-spacing inside text nodes

# File lib/sitediff/sanitize.rb, line 138
def self.remove_node_spacing(node)
  # remove double spacing, but only inside text nodes (eg not attributes)
  node.xpath('//text()').each do |el|
    el.content = el.content.gsub(/  +/, ' ')
  end
end
select_fragments(node, sel) click to toggle source

Get a fragment consisting of the elements matching the selector(s)

# File lib/sitediff/sanitize.rb, line 160
def self.select_fragments(node, sel)
  # When we choose a new root, we always become a DocumentFragment,
  # and lose any DOCTYPE and such.
  ns = node.css(sel)
  node = Nokogiri::HTML.fragment('') unless node.fragment?
  node.children = ns
  node
end
to_document(obj) click to toggle source

Force this object to be a document, so we can apply a stylesheet

# File lib/sitediff/sanitize.rb, line 222
def self.to_document(obj)
  if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
    obj
  # node or fragment
  elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
    domify(obj.to_s, true)
  else
    to_document(domify(obj, false))
  end
end

Public Instance Methods

canonicalize_rule(name) click to toggle source

Canonicalize a simple rule, eg: 'remove_spacing' or 'selector'. It may be a simple value, or a hash, or an array of hashes. Turn it into an array of hashes.

# File lib/sitediff/sanitize.rb, line 61
def canonicalize_rule(name)
  (rules = @config[name]) || (return nil)

  # Already an array? Do nothing.
  if rules[0]&.respond_to?('each') && rules[0]&.fetch('value')
  # If it is a hash, put it in an array.
  elsif rules['value']
    rules = [rules]
  # If it is a scalar value, put it in an array.
  else
    rules = [{ 'value' => rules }]
  end

  want = rules.select { |r| want_rule(r) }
  return nil if want.empty?
  raise "Too many matching rules of type #{name}" if want.size > 1

  want.first
end
dom_transforms() click to toggle source

Perform DOM transforms

# File lib/sitediff/sanitize.rb, line 125
def dom_transforms
  (rules = @config['dom_transform']) || return
  rules = rules.select { |r| want_rule(r) }

  rules.each do |rule|
    transform = DomTransform.create(rule)
    transform.apply(@node)
  end
end
regexps() click to toggle source

Applies regexps. Also

# File lib/sitediff/sanitize.rb, line 101
def regexps
  (rules = @config['sanitization']) || return
  rules = rules.select { |r| want_rule(r) }

  rules.map! { |r| Regexp.create(r) }
  selector, global = rules.partition(&:selector?)

  selector.each { |r| r.apply(@node) }
  @html = Sanitizer.prettify(@node)
  @node = nil
  # Prevent potential UTF-8 encoding errors by removing bytes
  # Not the only solution. An alternative is to return the
  # string unmodified.
  @html = @html.encode(
    'UTF-8',
    'binary',
    invalid: :replace,
    undef: :replace,
    replace: ''
  )
  global.each { |r| r.apply(@html) }
end
regions() click to toggle source

Perform 'regions' action, don't perform 'selector' if regions exist.

# File lib/sitediff/sanitize.rb, line 88
def regions
  return unless validate_regions

  @node = select_regions(@node, @config['regions'], @opts[:output])
end
remove_spacing() click to toggle source

Perform 'remove_spacing' action

# File lib/sitediff/sanitize.rb, line 82
def remove_spacing
  (rule = canonicalize_rule('remove_spacing')) || return
  Sanitizer.remove_node_spacing(@node) if rule['value']
end
sanitize() click to toggle source

Performs sanitization.

# File lib/sitediff/sanitize.rb, line 31
def sanitize
  return '' if @html == '' # Quick return on empty input

  @node = Sanitizer.domify(@html)
  @html = nil

  remove_spacing
  regions || selector
  dom_transforms
  regexps

  @html || Sanitizer.prettify(@node)
end
select_regions(node, regions, output) click to toggle source

Restructure the node into regions.

# File lib/sitediff/sanitize.rb, line 146
def select_regions(node, regions, output)
  regions = output.map do |name|
    selector = get_named_region(regions, name)['selector']
    region = Nokogiri::XML.fragment('<region id="' + name + '"></region>').at_css('region')
    matching = node.css(selector)
    matching.each { |m| region.add_child m }
    region
  end
  node = Nokogiri::HTML.fragment('')
  regions.each { |r| node.add_child r }
  node
end
selector() click to toggle source

Perform 'selector' action, to choose a new root

# File lib/sitediff/sanitize.rb, line 95
def selector
  (rule = canonicalize_rule('selector')) || return
  @node = Sanitizer.select_fragments(@node, rule['value'])
end
want_rule(rule) click to toggle source

Return whether or not we want to keep a rule

# File lib/sitediff/sanitize.rb, line 46
def want_rule(rule)
  return false unless rule
  return false if rule['disabled']

  # Filter out if path regexp doesn't match
  if (pathre = rule['path']) && (path = @opts[:path])
    return ::Regexp.new(pathre).match(path)
  end

  true
end

Private Instance Methods

get_named_region(regions, name) click to toggle source

Return the selector from a named region.

# File lib/sitediff/sanitize.rb, line 256
def get_named_region(regions, name)
  regions.find { |region| region['name'] == name }
end
validate_regions() click to toggle source

Validate `regions` and `output` from config.

# File lib/sitediff/sanitize.rb, line 236
def validate_regions
  return false unless @config['regions'].is_a?(Array)

  return false unless @opts[:output].is_a?(Array)

  regions = @config['regions']
  output = @opts[:output]
  regions.each do |region|
    return false unless region.key?('name') && region.key?('selector')
  end

  # Check that each named output has an associated region.
  output.each do |name|
    return false unless get_named_region(regions, name)
  end

  true
end