class TruncatedSaxDocument

Constants

IGNORABLE_TAGS
VOID_TAGS

These don't have to be closed (which also impacts ongoing length calculations) www.456bereastreet.com/archive/201005/void_empty_elements_and_self-closing_start_tags_in_html/

Attributes

ignored_levels[R]
max_length[R]
tail[R]
truncated[R]
truncated_string[R]

Public Class Methods

new(options) click to toggle source

FIXME: Call super to initialize state of the parent class.

# File lib/abbreviato/truncated_sax_document.rb, line 20
def initialize(options) # rubocop:disable Lint/MissingSuper
  @html_coder = HTMLEntities.new

  @max_length = options[:max_length]
  @tail = options[:tail] || ''
  @fragment_mode = options[:fragment]

  @truncated_string = ''
  @closing_tags = []
  @estimated_length = 0
  @ignored_levels = 0
  @truncated = false
end

Public Instance Methods

cdata_block(string) click to toggle source

This method is called when the parser encounters cdata. In practice, this also gets called for this style of comment inside an element:

<style><!--
  /* Font Definitions */
  @font-face
    {font-family:Wingdings;
    panose-1:5 0 0 0 0 0 0 0 0 0;}
--></style>
# File lib/abbreviato/truncated_sax_document.rb, line 103
def cdata_block(string)
  if string.bytesize <= remaining_length
    append_to_truncated_string(string)
  else
    @truncated = true
  end
end
characters(decoded_string) click to toggle source

This method is called when the parser encounters characters between tags

# File lib/abbreviato/truncated_sax_document.rb, line 64
def characters(decoded_string)
  if max_length_reached? || ignore_mode?
    @truncated = true
    return
  end

  # Use encoded length, so &gt; counts as 4 bytes, not 1 (which is what '>' would give)
  encoded_string = @html_coder.encode(decoded_string, :named)
  string_to_append = if encoded_string.bytesize > remaining_length
                       # This is the line which prevents HTML entities getting truncated - treat them as a single char
                       str = truncate_string(decoded_string)
                       str << tail if remaining_length - str.bytesize >= tail.bytesize
                       str
                     else
                       encoded_string
                     end
  append_to_truncated_string(string_to_append)
end
comment(string) click to toggle source

This method is called when the parser encounters a comment

# File lib/abbreviato/truncated_sax_document.rb, line 84
def comment(string)
  comment = comment_tag(string)
  if comment.bytesize <= remaining_length
    append_to_truncated_string(comment)
  else
    @truncated = true
  end
end
end_document() click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 130
def end_document
  @closing_tags.reverse_each { |name| append_to_truncated_string(closing_tag(name), 0) }
end
end_element(name) click to toggle source

This method is called when the parser encounters a closing tag

# File lib/abbreviato/truncated_sax_document.rb, line 112
def end_element(name)
  if ignore_mode?
    exit_ignored_level(name)
    return
  end

  # Note that any remaining end tags get added automatically (in `end_document`) as the document is closed
  return if max_length_reached? || ignorable_tag?(name)

  # FIXME: Style/GuardClause: Use a guard clause (return if single_tag_element?(name)) instead of wrapping the code inside a conditional expression. (https://rubystyle.guide#no-nested-conditionals)
  unless single_tag_element?(name) # rubocop:disable Style/GuardClause
    @closing_tags.pop
    # Don't count the length when closing a tag - it was accommodated when
    # the tag was opened
    append_to_truncated_string(closing_tag(name), 0)
  end
end
start_element(name, attributes) click to toggle source

This method is called when the parser encounters an open tag

# File lib/abbreviato/truncated_sax_document.rb, line 35
def start_element(name, attributes)
  if max_length_reached? || ignorable_tag?(name)
    @truncated = true if max_length_reached?
    return
  end

  # If already in ignore mode, go in deeper
  if ignore_mode?
    enter_ignored_level(name)
    return
  end

  string_to_add = opening_tag(name, attributes)

  # Abort if there is not enough space to add the combined opening tag and (potentially) the closing tag
  length_of_tags = overridden_tag_length(name, string_to_add)
  if length_of_tags > remaining_length
    @truncated = true
    enter_ignored_level(name)
    return
  end

  # Save the tag so we can push it on at the end
  @closing_tags.push name unless single_tag_element?(name)

  append_to_truncated_string(string_to_add, length_of_tags)
end

Private Instance Methods

append_to_truncated_string(string, overridden_length = nil) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 161
def append_to_truncated_string(string, overridden_length = nil)
  @truncated_string << string
  @estimated_length += (overridden_length || string.bytesize)
end
attributes_to_string(attributes) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 166
def attributes_to_string(attributes)
  attributes.inject(' ') do |string, attribute|
    key, value = attribute
    string << "#{key}='#{@html_coder.encode value}' "
  end.rstrip
end
closing_tag(name) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 149
def closing_tag(name)
  "</#{name}>"
end
comment_tag(comment) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 145
def comment_tag(comment)
  "<!--#{comment}-->"
end
enter_ignored_level(name) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 204
def enter_ignored_level(name)
  @ignored_levels += 1 unless single_tag_element?(name)
end
exit_ignored_level(name) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 208
def exit_ignored_level(name)
  @ignored_levels -= 1 unless single_tag_element?(name)
end
ignorable_tag?(name) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 200
def ignorable_tag?(name)
  @fragment_mode && IGNORABLE_TAGS.include?(name.downcase)
end
ignore_mode?() click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 212
def ignore_mode?
  @ignored_levels.positive?
end
max_length_reached?() click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 173
def max_length_reached?
  @estimated_length >= max_length
end
opening_tag(name, attributes) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 136
def opening_tag(name, attributes)
  attributes_string = attributes_to_string(attributes)
  if single_tag_element? name
    "<#{name}#{attributes_string}/>"
  else
    "<#{name}#{attributes_string}>"
  end
end
overridden_tag_length(tag_name, rendered_tag_with_attributes) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 191
def overridden_tag_length(tag_name, rendered_tag_with_attributes)
  # Start with the opening tag
  length = rendered_tag_with_attributes.bytesize

  # Add on closing tag if necessary
  length += closing_tag(tag_name).bytesize unless single_tag_element?(tag_name)
  length
end
remaining_length() click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 153
def remaining_length
  max_length - @estimated_length
end
single_tag_element?(name) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 157
def single_tag_element?(name)
  VOID_TAGS.include? name
end
truncate_string(decoded_string) click to toggle source
# File lib/abbreviato/truncated_sax_document.rb, line 177
def truncate_string(decoded_string)
  @truncated = true
  truncate_length = remaining_length - tail.bytesize
  truncated_string = ''
  decoded_string.chars.each do |char|
    encoded_char = @html_coder.encode(char)
    break if encoded_char.bytesize > truncate_length

    truncated_string += encoded_char
    truncate_length -= encoded_char.bytesize
  end
  truncated_string.scrub('')
end