class TextUtils::Sanitizier

Public Class Methods

new( ht ) click to toggle source
# File lib/textutils/sanitizier.rb, line 14
def initialize( ht )
  @ht = ht  # hypertext (html source)
end

Public Instance Methods

handle_block_tags( ht ) click to toggle source
# File lib/textutils/sanitizier.rb, line 62
def handle_block_tags( ht )
  @@block_tags.each do |tag|
    ht.gsub!( tag_regex(tag), "\n\1\n" )
  end
  ht
end
handle_entities( ht ) click to toggle source
# File lib/textutils/sanitizier.rb, line 34
def handle_entities( ht )
  ## unescape entities
  #  - check if it also works for generic entities like  etc.
  #  or only for > < etc.
  ht = CGI.unescapeHTML( ht )
end
handle_ignore_tags( ht ) click to toggle source
# File lib/textutils/sanitizier.rb, line 47
def handle_ignore_tags( ht )
  @@ignore_tags.each do |tag|
    ht.gsub!( tag_regex(tag), '' )
  end
  ht
end
handle_inline_tags( ht ) click to toggle source
# File lib/textutils/sanitizier.rb, line 54
def handle_inline_tags( ht )
  @@inline_tags.each do |tag|
    # add a space after
    ht.gsub!( tag_regex(tag), '\1 ' )
  end
  ht
end
tag_regex( tag ) click to toggle source
# File lib/textutils/sanitizier.rb, line 41
def tag_regex( tag )
  # note use non-greedy .*? for content

  /<#{tag}[^>]*>(.*?)<\/#{tag}>/mi
end
to_plain_text() click to toggle source
# File lib/textutils/sanitizier.rb, line 18
  def to_plain_text
    
    ht = @ht
    ht = handle_ignore_tags( ht )

## handle_pre_tags ??  - special rule for preformatted (keep whitespace)

    ht = handle_inline_tags( ht )
    ht = handle_block_tags( ht )
    ht = handle_other_tags( ht )  # rules for remain/left over tags

    ht = handle_entities( ht )

    ht
  end