class REHTML::Tokenizer

Public Class Methods

new(html) click to toggle source

Create a new Tokenizer for the given text.

# File lib/rehtml/tokenizer.rb, line 16
def initialize(html)
  @scanner = Scanner.new(html)
  @bpos = 0
end

Public Instance Methods

next() click to toggle source

Return the next token in the sequence, or nil if there are no more tokens in the stream.

# File lib/rehtml/tokenizer.rb, line 23
def next
  return nil if @scanner.eos?
  add_parse_info(@scanner.check(/<\S/) ? scan_element : scan_text)
end

Private Instance Methods

add_parse_info(node) click to toggle source
# File lib/rehtml/tokenizer.rb, line 29
def add_parse_info(node)
  node.extend(TokenInfo)
  node.set_token_info(@bpos,@scanner)
  @bpos = @scanner.pos
  node
end
decode(html) click to toggle source

decode html entity

# File lib/rehtml/tokenizer.rb, line 41
def decode(html)
  html.gsub(ENTITIES::REGEXP){
    if $1 
      if ENTITIES::MAP[$1]
        ENTITIES::MAP[$1]
      else
        $&
      end
    elsif $2
      [$2.to_i(10)].pack('U')
    elsif $3
      [$3.to_i(16)].pack('U')
    else
      $&
    end
  }
end
scan_doctype() click to toggle source
# File lib/rehtml/tokenizer.rb, line 118
def scan_doctype
  # TODO complex doctype
  # https://github.com/ruby/ruby/blob/master/lib/rexml/parsers/baseparser.rb#L258
  # source = REXML::Source.new(doctype)
  # parser = REXML::Parsers::BaseParser.new(soucre)
  # while parser.document_status == in_doctype
  #   parser.pull_event
  doctype = @scanner.scan_before_or_eos(/>/,true)
  DocType.new
end
scan_element() click to toggle source
# File lib/rehtml/tokenizer.rb, line 59
def scan_element
  if @scanner.scan(/<!--/) # comment
    comment = @scanner.scan_before_or_eos(/-->/,true)
    Comment.new(comment)
  elsif @scanner.scan(/<!\[CDATA\[/)
    CData.new(@scanner.scan_before_or_eos(/\]\]>/,true))
  elsif @scanner.scan(/<!DOCTYPE[\x20\x09\x0A\x0C\x0D]+/i)
    scan_doctype
  elsif @scanner.scan(/<!/) # comment
    comment = @scanner.scan_before_or_eos(/>/,true)
    Comment.new(comment)
  elsif @scanner.scan(/<\?/) # PI or xml decl
    scan_pi
  else
    scan_tag
  end
end
scan_pi() click to toggle source
# File lib/rehtml/tokenizer.rb, line 112
def scan_pi
  # http://www.w3.org/TR/REC-xml/#NT-Name
  name = @scanner.scan(/([-:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD0-9\u00B7\u0300-\u036F\u203F-\u2040]+)/) || ""
  body = @scanner.scan_before_or_eos(/\?>/,true)
  Instruction.new(name,body)
end
scan_tag() click to toggle source
# File lib/rehtml/tokenizer.rb, line 77
def scan_tag
  @scanner.scan(/<(\/)?([^\x20\x09\x0A\x0C\x0D>]*)/)
  is_end = @scanner[1] ? true : false
  name = @scanner[2]
  attrs = {}
  loop do
    @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/)
    attr = @scanner.scan_before_or_eos(/[=>\x20\x09\x0A\x0C\x0D]|\/>/)
    matched = @scanner.matched
    if matched == '>' || matched.nil? || matched == '/>'
      attrs[attr.downcase]="" unless attr.empty?
      break
    end
    @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/)
    if @scanner.scan(/=/)
      @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/)
      if @scanner.scan(/['"]/)
        m = Regexp.compile(Regexp.quote(@scanner.matched))
        value = @scanner.scan_before_or_eos(m, true)
      else
        value = @scanner.scan_before_or_eos(/[>\x20\x09\x0A\x0C\x0D]|\/>/)
      end
    else 
      value = ""
    end
    attrs[attr.downcase]=decode(value) unless attr.empty?
  end
  empty = !@scanner.scan(/\//).nil?
  @scanner.skip(/>/)
  if is_end
    EndTag.new(name.downcase,attrs,empty)
  else
    Tag.new(name.downcase,attrs,empty)
  end
end
scan_text() click to toggle source
# File lib/rehtml/tokenizer.rb, line 36
def scan_text
  Text.new(decode("#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"))
end