class REHTML::Tokenizer
Public Class Methods
new(html)
click to toggle source
Create a new Tokenizer
for the given text.
# File lib/rehtml/tokenizer.rb, line 16 def initialize(html) @scanner = Scanner.new(html) @bpos = 0 end
Public Instance Methods
next()
click to toggle source
Return the next token in the sequence, or nil
if there are no more tokens in the stream.
# File lib/rehtml/tokenizer.rb, line 23 def next return nil if @scanner.eos? add_parse_info(@scanner.check(/<\S/) ? scan_element : scan_text) end
Private Instance Methods
add_parse_info(node)
click to toggle source
# File lib/rehtml/tokenizer.rb, line 29 def add_parse_info(node) node.extend(TokenInfo) node.set_token_info(@bpos,@scanner) @bpos = @scanner.pos node end
decode(html)
click to toggle source
decode html entity
# File lib/rehtml/tokenizer.rb, line 41 def decode(html) html.gsub(ENTITIES::REGEXP){ if $1 if ENTITIES::MAP[$1] ENTITIES::MAP[$1] else $& end elsif $2 [$2.to_i(10)].pack('U') elsif $3 [$3.to_i(16)].pack('U') else $& end } end
scan_doctype()
click to toggle source
# File lib/rehtml/tokenizer.rb, line 118 def scan_doctype # TODO complex doctype # https://github.com/ruby/ruby/blob/master/lib/rexml/parsers/baseparser.rb#L258 # source = REXML::Source.new(doctype) # parser = REXML::Parsers::BaseParser.new(soucre) # while parser.document_status == in_doctype # parser.pull_event doctype = @scanner.scan_before_or_eos(/>/,true) DocType.new end
scan_element()
click to toggle source
# File lib/rehtml/tokenizer.rb, line 59 def scan_element if @scanner.scan(/<!--/) # comment comment = @scanner.scan_before_or_eos(/-->/,true) Comment.new(comment) elsif @scanner.scan(/<!\[CDATA\[/) CData.new(@scanner.scan_before_or_eos(/\]\]>/,true)) elsif @scanner.scan(/<!DOCTYPE[\x20\x09\x0A\x0C\x0D]+/i) scan_doctype elsif @scanner.scan(/<!/) # comment comment = @scanner.scan_before_or_eos(/>/,true) Comment.new(comment) elsif @scanner.scan(/<\?/) # PI or xml decl scan_pi else scan_tag end end
scan_pi()
click to toggle source
# File lib/rehtml/tokenizer.rb, line 112 def scan_pi # http://www.w3.org/TR/REC-xml/#NT-Name name = @scanner.scan(/([-:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD0-9\u00B7\u0300-\u036F\u203F-\u2040]+)/) || "" body = @scanner.scan_before_or_eos(/\?>/,true) Instruction.new(name,body) end
scan_tag()
click to toggle source
# File lib/rehtml/tokenizer.rb, line 77 def scan_tag @scanner.scan(/<(\/)?([^\x20\x09\x0A\x0C\x0D>]*)/) is_end = @scanner[1] ? true : false name = @scanner[2] attrs = {} loop do @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/) attr = @scanner.scan_before_or_eos(/[=>\x20\x09\x0A\x0C\x0D]|\/>/) matched = @scanner.matched if matched == '>' || matched.nil? || matched == '/>' attrs[attr.downcase]="" unless attr.empty? break end @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/) if @scanner.scan(/=/) @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/) if @scanner.scan(/['"]/) m = Regexp.compile(Regexp.quote(@scanner.matched)) value = @scanner.scan_before_or_eos(m, true) else value = @scanner.scan_before_or_eos(/[>\x20\x09\x0A\x0C\x0D]|\/>/) end else value = "" end attrs[attr.downcase]=decode(value) unless attr.empty? end empty = !@scanner.scan(/\//).nil? @scanner.skip(/>/) if is_end EndTag.new(name.downcase,attrs,empty) else Tag.new(name.downcase,attrs,empty) end end
scan_text()
click to toggle source
# File lib/rehtml/tokenizer.rb, line 36 def scan_text Text.new(decode("#{@scanner.getch}#{@scanner.scan(/[^<]*/)}")) end