class Gammo::Tokenizer
Class for implementing HTML5 tokenization algorithm.
Constants
- CommentToken
- DoctypeToken
- EOS
Represents end-of-string.
- EOSError
Raised if unexpected
EOS
is found. @!visibility private- EndTagToken
- ErrorToken
- RAW_TAGS
- RAW_TAGS_UNION
- SelfClosingTagToken
- StartTagToken
- TextToken
Attributes
convert_null[RW]
raw[RW]
raw_tag[RW]
scanner[RW]
Public Class Methods
new(text, context: nil)
click to toggle source
# File lib/gammo/tokenizer.rb, line 26 def initialize(text, context: nil) @text = text @scanner = StringScanner.new(text.force_encoding(Encoding::UTF_8)) @raw_tag = context && raw_tag?(context.downcase) ? context.downcase : '' @convert_null = false @cdata_allowed = false @raw = false end
Public Instance Methods
allow_cdata!(b)
click to toggle source
# File lib/gammo/tokenizer.rb, line 35 def allow_cdata!(b) @cdata_allowed = !!b end
allow_cdata?()
click to toggle source
# File lib/gammo/tokenizer.rb, line 39 def allow_cdata? @cdata_allowed end
next_is_not_raw_text!()
click to toggle source
# File lib/gammo/tokenizer.rb, line 96 def next_is_not_raw_text! @raw_tag = '' end
next_token()
click to toggle source
# File lib/gammo/tokenizer.rb, line 47 def next_token return EOS if scanner.eos? if previous_token_is_raw_tag? && (token = next_token_for_raw_tag) return token end @raw = false @convert_null = false pos = scanner.pos buffer = ''.force_encoding(Encoding::ASCII) loop do break unless byte = scanner.get_byte buffer << byte next if byte != ?< break unless byte = scanner.get_byte buffer << byte if pos < (scanner.pos - 2) scanner.pos -= 2 buffer = buffer.slice(0, buffer.length - 2) return text_token(buffer) end case byte when %r{[a-zA-Z]} step_back return scan_start_tag when ?! then return scan_markup_declaration when ?? then return comment_token(?? + scan_until_close_angle) when ?/ return text_token(buffer) if scanner.eos? # "</>" does not generate a token at all. treat this as empty comment token. return comment_token('') if scan(/>/) # Expects chars like "</a" return comment_token(scan_until_close_angle) unless check(/[a-zA-Z]/) begin tag = scan_tag(need_attribute: false) rescue EOSError return EOS end return error_token(pos) if tag.nil? return end_tag_token(tag) else step_back buffer = buffer.slice(0, buffer.length - 1) next end end return text_token(buffer) if pos < scanner.pos EOS end
previous_token_is_raw_tag?()
click to toggle source
# File lib/gammo/tokenizer.rb, line 43 def previous_token_is_raw_tag? !raw_tag.empty? end
Private Instance Methods
comment_token(text)
click to toggle source
# File lib/gammo/tokenizer.rb, line 365 def comment_token(text) CommentToken.new(text, raw: raw, convert_null: convert_null) end
doctype_token(text)
click to toggle source
# File lib/gammo/tokenizer.rb, line 369 def doctype_token(text) DoctypeToken.new(text, raw: raw, convert_null: convert_null) end
end_tag_token(tag)
click to toggle source
# File lib/gammo/tokenizer.rb, line 361 def end_tag_token(tag) EndTagToken.new(tag.name, tag: Tags.lookup(tag.name), attributes: tag.attributes) end
error_token(pos)
click to toggle source
# File lib/gammo/tokenizer.rb, line 357 def error_token(pos) ErrorToken.new("unexpected token, #{scanner.string.slice(pos..scanner.pos)}") end
next_token_for_raw_tag()
click to toggle source
# File lib/gammo/tokenizer.rb, line 102 def next_token_for_raw_tag pos = scanner.pos token = if raw_tag != 'plaintext' scan_raw_or_rcdata else @raw = true text_token(scan_until(/\z/) || '') end if token && scanner.pos > pos @convert_null = true token.convert_null = true token.load_data(token.data) return token end end
peek(length:, target: matched)
click to toggle source
# File lib/gammo/tokenizer.rb, line 170 def peek(length:, target: matched) target.slice(0, length) end
raw_tag?(name)
click to toggle source
# File lib/gammo/tokenizer.rb, line 349 def raw_tag?(name) RAW_TAGS.include?(name) end
scan_cdata()
click to toggle source
# File lib/gammo/tokenizer.rb, line 322 def scan_cdata return unless scan(/\[CDATA\[/) brackets = 0 buffer = '' loop do byte = scanner.get_byte return text_token(buffer) unless byte buffer << byte case byte when ?] brackets += 1 when ?> if brackets >= 2 buffer = buffer.slice(0, buffer.length - ']]>'.length) break end brackets = 0 else brackets = 0 end end text_token(buffer) end
scan_comment()
click to toggle source
# File lib/gammo/tokenizer.rb, line 282 def scan_comment count = 2 buffer = '' loop do unless byte = scanner.get_byte count = 2 if count > 2 buffer = buffer.slice(0, buffer.length - count) break end buffer << byte case byte when ?- count += 1 next when ?> if count >= 2 # "-->" buffer = buffer.slice(0, buffer.length - 3) break end when ?! if count >= 2 break unless byte = scanner.get_byte # "--!>" if byte == ?> # no need to count ">" as it's not appended to the buffer. buffer = buffer.slice(0, buffer.length - 3) break end end end count = 0 end comment_token(buffer) end
scan_doctype()
click to toggle source
# File lib/gammo/tokenizer.rb, line 276 def scan_doctype scan_whitespace return error_token(scanner.pos) if scanner.eos? doctype_token(scan_until_close_angle) end
scan_markup_declaration()
click to toggle source
# File lib/gammo/tokenizer.rb, line 260 def scan_markup_declaration return scan_comment if scan(/--/) return scan_doctype if scan(/DOCTYPE/i) if allow_cdata? && (cdata = scan_cdata) self.convert_null = true cdata else comment_token(scan_until_close_angle) end end
scan_raw_or_rcdata()
click to toggle source
# File lib/gammo/tokenizer.rb, line 119 def scan_raw_or_rcdata if raw_tag == 'script' token = scan_script @raw = true @raw_tag = '' return token end buffer = '' while !scanner.eos? ch = scanner.get_byte buffer << ch break if scanner.eos? next if ch != ?< ch = scanner.get_byte buffer << ch break if scanner.eos? if ch != ?/ buffer = buffer.slice(0, buffer.length - 1) scanner.unscan next end if scanner.check(%r{#{raw_tag}[>\s\/]}) || scanner.eos? buffer = buffer.slice(0..-3) scanner.pos -= 2 break end end @raw = raw_tag != 'textarea' && raw_tag != 'title' @raw_tag = '' text_token(buffer) unless buffer.empty? end
scan_script()
click to toggle source
# File lib/gammo/tokenizer.rb, line 151 def scan_script text_token(ScriptScanner.new(scanner, raw_tag: raw_tag).scan) end
scan_start_tag()
click to toggle source
# File lib/gammo/tokenizer.rb, line 155 def scan_start_tag begin tag = scan_tag(need_attribute: true) rescue EOSError return EOS end name = tag.name @raw_tag = name.downcase if raw_tag?(name) (tag.self_closing? || scanner.string.slice(scanner.pos - 2) == ?/ ? SelfClosingTagToken : StartTagToken).new( name, tag: Tags.lookup(tag.name), attributes: tag.attributes, ) end
scan_tag(need_attribute: false)
click to toggle source
# File lib/gammo/tokenizer.rb, line 224 def scan_tag(need_attribute: false) name = scan_until(%r{[\s/>]}) name = if name step_back if name.end_with?(?/) || name.end_with?(?>) name.slice(0, name.length - 1) else if buf = scan_until(/\s/) step_back buf.slice(0, buf.length - 1) else scan_until(/\z/) raise EOSError, "Couldn't find a token for representing end of the tag" end end name = name.downcase scan_whitespace return Tag.new(name: name) if scanner.eos? attrs = [] while !scan(/>/) key = scan_tag_attribute_key return Tag.new(name: name, attributes: attrs, self_closing: true) if key == ?/ && scan(/>/) next unless key break if scanner.eos? scan_whitespace break if scanner.eos? value = scan_tag_attribute_value value = unescape(value, in_attribute: true) if value break if scanner.eos? attrs << Attribute.new(key: key, value: value) if need_attribute scan_whitespace break if scanner.eos? end Tag.new(name: name, attributes: attrs, self_closing: false) end
scan_tag_attribute_key()
click to toggle source
# File lib/gammo/tokenizer.rb, line 178 def scan_tag_attribute_key key = scan_until(%r{[=>\s/]}) return scan_until(/\z/).downcase unless key return if key.length < 2 ch = key.slice(key.length - 1) case ch when ?=, ?> then step_back end key.slice(0, key.length - 1).downcase end
scan_tag_attribute_value()
click to toggle source
# File lib/gammo/tokenizer.rb, line 187 def scan_tag_attribute_value byte = scanner.get_byte step_back && return if byte != ?= scan_whitespace return unless quote = scanner.get_byte return if scanner.eos? case quote when ?> then step_back && return when ?', ?" value = scan_until(/#{quote}/) unless value scan_until(/\z/) raise EOSError, "Couldn't find a token for representing end of the tag" unless byte = scanner.get_byte end value.slice(0, value.length - 1) else return quote if scanner.eos? return quote + scan_until(/\z/) unless value = scan_until(/[\s>]/) step_back if value.end_with?(?>) quote + value.slice(0, value.length - 1) end end
scan_until_close_angle()
click to toggle source
# File lib/gammo/tokenizer.rb, line 271 def scan_until_close_angle text = scan_until(/>/) text ? text.slice(0, text.length - 1) : scan_until(/\z/) end
scan_whitespace()
click to toggle source
# File lib/gammo/tokenizer.rb, line 318 def scan_whitespace scan(/[\s]+/) end
step_back()
click to toggle source
# File lib/gammo/tokenizer.rb, line 174 def step_back scanner.pos -= 1 end
text_token(text)
click to toggle source
# File lib/gammo/tokenizer.rb, line 353 def text_token(text) TextToken.new(text, raw: raw, convert_null: convert_null) end