class SheepAst::Tokenizer
Handle tokenie process.
@api private
rubocop: disable all
Attributes
last_word_check[RW]
Public Class Methods
new()
click to toggle source
Calls superclass method
SheepAst::Log::new
# File lib/sheep_ast/tokenizer.rb, line 23 def initialize @tokenize_stage = [] super() end
Public Instance Methods
<<(expr)
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 167 def <<(expr) tokenize_expr(expr) end
add(&blk)
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 73 def add(&blk) @tokenize_stage << blk end
add_token(blk, token = nil, **options)
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 94 def add_token(blk, token = nil, **options) add { |line, num| args, t_token = blk.call(line, num) t_token = token if !token.nil? && t_token [args, t_token, options, token] } end
cmp(*args)
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 42 def cmp(*args) lambda { |array, num| res = T.let(true, T::Boolean) if !array.nil? args.each_with_index do |elem, idx| t_res = array[num + idx] == elem if elem.instance_of? String t_res = array[num + idx] =~ elem if elem.instance_of? Regexp res = false if t_res.nil? || !t_res break if !res end else res = false end [args, res] } end
Also aliased as: cmb
dump(logs)
click to toggle source
@api private
# File lib/sheep_ast/tokenizer.rb, line 172 def dump(logs) logf = method(logs) logf.call('') logf.call('## Tokenizer information start ##') @tokenize_stage.each_with_index do |blk, idx| args, _, options, token = blk.call(nil, 0) token = args.join if !token dump_part(idx, args, token, options, logf) logf.call(' |_______|', :cyan) if options[:recursive] end logf.call('') end
dump_part(idx, args, token, options, logf)
click to toggle source
@api private
# File lib/sheep_ast/tokenizer.rb, line 186 def dump_part(idx, args, token, options, logf) logf.call "stage#{idx + 1} :___\\ #{args.inspect} is combined to "\ "#{token.inspect} with options = #{options.inspect}", :cyan end
split_space_only()
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 203 def split_space_only / |([\t\r\n\f])/ end
token_rule(*par) { || ... }
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 219 def token_rule(*par) if block_given? add_token T.unsafe(self).cmb(*par), yield else add_token T.unsafe(self).cmb(*par) end end
tokenize(fpath)
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 134 def tokenize(fpath) line_count = 0 file_buf = [] raw_buf = [] if !File.exist?(fpath) application_error "#{fpath} is not found" end File.open(fpath) do |f| f.each_line do |line| line_count += 1 file_buf.push(shaping(scan(line))) raw_buf << line end end return file_buf, line_count, raw_buf end
tokenize_expr(expr)
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 155 def tokenize_expr(expr) line_count = 0 file_buf = [] expr.each_line do |line| line_count += 1 file_buf.push(shaping(scan(line))) end return file_buf, line_count end
use_split_rule(&blk)
click to toggle source
Give the split rule for the given strings. Currently assuming to be used with split_space_only.
Given that tokenizer got input as 'Hello, sheep_ast world', then With usine default separator. it returns
“`
- [“Hello”, “,”, “ ”, “world”, “.”, “ ”, “Now”, “ ”, “2020”, “/”, “12”, “/”, “14”, “ ”, “1”, “:”, “43”]
-
“`
With
use_split_rule
, it returns“`
- [“Hello,”, “world.”, “Now”, “2020/12/14”, “1:43”]
-
“`
So, split base tokenizer is more simple than default base tokenizer. But default base tokenizer has more fine-grain control.
@example
core.config_tok do |tok| tok.use_split_rule { tok.split_space_only } end
@api public
# File lib/sheep_ast/tokenizer.rb, line 128 def use_split_rule(&blk) @split = blk end
Private Instance Methods
basic_shaping(line, blk)
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 295 def basic_shaping(line, blk) num = 0 ret_array = [] options = T.let(nil, T.nilable(T::Hash[Symbol, T::Boolean])) while num <= (line.size - 1) inc_count = 1 store_str = line[num] inc_count, store_str, options = basic_shaping_part(blk, line, num) num += inc_count ret_array << store_str end return ret_array, options end
basic_shaping_part(blk, line, num)
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 319 def basic_shaping_part(blk, line, num) args, store_str, options = blk.call(line, num) inc_count = T.must(args).size if inc_count.nil? inc_count = 1 end if !store_str inc_count = 1 store_str = line[num] end if !store_str.is_a? String index1 = num index2 = num + T.must(inc_count) - 1 store_str = T.must(line[index1..index2]).join end return inc_count, store_str, options end
scan(line)
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 230 def scan(line) ldebug? and ldebug "scan line = #{line.inspect}" if @split.nil? test = T.must(line).scan(/\w+|\W/) else test = T.must(line).split(@split.call) end if !@last_word_check.nil? if @last_word_check != line[-1] ldebug? and ldebug "last_word_check failed; drop last word" test = test[0..-2] end end T.must(test).reject!(&:empty?) if test.respond_to? :each # no process elsif test.nil? test = [] else test = [test] end return T.cast(test, T::Array[String]) end
shaping(line)
click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 258 def shaping(line) buf = line ldebug? and ldebug2 "#{line} will be combined process" prev = T.let(nil, T.nilable(T::Array[String])) @tokenize_stage.each do |blk| loop do buf, options = basic_shaping(buf, blk) if T.must(options)[:recursive] ldebug? and ldebug 'recursiv option enable' ldebug? and ldebug "buf => #{buf.inspect}" ldebug? and ldebug "prev => #{prev.inspect}" if buf != prev prev = buf.dup else prev = nil break end else break end end end buf = [] if buf.nil? return buf end