class SheepAst::Tokenizer

Handle tokenie process.

@api private

rubocop: disable all

Attributes

last_word_check[RW]

Public Class Methods

new() click to toggle source
Calls superclass method SheepAst::Log::new
# File lib/sheep_ast/tokenizer.rb, line 23
def initialize
  @tokenize_stage = []
  super()
end

Public Instance Methods

<<(expr) click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 167
def <<(expr)
  tokenize_expr(expr)
end
add(&blk) click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 73
def add(&blk)
  @tokenize_stage << blk
end
add_token(blk, token = nil, **options) click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 94
def add_token(blk, token = nil, **options) 
  add { |line, num|
    args, t_token = blk.call(line, num)
    t_token = token if !token.nil? && t_token
    [args, t_token, options, token]
  }
end
cmp(*args) click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 42
def cmp(*args)
  lambda { |array, num|
    res = T.let(true, T::Boolean)
    if !array.nil?
      args.each_with_index do |elem, idx|
        t_res = array[num + idx] == elem if elem.instance_of? String
        t_res = array[num + idx] =~ elem if elem.instance_of? Regexp
        res = false if t_res.nil? || !t_res
        break if !res
      end
    else
      res = false
    end
    [args, res]
  }
end
Also aliased as: cmb
dump(logs) click to toggle source

@api private

# File lib/sheep_ast/tokenizer.rb, line 172
def dump(logs)
  logf = method(logs)
  logf.call('')
  logf.call('## Tokenizer information start ##')
  @tokenize_stage.each_with_index do |blk, idx|
    args, _, options, token = blk.call(nil, 0)
    token = args.join if !token
    dump_part(idx, args, token, options, logf)
    logf.call('        |_______|', :cyan) if options[:recursive]
  end
  logf.call('')
end
dump_part(idx, args, token, options, logf) click to toggle source

@api private

# File lib/sheep_ast/tokenizer.rb, line 186
def dump_part(idx, args, token, options, logf)
  logf.call "stage#{idx + 1} :___\\ #{args.inspect} is combined to "\
            "#{token.inspect} with options = #{options.inspect}", :cyan
end
split_space_only() click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 203
def split_space_only
  / |([\t\r\n\f])/
end
token_rule(*par) { || ... } click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 219
def token_rule(*par)
  if block_given?
    add_token T.unsafe(self).cmb(*par), yield
  else
    add_token T.unsafe(self).cmb(*par)
  end
end
tokenize(fpath) click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 134
def tokenize(fpath)
  line_count = 0
  file_buf = []
  raw_buf = []

  if !File.exist?(fpath)
    application_error "#{fpath} is not found"
  end

  File.open(fpath) do |f|
    f.each_line do |line|
      line_count += 1
      file_buf.push(shaping(scan(line)))
      raw_buf << line
    end
  end
  return file_buf, line_count, raw_buf
end
tokenize_expr(expr) click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 155
def tokenize_expr(expr)
  line_count = 0
  file_buf = []
  expr.each_line do |line|
    line_count += 1
    file_buf.push(shaping(scan(line)))
  end
  return file_buf, line_count
end
use_split_rule(&blk) click to toggle source

Give the split rule for the given strings. Currently assuming to be used with split_space_only.

Given that tokenizer got input as 'Hello, sheep_ast world', then With usine default separator. it returns

“`

[“Hello”, “,”, “ ”, “world”, “.”, “ ”, “Now”, “ ”, “2020”, “/”, “12”, “/”, “14”, “ ”, “1”, “:”, “43”]

“`

With use_split_rule, it returns

“`

[“Hello,”, “world.”, “Now”, “2020/12/14”, “1:43”]

“`

So, split base tokenizer is more simple than default base tokenizer. But default base tokenizer has more fine-grain control.

@example

core.config_tok do |tok|
  tok.use_split_rule { tok.split_space_only }
end

@api public

# File lib/sheep_ast/tokenizer.rb, line 128
def use_split_rule(&blk)
  @split = blk
end

Private Instance Methods

basic_shaping(line, blk) click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 295
def basic_shaping(line, blk)
  num = 0
  ret_array = []
  options = T.let(nil, T.nilable(T::Hash[Symbol, T::Boolean]))
  while num <= (line.size - 1)
    inc_count = 1
    store_str = line[num]
    inc_count, store_str, options = basic_shaping_part(blk, line, num)
    num += inc_count
    ret_array << store_str
  end
  return ret_array, options
end
basic_shaping_part(blk, line, num) click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 319
def basic_shaping_part(blk, line, num)
  args, store_str, options = blk.call(line, num)
  inc_count = T.must(args).size

  if inc_count.nil?
    inc_count = 1
  end

  if !store_str
    inc_count = 1
    store_str = line[num]
  end

  if !store_str.is_a? String
    index1 = num
    index2 = num + T.must(inc_count) - 1
    store_str = T.must(line[index1..index2]).join
  end

  return inc_count, store_str, options
end
cmb(*args)
Alias for: cmp
scan(line) click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 230
def scan(line)
  ldebug? and ldebug "scan line = #{line.inspect}"
  if @split.nil?
    test = T.must(line).scan(/\w+|\W/)
  else
    test = T.must(line).split(@split.call)
  end

  if !@last_word_check.nil?
    if @last_word_check != line[-1]
      ldebug? and ldebug "last_word_check failed; drop last word"
      test = test[0..-2]
    end
  end

  T.must(test).reject!(&:empty?)
  if test.respond_to? :each
    # no process
  elsif test.nil?
    test = []
  else
    test = [test]
  end
  return T.cast(test, T::Array[String])
end
shaping(line) click to toggle source
# File lib/sheep_ast/tokenizer.rb, line 258
def shaping(line)
  buf = line

  ldebug? and ldebug2 "#{line} will be combined process"

  prev = T.let(nil, T.nilable(T::Array[String]))
  @tokenize_stage.each do |blk|
    loop do
      buf, options = basic_shaping(buf, blk)
      if T.must(options)[:recursive]
        ldebug? and ldebug 'recursiv option enable'
        ldebug? and ldebug "buf  => #{buf.inspect}"
        ldebug? and ldebug "prev => #{prev.inspect}"
        if buf != prev
          prev = buf.dup
        else
          prev = nil
          break
        end
      else
        break
      end
    end
  end
  buf = [] if buf.nil?
  return buf
end