module BlingFire

Constants

VERSION

Attributes

ffi_lib[RW]

Public Class Methods

change_settings_dummy_prefix(model, value) click to toggle source
# File lib/blingfire.rb, line 129
def change_settings_dummy_prefix(model, value)
  # use opposite of value
  ret = FFI.SetNoDummyPrefix(model, value ? 0 : 1)
  raise Error, "Bad status: #{ret}" if ret != 1
end
free_model(model) click to toggle source
# File lib/blingfire.rb, line 116
def free_model(model)
  FFI.FreeModel(model)
end
lib_version() click to toggle source
# File lib/blingfire.rb, line 37
def lib_version
  FFI.GetBlingFireTokVersion
end
load_model(path, **options) click to toggle source
# File lib/blingfire.rb, line 41
def load_model(path, **options)
  Model.new(path, **options)
end
normalize_spaces(text) click to toggle source
# File lib/blingfire.rb, line 120
def normalize_spaces(text)
  u_space = 0x20
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
  out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
  out_size = FFI.NormalizeSpaces(text, text.bytesize, out, out.size, u_space)
  check_status out_size, out
  encode_utf8(out.to_str(out_size))
end
text_to_ids(model, text, max_len = nil, unk_id = 0) click to toggle source
# File lib/blingfire.rb, line 93
def text_to_ids(model, text, max_len = nil, unk_id = 0)
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
  out_size = FFI.TextToIds(model, text, text.bytesize, ids, ids.size, unk_id)
  check_status out_size, ids
  ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
end
text_to_ids_with_offsets(model, text, max_len = nil, unk_id = 0) click to toggle source
# File lib/blingfire.rb, line 101
def text_to_ids_with_offsets(model, text, max_len = nil, unk_id = 0)
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)

  start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)
  end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)

  out_size = FFI.TextToIdsWithOffsets(model, text, text.bytesize, ids, start_offsets, end_offsets, ids.size, unk_id)

  check_status out_size, ids

  result = ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
  [result].concat(unpack_offsets(start_offsets, end_offsets, result, text))
end
text_to_sentences(text) click to toggle source
# File lib/blingfire.rb, line 69
def text_to_sentences(text)
  text_to(text, "\n") do |t, out|
    FFI.TextToSentences(t, t.bytesize, out, out.size)
  end
end
text_to_sentences_with_model(model, text) click to toggle source
# File lib/blingfire.rb, line 75
def text_to_sentences_with_model(model, text)
  text_to(text, "\n") do |t, out|
    FFI.TextToSentencesWithModel(t, t.bytesize, out, out.size, model)
  end
end
text_to_sentences_with_offsets(text) click to toggle source
# File lib/blingfire.rb, line 81
def text_to_sentences_with_offsets(text)
  text_to_with_offsets(text, "\n") do |t, out, start_offsets, end_offsets|
    FFI.TextToSentencesWithOffsets(t, t.bytesize, out, start_offsets, end_offsets, out.size)
  end
end
text_to_sentences_with_offsets_with_model(model, text) click to toggle source
# File lib/blingfire.rb, line 87
def text_to_sentences_with_offsets_with_model(model, text)
  text_to_with_offsets(text, "\n") do |t, out, start_offsets, end_offsets|
    FFI.TextToSentencesWithOffsetsWithModel(t, t.bytesize, out, start_offsets, end_offsets, out.size, model)
  end
end
text_to_words(text) click to toggle source
# File lib/blingfire.rb, line 45
def text_to_words(text)
  text_to(text, " ") do |t, out|
    FFI.TextToWords(t, t.bytesize, out, out.size)
  end
end
text_to_words_with_model(model, text) click to toggle source
# File lib/blingfire.rb, line 51
def text_to_words_with_model(model, text)
  text_to(text, " ") do |t, out|
    FFI.TextToWordsWithModel(t, t.bytesize, out, out.size, model)
  end
end
text_to_words_with_offsets(text) click to toggle source
# File lib/blingfire.rb, line 57
def text_to_words_with_offsets(text)
  text_to_with_offsets(text, " ") do |t, out, start_offsets, end_offsets|
    FFI.TextToWordsWithOffsets(t, t.bytesize, out, start_offsets, end_offsets, out.size)
  end
end
text_to_words_with_offsets_with_model(model, text) click to toggle source
# File lib/blingfire.rb, line 63
def text_to_words_with_offsets_with_model(model, text)
  text_to_with_offsets(text, " ") do |t, out, start_offsets, end_offsets|
    FFI.TextToWordsWithOffsetsWithModel(t, t.bytesize, out, start_offsets, end_offsets, out.size, model)
  end
end

Private Class Methods

check_status(ret, ptr) click to toggle source
# File lib/blingfire.rb, line 137
def check_status(ret, ptr)
  raise Error, "Not enough memory allocated" if ret == -1 || ret > ptr.size
end
encode_utf8(text) click to toggle source
# File lib/blingfire.rb, line 166
def encode_utf8(text)
  text.force_encoding(Encoding::UTF_8)
end
text_to(text, sep) { |text, out| ... } click to toggle source
# File lib/blingfire.rb, line 141
def text_to(text, sep)
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
  # TODO allocate less, and try again if needed
  out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max)
  out_size = yield(text, out)
  check_status out_size, out
  encode_utf8(out.to_str(out_size - 1)).split(sep)
end
text_to_with_offsets(text, sep) { |text, out, start_offsets, end_offsets| ... } click to toggle source
# File lib/blingfire.rb, line 150
def text_to_with_offsets(text, sep)
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
  # TODO allocate less, and try again if needed
  out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max)

  start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size)
  end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size)

  out_size = yield(text, out, start_offsets, end_offsets)

  check_status out_size, out

  result = encode_utf8(out.to_str(out_size - 1)).split(sep)
  [result].concat(unpack_offsets(start_offsets, end_offsets, result, text))
end
unpack_offsets(start_offsets, end_offsets, result, text) click to toggle source
# File lib/blingfire.rb, line 170
def unpack_offsets(start_offsets, end_offsets, result, text)
  start_bytes = start_offsets.to_s(Fiddle::SIZEOF_INT * result.size).unpack("i*")
  end_bytes = end_offsets.to_s(Fiddle::SIZEOF_INT * result.size).unpack("i*")
  starts = []
  ends = []

  # convert byte offsets to character offsets
  # TODO see if more efficient to store next_pos in variable
  pos = 0
  text.each_char.with_index do |c, i|
    while pos == start_bytes[starts.size] || start_bytes[starts.size] == -1
      starts << i
    end
    pos += c.bytesize
    while pos - 1 == end_bytes[ends.size]
      ends << i + 1
    end
  end

  [starts, ends]
end