class Informers::NER

Public Class Methods

new(model_path) click to toggle source
# File lib/informers/ner.rb, line 18
def initialize(model_path)
  tokenizer_path = File.expand_path("../../vendor/bert_base_cased_tok.bin", __dir__)
  @tokenizer = BlingFire.load_model(tokenizer_path)
  @model = OnnxRuntime::Model.new(model_path)
end

Public Instance Methods

predict(texts) click to toggle source
# File lib/informers/ner.rb, line 24
def predict(texts)
  singular = !texts.is_a?(Array)
  texts = [texts] if singular

  result = []
  texts.each do |text|
    # tokenize
    tokens, start_offsets, end_offsets = @tokenizer.text_to_ids_with_offsets(text, nil, 100) # unk token
    tokens.unshift(101) # cls token
    tokens << 102 # sep token

    # infer
    input = {
      input_ids: [tokens],
      attention_mask: [[1] * tokens.size],
      token_type_ids: [[0] * tokens.size]
    }
    output = @model.predict(input)

    # transform
    entities = output["output_0"][0]
    score =
      entities.map do |e|
        values = e.map { |v| Math.exp(v) }
        sum = values.sum
        values.map { |v| v / sum }
      end

    labels_idx = score.map { |s| s.each_with_index.max[1] }
    labels = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

    entities = []
    filtered_label_idx = labels_idx.map.with_index.reject { |v, i| v == 0 }
    filtered_label_idx.each do |label_idx, idx|
      entities << {
        score: score[idx][label_idx],
        entity: labels[label_idx],
        index: idx
      }
    end

    result << group_entities(entities, text, start_offsets, end_offsets)
  end

  singular ? result.first : result
end

Private Instance Methods

group_entities(entities, text, start_offsets, end_offsets) click to toggle source
# File lib/informers/ner.rb, line 73
def group_entities(entities, text, start_offsets, end_offsets)
  last_entity = {}
  groups = []
  entities.each do |entity|
    if entity[:index] - 1 == last_entity[:index] && entity[:entity] == last_entity[:entity]
      groups.last << entity
    else
      groups << [entity]
    end
    last_entity = entity
  end

  entity_map = {
    "I-PER" => "person",
    "I-ORG" => "org",
    "I-LOC" => "location",
    "I-MIS" => "misc"
  }

  groups.map do |group|
    start_offset = start_offsets[group.first[:index] - 1]
    end_offset = end_offsets[group.last[:index] - 1]

    {
      text: text[start_offset...end_offset],
      tag: entity_map[group.first[:entity]],
      score: group.map { |v| v[:score] }.sum / group.size,
      start: start_offset,
      end: end_offset
    }
  end
end