class Informers::NER
Public Class Methods
new(model_path)
click to toggle source
# File lib/informers/ner.rb, line 18 def initialize(model_path) tokenizer_path = File.expand_path("../../vendor/bert_base_cased_tok.bin", __dir__) @tokenizer = BlingFire.load_model(tokenizer_path) @model = OnnxRuntime::Model.new(model_path) end
Public Instance Methods
predict(texts)
click to toggle source
# File lib/informers/ner.rb, line 24 def predict(texts) singular = !texts.is_a?(Array) texts = [texts] if singular result = [] texts.each do |text| # tokenize tokens, start_offsets, end_offsets = @tokenizer.text_to_ids_with_offsets(text, nil, 100) # unk token tokens.unshift(101) # cls token tokens << 102 # sep token # infer input = { input_ids: [tokens], attention_mask: [[1] * tokens.size], token_type_ids: [[0] * tokens.size] } output = @model.predict(input) # transform entities = output["output_0"][0] score = entities.map do |e| values = e.map { |v| Math.exp(v) } sum = values.sum values.map { |v| v / sum } end labels_idx = score.map { |s| s.each_with_index.max[1] } labels = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] entities = [] filtered_label_idx = labels_idx.map.with_index.reject { |v, i| v == 0 } filtered_label_idx.each do |label_idx, idx| entities << { score: score[idx][label_idx], entity: labels[label_idx], index: idx } end result << group_entities(entities, text, start_offsets, end_offsets) end singular ? result.first : result end
Private Instance Methods
group_entities(entities, text, start_offsets, end_offsets)
click to toggle source
# File lib/informers/ner.rb, line 73 def group_entities(entities, text, start_offsets, end_offsets) last_entity = {} groups = [] entities.each do |entity| if entity[:index] - 1 == last_entity[:index] && entity[:entity] == last_entity[:entity] groups.last << entity else groups << [entity] end last_entity = entity end entity_map = { "I-PER" => "person", "I-ORG" => "org", "I-LOC" => "location", "I-MIS" => "misc" } groups.map do |group| start_offset = start_offsets[group.first[:index] - 1] end_offset = end_offsets[group.last[:index] - 1] { text: text[start_offset...end_offset], tag: entity_map[group.first[:entity]], score: group.map { |v| v[:score] }.sum / group.size, start: start_offset, end: end_offset } end end