class Srx::Engine

Engine for performing SRX segmenting

Attributes

data[R]

@return [Data]

Public Class Methods

new(data, format: :text) click to toggle source

@param data [Data] @param format [Symbol] see {Format#get}

# File lib/srx/engine.rb, line 11
def initialize(data, format: :text)
  @data = data
  @format = Format.get(format)
end

Public Instance Methods

segment(str, language:) click to toggle source

@param str [String] @param language [String] @return [Array<String>]

# File lib/srx/engine.rb, line 19
def segment(str, language:)
  results = []
  rules = rules(language)

  plain_text, markups = @format.extract_markups(str)

  pos = 0
  breaks_by_pos(plain_text, rules).each do |break_pos, _|
    results << build_segment!(plain_text, markups, pos, break_pos)
    pos = break_pos
  end

  results
end

Private Instance Methods

all_matches(str, rule) click to toggle source

@param str [String] @param rule [Data::LanguageRule::Rule] @return [Array<Array(Integer,Data::LanguageRule::Rule)>]

# File lib/srx/engine.rb, line 81
def all_matches(str, rule)
  results = []

  pos = 0
  while pos < str.length
    if rule.before_break
      m = rule.before_break.match(str, pos)
      break unless m

      pos = m.end(0)
      pos += 1 if pos == m.begin(0)

      results << [pos, rule] if rule.after_break.nil? || m.post_match.start_with?(rule.after_break)
    elsif rule.after_break
      m = rule.after_break.match(str, pos)
      break unless m

      pos = m.begin(0) + 1
      results << [pos, rule]
    else
      raise('Rule has neither before_break nor after_break')
    end
  end

  results
end
breaks_by_pos(str, rules) click to toggle source

@param str [String] @param rules [Array<Data::LanguageRule::Rule>] @return [Array<Array(Integer,Data::LanguageRule::Rule)>] an array of pairs

of 1) the position of a break, and 2) the rule that matched at that
position. Note that the final break will always be at the end of the
string and may not have an associated rule.
# File lib/srx/engine.rb, line 68
def breaks_by_pos(str, rules)
  grouped = rules.flat_map { |rule| all_matches(str, rule) }
                 .group_by(&:first)
  grouped.transform_values! { |pairs| pairs.first.last }
  grouped.select! { |_pos, rule| rule.break? }
  result = grouped.sort_by(&:first)
  result << [str.length] unless result.last&.first == str.length
  result
end
build_segment!(str, markups, start, finish) click to toggle source

@param str [String] @param markups [Array<Array(Integer,String)>] @param start [Integer] start offset of segment in str @param finish [Integer] end offset of segment in str

# File lib/srx/engine.rb, line 112
def build_segment!(str, markups, start, finish)
  segment = str[start...finish]

  until markups.empty?
    markup_pos, markup = markups.first
    break unless start + segment.length >= markup_pos

    break if start + segment.length == markup_pos && !include_edge_formatting?(markup)

    segment.insert(markup_pos - start, markup)
    markups.shift
  end

  segment
end
include_edge_formatting?(markup) click to toggle source

@param markup [String] @return [Boolean] whether to include the specified edge markup in the

current segment, in accordance with <formathandle> rules
# File lib/srx/engine.rb, line 131
def include_edge_formatting?(markup)
  return false if !@data.include_start_formatting? && @format.start_formatting?(markup)
  return false if !@data.include_end_formatting? && @format.end_formatting?(markup)
  return false if !@data.include_isolated_formatting? && @format.isolated_formatting?(markup)

  true
end
rule_names(language) click to toggle source

@param language [String] nil treated as empty string @return [Array<String>]

# File lib/srx/engine.rb, line 50
def rule_names(language)
  language ||= ''

  @data.map_rules.map do |lang_map|
    next unless lang_map.language_pattern.match?(language)

    break [lang_map.language_rule_name] unless @data.cascade?

    lang_map.language_rule_name
  end.compact
end
rules(language) click to toggle source

@param language [String] @return [Array<Data::Rule>]

# File lib/srx/engine.rb, line 38
def rules(language)
  names = rule_names(language)

  rule_map = @data.language_rules.map do |rule|
    [rule.name, rule]
  end.to_h

  names.flat_map { |name| rule_map[name].rules }
end