module Regexgen

Generate regular expressions that match a set of strings

Constants

VERSION

Public Class Methods

common_substring(a, b, side) click to toggle source
# File lib/regexgen/regex.rb, line 130
def common_substring(a, b, side)
  dir = side == :start ? 1 : -1
  a = a.chars
  b = b.chars
  ai = dir == 1 ? 0 : a.length - 1
  ae = dir == 1 ? a.length : -1
  bi = dir == 1 ? 0 : b.length - 1
  be = dir == 1 ? b.length : -1
  res = ''

  while ai != ae && bi != be && a[ai] == b[bi]
    if dir == 1
      res += a[ai]
    else
      res = a[ai] + res
    end
    ai += dir
    bi += dir
  end

  res
end
concat(a, b) click to toggle source
# File lib/regexgen/regex.rb, line 153
def concat(a, b)
  return unless a && b

  return b if a.respond_to?(:empty?) && a.empty?
  return a if b.respond_to?(:empty?) && b.empty?

  return Ast::Literal.new(a.value + b.value) if a.is_a?(Ast::Literal) && b.is_a?(Ast::Literal)

  if a.is_a?(Ast::Literal) && b.is_a?(Ast::Concatenation) && b.a.is_a?(Ast::Literal)
    return Ast::Concatenation.new(Ast::Literal.new(a.value + b.a.value), b.b)
  end

  if b.is_a?(Ast::Literal) && a.is_a?(Ast::Concatenation) && a.b.is_a?(Ast::Literal)
    return Ast::Concatenation.new(a.a, Ast::Literal.new(a.b.value + b.value))
  end

  Ast::Concatenation.new(a, b)
end
generate(strings, flags = nil) click to toggle source
# File lib/regexgen.rb, line 9
def generate(strings, flags = nil)
  Trie.new.tap { |t| strings.each(&t.method(:add)) }
      .to_regex(flags)
end
minimize(root) click to toggle source

Hopcroft's DSA minimization algorithm en.wikipedia.org/wiki/DFA_minimization#Hopcroft’s_algorithm

Largely ported from github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/minimize.js

P := {F, Q \ F};
W := {F, Q \ F};
while (W is not empty) do
     choose and remove a set A from W
     for each c in Σ do
          let X be the set of states for which a transition on c leads to a state in A
          for each set Y in P for which X ∩ Y is nonempty and Y \ X is nonempty do
               replace Y in P by the two sets X ∩ Y and Y \ X
               if Y is in W
                    replace Y in W by the same two sets
               else
                    if |X ∩ Y| <= |Y \ X|
                         add X ∩ Y to W
                    else
                         add Y \ X to W
          end;
     end;
end;

Key:

{…} is a Set (yes we have sets of sets here) A \ B is complement (A - B) Q is all states F is final states Σ is the DFA's alphabet c is a letter of the alphabet A ∩ B is intersection (A & B) |A| is the cardinality of A (A.size)

# File lib/regexgen/minimize.rb, line 44
def minimize(root)
  states = root.visit
  final_states = states.select(&:accepting).to_set

  # Create a map of incoming transitions to each state, grouped by
  # character.
  transitions = Hash.new { |h, k| h[k] = Hash.new { |h_, k_| h_[k_] = Set.new } }
  states.each_with_object(transitions) do |s, acc|
    s.transitions.each do |t, st|
      acc[st][t] << s
    end
  end

  p = Set[states, final_states]
  w = Set.new(p)
  until w.empty?
    a = w.shift

    # Collect states that have transitions leading to states in a, grouped
    # by character.
    t = Hash.new { |h, k| h[k] = Set.new }
    a.each_with_object(t) do |s, acc|
      transitions[s].each do |t_, x|
        acc[t_].merge(x)
      end
    end

    t.each_value do |x|
      p.to_a.each do |y|
        intersection = x & y
        next if intersection.empty?

        complement = y - x
        next if complement.empty?

        p.replace(y, intersection, complement)
        if w.include?(y)
          w.replace(y, intersection, complement)
        elsif intersection.size <= complement.size
          w.add(intersection)
        else
          w.add(complement)
        end
      end
    end
  end

  new_states = Hash.new { |hash, key| hash[key] = State.new }
  initial = nil

  p.each do |s|
    first = s.first
    s_ = new_states[s]
    first.transitions.each do |c, old|
      s_.transitions[c] = new_states[p.find { |v| v.include?(old) }]
    end

    s_.accepting = first.accepting

    initial = s_ if s.include?(root)
  end

  initial
end
remove_common_substring(a, b, side) click to toggle source
# File lib/regexgen/regex.rb, line 116
def remove_common_substring(a, b, side)
  al = a.literal(side) if a.respond_to?(:literal)
  bl = b.literal(side) if b.respond_to?(:literal)
  return [a, b, nil] if al.nil? || bl.nil? || al.empty? || bl.empty?

  s = common_substring(al, bl, side)
  return [a, b, ''] if s.empty?

  a = a.remove_substring(side, s.length)
  b = b.remove_substring(side, s.length)

  [a, b, s]
end
star(exp) click to toggle source
# File lib/regexgen/regex.rb, line 80
def star(exp)
  Ast::Repetition.new(exp, '*') if exp
end
to_regex(root) click to toggle source

Brzozowski algebraic method cs.stackexchange.com/a/2392

Largely ported from github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/regex.js

Initialize B

for i = 1 to m:
  if final(i):
    B[i] := ε
  else:
    B[i] := ∅

Initialize A

for i = 1 to m:
  for j = 1 to m:
    for a in Σ:
      if trans(i, a, j):
        A[i,j] := a
      else:
        A[i,j] := ∅

Solve

for n = m decreasing to 1:
  B[n] := star(A[n,n]) . B[n]
  for j = 1 to n:
    A[n,j] := star(A[n,n]) . A[n,j];
  for i = 1 to n:
    B[i] += A[i,n] . B[n]
    for j = 1 to n:
      A[i,j] += A[i,n] . A[n,j]

Result is e := B

# File lib/regexgen/regex.rb, line 43
def to_regex(root)
  states = root.visit.to_a

  a = []
  b = []

  states.each_with_index do |a_, i|
    b[i] = Ast::Literal.new('') if a_.accepting

    a[i] = []
    a_.transitions.each do |t, s|
      j = states.index(s)
      a[i][j] = a[i][j] ? union(a[i][j], Ast::Literal.new(t)) : Ast::Literal.new(t)
    end
  end

  (states.length - 1).downto(0) do |n|
    if a[n][n]
      b[n] = concat(star(a[n][n]), b[n])
      (0...n).each do |j|
        a[n][j] = concat(star(a[n][n]), a[n][j])
      end
    end

    (0...n).each do |i|
      next unless a[i][n]

      b[i] = union(b[i], concat(a[i][n], b[n]))
      (0...n).each do |j|
        a[i][j] = union(a[i][j], concat(a[i][n], a[n][j]))
      end
    end
  end

  b[0].to_s
end
union(a, b) click to toggle source
# File lib/regexgen/regex.rb, line 84
def union(a, b)
  if a && b && a != b
    res = nil
    a, b, start = remove_common_substring(a, b, :start)
    a, b, end_ = remove_common_substring(a, b, :end)

    if (a.respond_to?(:empty?) && a.empty?) || (b.respond_to?(:empty?) && b.empty?)
      res = Ast::Repetition.new(a.empty? ? b : a, '?')
    elsif a.is_a?(Ast::Repetition) && a.type == '?'
      res = Ast::Repetition.new(Ast::Alternation.new(a.expr, b), '?')
    elsif b.is_a?(Ast::Repetition) && b.type == '?'
      res = Ast::Repetition.new(Ast::Alternation.new(a, b.expr), '?')
    else
      ac = a.char_class if a.respond_to?(:char_class)
      bc = b.char_class if b.respond_to?(:char_class)
      res = if ac && bc
              Ast::CharClass.new(ac, bc)
            else
              Ast::Alternation.new(a, b)
            end
    end

    res = Ast::Concatenation.new(Ast::Literal.new(start), res) unless start.nil? || start.empty?

    res = Ast::Concatenation.new(res, Ast::Literal.new(end_)) unless end_.nil? || end_.empty?

    return res
  end

  a || b
end