module Regexgen
Generate regular expressions that match a set of strings
Constants
- VERSION
Public Class Methods
common_substring(a, b, side)
click to toggle source
# File lib/regexgen/regex.rb, line 130 def common_substring(a, b, side) dir = side == :start ? 1 : -1 a = a.chars b = b.chars ai = dir == 1 ? 0 : a.length - 1 ae = dir == 1 ? a.length : -1 bi = dir == 1 ? 0 : b.length - 1 be = dir == 1 ? b.length : -1 res = '' while ai != ae && bi != be && a[ai] == b[bi] if dir == 1 res += a[ai] else res = a[ai] + res end ai += dir bi += dir end res end
concat(a, b)
click to toggle source
# File lib/regexgen/regex.rb, line 153 def concat(a, b) return unless a && b return b if a.respond_to?(:empty?) && a.empty? return a if b.respond_to?(:empty?) && b.empty? return Ast::Literal.new(a.value + b.value) if a.is_a?(Ast::Literal) && b.is_a?(Ast::Literal) if a.is_a?(Ast::Literal) && b.is_a?(Ast::Concatenation) && b.a.is_a?(Ast::Literal) return Ast::Concatenation.new(Ast::Literal.new(a.value + b.a.value), b.b) end if b.is_a?(Ast::Literal) && a.is_a?(Ast::Concatenation) && a.b.is_a?(Ast::Literal) return Ast::Concatenation.new(a.a, Ast::Literal.new(a.b.value + b.value)) end Ast::Concatenation.new(a, b) end
generate(strings, flags = nil)
click to toggle source
# File lib/regexgen.rb, line 9 def generate(strings, flags = nil) Trie.new.tap { |t| strings.each(&t.method(:add)) } .to_regex(flags) end
minimize(root)
click to toggle source
Hopcroft's DSA minimization algorithm en.wikipedia.org/wiki/DFA_minimization#Hopcroft’s_algorithm
Largely ported from github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/minimize.js
P := {F, Q \ F}; W := {F, Q \ F}; while (W is not empty) do choose and remove a set A from W for each c in Σ do let X be the set of states for which a transition on c leads to a state in A for each set Y in P for which X ∩ Y is nonempty and Y \ X is nonempty do replace Y in P by the two sets X ∩ Y and Y \ X if Y is in W replace Y in W by the same two sets else if |X ∩ Y| <= |Y \ X| add X ∩ Y to W else add Y \ X to W end; end; end;
Key:
{…} is a Set (yes we have sets of sets here) A \ B is complement (A - B) Q is all states F is final states Σ is the DFA's alphabet c is a letter of the alphabet A ∩ B is intersection (A & B) |A| is the cardinality of A (A.size)
# File lib/regexgen/minimize.rb, line 44 def minimize(root) states = root.visit final_states = states.select(&:accepting).to_set # Create a map of incoming transitions to each state, grouped by # character. transitions = Hash.new { |h, k| h[k] = Hash.new { |h_, k_| h_[k_] = Set.new } } states.each_with_object(transitions) do |s, acc| s.transitions.each do |t, st| acc[st][t] << s end end p = Set[states, final_states] w = Set.new(p) until w.empty? a = w.shift # Collect states that have transitions leading to states in a, grouped # by character. t = Hash.new { |h, k| h[k] = Set.new } a.each_with_object(t) do |s, acc| transitions[s].each do |t_, x| acc[t_].merge(x) end end t.each_value do |x| p.to_a.each do |y| intersection = x & y next if intersection.empty? complement = y - x next if complement.empty? p.replace(y, intersection, complement) if w.include?(y) w.replace(y, intersection, complement) elsif intersection.size <= complement.size w.add(intersection) else w.add(complement) end end end end new_states = Hash.new { |hash, key| hash[key] = State.new } initial = nil p.each do |s| first = s.first s_ = new_states[s] first.transitions.each do |c, old| s_.transitions[c] = new_states[p.find { |v| v.include?(old) }] end s_.accepting = first.accepting initial = s_ if s.include?(root) end initial end
remove_common_substring(a, b, side)
click to toggle source
# File lib/regexgen/regex.rb, line 116 def remove_common_substring(a, b, side) al = a.literal(side) if a.respond_to?(:literal) bl = b.literal(side) if b.respond_to?(:literal) return [a, b, nil] if al.nil? || bl.nil? || al.empty? || bl.empty? s = common_substring(al, bl, side) return [a, b, ''] if s.empty? a = a.remove_substring(side, s.length) b = b.remove_substring(side, s.length) [a, b, s] end
star(exp)
click to toggle source
# File lib/regexgen/regex.rb, line 80 def star(exp) Ast::Repetition.new(exp, '*') if exp end
to_regex(root)
click to toggle source
Brzozowski algebraic method cs.stackexchange.com/a/2392
Largely ported from github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/regex.js
Initialize B
for i = 1 to m: if final(i): B[i] := ε else: B[i] := ∅
Initialize A
for i = 1 to m: for j = 1 to m: for a in Σ: if trans(i, a, j): A[i,j] := a else: A[i,j] := ∅
Solve
for n = m decreasing to 1: B[n] := star(A[n,n]) . B[n] for j = 1 to n: A[n,j] := star(A[n,n]) . A[n,j]; for i = 1 to n: B[i] += A[i,n] . B[n] for j = 1 to n: A[i,j] += A[i,n] . A[n,j]
Result is e := B
# File lib/regexgen/regex.rb, line 43 def to_regex(root) states = root.visit.to_a a = [] b = [] states.each_with_index do |a_, i| b[i] = Ast::Literal.new('') if a_.accepting a[i] = [] a_.transitions.each do |t, s| j = states.index(s) a[i][j] = a[i][j] ? union(a[i][j], Ast::Literal.new(t)) : Ast::Literal.new(t) end end (states.length - 1).downto(0) do |n| if a[n][n] b[n] = concat(star(a[n][n]), b[n]) (0...n).each do |j| a[n][j] = concat(star(a[n][n]), a[n][j]) end end (0...n).each do |i| next unless a[i][n] b[i] = union(b[i], concat(a[i][n], b[n])) (0...n).each do |j| a[i][j] = union(a[i][j], concat(a[i][n], a[n][j])) end end end b[0].to_s end
union(a, b)
click to toggle source
# File lib/regexgen/regex.rb, line 84 def union(a, b) if a && b && a != b res = nil a, b, start = remove_common_substring(a, b, :start) a, b, end_ = remove_common_substring(a, b, :end) if (a.respond_to?(:empty?) && a.empty?) || (b.respond_to?(:empty?) && b.empty?) res = Ast::Repetition.new(a.empty? ? b : a, '?') elsif a.is_a?(Ast::Repetition) && a.type == '?' res = Ast::Repetition.new(Ast::Alternation.new(a.expr, b), '?') elsif b.is_a?(Ast::Repetition) && b.type == '?' res = Ast::Repetition.new(Ast::Alternation.new(a, b.expr), '?') else ac = a.char_class if a.respond_to?(:char_class) bc = b.char_class if b.respond_to?(:char_class) res = if ac && bc Ast::CharClass.new(ac, bc) else Ast::Alternation.new(a, b) end end res = Ast::Concatenation.new(Ast::Literal.new(start), res) unless start.nil? || start.empty? res = Ast::Concatenation.new(res, Ast::Literal.new(end_)) unless end_.nil? || end_.empty? return res end a || b end