class ToknInternal::DFABuilder

Converts NFAs (nondeterministic, finite state automata) to minimal DFAs.

Performs the subset construction algorithm described in (among other placess) en.wikipedia.org/wiki/Powerset_construction

Also implements an innovative algorithm to partition a set of edge labels into a set that has the property that no two elements have overlapping regions. This allows us to perform the subset construction (and closure operations) efficiently while supporting large possible character sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb for more details.

Public Class Methods

new(nfaStartState) click to toggle source

Constructs a builder object

# File lib/tokn/dfa_builder.rb, line 80
def initialize(nfaStartState)
  @nextId = 0
  @nfaStart = nfaStartState
  
  # Build a map of nfa state ids => nfa states
  @nfaStateMap = {}
  nfas, _, _ = @nfaStart.reachableStates
  nfas.each {|s| @nfaStateMap[s.id] = s}
  
  # Initialize an array of nfa state lists, indexed by dfa state id
  @nfaStateLists = []
  
  # Map of existing DFA states; key is array of NFA state ids
  @dfaStateMap = {}
end
nfa_to_dfa(startState, db = false) click to toggle source

Convert an NFA to a DFA.

@param startState the start state of the NFA @param db if true, generates PDF files for debug purposes, showing various

steps of the procedure
# File lib/tokn/dfa_builder.rb, line 28
def self.nfa_to_dfa(startState, db = false)
 
  !db || startState.generatePDF("original_nfa")
  
  # Reverse this NFA, convert to DFA, then
  # reverse it, and convert it again.  Apparently this
  # produces a minimal DFA.
  
  rev = startState.reverseNFA()
  !db || rev.generatePDF("reversed_nfa")
  
  bld = DFABuilder.new(rev)
  dfa = bld.build(true, false)  # partition, but don't normalize
  
  !db || dfa.generatePDF("reversed_dfa")
  
  rev2 = dfa.reverseNFA()
  bld = DFABuilder.new(rev2)
  
  # Don't regenerate the partition; it is still valid
  # for this second build process
  #
  dfa = bld.build(false, true) # don't partition, but do normalize
  
  # If there are edges that contain more than one token identifier,
  # remove all but the first (i.e. the one with the highest token id)
  
  stSet, _, _ = dfa.reachableStates
  stSet.each do |s|
    s.edges.each do |lbl, dest|
      a = lbl.array
      if !a.size
        next
      end
      
      primeId = a[0]
      
      next if primeId >= EPSILON-1
      
      lbl.difference!(CodeSet.new(primeId+1, EPSILON))
    end
  end
  
  !db || dfa.generatePDF("minimal_dfa")
  
  dfa
end

Public Instance Methods

build(partition = true, normalize = true) click to toggle source

Perform the build algorithm

@param partition if true, partitions the edge labels into disjoint code sets @param normalize if true, normalizes the states afterward

# File lib/tokn/dfa_builder.rb, line 101
def build(partition = true, normalize = true)
  db = false
  
  !partition || partitionEdges(@nfaStart)
  
  iset = Set.new
  iset.add(@nfaStart)
  epsClosure(iset)
  
  @dfaStart,_ = createDFAState(stateSetToIdArray(iset))
  
  markedStates = Set.new
  
  unmarked = [@dfaStart]
  
  until unmarked.empty? 
    dfaState  = unmarked.pop
    
    nfaIds = @nfaStateLists[dfaState.id]
    
    # map of CodeSet => set of NFA states
    moveMap = {}
    
    nfaIds.each do |nfaId| 
      nfaState = @nfaStateMap[nfaId]
      nfaState.edges.each do |lbl,dest| 
        if lbl.array[0] == EPSILON
          next
        end
        
        nfaStates = moveMap[lbl]
        if !nfaStates
          nfaStates = Set.new
          moveMap[lbl] = nfaStates
        end
        nfaStates.add(dest)
      end
    end
    
    moveMap.each_pair do |charRange,nfaStates|
      # May be better to test if already in set before calc closure; or simply has closure
      epsClosure(nfaStates)
      dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
      if isNew
        unmarked.push(dfaDestState)
      end
      dfaState.addEdge(charRange, dfaDestState)
    end
    
  end
  
  if normalize
    !db || @dfaStart.generatePDF("prior_normalize")
    
    !db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
    State.normalizeStates(@dfaStart)
    !db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
    !db || @dfaStart.generatePDF("post_normalize")
  end
    
  @dfaStart
end

Private Instance Methods

createDFAState(nfaStateList) click to toggle source

Adds a DFA state for a set of NFA states, if one doesn't already exist for the set @param nfaStateList a sorted array of NFA state ids @return a pair [DFA State,

created flag (boolean): true if this did not already exist]
# File lib/tokn/dfa_builder.rb, line 172
def createDFAState(nfaStateList)
  
  lst = nfaStateList
  
  newState = @nfaStateMap[lst]
  isNewState = !newState
  if isNewState
    newState = State.new(@nextId)
    
    # Determine if any of the NFA states were final states
    newState.finalState = nfaStateList.any?{|id| @nfaStateMap[id].finalState?}
    
    if false
      # Set label of DFA state to show which NFA states produced it
      # (useful for debugging)
      newState.label = lst.map {|x| x.to_s}.join(' ')
    end
    
    @nextId += 1
    @nfaStateMap[lst] = newState   
    @nfaStateLists.push(lst)
      
  end
  return [newState,isNewState]
end
epsClosure(stateSet) click to toggle source

Calculate the epsilon closure of a set of NFA states @return a set of states

# File lib/tokn/dfa_builder.rb, line 205
def epsClosure(stateSet)
  stk = stateSet.to_a
  while !stk.empty?
    s = stk.pop
    s.edges.each do |lbl,dest|
      if lbl.contains? EPSILON
        if stateSet.add?(dest)
          stk.push(dest)
        end
      end
    end
  end  
  stateSet
end
partitionEdges(startState) click to toggle source

Modify edges so each is labelled with a disjoint subset of characters. See the notes at the start of this class, as well as RangePartition.rb.

# File lib/tokn/dfa_builder.rb, line 224
def partitionEdges(startState)
 
  db = false
  
  par = RangePartition.new
  
  stateSet, _, _ = startState.reachableStates
  
  stateSet.each do |s|
    s.edges.each {|lbl,dest| par.addSet(lbl) }
  end
  
  par.prepare
  
  stateSet.each do |s|
    newEdges = []
    s.edges.each do |lbl, dest|
      !db||pr(" old edge: %s   => %s\n",d(lbl),d(dest.name))
      newLbls = par.apply(lbl)
      newLbls.each {|x| newEdges.push([x, dest]) }
    end  
    s.clearEdges()
    
    newEdges.each do |lbl,dest| 
      !db||pr(" new edge: %s   => %s\n",d(lbl),d(dest.name))
      s.addEdge(lbl,dest)
    end
    !db||pr("\n")
  end
  
end
stateSetToIdArray(s) click to toggle source
# File lib/tokn/dfa_builder.rb, line 198
def stateSetToIdArray(s)
  s.to_a.map {|x| x.id}.sort
end