class RegextestPreCaseFolding

A script for generating case-folding of Unicode This uses tables of Unicode.org, i.e.

Public Class Methods

generate(input_file, output_file) click to toggle source
# File lib/pre/case-folding.rb, line 11
def self.generate(input_file, output_file)
  # Get valid casefoldings from unicode table
  case_foldings = read_unicode_case_folding("./contrib/unicode/CaseFolding.txt")
  puts_unicode_case_folding('lib/regextest/front/case-folding.rb', case_foldings)
end
puts_unicode_case_folding(case_folding_file, case_folding) click to toggle source

puts source to unicode.rb

# File lib/pre/case-folding.rb, line 56
  def self.puts_unicode_case_folding(case_folding_file, case_folding)
    
    template =<<"    END_OF_TEMPLATE"
      # encoding: utf-8
      # DO NOT Modify This File Since Automatically Generated

      # Range of Unicode
      class Regextest::Front::CaseFolding
        # return case foldings
        def self.ignore_case(letter_array)
          CASE_FOLDING_HASH[letter_array]
        end
        
        # case folding hash [codepoint] => [[mapping_1], ...]
        CASE_FOLDING_HASH =
#{case_folding.inspect}
      end

      # Test suite (execute when this file is specified in command line)
      if __FILE__ == $0 
      end
    END_OF_TEMPLATE
    template.gsub!(/^      /, "")
    File.open(case_folding_file, "w") do |fp|
      fp.puts template
    end
    
  end
read_unicode_case_folding(file) click to toggle source

Get list of case-folding pairs from Unicode.org table

# File lib/pre/case-folding.rb, line 18
def self.read_unicode_case_folding(file)
  case_foldings = {}
  read_unicode_line(file) do | line |
    if md = line.match(/^(\h{4,6});\s*([CFST]); ([ \h]+);/)
      code_point = md[1].to_i(16)
      mapping = md[3].split(" ").map{|elem| elem.to_i(16)}
      code_point_string = [code_point].pack("U*")
      mapping_string = mapping.map{|elem| [elem].pack("U*")}.join("")
      if /(?ai:#{code_point_string})/.match(mapping_string)
        case_foldings[[code_point]] ||= []
        case_foldings[[code_point]].push mapping
        case_foldings[mapping] ||= []
        case_foldings[mapping].push [code_point]
      else
        # puts "code=#{code_point_string}, map=#{mapping_string}"
      end
    else
      raise "not matched line: #{line}"
    end
  end
  # case_foldings.each do | key, value |
  #   value.each do | elem |
  #     puts "#{key.pack("U*")} #{key}: #{elem.pack("U*") } #{elem}"
  #   end
  # end
  case_foldings
end
read_unicode_line(file) { |line| ... } click to toggle source

common process for parsing tables of Unicode.org

# File lib/pre/case-folding.rb, line 47
def self.read_unicode_line(file)
  content = open(file, 'r:BOM|UTF-8') {|f| f.read}  # ignore BOM header
  content.split(/\r?\n/).each do | line |
    next if(line.length == 0 || line[0..0] == '#')
    yield(line)
  end
end