module AsciiDammit

Constants

DATABASE
DATABASEFILE
DATABASEXML
MANUAL
VERSION

Public Class Methods

parse_lvg_map(url) click to toggle source
# File lib/asciidammit.rb, line 27
def parse_lvg_map(url)
  map = {}
  open(url).readlines.each{|line|
    next if line =~ /^#/
    line.gsub!(/\|[^\|]+\|[^\|]+$/, '')
    codepoint, char = *line.split('|', 2)
    codepoint = eval("\"\\u{#{codepoint.gsub!(/^U\+/, '')}}\"")
    map[codepoint] = char
  }
  return map
end
update() click to toggle source
# File lib/asciidammit.rb, line 40
def update
  database = {}
  unmatched = []

  #CSV.parse(open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row|
  CSV.parse(open('UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row|
    next if row[1] == '<control>'

    begin
      codepoint = {
        :codepoint => row[0],
        :char => [row[0].hex].pack('U') =~ /^[\x20-\x7E]$/ ? [row[0].hex].pack('U') : nil,
        :name => row[1],
        :category => row[2],
        :decomp => row[5] ? row[5].split.select{|v| v =~ /^[0-9A-F]{4}$/i}.collect{|cp| cp.intern} : nil,
        :decimal => row[6] ? Integer(row[6]) : (row[7] ? Integer(row[7]) : nil),
        :numeric => row[8] =~ /^[0-9]+$/ ? Integer(row[8]) : nil,
        :upcase => row[12] ? row[12].intern : nil,
        :downcase => row[13] ? row[13].intern : nil,
      }
    rescue
      next
    end

    # sorry guys, but I really don't know what to make of these
    next if %w{KANNADA MALAYALAM TELUGU TAMIL ORIYA GUJARATI GURMUKHI BENGALI DEVANAGARI THAANA
              SYRIAC HEBREW COPTIC HANGUL CYRILLIC ARMENIAN ARABIC SINHALA THAI CHEROKEE
              OGHAM RUNIC TAGALOG HANUNOO BUHID TAGBANWA KHMER MONGOLIAN MEASURED KANGXI
              LAO MYANMAR GEORGIAN ETHIOPIC TIBETAN}.include?(codepoint[:name].split[0])
    next if codepoint[:name] =~ /CANADIAN SYLLABICS /
    next if codepoint[:name] =~ /PHILIPPINE (SINGLE|DOUBLE) PUNCTUATION/
    next if codepoint[:name] =~ /BOX DRAWINGS /
    next if codepoint[:name] =~ /MUSICAL SYMBOL /
    next if codepoint[:name] =~ /CJK COMPATIBILITY /

    if codepoint[:char].nil?
      name = codepoint[:name].dup.split.reject{|w| %w{TURNED INVERTED CLOSED REVERSED OPEN}.include?(w)}.join(' ')

      codepoint[:char] ||= $2           if name =~ /LATIN CAPITAL (LIGATURE|LETTER) ([A-Z]+)/ && codepoint[:category] == 'Lu'
      codepoint[:char] ||= $1           if name =~ /LATIN LETTER ([A-Z]+)/ && codepoint[:category] == 'Lu'
      codepoint[:char] ||= $3.downcase  if name =~ /LATIN( LETTER)? SMALL (LIGATURE|LETTER|CAPITAL|SCRIPT) ([A-Z]+)/ && codepoint[:category] == 'Ll'
      codepoint[:char] ||= $1.downcase  if name =~ /LATIN LETTER STRETCHED ([A-Z]+)/ && codepoint[:category] == 'Ll'
      codepoint[:char] ||= ' '          if name =~ /NO-BREAK SPACE/ && %w{Cf Zs}.include?(codepoint[:category])
      codepoint[:char] ||= $2.downcase  if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z])$/ && codepoint[:category] == 'Lm'
      codepoint[:char] ||= $2.downcase  if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z]) / && codepoint[:category] == 'Lm'
    end

    codepoint[:char] ||= MANUAL[codepoint[:name]]
      
    database[codepoint[:codepoint].intern] = database[codepoint[:name]] = codepoint
  }

  database.each_pair{|k, v|
    next unless k.is_a?(Symbol) && !v[:char]

    case v[:category]
      when 'Mn'
        v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char] if database[v[:name].gsub(/^COMBINING /, '')]

      when 'Lu', 'Ll'
        if v[:decomp]
          v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
          v[:char] = nil if v[:char] == ''
        else
          names = v[:name].split.reject{|w| %w{LATIN MACRON SHARP WITH STROKE HOOK TOPBAR DOT ABOVE ACUTE}.include?(w)}
          names.size.downto(1){|n|
            names.permutation(n).to_a.each{|name|
              name = "LATIN " + name.join(' ')
              v[:char] = database[name][:char] if database[name] && database[name][:char]
              break if v[:char]
            }
            break if v[:char]
          }
        end

      when 'Lm', 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
        names = v[:name].split.reject{|w| %w{PARENTHESIZED CIRCLED ELEMENT VERTICAL HORIZONTAL STROKE LEFTWARDS ARROW BAR OVER
                                              RIGHTWARDS WITH DOT ABOVE BELOW TAG INVERTED LEFT-POINTING RIGHT-POINTING
                                              QUADRANT UPPER LEFT LOWER RIGHT
                                              ANGLE SUPERSCRIPT SUBSCRIPT DOWNWARDS UPWARDS
                                              BROKEN_BAR}.include?(w.gsub(' ', '_'))}
        STDOUT.flush
        names.size.downto(1){|n|
          names.permutation(n).to_a.each{|name|
            name = name.join(' ')
            v[:char] = database[name][:char] if database[name] && database[name][:char]
            break if v[:char]
          }
          break if v[:char]
        }

      when 'Lt'
        m = v[:name].match(/(CAPITAL|SMALL) LETTER (.) WITH (CAPITAL|SMALL) LETTER (.)/)
        if m
          v[:char] = database["LATIN #{m[1]} LETTER #{m[2]}"][:char].to_s + database["LATIN #{m[3]} LETTER #{m[4]}"][:char].to_s
          v[:char] = nil if v[:char] == ''
        end

      when 'Nl', 'No', 'Nd'
        if v[:decomp]
          v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
          v[:char] = nil if v[:char] == ''
        end
        if v[:char].nil?
          offset = [v[:decimal], v[:numeric]].compact
          v[:char] = offset[0].to_s if offset.size > 0
        end

      when 'Pi', 'Pf'
        v[:char] = "'" if v[:name] =~ /SINGLE/
        v[:char] = '"' if v[:name] =~ /DOUBLE/

      when 'Zs'
        v[:char] = ' '

      when 'Sk', 'Cf', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp'
        v[:ignore] = true

      else
        raise "Unhandled character category #{v[:category]}"
    end

    if v[:char].nil?
      v[:char] = database[v[:upcase]][:char].downcase if v[:upcase] && database[v[:upcase]][:char]
      v[:char] = database[v[:downcase]][:char].upcase if v[:downcase] && database[v[:downcase]][:char]
    end

    #puts "#{v[:name]} (#{v[:category]})" unless v[:char] || v[:ignore]

    unmatched << v unless v[:char]
  }

  puts "#{unmatched.size} unmatched"

  categories = {}
  database.keys.each{|k|
    next unless k.is_a?(Symbol)
    key = [k.to_s.hex].pack('U')
    categories[key] = database[k][:category]
    database[key] = database[k][:char]
  }
  database = database.reject{|k, v| !v.is_a?(String)}

  reversed = {}
  database.each_pair{|unicode, ascii|
    reversed[ascii] ||= []
    reversed[ascii] << unicode
  }
  cls = Hash[*(%w{Lu Ll Lt Lo}.collect{|k| [k, 'alpha']}.flatten)].merge(Hash[*(%w{Nd Nl, No}.collect{|k| [k, 'num']}.flatten)])

  builder = Nokogiri::XML::Builder.new{|xml|
    xml.mapping {
      unmatched.each{|cp|
        char = [cp[:codepoint].hex].pack('U')
        xml.character(:codepoint => cp[:codepoint].rjust(4, '0'), :category => categories[char], :class => cls[categories[char]] || 'other') { xml.text(char) }
      }
      reversed.each_pair{|ascii, unicodes|
        xml.replacement(:string => ascii) {
          unicodes.each{|unicode|
            xml.character(:codepoint => unicode.unpack('U')[0].to_s(16).upcase.rjust(4, '0'), :category => categories[unicode], :class => cls[categories[unicode]] || 'other') { xml.text(unicode) }
          }
        }
      }
    }
  }
  File.open(DATABASEXML, 'wb'){|f| f.write(builder.to_xml) }

  File.open(DATABASEFILE, 'wb'){|f| f.write(database.to_msgpack) }
  exit
end

Private Instance Methods

parse_lvg_map(url) click to toggle source
# File lib/asciidammit.rb, line 27
def parse_lvg_map(url)
  map = {}
  open(url).readlines.each{|line|
    next if line =~ /^#/
    line.gsub!(/\|[^\|]+\|[^\|]+$/, '')
    codepoint, char = *line.split('|', 2)
    codepoint = eval("\"\\u{#{codepoint.gsub!(/^U\+/, '')}}\"")
    map[codepoint] = char
  }
  return map
end
update() click to toggle source
# File lib/asciidammit.rb, line 40
def update
  database = {}
  unmatched = []

  #CSV.parse(open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row|
  CSV.parse(open('UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row|
    next if row[1] == '<control>'

    begin
      codepoint = {
        :codepoint => row[0],
        :char => [row[0].hex].pack('U') =~ /^[\x20-\x7E]$/ ? [row[0].hex].pack('U') : nil,
        :name => row[1],
        :category => row[2],
        :decomp => row[5] ? row[5].split.select{|v| v =~ /^[0-9A-F]{4}$/i}.collect{|cp| cp.intern} : nil,
        :decimal => row[6] ? Integer(row[6]) : (row[7] ? Integer(row[7]) : nil),
        :numeric => row[8] =~ /^[0-9]+$/ ? Integer(row[8]) : nil,
        :upcase => row[12] ? row[12].intern : nil,
        :downcase => row[13] ? row[13].intern : nil,
      }
    rescue
      next
    end

    # sorry guys, but I really don't know what to make of these
    next if %w{KANNADA MALAYALAM TELUGU TAMIL ORIYA GUJARATI GURMUKHI BENGALI DEVANAGARI THAANA
              SYRIAC HEBREW COPTIC HANGUL CYRILLIC ARMENIAN ARABIC SINHALA THAI CHEROKEE
              OGHAM RUNIC TAGALOG HANUNOO BUHID TAGBANWA KHMER MONGOLIAN MEASURED KANGXI
              LAO MYANMAR GEORGIAN ETHIOPIC TIBETAN}.include?(codepoint[:name].split[0])
    next if codepoint[:name] =~ /CANADIAN SYLLABICS /
    next if codepoint[:name] =~ /PHILIPPINE (SINGLE|DOUBLE) PUNCTUATION/
    next if codepoint[:name] =~ /BOX DRAWINGS /
    next if codepoint[:name] =~ /MUSICAL SYMBOL /
    next if codepoint[:name] =~ /CJK COMPATIBILITY /

    if codepoint[:char].nil?
      name = codepoint[:name].dup.split.reject{|w| %w{TURNED INVERTED CLOSED REVERSED OPEN}.include?(w)}.join(' ')

      codepoint[:char] ||= $2           if name =~ /LATIN CAPITAL (LIGATURE|LETTER) ([A-Z]+)/ && codepoint[:category] == 'Lu'
      codepoint[:char] ||= $1           if name =~ /LATIN LETTER ([A-Z]+)/ && codepoint[:category] == 'Lu'
      codepoint[:char] ||= $3.downcase  if name =~ /LATIN( LETTER)? SMALL (LIGATURE|LETTER|CAPITAL|SCRIPT) ([A-Z]+)/ && codepoint[:category] == 'Ll'
      codepoint[:char] ||= $1.downcase  if name =~ /LATIN LETTER STRETCHED ([A-Z]+)/ && codepoint[:category] == 'Ll'
      codepoint[:char] ||= ' '          if name =~ /NO-BREAK SPACE/ && %w{Cf Zs}.include?(codepoint[:category])
      codepoint[:char] ||= $2.downcase  if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z])$/ && codepoint[:category] == 'Lm'
      codepoint[:char] ||= $2.downcase  if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z]) / && codepoint[:category] == 'Lm'
    end

    codepoint[:char] ||= MANUAL[codepoint[:name]]
      
    database[codepoint[:codepoint].intern] = database[codepoint[:name]] = codepoint
  }

  database.each_pair{|k, v|
    next unless k.is_a?(Symbol) && !v[:char]

    case v[:category]
      when 'Mn'
        v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char] if database[v[:name].gsub(/^COMBINING /, '')]

      when 'Lu', 'Ll'
        if v[:decomp]
          v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
          v[:char] = nil if v[:char] == ''
        else
          names = v[:name].split.reject{|w| %w{LATIN MACRON SHARP WITH STROKE HOOK TOPBAR DOT ABOVE ACUTE}.include?(w)}
          names.size.downto(1){|n|
            names.permutation(n).to_a.each{|name|
              name = "LATIN " + name.join(' ')
              v[:char] = database[name][:char] if database[name] && database[name][:char]
              break if v[:char]
            }
            break if v[:char]
          }
        end

      when 'Lm', 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
        names = v[:name].split.reject{|w| %w{PARENTHESIZED CIRCLED ELEMENT VERTICAL HORIZONTAL STROKE LEFTWARDS ARROW BAR OVER
                                              RIGHTWARDS WITH DOT ABOVE BELOW TAG INVERTED LEFT-POINTING RIGHT-POINTING
                                              QUADRANT UPPER LEFT LOWER RIGHT
                                              ANGLE SUPERSCRIPT SUBSCRIPT DOWNWARDS UPWARDS
                                              BROKEN_BAR}.include?(w.gsub(' ', '_'))}
        STDOUT.flush
        names.size.downto(1){|n|
          names.permutation(n).to_a.each{|name|
            name = name.join(' ')
            v[:char] = database[name][:char] if database[name] && database[name][:char]
            break if v[:char]
          }
          break if v[:char]
        }

      when 'Lt'
        m = v[:name].match(/(CAPITAL|SMALL) LETTER (.) WITH (CAPITAL|SMALL) LETTER (.)/)
        if m
          v[:char] = database["LATIN #{m[1]} LETTER #{m[2]}"][:char].to_s + database["LATIN #{m[3]} LETTER #{m[4]}"][:char].to_s
          v[:char] = nil if v[:char] == ''
        end

      when 'Nl', 'No', 'Nd'
        if v[:decomp]
          v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
          v[:char] = nil if v[:char] == ''
        end
        if v[:char].nil?
          offset = [v[:decimal], v[:numeric]].compact
          v[:char] = offset[0].to_s if offset.size > 0
        end

      when 'Pi', 'Pf'
        v[:char] = "'" if v[:name] =~ /SINGLE/
        v[:char] = '"' if v[:name] =~ /DOUBLE/

      when 'Zs'
        v[:char] = ' '

      when 'Sk', 'Cf', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp'
        v[:ignore] = true

      else
        raise "Unhandled character category #{v[:category]}"
    end

    if v[:char].nil?
      v[:char] = database[v[:upcase]][:char].downcase if v[:upcase] && database[v[:upcase]][:char]
      v[:char] = database[v[:downcase]][:char].upcase if v[:downcase] && database[v[:downcase]][:char]
    end

    #puts "#{v[:name]} (#{v[:category]})" unless v[:char] || v[:ignore]

    unmatched << v unless v[:char]
  }

  puts "#{unmatched.size} unmatched"

  categories = {}
  database.keys.each{|k|
    next unless k.is_a?(Symbol)
    key = [k.to_s.hex].pack('U')
    categories[key] = database[k][:category]
    database[key] = database[k][:char]
  }
  database = database.reject{|k, v| !v.is_a?(String)}

  reversed = {}
  database.each_pair{|unicode, ascii|
    reversed[ascii] ||= []
    reversed[ascii] << unicode
  }
  cls = Hash[*(%w{Lu Ll Lt Lo}.collect{|k| [k, 'alpha']}.flatten)].merge(Hash[*(%w{Nd Nl, No}.collect{|k| [k, 'num']}.flatten)])

  builder = Nokogiri::XML::Builder.new{|xml|
    xml.mapping {
      unmatched.each{|cp|
        char = [cp[:codepoint].hex].pack('U')
        xml.character(:codepoint => cp[:codepoint].rjust(4, '0'), :category => categories[char], :class => cls[categories[char]] || 'other') { xml.text(char) }
      }
      reversed.each_pair{|ascii, unicodes|
        xml.replacement(:string => ascii) {
          unicodes.each{|unicode|
            xml.character(:codepoint => unicode.unpack('U')[0].to_s(16).upcase.rjust(4, '0'), :category => categories[unicode], :class => cls[categories[unicode]] || 'other') { xml.text(unicode) }
          }
        }
      }
    }
  }
  File.open(DATABASEXML, 'wb'){|f| f.write(builder.to_xml) }

  File.open(DATABASEFILE, 'wb'){|f| f.write(database.to_msgpack) }
  exit
end