module AsciiDammit
Constants
- DATABASE
- DATABASEFILE
- DATABASEXML
- MANUAL
- VERSION
Public Class Methods
parse_lvg_map(url)
click to toggle source
# File lib/asciidammit.rb, line 27 def parse_lvg_map(url) map = {} open(url).readlines.each{|line| next if line =~ /^#/ line.gsub!(/\|[^\|]+\|[^\|]+$/, '') codepoint, char = *line.split('|', 2) codepoint = eval("\"\\u{#{codepoint.gsub!(/^U\+/, '')}}\"") map[codepoint] = char } return map end
update()
click to toggle source
# File lib/asciidammit.rb, line 40 def update database = {} unmatched = [] #CSV.parse(open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row| CSV.parse(open('UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row| next if row[1] == '<control>' begin codepoint = { :codepoint => row[0], :char => [row[0].hex].pack('U') =~ /^[\x20-\x7E]$/ ? [row[0].hex].pack('U') : nil, :name => row[1], :category => row[2], :decomp => row[5] ? row[5].split.select{|v| v =~ /^[0-9A-F]{4}$/i}.collect{|cp| cp.intern} : nil, :decimal => row[6] ? Integer(row[6]) : (row[7] ? Integer(row[7]) : nil), :numeric => row[8] =~ /^[0-9]+$/ ? Integer(row[8]) : nil, :upcase => row[12] ? row[12].intern : nil, :downcase => row[13] ? row[13].intern : nil, } rescue next end # sorry guys, but I really don't know what to make of these next if %w{KANNADA MALAYALAM TELUGU TAMIL ORIYA GUJARATI GURMUKHI BENGALI DEVANAGARI THAANA SYRIAC HEBREW COPTIC HANGUL CYRILLIC ARMENIAN ARABIC SINHALA THAI CHEROKEE OGHAM RUNIC TAGALOG HANUNOO BUHID TAGBANWA KHMER MONGOLIAN MEASURED KANGXI LAO MYANMAR GEORGIAN ETHIOPIC TIBETAN}.include?(codepoint[:name].split[0]) next if codepoint[:name] =~ /CANADIAN SYLLABICS / next if codepoint[:name] =~ /PHILIPPINE (SINGLE|DOUBLE) PUNCTUATION/ next if codepoint[:name] =~ /BOX DRAWINGS / next if codepoint[:name] =~ /MUSICAL SYMBOL / next if codepoint[:name] =~ /CJK COMPATIBILITY / if codepoint[:char].nil? name = codepoint[:name].dup.split.reject{|w| %w{TURNED INVERTED CLOSED REVERSED OPEN}.include?(w)}.join(' ') codepoint[:char] ||= $2 if name =~ /LATIN CAPITAL (LIGATURE|LETTER) ([A-Z]+)/ && codepoint[:category] == 'Lu' codepoint[:char] ||= $1 if name =~ /LATIN LETTER ([A-Z]+)/ && codepoint[:category] == 'Lu' codepoint[:char] ||= $3.downcase if name =~ /LATIN( LETTER)? SMALL (LIGATURE|LETTER|CAPITAL|SCRIPT) ([A-Z]+)/ && codepoint[:category] == 'Ll' codepoint[:char] ||= $1.downcase if name =~ /LATIN LETTER STRETCHED ([A-Z]+)/ && codepoint[:category] == 'Ll' codepoint[:char] ||= ' ' if name =~ /NO-BREAK SPACE/ && %w{Cf Zs}.include?(codepoint[:category]) codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z])$/ && codepoint[:category] == 'Lm' codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z]) / && codepoint[:category] == 'Lm' end codepoint[:char] ||= MANUAL[codepoint[:name]] database[codepoint[:codepoint].intern] = database[codepoint[:name]] = codepoint } database.each_pair{|k, v| next unless k.is_a?(Symbol) && !v[:char] case v[:category] when 'Mn' v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char] if database[v[:name].gsub(/^COMBINING /, '')] when 'Lu', 'Ll' if v[:decomp] v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('') v[:char] = nil if v[:char] == '' else names = v[:name].split.reject{|w| %w{LATIN MACRON SHARP WITH STROKE HOOK TOPBAR DOT ABOVE ACUTE}.include?(w)} names.size.downto(1){|n| names.permutation(n).to_a.each{|name| name = "LATIN " + name.join(' ') v[:char] = database[name][:char] if database[name] && database[name][:char] break if v[:char] } break if v[:char] } end when 'Lm', 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs' names = v[:name].split.reject{|w| %w{PARENTHESIZED CIRCLED ELEMENT VERTICAL HORIZONTAL STROKE LEFTWARDS ARROW BAR OVER RIGHTWARDS WITH DOT ABOVE BELOW TAG INVERTED LEFT-POINTING RIGHT-POINTING QUADRANT UPPER LEFT LOWER RIGHT ANGLE SUPERSCRIPT SUBSCRIPT DOWNWARDS UPWARDS BROKEN_BAR}.include?(w.gsub(' ', '_'))} STDOUT.flush names.size.downto(1){|n| names.permutation(n).to_a.each{|name| name = name.join(' ') v[:char] = database[name][:char] if database[name] && database[name][:char] break if v[:char] } break if v[:char] } when 'Lt' m = v[:name].match(/(CAPITAL|SMALL) LETTER (.) WITH (CAPITAL|SMALL) LETTER (.)/) if m v[:char] = database["LATIN #{m[1]} LETTER #{m[2]}"][:char].to_s + database["LATIN #{m[3]} LETTER #{m[4]}"][:char].to_s v[:char] = nil if v[:char] == '' end when 'Nl', 'No', 'Nd' if v[:decomp] v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('') v[:char] = nil if v[:char] == '' end if v[:char].nil? offset = [v[:decimal], v[:numeric]].compact v[:char] = offset[0].to_s if offset.size > 0 end when 'Pi', 'Pf' v[:char] = "'" if v[:name] =~ /SINGLE/ v[:char] = '"' if v[:name] =~ /DOUBLE/ when 'Zs' v[:char] = ' ' when 'Sk', 'Cf', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp' v[:ignore] = true else raise "Unhandled character category #{v[:category]}" end if v[:char].nil? v[:char] = database[v[:upcase]][:char].downcase if v[:upcase] && database[v[:upcase]][:char] v[:char] = database[v[:downcase]][:char].upcase if v[:downcase] && database[v[:downcase]][:char] end #puts "#{v[:name]} (#{v[:category]})" unless v[:char] || v[:ignore] unmatched << v unless v[:char] } puts "#{unmatched.size} unmatched" categories = {} database.keys.each{|k| next unless k.is_a?(Symbol) key = [k.to_s.hex].pack('U') categories[key] = database[k][:category] database[key] = database[k][:char] } database = database.reject{|k, v| !v.is_a?(String)} reversed = {} database.each_pair{|unicode, ascii| reversed[ascii] ||= [] reversed[ascii] << unicode } cls = Hash[*(%w{Lu Ll Lt Lo}.collect{|k| [k, 'alpha']}.flatten)].merge(Hash[*(%w{Nd Nl, No}.collect{|k| [k, 'num']}.flatten)]) builder = Nokogiri::XML::Builder.new{|xml| xml.mapping { unmatched.each{|cp| char = [cp[:codepoint].hex].pack('U') xml.character(:codepoint => cp[:codepoint].rjust(4, '0'), :category => categories[char], :class => cls[categories[char]] || 'other') { xml.text(char) } } reversed.each_pair{|ascii, unicodes| xml.replacement(:string => ascii) { unicodes.each{|unicode| xml.character(:codepoint => unicode.unpack('U')[0].to_s(16).upcase.rjust(4, '0'), :category => categories[unicode], :class => cls[categories[unicode]] || 'other') { xml.text(unicode) } } } } } } File.open(DATABASEXML, 'wb'){|f| f.write(builder.to_xml) } File.open(DATABASEFILE, 'wb'){|f| f.write(database.to_msgpack) } exit end
Private Instance Methods
parse_lvg_map(url)
click to toggle source
# File lib/asciidammit.rb, line 27 def parse_lvg_map(url) map = {} open(url).readlines.each{|line| next if line =~ /^#/ line.gsub!(/\|[^\|]+\|[^\|]+$/, '') codepoint, char = *line.split('|', 2) codepoint = eval("\"\\u{#{codepoint.gsub!(/^U\+/, '')}}\"") map[codepoint] = char } return map end
update()
click to toggle source
# File lib/asciidammit.rb, line 40 def update database = {} unmatched = [] #CSV.parse(open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row| CSV.parse(open('UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row| next if row[1] == '<control>' begin codepoint = { :codepoint => row[0], :char => [row[0].hex].pack('U') =~ /^[\x20-\x7E]$/ ? [row[0].hex].pack('U') : nil, :name => row[1], :category => row[2], :decomp => row[5] ? row[5].split.select{|v| v =~ /^[0-9A-F]{4}$/i}.collect{|cp| cp.intern} : nil, :decimal => row[6] ? Integer(row[6]) : (row[7] ? Integer(row[7]) : nil), :numeric => row[8] =~ /^[0-9]+$/ ? Integer(row[8]) : nil, :upcase => row[12] ? row[12].intern : nil, :downcase => row[13] ? row[13].intern : nil, } rescue next end # sorry guys, but I really don't know what to make of these next if %w{KANNADA MALAYALAM TELUGU TAMIL ORIYA GUJARATI GURMUKHI BENGALI DEVANAGARI THAANA SYRIAC HEBREW COPTIC HANGUL CYRILLIC ARMENIAN ARABIC SINHALA THAI CHEROKEE OGHAM RUNIC TAGALOG HANUNOO BUHID TAGBANWA KHMER MONGOLIAN MEASURED KANGXI LAO MYANMAR GEORGIAN ETHIOPIC TIBETAN}.include?(codepoint[:name].split[0]) next if codepoint[:name] =~ /CANADIAN SYLLABICS / next if codepoint[:name] =~ /PHILIPPINE (SINGLE|DOUBLE) PUNCTUATION/ next if codepoint[:name] =~ /BOX DRAWINGS / next if codepoint[:name] =~ /MUSICAL SYMBOL / next if codepoint[:name] =~ /CJK COMPATIBILITY / if codepoint[:char].nil? name = codepoint[:name].dup.split.reject{|w| %w{TURNED INVERTED CLOSED REVERSED OPEN}.include?(w)}.join(' ') codepoint[:char] ||= $2 if name =~ /LATIN CAPITAL (LIGATURE|LETTER) ([A-Z]+)/ && codepoint[:category] == 'Lu' codepoint[:char] ||= $1 if name =~ /LATIN LETTER ([A-Z]+)/ && codepoint[:category] == 'Lu' codepoint[:char] ||= $3.downcase if name =~ /LATIN( LETTER)? SMALL (LIGATURE|LETTER|CAPITAL|SCRIPT) ([A-Z]+)/ && codepoint[:category] == 'Ll' codepoint[:char] ||= $1.downcase if name =~ /LATIN LETTER STRETCHED ([A-Z]+)/ && codepoint[:category] == 'Ll' codepoint[:char] ||= ' ' if name =~ /NO-BREAK SPACE/ && %w{Cf Zs}.include?(codepoint[:category]) codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z])$/ && codepoint[:category] == 'Lm' codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z]) / && codepoint[:category] == 'Lm' end codepoint[:char] ||= MANUAL[codepoint[:name]] database[codepoint[:codepoint].intern] = database[codepoint[:name]] = codepoint } database.each_pair{|k, v| next unless k.is_a?(Symbol) && !v[:char] case v[:category] when 'Mn' v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char] if database[v[:name].gsub(/^COMBINING /, '')] when 'Lu', 'Ll' if v[:decomp] v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('') v[:char] = nil if v[:char] == '' else names = v[:name].split.reject{|w| %w{LATIN MACRON SHARP WITH STROKE HOOK TOPBAR DOT ABOVE ACUTE}.include?(w)} names.size.downto(1){|n| names.permutation(n).to_a.each{|name| name = "LATIN " + name.join(' ') v[:char] = database[name][:char] if database[name] && database[name][:char] break if v[:char] } break if v[:char] } end when 'Lm', 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs' names = v[:name].split.reject{|w| %w{PARENTHESIZED CIRCLED ELEMENT VERTICAL HORIZONTAL STROKE LEFTWARDS ARROW BAR OVER RIGHTWARDS WITH DOT ABOVE BELOW TAG INVERTED LEFT-POINTING RIGHT-POINTING QUADRANT UPPER LEFT LOWER RIGHT ANGLE SUPERSCRIPT SUBSCRIPT DOWNWARDS UPWARDS BROKEN_BAR}.include?(w.gsub(' ', '_'))} STDOUT.flush names.size.downto(1){|n| names.permutation(n).to_a.each{|name| name = name.join(' ') v[:char] = database[name][:char] if database[name] && database[name][:char] break if v[:char] } break if v[:char] } when 'Lt' m = v[:name].match(/(CAPITAL|SMALL) LETTER (.) WITH (CAPITAL|SMALL) LETTER (.)/) if m v[:char] = database["LATIN #{m[1]} LETTER #{m[2]}"][:char].to_s + database["LATIN #{m[3]} LETTER #{m[4]}"][:char].to_s v[:char] = nil if v[:char] == '' end when 'Nl', 'No', 'Nd' if v[:decomp] v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('') v[:char] = nil if v[:char] == '' end if v[:char].nil? offset = [v[:decimal], v[:numeric]].compact v[:char] = offset[0].to_s if offset.size > 0 end when 'Pi', 'Pf' v[:char] = "'" if v[:name] =~ /SINGLE/ v[:char] = '"' if v[:name] =~ /DOUBLE/ when 'Zs' v[:char] = ' ' when 'Sk', 'Cf', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp' v[:ignore] = true else raise "Unhandled character category #{v[:category]}" end if v[:char].nil? v[:char] = database[v[:upcase]][:char].downcase if v[:upcase] && database[v[:upcase]][:char] v[:char] = database[v[:downcase]][:char].upcase if v[:downcase] && database[v[:downcase]][:char] end #puts "#{v[:name]} (#{v[:category]})" unless v[:char] || v[:ignore] unmatched << v unless v[:char] } puts "#{unmatched.size} unmatched" categories = {} database.keys.each{|k| next unless k.is_a?(Symbol) key = [k.to_s.hex].pack('U') categories[key] = database[k][:category] database[key] = database[k][:char] } database = database.reject{|k, v| !v.is_a?(String)} reversed = {} database.each_pair{|unicode, ascii| reversed[ascii] ||= [] reversed[ascii] << unicode } cls = Hash[*(%w{Lu Ll Lt Lo}.collect{|k| [k, 'alpha']}.flatten)].merge(Hash[*(%w{Nd Nl, No}.collect{|k| [k, 'num']}.flatten)]) builder = Nokogiri::XML::Builder.new{|xml| xml.mapping { unmatched.each{|cp| char = [cp[:codepoint].hex].pack('U') xml.character(:codepoint => cp[:codepoint].rjust(4, '0'), :category => categories[char], :class => cls[categories[char]] || 'other') { xml.text(char) } } reversed.each_pair{|ascii, unicodes| xml.replacement(:string => ascii) { unicodes.each{|unicode| xml.character(:codepoint => unicode.unpack('U')[0].to_s(16).upcase.rjust(4, '0'), :category => categories[unicode], :class => cls[categories[unicode]] || 'other') { xml.text(unicode) } } } } } } File.open(DATABASEXML, 'wb'){|f| f.write(builder.to_xml) } File.open(DATABASEFILE, 'wb'){|f| f.write(database.to_msgpack) } exit end