#!/usr/bin/python -u # # Original script modified in November 2003 to take advantage of # the character-validation range routines, and updated to the # current Unicode information (Version 4.0.1) # # NOTE: there is an 'alias' facility for blocks which are not present in # the current release, but are needed for ABI compatibility. This # must be accomplished MANUALLY! Please see the comments below under # 'blockAliases' # import sys import string import time

webpage = “www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html” sources = “Blocks-4.0.1.txt UnicodeData-4.0.1.txt”

# # blockAliases is a small hack - it is used for mapping block names which # were were used in the 3.1 release, but are missing or changed in the current # release. The format is “OldBlockName:NewBlockName1[,NewBlockName2]” blockAliases = [] blockAliases.append(“CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols”) blockAliases.append(“Greek:GreekandCoptic”) blockAliases.append(“PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A,” +

"SupplementaryPrivateUseArea-B")

# minTableSize gives the minimum number of ranges which must be present # before a range table is produced. If there are less than this # number, inline comparisons are generated minTableSize = 8

(blockfile, catfile) = string.split(sources)

# # Now process the “blocks” file, reducing it to a dictionary # indexed by blockname, containing a tuple with the applicable # block range # BlockNames = {} try:

blocks = open(blockfile, "r")

except:

print "Missing %s, aborting ..." % blockfile
sys.exit(1)

for line in blocks.readlines():

if line[0] == '#':
    continue
line = string.strip(line)
if line == '':
    continue
try:
    fields = string.split(line, ';')
    range = string.strip(fields[0])
    (start, end) = string.split(range, "..")
    name = string.strip(fields[1])
    name = string.replace(name, ' ', '')
except:
    print "Failed to process line: %s" % (line)
    continue
start = "0x" + start
end = "0x" + end
try:
    BlockNames[name].append((start, end))
except:
    BlockNames[name] = [(start, end)]

blocks.close() print “Parsed %d blocks descriptions” % (len(BlockNames.keys()))

for block in blockAliases:

alias = string.split(block,':')
alist = string.split(alias[1],',')
for comp in alist:
    if BlockNames.has_key(comp):
        if alias[0] not in BlockNames:
            BlockNames[alias[0]] = []
        for r in BlockNames[comp]:
            BlockNames[alias[0]].append(r)
    else:
        print "Alias %s: %s not in Blocks" % (alias[0], comp)
        continue

# # Next process the Categories file. This is more complex, since # the file is in code sequence, and we need to invert it. We use # a dictionary with index category-name, with each entry containing # all the ranges (codepoints) of that category. Note that category # names comprise two parts - the general category, and the “subclass” # within that category. Therefore, both “general category” (which is # the first character of the 2-character category-name) and the full # (2-character) name are entered into this dictionary. # try:

data = open(catfile, "r")

except:

print "Missing %s, aborting ..." % catfile
sys.exit(1)

nbchar = 0; Categories = {} for line in data.readlines():

if line[0] == '#':
    continue
line = string.strip(line)
if line == '':
    continue
try:
    fields = string.split(line, ';')
    point = string.strip(fields[0])
    value = 0
    while point != '':
        value = value * 16
        if point[0] >= '0' and point[0] <= '9':
            value = value + ord(point[0]) - ord('0')
        elif point[0] >= 'A' and point[0] <= 'F':
            value = value + 10 + ord(point[0]) - ord('A')
        elif point[0] >= 'a' and point[0] <= 'f':
            value = value + 10 + ord(point[0]) - ord('a')
        point = point[1:]
    name = fields[2]
except:
    print "Failed to process line: %s" % (line)
    continue

nbchar = nbchar + 1
# update entry for "full name"
try:
    Categories[name].append(value)
except:
    try:
        Categories[name] = [value]
    except:
        print "Failed to process line: %s" % (line)
# update "general category" name
try:
    Categories[name[0]].append(value)
except:
    try:
        Categories[name[0]] = [value]
    except:
        print "Failed to process line: %s" % (line)

blocks.close() print “Parsed %d char generating %d categories” % (nbchar, len(Categories.keys()))

# # The data is now all read. Time to process it into a more useful form. # # reduce the number list into ranges for cat in Categories.keys():

list = Categories[cat]
start = -1
prev = -1
end = -1
ranges = []
for val in list:
    if start == -1:
        start = val
        prev = val
        continue
    elif val == prev + 1:
        prev = val
        continue
    elif prev == start:
        ranges.append((prev, prev))
        start = val
        prev = val
        continue
    else:
        ranges.append((start, prev))
        start = val
        prev = val
        continue
if prev == start:
    ranges.append((prev, prev))
else:
    ranges.append((start, prev))
Categories[cat] = ranges

# # Assure all data is in alphabetic order, since we will be doing binary # searches on the tables. # bkeys = BlockNames.keys() bkeys.sort()

ckeys = Categories.keys() ckeys.sort()

# # Generate the resulting files # try:

header = open("include/libxml/xmlunicode.h", "w")

except:

print "Failed to open include/libxml/xmlunicode.h"
sys.exit(1)

try:

output = open("xmlunicode.c", "w")

except:

print "Failed to open xmlunicode.c"
sys.exit(1)

date = time.asctime(time.localtime(time.time()))

header.write( “”“

Summary: Unicode character APIs
Description: API for the Unicode character APIs

This file is automatically generated from the
UCS description files of the Unicode Character Database
%s
using the genUnicode.py Python script.

Generation date: %s
Sources: %s
Author: Daniel Veillard

ifndef XML_UNICODE_H define XML_UNICODE_H

include <libxml/xmlversion.h>

ifdef LIBXML_UNICODE_ENABLED

ifdef __cplusplus extern “C” { endif

“”“ % (webpage, date, sources));

output.write( “”“

 xmlunicode.c: this module implements the Unicode character APIs

 This file is automatically generated from the
 UCS description files of the Unicode Character Database
 %s
 using the genUnicode.py Python script.

 Generation date: %s
 Sources: %s
 Daniel Veillard <veillard@redhat.com>
/

define IN_LIBXML include “libxml.h”

ifdef LIBXML_UNICODE_ENABLED

include <string.h> include <libxml/xmlversion.h> include <libxml/xmlunicode.h> include <libxml/chvalid.h>

typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted

typedef struct {

const char *rangename;
xmlIntFunc *func;

} xmlUnicodeRange;

typedef struct {

const xmlUnicodeRange *table;
int             numentries;

} xmlUnicodeNameTable;

static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);

static const xmlUnicodeRange xmlUnicodeBlocks[] = { “”“ % (webpage, date, sources));

flag = 0 for block in bkeys:

name = string.replace(block, '-', '')
if flag:
    output.write(',\n')
else:
    flag = 1
output.write('  {"%s", xmlUCSIs%s}' % (block, name))

output.write('};nn')

output.write('static xmlUnicodeRange xmlUnicodeCats[] = {n') flag = 0; for name in ckeys:

if flag:
    output.write(',\n')
else:
    flag = 1
output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))

output.write('};nn')

# # For any categories with more than minTableSize ranges we generate # a range table suitable for xmlCharInRange # for name in ckeys:

if len(Categories[name]) > minTableSize:
  numshort = 0
  numlong = 0
  ranges = Categories[name]
  sptr = "NULL"
  lptr = "NULL"
  for range in ranges:
    (low, high) = range
    if high < 0x10000:
      if numshort == 0:
        pline = "static const xmlChSRange xml%sS[] = {" % name
        sptr = "xml%sS" % name
      else:
        pline += ", "
      numshort += 1
    else:
      if numlong == 0:
        if numshort > 0:
          output.write(pline + " };\n")
        pline = "static const xmlChLRange xml%sL[] = {" % name
        lptr = "xml%sL" % name
      else:
        pline += ", "
      numlong += 1
    if len(pline) > 60:
      output.write(pline + "\n")
      pline = "    "
    pline += "{%s, %s}" % (hex(low), hex(high))
  output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
       % (name, numshort, numlong, sptr, lptr))

output.write( “”“static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s}; static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};

/**

 xmlUnicodeLookup:
 @tptr: pointer to the name table
 @name: name to be found

 binary table lookup for user-supplied name

 Returns pointer to range function if found, otherwise NULL
/

static xmlIntFunc

xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
   int low, high, mid, cmp;
   xmlUnicodeRange *sptr;

   if ((tptr == NULL) || (tname == NULL)) return(NULL);

   low = 0;
   high = tptr->numentries - 1;
   sptr = tptr->table;
   while (low <= high) {
       mid = (low + high) / 2;
       if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
           return (sptr[mid].func);
       if (cmp < 0)
           high = mid - 1;
       else
           low = mid + 1;
   }
   return (NULL);

}

“”“ % (len(BlockNames), len(Categories)) )

for block in bkeys:

name = string.replace(block, '-', '')
header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
             (block))
output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
flag = 0
for (start, end) in BlockNames[block]:
    if flag:
        output.write(" ||\n           ")
    else:
        flag = 1
    output.write("((code >= %s) && (code <= %s))" % (start, end))
output.write(");\n}\n\n")

header.write(“nXMLPUBFUN int XMLCALL xmlUCSIsBlockt(int code, const char *block);nn”) output.write( “”“/**

 xmlUCSIsBlock:
 @code: UCS code point
 @block: UCS block name

 Check whether the character is part of the UCS Block

 Returns 1 if true, 0 if false and -1 on unknown block
/

int xmlUCSIsBlock(int code, const char *block) {

xmlIntFunc *func;

func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
if (func == NULL)
    return (-1);
return (func(code));

}

“”“)

for name in ckeys:

ranges = Categories[name]
header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
             (name))
output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
if len(Categories[name]) > minTableSize:
    output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
        % name)
else:
    start = 1
    for range in ranges:
        (begin, end) = range;
        if start:
            output.write("    return(");
            start = 0
        else:
            output.write(" ||\n           ");
        if (begin == end):
            output.write("(code == %s)" % (hex(begin)))
        else:
            output.write("((code >= %s) && (code <= %s))" % (
                     hex(begin), hex(end)))
output.write(");\n}\n\n")

header.write(“nXMLPUBFUN int XMLCALL xmlUCSIsCatt(int code, const char *cat);n”) output.write( “”“/**

 xmlUCSIsCat:
 @code: UCS code point
 @cat: UCS Category name

 Check whether the character is part of the UCS Category

 Returns 1 if true, 0 if false and -1 on unknown category
/

int xmlUCSIsCat(int code, const char *cat) {

xmlIntFunc *func;

func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
if (func == NULL)
    return (-1);
return (func(code));

}

define bottom_xmlunicode include “elfgcchack.h” endif /* LIBXML_UNICODE_ENABLED */ “”“)

header.write(“”“ ifdef __cplusplus } endif

endif /* LIBXML_UNICODE_ENABLED */

endif /* XML_UNICODE_H */ “”“);

header.close() output.close()