“”“ A SAX2 driver for libxml2, on top of it's XmlReader API

USAGE

# put this file (drv_libxml2.py) in PYTHONPATH
import xml.sax
reader = xml.sax.make_parser(["drv_libxml2"])
# ...and the rest is standard python sax.

CAVEATS

- Lexical handlers are supported, except for start/endEntity
  (waiting for XmlReader.ResolveEntity) and start/endDTD
- Error callbacks are not exactly synchronous, they tend
  to be invoked before the corresponding content callback,
  because the underlying reader interface parses
  data by chunks of 512 bytes

TODO

- search for TODO
- some ErrorHandler events (warning)
- some ContentHandler events (setDocumentLocator, skippedEntity)
- EntityResolver (using libxml2.?)
- DTDHandler (if/when libxml2 exposes such node types)
- DeclHandler (if/when libxml2 exposes such node types)
- property_xml_string?
- feature_string_interning?
- Incremental parser
- additional performance tuning:
  - one might cache callbacks to avoid some name lookups
  - one might implement a smarter way to pass attributes to startElement
    (some kind of lazy evaluation?)
  - there might be room for improvement in start/endPrefixMapping
  - other?

“”“

__author__ = “Stéphane Bidoul <sbi@skynet.be>” __version__ = “0.3”

import sys import codecs

if sys.version_info < 3:

__author__  = codecs.unicode_escape_decode(__author__)[0]

StringTypes = (str, unicode)
# libxml2 returns strings as UTF8
_decoder = codecs.lookup("utf8")[1]
def _d(s):
    if s is None:
        return s
    else:
        return _decoder(s)[0]

else:

StringTypes = str
# s is Unicode `str` already
def _d(s):
    return s

from xml.sax._exceptions import * from xml.sax import xmlreader, saxutils from xml.sax.handler import \

feature_namespaces, \
feature_namespace_prefixes, \
feature_string_interning, \
feature_validation, \
feature_external_ges, \
feature_external_pes, \
property_lexical_handler, \
property_declaration_handler, \
property_dom_node, \
property_xml_string

try:

import libxml2

except ImportError:

raise SAXReaderNotAvailable("libxml2 not available: " \
                            "import error was: %s" % sys.exc_info()[1])

class Locator(xmlreader.Locator):

"""SAX Locator adapter for libxml2.xmlTextReaderLocator"""

def __init__(self,locator):
    self.__locator = locator

def getColumnNumber(self):
    "Return the column number where the current event ends."
    return -1

def getLineNumber(self):
    "Return the line number where the current event ends."
    return self.__locator.LineNumber()

def getPublicId(self):
    "Return the public identifier for the current event."
    return None

def getSystemId(self):
    "Return the system identifier for the current event."
    return self.__locator.BaseURI()

class LibXml2Reader(xmlreader.XMLReader):

def __init__(self):
    xmlreader.XMLReader.__init__(self)
    # features
    self.__ns = 0
    self.__nspfx = 0
    self.__validate = 0
    self.__extparams = 1
    # parsing flag
    self.__parsing = 0
    # additional handlers
    self.__lex_handler = None
    self.__decl_handler = None
    # error messages accumulator
    self.__errors = None

def _errorHandler(self,arg,msg,severity,locator):
    if self.__errors is None:
        self.__errors = []
    self.__errors.append((severity,
                          SAXParseException(msg,None,
                                            Locator(locator))))

def _reportErrors(self,fatal):
    for severity,exception in self.__errors:
        if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
                        libxml2.PARSER_SEVERITY_WARNING):
            self._err_handler.warning(exception)
        else:
            # when fatal is set, the parse will stop;
            # we consider that the last error reported
            # is the fatal one.
            if fatal and exception is self.__errors[-1][1]:
                self._err_handler.fatalError(exception)
            else:
                self._err_handler.error(exception)
    self.__errors = None

def parse(self, source):
    self.__parsing = 1
    try:
        # prepare source and create reader
        if isinstance(source, StringTypes):
            reader = libxml2.newTextReaderFilename(source)
        else:
            source = saxutils.prepare_input_source(source)
            input = libxml2.inputBuffer(source.getByteStream())
            reader = input.newTextReader(source.getSystemId())
        reader.SetErrorHandler(self._errorHandler,None)
        # configure reader
        if self.__extparams:
            reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
            reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
            reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
            reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
        else:
            reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
        # we reuse attribute maps (for a slight performance gain)
        if self.__ns:
            attributesNSImpl = xmlreader.AttributesNSImpl({},{})
        else:
            attributesImpl = xmlreader.AttributesImpl({})
        # prefixes to pop (for endPrefixMapping)
        prefixes = []
        # start loop
        self._cont_handler.startDocument()
        while 1:
            r = reader.Read()
            # check for errors
            if r == 1:
                if not self.__errors is None:
                    self._reportErrors(0)
            elif r == 0:
                if not self.__errors is None:
                    self._reportErrors(0)
                break # end of parse
            else:
                if not self.__errors is None:
                    self._reportErrors(1)
                else:
                    self._err_handler.fatalError(\
                        SAXException("Read failed (no details available)"))
                break # fatal parse error
            # get node type
            nodeType = reader.NodeType()
            # Element
            if nodeType == 1: 
                if self.__ns:
                    eltName = (_d(reader.NamespaceUri()),\
                               _d(reader.LocalName()))
                    eltQName = _d(reader.Name())
                    attributesNSImpl._attrs = attrs = {}
                    attributesNSImpl._qnames = qnames = {}
                    newPrefixes = []
                    while reader.MoveToNextAttribute():
                        qname = _d(reader.Name())
                        value = _d(reader.Value())
                        if qname.startswith("xmlns"):
                            if len(qname) > 5:
                                newPrefix = qname[6:]
                            else:
                                newPrefix = None
                            newPrefixes.append(newPrefix)
                            self._cont_handler.startPrefixMapping(\
                                newPrefix,value)
                            if not self.__nspfx:
                                continue # don't report xmlns attribute
                        attName = (_d(reader.NamespaceUri()),
                                   _d(reader.LocalName()))
                        qnames[attName] = qname
                        attrs[attName] = value
                    reader.MoveToElement()
                    self._cont_handler.startElementNS( \
                        eltName,eltQName,attributesNSImpl) 
                    if reader.IsEmptyElement():
                        self._cont_handler.endElementNS(eltName,eltQName)
                        for newPrefix in newPrefixes:
                            self._cont_handler.endPrefixMapping(newPrefix)
                    else:
                        prefixes.append(newPrefixes)
                else:
                    eltName = _d(reader.Name())
                    attributesImpl._attrs = attrs = {}
                    while reader.MoveToNextAttribute():
                        attName = _d(reader.Name())
                        attrs[attName] = _d(reader.Value())
                    reader.MoveToElement()
                    self._cont_handler.startElement( \
                        eltName,attributesImpl)
                    if reader.IsEmptyElement():
                        self._cont_handler.endElement(eltName)
            # EndElement
            elif nodeType == 15: 
                if self.__ns:
                    self._cont_handler.endElementNS( \
                         (_d(reader.NamespaceUri()),_d(reader.LocalName())),
                         _d(reader.Name()))
                    for prefix in prefixes.pop():
                        self._cont_handler.endPrefixMapping(prefix)
                else:
                    self._cont_handler.endElement(_d(reader.Name()))
            # Text
            elif nodeType == 3: 
                self._cont_handler.characters(_d(reader.Value()))
            # Whitespace
            elif nodeType == 13: 
                self._cont_handler.ignorableWhitespace(_d(reader.Value()))
            # SignificantWhitespace
            elif nodeType == 14:
                self._cont_handler.characters(_d(reader.Value()))
            # CDATA
            elif nodeType == 4:
                if not self.__lex_handler is None:
                    self.__lex_handler.startCDATA()
                self._cont_handler.characters(_d(reader.Value()))
                if not self.__lex_handler is None:
                    self.__lex_handler.endCDATA()
            # EntityReference
            elif nodeType == 5:
                if not self.__lex_handler is None:
                    self.startEntity(_d(reader.Name()))
                reader.ResolveEntity()
            # EndEntity
            elif nodeType == 16:
                if not self.__lex_handler is None:
                    self.endEntity(_d(reader.Name()))
            # ProcessingInstruction
            elif nodeType == 7: 
                self._cont_handler.processingInstruction( \
                    _d(reader.Name()),_d(reader.Value()))
            # Comment
            elif nodeType == 8:
                if not self.__lex_handler is None:
                    self.__lex_handler.comment(_d(reader.Value()))
            # DocumentType
            elif nodeType == 10:
                #if not self.__lex_handler is None:
                #    self.__lex_handler.startDTD()
                pass # TODO (how to detect endDTD? on first non-dtd event?)
            # XmlDeclaration
            elif nodeType == 17:
                pass # TODO
            # Entity
            elif nodeType == 6:
                pass # TODO (entity decl)
            # Notation (decl)
            elif nodeType == 12:
                pass # TODO
            # Attribute (never in this loop)
            #elif nodeType == 2: 
            #    pass
            # Document (not exposed)
            #elif nodeType == 9: 
            #    pass
            # DocumentFragment (never returned by XmlReader)
            #elif nodeType == 11:
            #    pass
            # None
            #elif nodeType == 0:
            #    pass
            # -
            else:
                raise SAXException("Unexpected node type %d" % nodeType)
        if r == 0:
            self._cont_handler.endDocument()
        reader.Close()
    finally:
        self.__parsing = 0

def setDTDHandler(self, handler):
    # TODO (when supported, the inherited method works just fine)
    raise SAXNotSupportedException("DTDHandler not supported")

def setEntityResolver(self, resolver):
    # TODO (when supported, the inherited method works just fine)
    raise SAXNotSupportedException("EntityResolver not supported")

def getFeature(self, name):
    if name == feature_namespaces:
        return self.__ns
    elif name == feature_namespace_prefixes:
        return self.__nspfx
    elif name == feature_validation:
        return self.__validate
    elif name == feature_external_ges:
        return 1 # TODO (does that relate to PARSER_LOADDTD)?
    elif name == feature_external_pes:
        return self.__extparams
    else:
        raise SAXNotRecognizedException("Feature '%s' not recognized" % \
                                        name)

def setFeature(self, name, state):
    if self.__parsing:
        raise SAXNotSupportedException("Cannot set feature %s " \
                                       "while parsing" % name)
    if name == feature_namespaces:
        self.__ns = state
    elif name == feature_namespace_prefixes:
        self.__nspfx = state
    elif name == feature_validation:
        self.__validate = state
    elif name == feature_external_ges:
        if state == 0:
            # TODO (does that relate to PARSER_LOADDTD)?
            raise SAXNotSupportedException("Feature '%s' not supported" % \
                                           name)
    elif name == feature_external_pes:
        self.__extparams = state
    else:
        raise SAXNotRecognizedException("Feature '%s' not recognized" % \
                                        name)

def getProperty(self, name):
    if name == property_lexical_handler:
        return self.__lex_handler
    elif name == property_declaration_handler:
        return self.__decl_handler
    else:
        raise SAXNotRecognizedException("Property '%s' not recognized" % \
                                        name)

def setProperty(self, name, value):     
    if name == property_lexical_handler:
        self.__lex_handler = value
    elif name == property_declaration_handler:
        # TODO: remove if/when libxml2 supports dtd events
        raise SAXNotSupportedException("Property '%s' not supported" % \
                                       name)
        self.__decl_handler = value
    else:
        raise SAXNotRecognizedException("Property '%s' not recognized" % \
                                        name)

def create_parser():

return LibXml2Reader()