events into a hierarchy of objects. E.g., stack handling, |
events into a hierarchy of objects. E.g., stack handling, |
delegation to node classes, etc. |
delegation to node classes, etc. |
|
|
Note: I originally wrote this module because the Python default DOM |
If all you need is to read an XML file and turn it into objects, you |
for XML (way back in the XML-SIG for Python 1.5.2 days) was horrendously |
came to the right place. If you need an actual model of the XML file |
slow for processing (typically enormous) XMI files. If 2.2 minidom |
that you can manipulate, with absolute fidelity to the original, you |
is fast enough and easy enough to use, this package may go away once |
might be better off with a DOM, since this doesn't retain processing |
I've ported the XMI support to minidom. So let me know if you're using |
instructions or comments. |
this module for anything; I have some non-TransWarp uses for it myself, |
|
but depending on how easy using minidom turns out to be... I may switch |
SOX is faster than 'minidom' or any other DOM that I know of. On the |
them over too! |
other hand, SOX is slower than PyRXP, but SOX handles Unicode correctly. |
""" |
|
|
To use this module, you will need a "document" object that implements |
|
either 'ISOXNode' or 'ISOXNode_NS', depending on whether you want |
from xml.sax import ContentHandler, parse |
namespace support. The interfaces are very similar, except that |
import Interface |
the 'NS' version has some enhancements/simplifications that can't be |
|
added to the non-namespace version for backward-compatibility reasons. |
|
|
|
Once you have your document object, just call |
|
'SOX.load(filenameOrStream,documentObject,namespaces=flag)' to get back |
|
the result of your document object's '_finish()' method after it has |
|
absorbed all of the XML data supplied. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
If you need a simple document or node class, 'Document', 'Document_NS', |
|
'Node', and 'Node_NS' are available for subclassing or idea-stealing. |
|
""" |
|
|
|
|
|
from xml.sax.saxutils import XMLGenerator, quoteattr, escape |
|
from protocols import Interface, advise, Adapter |
|
from kjbuckets import kjGraph |
|
|
|
__all__ = [ |
|
'load', 'ISOXNode', 'ISOXNode_NS', 'IXMLBuilder', 'ExpatBuilder', |
|
'Node', 'Node_NS', 'Document', 'Document_NS', 'IndentedXML', |
|
] |
|
|
|
|
class ISOXNode(Interface.Base): |
class ISOXNode(Interface): |
|
|
"""Object mapping from an XML element |
"""Object mapping from an XML element |
|
|
processing guided by the XML structures, like an interpretive parser. |
processing guided by the XML structures, like an interpretive parser. |
""" |
""" |
|
|
def _newNode(self,name,attributeMap): |
def _newNode(name,attributeMap): |
"""Create new child node from 'name' and 'attributeMap' |
"""Create new child node from 'name' and 'attributeMap' |
|
|
Child node must implement the 'ISOXNode' interface.""" |
Child node must implement the 'ISOXNode' interface.""" |
|
|
def _acquireFrom(self,parentNode): |
def _acquireFrom(parentNode): |
"""Parent-child relationship hook |
"""Parent-child relationship hook |
|
|
Called on newly created nodes to give them a chance to acquire |
Called on newly created nodes to give them a chance to acquire |
context information from their parent node""" |
context information from their parent node""" |
|
|
def _addText(self,text): |
def _addText(text): |
"""Add text string 'text' to node""" |
"""Add text string 'text' to node""" |
|
|
def _addNode(self,subObj): |
def _addNode(name,subObj): |
"""Add sub-node 'subObj' to node""" |
"""Add finished sub-node 'subObj' to node""" |
|
|
def _finish(self): |
def _finish(): |
"""Return an object to be used in place of this node in call to the |
"""Return an object to be used in place of this node in call to the |
parent's '_addNode()' method. Returning 'None' will result in |
parent's '_addNode()' method. Returning 'None' will result in |
nothing being added to the parent.""" |
nothing being added to the parent.""" |
|
|
|
|
|
|
class ObjectMakingHandler(ContentHandler): |
class ISOXNode_NS(Interface): |
|
|
"""SAX handler that makes a pseudo-DOM""" |
def _newNode(name, attributeMap): |
|
|
def __init__(self,documentRoot): |
"""Create new child node from 'name' and 'attributeMap' |
self.stack = [documentRoot] |
|
ContentHandler.__init__(self) |
|
|
|
def startElement(self, name, atts): |
|
top = self.stack[-1] |
|
node = top._newNode(name,atts) |
|
node._acquireFrom(top) |
|
self.stack.append(node) |
|
|
|
def characters(self, ch): |
Child node must implement the 'ISOX2Node' interface.""" |
self.stack[-1]._addText(ch) |
|
|
|
def endElement(self, name): |
def _setNS(ns2uri, uri2ns): |
stack = self.stack |
"""Set namespace declaration maps""" |
top = stack.pop() |
|
|
def _addText(text): |
|
"""Add text string 'text' to node""" |
|
|
|
|
|
def _addNode(name,subObj): |
|
"""Add finished sub-node 'subObj' to node""" |
|
|
|
|
|
def _finish(): |
|
"""Return an object to be used in place of this node in call to the |
|
parent's '_addNode()' method. Returning 'None' will result in |
|
nothing being added to the parent.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class IXMLBuilder(Interface): |
|
|
|
def _xml_addChild(data): |
|
"""Add 'data' to element's children""" |
|
|
|
def _xml_finish(): |
|
"""Return finished value to be passed to parent's 'addChild()'""" |
|
|
|
def _xml_newTag(name,attrs,newPrefixes,parser): |
|
"""Create and return a subnode for a tag""" |
|
|
|
def _xml_addText(xml): |
|
"""Return a new subnode for text""" |
|
|
|
def _xml_addLiteral(xml): |
|
"""Return a new subnode for literals such as comments, PIs, etc.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SoxNodeAsXMLBuilder(Adapter): |
|
|
|
advise( |
|
instancesProvide=[IXMLBuilder], |
|
asAdapterForProtocols=[ISOXNode] |
|
) |
|
|
|
def _xml_addText(self,text): |
|
self.subject._addText(text) |
|
|
|
def _xml_addLiteral(self,text): |
|
pass |
|
|
|
def _xml_finish(self): |
|
return self.subject._finish() |
|
|
|
def _xml_addChild(self,node): |
|
self.subject._addNode(self.lastName,node) # XXX |
|
|
|
def _xml_newTag(self,name,attrs,newPrefixes,parser): |
|
node = self.subject._newNode(name,dict(attrs)) |
|
node._acquireFrom(self.subject) |
|
self.lastName = name |
|
return node |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NSNodeAsXMLBuilder(Adapter): |
|
|
|
advise( |
|
instancesProvide=[IXMLBuilder], |
|
asAdapterForProtocols=[ISOXNode_NS] |
|
) |
|
|
|
def _xml_addText(self,text): |
|
self.subject._addText(text) |
|
|
|
def _xml_addLiteral(self,text): |
|
pass |
|
|
|
def _xml_finish(self): |
|
return self.subject._finish() |
|
|
|
def _xml_addChild(self,node): |
|
self.subject._addNode(self.lastName,node) # XXX |
|
|
|
def _xml_newTag(self,name,attrs,newPrefixes,parser): |
|
node = self.subject._newNode(name,dict(attrs)) |
|
if newPrefixes: |
|
ns2uri = dict( |
|
[(prefix,stack[-1]) for prefix,stack in parser.nsInfo.items()] |
|
) |
|
node._setNS(ns2uri, ~kjGraph(ns2uri.items())) |
|
self.lastName = name |
|
return node |
|
|
if top._name != name: |
|
raise SyntaxError,"End tag '%s' found when '%s' was wanted" % (name, top._name) |
|
|
|
out = top._finish() |
|
|
|
if out is not None: |
|
stack[-1]._addNode(name,out) |
|
|
|
def endDocument(self): |
|
self.document = self.stack[0]._finish() |
|
del self.stack |
|
|
|
|
|
|
|
|
|
"""Simple, DOM-like ISOXNode implementation""" |
"""Simple, DOM-like ISOXNode implementation""" |
|
|
__implements__ = ISOXNode |
advise( instancesProvide = [ISOXNode] ) |
|
|
def __init__(self,name='',atts={},**kw): |
def __init__(self,name='',atts={},**kw): |
self._name = name |
self._name = name |
self._subNodes = [] |
self._subNodes = [] |
self._allNodes = [] |
self._allNodes = [] |
d=self.__dict__ |
self.__dict__.update(atts) |
for a in atts.keys(): |
|
d[a]=atts[a] |
|
|
|
self.__dict__.update(kw) |
self.__dict__.update(kw) |
|
|
def _addNode(self,name,node): |
def _addNode(self,name,node): |
d = n._findFirst(name) |
d = n._findFirst(name) |
if d: return d |
if d: return d |
|
|
|
def _finish(self): |
|
return self |
|
|
def _finish(self): return self |
|
|
|
_acquiredAttrs = () |
_acquiredAttrs = () |
|
|
def _newNode(self,name,atts): |
def _newNode(self,name,atts): |
return Node(name,atts) |
return Node(name,atts) |
|
|
|
class Node_NS(Node): |
|
|
|
advise( instancesProvide = [ISOXNode_NS] ) |
|
ns2uri = {} |
|
uri2ns = kjGraph() |
|
|
|
def _newNode(self,name,atts): |
|
node = self.__class__( |
|
name, atts, ns2uri=self.ns2uri, uri2ns=self.uri2ns |
|
) |
|
return node |
|
|
|
def _setNS(self, ns2uri, uri2ns): |
|
self.ns2uri, self.uri2ns = ns2uri, uri2ns |
|
|
|
|
|
class Document_NS(Node_NS): |
|
|
|
_finish = Document._finish.im_func |
|
|
|
def _newNode(self,name,atts): |
|
return Node_NS(name, atts) |
|
|
def load(filename_or_stream, documentObject=None): |
def load(filename_or_stream, documentObject=None, namespaces=False): |
|
|
"""Build a tree from a filename/stream, rooted in a document object""" |
"""Build a tree from a filename/stream, rooted in a document object""" |
|
|
|
if namespaces: |
|
|
|
if documentObject is None: |
|
documentObject = Document_NS() |
|
|
|
else: |
if documentObject is None: |
if documentObject is None: |
documentObject = Document() |
documentObject = Document() |
|
|
handler = ObjectMakingHandler(documentObject) |
|
parse(filename_or_stream, handler) |
if isinstance(filename_or_stream,str): |
return handler.document |
filename_or_stream = open(filename_or_stream,'rt') |
|
|
|
elif hasattr(filename_or_stream,'getByteStream'): |
|
filename_or_stream = filename_or_stream.getByteStream() |
|
|
|
return ExpatBuilder().parseFile(filename_or_stream,documentObject) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class IndentedXML(XMLGenerator): |
|
|
|
"""SAX handler that writes its output to an IndentedStream""" |
|
|
|
def __init__(self, out=None, encoding="iso-8859-1"): |
|
if out is None: |
|
from IndentedStream import IndentedStream |
|
out = IndentedStream() |
|
XMLGenerator.__init__(self,out,encoding) |
|
|
|
def startElement(self,name,attrs): |
|
XMLGenerator.startElement(self,name,attrs) |
|
self._out.push(1) |
|
|
|
def startElementNS(self,name,qname,attrs): |
|
XMLGenerator.startElementNS(self,name,qname,attrs) |
|
self._out.push(1) |
|
|
|
def characters(self,content): |
|
self._out.push() |
|
self._out.setMargin(absolute=0) |
|
XMLGenerator.characters(self,content) |
|
self._out.pop() |
|
|
|
def endElement(self,name): |
|
self._out.pop() |
|
XMLGenerator.endElement(self,name) |
|
|
|
def endElementNS(self,name,qname): |
|
self._out.pop() |
|
XMLGenerator.endElementNS(self,name,qname) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ExpatBuilder: |
|
|
|
"""Parser that assembles a document""" |
|
|
|
def __init__(self): |
|
self.parser = self.makeParser() |
|
self.stack = [] # "object being assembled" stack |
|
self.nsStack = [] |
|
self.nsInfo = {} # URI stack for each NS prefix |
|
|
|
def makeParser(self): |
|
from xml.parsers.expat import ParserCreate |
|
p = ParserCreate() |
|
p.ordered_attributes = True |
|
p.returns_unicode = True |
|
p.specified_attributes = True |
|
p.StartElementHandler = self.startElement |
|
p.EndElementHandler = self.endElement |
|
p.CommentHandler = self.comment |
|
p.DefaultHandler = self.buildLiteral |
|
# We don't use: |
|
# .StartDoctypeDeclHandler |
|
# .StartNamespaceDeclHandler |
|
# .EndNamespaceDeclHandler |
|
# .XmlDeclHandler(version, encoding, standalone) |
|
# .ElementDeclHandler(name, model) |
|
# .AttlistDeclHandler(elname, attname, type, default, required) |
|
# .EndDoctypeDeclHandler() |
|
# .ProcessingInstructionHandler(target, data) |
|
# .UnparsedEntityDeclHandler(entityN,base,systemId,publicId,notationN) |
|
# .EntityDeclHandler( |
|
# entityName, is_parameter_entity, value, base, |
|
# systemId, publicId, notationName) |
|
# .NotationDeclHandler(notationName, base, systemId, publicId) |
|
# .StartCdataSectionHandler() |
|
# .EndCdataSectionHandler() |
|
# .NotStandaloneHandler() |
|
return p |
|
|
|
|
|
|
|
def parseFile(self, stream, rootNode): |
|
self.__init__() |
|
self.stack.append(IXMLBuilder(rootNode)) |
|
self.parser.CharacterDataHandler = self.stack[-1]._xml_addText |
|
self.parser.ParseFile(stream) |
|
return self.stack[-1]._xml_finish() |
|
|
|
|
|
def comment(self,data): |
|
self.buildLiteral(u'<!--%s-->' % data) |
|
|
|
def buildLiteral(self,xml): |
|
self.stack[-1]._xml_addLiteral(xml) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def startElement(self, name, attrs): |
|
|
|
prefixes = []; a = [] |
|
pop = attrs.pop |
|
append = a.append |
|
|
|
while attrs: |
|
k = pop(0); v=pop(0) |
|
append((k,v)) |
|
|
|
if not k.startswith('xmlns'): |
|
continue |
|
|
|
rest = k[5:] |
|
if not rest: |
|
ns = '' |
|
elif rest.startswith(':'): |
|
ns = rest[1:] |
|
else: |
|
continue |
|
|
|
self.nsInfo.setdefault(ns,[]).append(v) |
|
prefixes.append(ns) |
|
|
|
self.nsStack.append(prefixes) |
|
element = self.stack[-1]._xml_newTag(name, a, prefixes, self) |
|
self.stack.append(IXMLBuilder(element)) |
|
self.parser.CharacterDataHandler = self.stack[-1]._xml_addText |
|
|
|
def endElement(self, name): |
|
last = self.stack.pop() |
|
self.parser.CharacterDataHandler = self.stack[-1]._xml_addText |
|
self.stack[-1]._xml_addChild(last._xml_finish()) |
|
for prefix in self.nsStack.pop(): |
|
self.nsInfo[prefix].pop() |
|
|
|
|
|
|
|
|
|
|
|
|