refactor this dictionary-to-xml converter in python
Question
It's a small thing, really: I have this function that converts dict objects to xml.
Here's the function:
def dictToXml(d):
from xml.sax.saxutils import escape
def unicodify(o):
if o is None:
return u'';
return unicode(o)
lines = []
def addDict(node, offset):
for name, value in node.iteritems():
if isinstance(value, dict):
lines.append(offset + u"<%s>" % name)
addDict(value, offset + u" " * 4)
lines.append(offset + u"</%s>" % name)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
lines.append(offset + u"<%s>" % name)
addDict(item, offset + u" " * 4)
lines.append(offset + u"</%s>" % name)
else:
lines.append(offset + u"<%s>%s</%s>" % (name, escape(unicodify(item)), name))
else:
lines.append(offset + u"<%s>%s</%s>" % (name, escape(unicodify(value)), name))
addDict(d, u"")
lines.append(u"")
return u"\n".join(lines)
For example, it converts this dictionary
{ 'site': { 'name': 'stackoverflow', 'blogger': [ 'Jeff', 'Joel' ] } }
to:
<site>
<name>stackoverflow</name>
<blogger>jeff</blogger>
<blogger>joel</blogger>
</site>
It works, but the addDict
function looks a little too repetitive. I'm sure there's a way to refactor it into 3 co-recursive functions named addDict
, addList
and addElse
, but my brain is stuck. Any help?
Also, any way to get rid of the offset +
thing in every line would be nice.
NOTE: I chose these semantics because I'm trying to match the behavior of the json-to-xml converter in org.json, which I use in a different part of my project. If you got to this page just looking for a dictionary to xml converter, there are some really good options in some of the answers. (Especially pyfo).
Solution
I noticed you have commonality in adding items. Using this commonality I would refactor adding an item to a separate function.
def addItem(item, name, offset):
if isinstance(item, dict):
lines.append(offset + u"<%s>" % name)
addDict(item, offset + u" " * 4)
lines.append(offset + u"</%s>" % name)
else:
lines.append(offset + u"<%s>%s</%s>" % (name, escape(unicodify(item)), name))
def addList(value,name, offset):
for item in value:
addItem(item, name, offset)
def addDict(node, offset):
for name, value in node.iteritems():
if isinstance(value, list):
addList(value, name, offset)
else:
addItem(value, name, offset)
Advisory warning: this code is not tested or written by anybody who actually uses Python.
OTHER TIPS
>>> from pyfo import pyfo
>>> d = ('site', { 'name': 'stackoverflow', 'blogger': [ 'Jeff', 'Joel' ] } )
>>> result = pyfo(d, pretty=True, prolog=True, encoding='ascii')
>>> print result.encode('ascii', 'xmlcharrefreplace')
<?xml version="1.0" encoding="ascii"?>
<site>
<blogger>
Jeff
Joel
</blogger>
<name>stackoverflow</name>
</site>
To install pyfo:
$ easy_install pyfo
To get rid of repeated "offset+":
offset = 0
def addLine(str):
lines.append(u" " * (offset * 4) + str
then
...
addLine(u"<%s>" % name)
offset = offset + 1
addDict(value)
offset = offset - 1
addLine(u"</%s>" % name)
Don't have access to an interpreter here, so take this with a grain of salt :(
Your original code produce malformed XML and can produce the same XML for two different dictionaries (is not injective, speaking mathematically).
For example, if you have a list as a value of the only key in a dictionary:
d = { 'list': [1,2,3] }
I expect that your code would produce
<list>1</list><list>2</list><list>3</list>
and there is no root element. Any XML should have one and only one root element.
Then given the XML produced by your code, it is impossible to say if this XML
<tag>1</tag>
was produced from { 'tag': 1 }
or from { 'tag': [1] }
.
So, I suggest
- always start from the root element
- represent lists with either two special tags (e.g.
<list/>
and<item/>
) or mark them as such in attributes
Then, after decisions about these conceptual shortcomings we can generate correct and unambiguous XML. I chose to use attributes to markup lists, and used ElementTree to construct the XML tree automatically. Also, recursion helps (add_value_to_xml
is called recursively):
from xml.etree.ElementTree import Element, SubElement, tostring
def is_scalar(v):
return isinstance(v,basestring) or isinstance(v,float) \
or isinstance(v,int) or isinstance(v,bool)
def add_value_to_xml(root,v):
if type(v) == type({}):
for k,kv in v.iteritems():
vx = SubElement(root,unicode(k))
vx = add_value_to_xml(vx,kv)
elif type(v) == list:
root.set('type','list')
for e in v:
li = SubElement(root,root.tag)
li = add_value_to_xml(li,e)
li.set('type','item')
elif is_scalar(v):
root.text = unicode(v)
else:
raise Exception("add_value_to_xml: unsuppoted type (%s)"%type(v))
return root
def dict_to_xml(d,root='dict'):
x = Element(root)
x = add_value_to_xml(x,d)
return x
d = { 'float': 5194.177, 'str': 'eggs', 'int': 42,
'list': [1,2], 'dict': { 'recursion': True } }
x = dict_to_xml(d)
print tostring(x)
The result of the conversion of the test dict is:
<dict><int>42</int><dict><recursion>True</recursion></dict><float>5194.177</float><list type="list"><list type="item">1</list><list type="item">2</list></list><str>eggs</str></dict>
Here is my short sketch for a solution:
have a general addSomething()
function that dispatches based on the type of the value to addDict()
, addList()
or addElse()
. Those functions recursively call addSomething()
again.
Basically you are factoring out the parts in the if
clause and add a recursive call.
Here's what I find helpful when working with XML. Actually create the XML node structure first, then render this into text second.
This separates two unrelated concerns.
How do I transform my Python structure into an XML object model?
How to I format that XML object model?
It's hard when you put these two things together into one function. If, on the other hand, you separate them, then you have two things. First, you have a considerably simpler function to "walk" your Python structure and return an XML node. Your XML Nodes can be rendered into text with some preferred encoding and formatting rules applied.
from xml.sax.saxutils import escape
class Node( object ):
def __init__( self, name, *children ):
self.name= name
self.children= children
def toXml( self, indent ):
if len(self.children) == 0:
return u"%s<%s/>" % ( indent*4*u' ', self.name )
elif len(self.children) == 1:
child= self.children[0].toXml(0)
return u"%s<%s>%s</%s>" % ( indent*4*u' ', self.name, child, self.name )
else:
items = [ u"%s<%s>" % ( indent*4*u' ', self.name ) ]
items.extend( [ c.toXml(indent+1) for c in self.children ] )
items.append( u"%s</%s>" % ( indent*4*u' ', self.name ) )
return u"\n".join( items )
class Text( Node ):
def __init__( self, value ):
self.value= value
def toXml( self, indent ):
def unicodify(o):
if o is None:
return u'';
return unicode(o)
return "%s%s" % ( indent*4*u' ', escape( unicodify(self.value) ), )
def dictToXml(d):
def dictToNodeList(node):
nodes= []
for name, value in node.iteritems():
if isinstance(value, dict):
n= Node( name, *dictToNodeList( value ) )
nodes.append( n )
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
n= Node( name, *dictToNodeList( value ) )
nodes.append( n )
else:
n= Node( name, Text( item ) )
nodes.append( n )
else:
n= Node( name, Text( value ) )
nodes.append( n )
return nodes
return u"\n".join( [ n.toXml(0) for n in dictToNodeList(d) ] )