Strumenti di scrittura XML per Python

https://stackoverflow.com/questions/56229

09-06-2019
|

Domanda

Attualmente sto provando ElementTree e sembra a posto, sfugge alle entità HTML e così via.Mi sto perdendo qualcosa di veramente meraviglioso di cui non ho sentito parlare?

Questo è simile a quello che sto effettivamente facendo:

import xml.etree.ElementTree as ET
root = ET.Element('html')
head = ET.SubElement(root,'head')
script = ET.SubElement(head,'script')
script.set('type','text/javascript')
script.text = "var a = 'I love &aacute; letters'"
body = ET.SubElement(root,'body')
h1 = ET.SubElement(body,'h1')
h1.text = "And I like the fact that 3 > 1"
tree = ET.ElementTree(root)
tree.write('foo.xhtml')

# more foo.xhtml
<html><head><script type="text/javascript">var a = 'I love &amp;aacute;
letters'</script></head><body><h1>And I like the fact that 3 &gt; 1</h1>
</body></html>

Soluzione

Presumo che tu stia effettivamente creando un albero DOM XML, perché vuoi verificare che ciò che entra in questo file sia un XML valido, poiché altrimenti scriveresti semplicemente una stringa statica in un file.Se convalidare il tuo output è davvero il tuo obiettivo, allora suggerirei

from xml.dom.minidom import parseString

doc = parseString("""<html>
    <head>
        <script type="text/javascript">
            var a = 'I love &amp;aacute; letters'
        </script>
    </head>
    <body>
        <h1>And I like the fact that 3 &gt; 1</h1>
    </body>
    </html>""")

with open("foo.xhtml", "w") as f:
    f.write( doc.toxml() )

Ciò ti consente semplicemente di scrivere l'XML che desideri restituire, verificare che sia corretto (poiché parseString solleverà un'eccezione se non è valido) e che il tuo codice abbia un aspetto molto più gradevole.

Presumibilmente non stai semplicemente scrivendo lo stesso XML statico ogni volta e desideri qualche sostituzione.In questo caso avrei righe come

var a = '%(message)s'

e quindi utilizzare l'operatore % per eseguire la sostituzione, ad esempio

</html>""" % {"message": "I love &amp;aacute; letters"})

Altri suggerimenti

Un altro modo è usare il file Fabbrica E builder da lxml (disponibile in Elementtree pure)

>>> from lxml import etree

>>> from lxml.builder import E

>>> def CLASS(*args): # class is a reserved word in Python
...     return {"class":' '.join(args)}

>>> html = page = (
...   E.html(       # create an Element called "html"
...     E.head(
...       E.title("This is a sample document")
...     ),
...     E.body(
...       E.h1("Hello!", CLASS("title")),
...       E.p("This is a paragraph with ", E.b("bold"), " text in it!"),
...       E.p("This is another paragraph, with a", "\n      ",
...         E.a("link", href="http://www.python.org"), "."),
...       E.p("Here are some reserved characters: <spam&egg>."),
...       etree.XML("<p>And finally an embedded XHTML fragment.</p>"),
...     )
...   )
... )

>>> print(etree.tostring(page, pretty_print=True))
<html>
  <head>
    <title>This is a sample document</title>
  </head>
  <body>
    <h1 class="title">Hello!</h1>
    <p>This is a paragraph with <b>bold</b> text in it!</p>
    <p>This is another paragraph, with a
      <a href="http://www.python.org">link</a>.</p>
    <p>Here are some reservered characters: &lt;spam&amp;egg&gt;.</p>
    <p>And finally an embedded XHTML fragment.</p>
  </body>
</html>

C'è sempre SimpleXMLWriter, parte del toolkit ElementTree.L'interfaccia è semplicissima.

Ecco un esempio:

from elementtree.SimpleXMLWriter import XMLWriter
import sys

w = XMLWriter(sys.stdout)
html = w.start("html")

w.start("head")
w.element("title", "my document")
w.element("meta", name="generator", value="my application 1.0")
w.end()

w.start("body")
w.element("h1", "this is a heading")
w.element("p", "this is a paragraph")

w.start("p")
w.data("this is ")
w.element("b", "bold")
w.data(" and ")
w.element("i", "italic")
w.data(".")
w.end("p")

w.close(html)

https://github.com/galvez/xmlwitch:

import xmlwitch
xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
with xml.feed(xmlns='http://www.w3.org/2005/Atom'):
    xml.title('Example Feed')
    xml.updated('2003-12-13T18:30:02Z')
    with xml.author:
        xml.name('John Doe')
    xml.id('urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6')
    with xml.entry:
        xml.title('Atom-Powered Robots Run Amok')
        xml.id('urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a')
        xml.updated('2003-12-13T18:30:02Z')
        xml.summary('Some text.')
print(xml)

non vuoi davvero qualcosa del tipo:

html(head(script(type='text/javascript', content='var a = ...')),
body(h1('And I like the fact that 3 < 1'), p('just some paragraph'))

Penso di aver visto qualcosa del genere da qualche parte.Sarebbe meraviglioso.

MODIFICARE: In realtà, oggi sono andato a scrivere una biblioteca da fare solo quello: magictree

Puoi usarlo in questo modo:

from magictree import html, head, script, body, h1, p
root = html(
         head(
           script('''var a = 'I love &amp;aacute; letters''', 
                  type='text/javascript')),
         body(
           h1('And I like the fact that 3 > 1')))

# root is a plain Element object, like those created with ET.Element...
# so you can write it out using ElementTree :)
tree = ET.ElementTree(root)
tree.write('foo.xhtml')

La magia dentro magictree sta nel modo in cui funziona l'importazione:IL Element le fabbriche vengono create quando necessario.Avere un guarda la fonte, è basato su una risposta a un'altra domanda StackOverflow.

Ho finito per utilizzare saxutils.escape(str) per generare stringhe XML valide e quindi convalidarle con l'approccio di Eli per essere sicuro di non perdere alcun tag

from xml.sax import saxutils
from xml.dom.minidom import parseString
from xml.parsers.expat import ExpatError

xml = '''<?xml version="1.0" encoding="%s"?>\n
<contents title="%s" crawl_date="%s" in_text_date="%s" 
url="%s">\n<main_post>%s</main_post>\n</contents>''' %
(self.encoding, saxutils.escape(title), saxutils.escape(time), 
saxutils.escape(date), saxutils.escape(url), saxutils.escape(contents))
try:
    minidoc = parseString(xml)
catch ExpatError:
    print "Invalid xml"

Per chiunque lo incontri adesso, c'è in realtà un modo per farlo nascosto nella libreria standard di Python in xml.sax.utils.XMLGenerator.Eccone un esempio in azione:

>>> from xml.sax.saxutils import XMLGenerator
>>> import StringIO
>>> w = XMLGenerator(out, 'utf-8')
>>> w.startDocument()
>>> w.startElement("test", {'bar': 'baz'})
>>> w.characters("Foo")
>>> w.endElement("test")
>>> w.endDocument()
>>> print out.getvalue()
<?xml version="1.0" encoding="utf-8"?>
<test bar="baz">Foo</test>

Tentativo http://uche.ogbuji.net/tech/4suite/amara.È abbastanza completo e dispone di una serie di strumenti di accesso semplici.Supporto Unicode normale, ecc.

#
#Output the XML entry
#
def genFileOLD(out,label,term,idval):
    filename=entryTime() + ".html"
    writer=MarkupWriter(out, indent=u"yes")
    writer.startDocument()
    #Test element and attribute writing
    ans=namespace=u'http://www.w3.org/2005/Atom'
    xns=namespace=u'http://www.w3.org/1999/xhtml'
    writer.startElement(u'entry',
       ans,
       extraNss={u'x':u'http://www.w3.org/1999/xhtml' ,
                 u'dc':u'http://purl.org/dc/elements/1.1'})
    #u'a':u'http://www.w3.org/2005/Atom',
    #writer.attribute(u'xml:lang',unicode("en-UK"))

    writer.simpleElement(u'title',ans,content=unicode(label))
    #writer.simpleElement(u'a:subtitle',ans,content=u' ')
    id=unicode("http://www.dpawson.co.uk/nodesets/"+afn.split(".")[0])
    writer.simpleElement(u'id',ans,content=id)
    writer.simpleElement(u'updated',ans,content=unicode(dtime()))
    writer.startElement(u'author',ans)
    writer.simpleElement(u'name',ans,content=u'Dave ')
    writer.simpleElement(u'uri',ans,
      content=u'http://www.dpawson.co.uk/nodesets/'+afn+".xml")
    writer.endElement(u'author')
    writer.startElement(u'category', ans)
    if (prompt):
        label=unicode(raw_input("Enter label "))
    writer.attribute(u'label',unicode(label))
    if (prompt):
        term = unicode(raw_input("Enter term to use "))
    writer.attribute(u'term', unicode(term))
    writer.endElement(u'category')
    writer.simpleElement(u'rights',ans,content=u'\u00A9 Dave 2005-2008')
    writer.startElement(u'link',ans)
    writer.attribute(u'href',
         unicode("http://www.dpawson.co.uk/nodesets/entries/"+afn+".html"))
    writer.attribute(u'rel',unicode("alternate"))
    writer.endElement(u'link')
    writer.startElement(u'published', ans)
    dt=dtime()
    dtu=unicode(dt)
    writer.text(dtu)
    writer.endElement(u'published')
    writer.simpleElement(u'summary',ans,content=unicode(label))
    writer.startElement(u'content',ans)
    writer.attribute(u'type',unicode("xhtml"))
    writer.startElement(u'div',xns)
    writer.simpleElement(u'h3',xns,content=unicode(label))
    writer.endElement(u'div')
    writer.endElement(u'content')
    writer.endElement(u'entry')

Autorizzato sotto: CC-BY-SA insieme a attribuzione

Non affiliato a StackOverflow