use lxml
:
mytext="""<contextfile concordance=brown>
<context filename=br-a01 paras=yes>
<p pnum=1>
....
<wf cmd=done pos=VB lemma=say wnsn=1 lexsn=2:32:00::>said</wf>
<wf cmd=done pos=NN lemma=friday wnsn=1 lexsn=1:28:00::>Friday</wf>
<wf cmd=ignore pos=DT>an</wf>
....
....
<punc>``</punc>
<wf cmd=ignore pos=DT>no</wf>
<wf cmd=done pos=NN lemma=evidence wnsn=1 lexsn=1:09:00::>evidence</wf>
<punc>''</punc>
....
<wf cmd=done pos=NN lemma=irregularity wnsn=1 lexsn=1:04:00::>irregularities</wf>
<punc>.</punc>
</s>
</p>
</context>
</contextfile>"""
from lxml import html
parsed = html.fromstring(mytext)
for x in parsed.getiterator(): print x.tag, x.attrib, x.text, x.tail
output:
contextfile {'concordance': 'brown'} None None
context {'paras': 'yes', 'filename': 'br-a01'} None None
p {'pnum': '1'}
....
wf {'lemma': 'say', 'cmd': 'done', 'wnsn': '1', 'pos': 'VB', 'lexsn': '2:32:00::'} said None
wf {'lemma': 'friday', 'cmd': 'done', 'wnsn': '1', 'pos': 'NN', 'lexsn': '1:28:00::'} Friday None
wf {'cmd': 'ignore', 'pos': 'DT'} an
....
....
punc {} `` None
wf {'cmd': 'ignore', 'pos': 'DT'} no None
wf {'lemma': 'evidence', 'cmd': 'done', 'wnsn': '1', 'pos': 'NN', 'lexsn': '1:09:00::'} evidence None
punc {} ''
....
wf {'lemma': 'irregularity', 'cmd': 'done', 'wnsn': '1', 'pos': 'NN', 'lexsn': '1:04:00::'} irregularities None
punc {} . None