This is my current solution. You give it an XPath for the nodes that are messed up and a set of element names that might include messed up HTML and other problems. Works roughly as follows
- Pull out text content of nodes matched by XPATH
- Run regex to wrap problematic child elements in CDATA
- Wrap text in temporary element (otherwise it crashes if there are multiple root nodes)
- Parse text back to DOM
- Add child nodes of temporary node back in place of previous text content.
The regex solution in step 2 is probably not fool-proof, but don't really see a better solution at the moment. If you do, let me know!
CDataFixer
import java.util.*;
import javax.xml.xpath.*;
import org.w3c.dom.*;
public class CDataFixer
{
private final XmlHelper xml = XmlHelper.getInstance();
public Document fix(Document document, String nodesToFix, Set<String> excludes) throws XPathExpressionException, XmlException
{
return fix(document, xml.newXPath().compile(nodesToFix), excludes);
}
private Document fix(Document document, XPathExpression nodesToFix, Set<String> excludes) throws XPathExpressionException, XmlException
{
Document wc = xml.copy(document);
NodeList nodes = (NodeList) nodesToFix.evaluate(wc, XPathConstants.NODESET);
int nodeCount = nodes.getLength();
for(int n=0; n<nodeCount; n++)
parse(nodes.item(n), excludes);
return wc;
}
private void parse(Node node, Set<String> excludes) throws XmlException
{
String text = node.getTextContent();
for(String exclude : excludes)
{
String regex = String.format("(?s)(<%1$s\\b[^>]*>)(.*?)(</%1$s>)", Pattern.quote(exclude));
text = text.replaceAll(regex, "$1<![CDATA[$2]]>$3");
}
String randomNode = "tmp_"+UUID.randomUUID().toString();
text = String.format("<%1$s>%2$s</%1$s>", randomNode, text);
NodeList parsed = xml
.parse(text)
.getFirstChild()
.getChildNodes();
node.setTextContent(null);
for(int n=0; n<parsed.getLength(); n++)
node.appendChild(node.getOwnerDocument().importNode(parsed.item(n), true));
}
}
XmlHelper
import java.io.*;
import javax.xml.parsers.*;
import javax.xml.transform.*;
import javax.xml.transform.dom.*;
import javax.xml.transform.sax.*;
import javax.xml.transform.stream.*;
import javax.xml.xpath.*;
import org.w3c.dom.*;
import org.xml.sax.*;
public final class XmlHelper
{
private static final XmlHelper instance = new XmlHelper();
public static XmlHelper getInstance()
{
return instance;
}
private final SAXTransformerFactory transformerFactory;
private final DocumentBuilderFactory documentBuilderFactory;
private final XPathFactory xpathFactory;
private XmlHelper()
{
documentBuilderFactory = DocumentBuilderFactory.newInstance();
documentBuilderFactory.setNamespaceAware(true);
xpathFactory = XPathFactory.newInstance();
TransformerFactory tf = TransformerFactory.newInstance();
if (!tf.getFeature(SAXTransformerFactory.FEATURE))
throw new RuntimeException("Failed to create SAX-compatible TransformerFactory.");
transformerFactory = (SAXTransformerFactory) tf;
}
public DocumentBuilder newDocumentBuilder()
{
try
{
return documentBuilderFactory.newDocumentBuilder();
}
catch (ParserConfigurationException e)
{
throw new RuntimeException("Failed to create new "+DocumentBuilder.class, e);
}
}
public XPath newXPath()
{
return xpathFactory.newXPath();
}
public Transformer newIdentityTransformer(boolean omitXmlDeclaration, boolean indent)
{
try
{
Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, indent ? "yes" : "no");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDeclaration ? "yes" : "no");
return transformer;
}
catch (TransformerConfigurationException e)
{
throw new RuntimeException("Failed to create Transformer instance: "+e.getMessage(), e);
}
}
public Templates newTemplates(String xslt) throws XmlException
{
try
{
return transformerFactory.newTemplates(new DOMSource(parse(xslt)));
}
catch (TransformerConfigurationException e)
{
throw new RuntimeException("Failed to create templates: "+e.getMessage(), e);
}
}
public Document parse(String xml) throws XmlException
{
return parse(new InputSource(new StringReader(xml)));
}
public Document parse(InputSource xml) throws XmlException
{
try
{
return newDocumentBuilder().parse(xml);
}
catch (SAXException e)
{
throw new XmlException("Failed to parse xml: "+e.getMessage(), e);
}
catch (IOException e)
{
throw new XmlException("Failed to read xml: "+e.getMessage(), e);
}
}
public String toString(Node node)
{
return toString(node, true, false);
}
public String toString(Node node, boolean omitXMLDeclaration, boolean indent)
{
try
{
StringWriter writer = new StringWriter();
newIdentityTransformer(omitXMLDeclaration, indent)
.transform(new DOMSource(node), new StreamResult(writer));
return writer.toString();
}
catch (TransformerException e)
{
throw new RuntimeException("Failed to transform XML into string: " + e.getMessage(), e);
}
}
public Document copy(Document document)
{
DOMSource source = new DOMSource(document);
DOMResult result = new DOMResult();
try
{
newIdentityTransformer(true, false)
.transform(source, result);
return (Document) result.getNode();
}
catch (TransformerException e)
{
throw new RuntimeException("Failed to copy XML: " + e.getMessage(), e);
}
}
}