Parse XML escaped in CDATA mixed with invalid HTML

Question 1

This is my current solution. You give it an XPath for the nodes that are messed up and a set of element names that might include messed up HTML and other problems. Works roughly as follows

Pull out text content of nodes matched by XPATH
Run regex to wrap problematic child elements in CDATA
Wrap text in temporary element (otherwise it crashes if there are multiple root nodes)
Parse text back to DOM
Add child nodes of temporary node back in place of previous text content.

The regex solution in step 2 is probably not fool-proof, but don't really see a better solution at the moment. If you do, let me know!

CDataFixer

import java.util.*;    
import javax.xml.xpath.*;    
import org.w3c.dom.*;

public class CDataFixer
{
    private final XmlHelper xml = XmlHelper.getInstance();

    public Document fix(Document document, String nodesToFix, Set<String> excludes) throws XPathExpressionException, XmlException
    {
        return fix(document, xml.newXPath().compile(nodesToFix), excludes);
    }

    private Document fix(Document document, XPathExpression nodesToFix, Set<String> excludes) throws XPathExpressionException, XmlException
    {
        Document wc = xml.copy(document); 

        NodeList nodes = (NodeList) nodesToFix.evaluate(wc, XPathConstants.NODESET);
        int nodeCount = nodes.getLength();

        for(int n=0; n<nodeCount; n++)
            parse(nodes.item(n), excludes);

        return wc;
    }

    private void parse(Node node, Set<String> excludes) throws XmlException
    {
        String text = node.getTextContent();

        for(String exclude : excludes)
        {
            String regex = String.format("(?s)(<%1$s\\b[^>]*>)(.*?)(</%1$s>)", Pattern.quote(exclude));
            text = text.replaceAll(regex, "$1<![CDATA[$2]]>$3");
        }

        String randomNode = "tmp_"+UUID.randomUUID().toString();

        text = String.format("<%1$s>%2$s</%1$s>", randomNode, text);

        NodeList parsed = xml
            .parse(text)
            .getFirstChild()
            .getChildNodes();

        node.setTextContent(null);
        for(int n=0; n<parsed.getLength(); n++)
            node.appendChild(node.getOwnerDocument().importNode(parsed.item(n), true));
    }
}

XmlHelper

import java.io.*;    
import javax.xml.parsers.*;
import javax.xml.transform.*;
import javax.xml.transform.dom.*;
import javax.xml.transform.sax.*;
import javax.xml.transform.stream.*;
import javax.xml.xpath.*;    
import org.w3c.dom.*;
import org.xml.sax.*;

public final class XmlHelper
{
    private static final XmlHelper instance = new XmlHelper(); 
    public static XmlHelper getInstance()
    {
        return instance;
    }


    private final SAXTransformerFactory transformerFactory;
    private final DocumentBuilderFactory documentBuilderFactory;
    private final XPathFactory xpathFactory;

    private XmlHelper()
    {
        documentBuilderFactory = DocumentBuilderFactory.newInstance();
        documentBuilderFactory.setNamespaceAware(true);

        xpathFactory = XPathFactory.newInstance();

        TransformerFactory tf = TransformerFactory.newInstance();
        if (!tf.getFeature(SAXTransformerFactory.FEATURE))
            throw new RuntimeException("Failed to create SAX-compatible TransformerFactory.");
        transformerFactory = (SAXTransformerFactory) tf;
    }

    public DocumentBuilder newDocumentBuilder()
    {
        try
        {
            return documentBuilderFactory.newDocumentBuilder();
        }
        catch (ParserConfigurationException e)
        {
            throw new RuntimeException("Failed to create new "+DocumentBuilder.class, e);
        }
    }

    public XPath newXPath()
    {
        return xpathFactory.newXPath();
    }

    public Transformer newIdentityTransformer(boolean omitXmlDeclaration, boolean indent)
    {
        try
        {
            Transformer transformer = transformerFactory.newTransformer();
            transformer.setOutputProperty(OutputKeys.INDENT, indent ? "yes" : "no");
            transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDeclaration ? "yes" : "no");
            return transformer;
        }
        catch (TransformerConfigurationException e)
        {
            throw new RuntimeException("Failed to create Transformer instance: "+e.getMessage(), e);
        }
    }

    public Templates newTemplates(String xslt) throws XmlException
    {
        try
        {
            return transformerFactory.newTemplates(new DOMSource(parse(xslt)));
        }
        catch (TransformerConfigurationException e)
        {
            throw new RuntimeException("Failed to create templates: "+e.getMessage(), e);
        }
    }

    public Document parse(String xml) throws XmlException
    {
        return parse(new InputSource(new StringReader(xml)));
    }

    public Document parse(InputSource xml) throws XmlException
    {
        try
        {
            return newDocumentBuilder().parse(xml);
        }
        catch (SAXException e)
        {
            throw new XmlException("Failed to parse xml: "+e.getMessage(), e);
        }
        catch (IOException e)
        {
            throw new XmlException("Failed to read xml: "+e.getMessage(), e);
        }
    }

    public String toString(Node node)
    {
        return toString(node, true, false);
    }

    public String toString(Node node, boolean omitXMLDeclaration, boolean indent)
    {
        try
        {
            StringWriter writer = new StringWriter();

            newIdentityTransformer(omitXMLDeclaration, indent)
                .transform(new DOMSource(node), new StreamResult(writer));

            return writer.toString();
        }
        catch (TransformerException e)
        {
            throw new RuntimeException("Failed to transform XML into string: " + e.getMessage(), e);
        }
    }

    public Document copy(Document document)
    {
        DOMSource source = new DOMSource(document);
        DOMResult result = new DOMResult();

        try
        {
            newIdentityTransformer(true, false)
                .transform(source, result);
            return (Document) result.getNode();
        }
        catch (TransformerException e)
        {
            throw new RuntimeException("Failed to copy XML: " + e.getMessage(), e);
        }
    }
}

Question 2

This is actually lifted from the project i'm working on right now.

    private Node stringToNode(String textContent) {
    Element node = null;
    try {
        node = DocumentBuilderFactory.newInstance().newDocumentBuilder()
                .parse(new ByteArrayInputStream(textContent.getBytes()))
                .getDocumentElement();

    } catch (SAXException e) {
        logger.error(e.getMessage(), e);
    } catch (IOException e) {
        logger.error(e.getMessage(), e);
    } catch (ParserConfigurationException e) {
        logger.error(e.getMessage(), e);
    }
    return node;
}

This will give you a document object representing the string. I use this to get this back into the original document:

if (textContent.contains(XML_HEADER)) {
  textContent = textContent.substring(textContent.indexOf(XML_HEADER) + XML_HEADER.length());
}
Node newNode = stringToNode(textContent);
if (newNode != null) {
  Node importedNode = soapBody.getOwnerDocument().importNode(newNode, true);
  nextChild.setTextContent(null);
  nextChild.appendChild(importedNode);
}