While fixing another issue, I found a relatively clean way of filtering out unwanted characters from the source of an NSXMLDocument. Pasting it here just in case someone encounters a similar issue:
@implementation NSXMLDocument (FilterIllegalCharacters)
- (NSXMLDocument *)initWithDataAndIgnoreIllegalCharacters:(NSData *)data illegalChars:(NSCharacterSet *)illegalChars error:(NSError **)error{
// -- Then, read the resulting XML string.
NSMutableString *str = [[NSMutableString alloc] initWithData:data encoding:NSUTF8StringEncoding];
// -- Go through the XML, only caring about attribute value strings
NSMutableArray *charactersToRemove = [NSMutableArray array];
NSUInteger openQuotes = NSNotFound;
for (NSUInteger pos = 0; pos < str.length; ++pos) {
NSUInteger currentChar = [str characterAtIndex:pos];
if (currentChar == '\"') {
if (openQuotes == NSNotFound) {
openQuotes = pos;
}
else {
openQuotes = NSNotFound;
}
}
else if (openQuotes != NSNotFound) {
// -- If we find an illegal character, we make a note of its position.
if ([illegalChars characterIsMember:currentChar]) {
[charactersToRemove addObject:[NSNumber numberWithLong:pos]];
}
}
}
if (charactersToRemove.count) {
NSUInteger index = charactersToRemove.count;
// -- If we have characters to fix, we work thru them backwards, in order to not mess up our saved positions by modifying the XML.
do {
--index;
NSNumber *characterPos = charactersToRemove[index];
[str replaceCharactersInRange:NSMakeRange(characterPos.longValue, 1) withString:@""];
}
while (index > 0);
// -- Finally we update the data with our corrected version
data = [str dataUsingEncoding:NSUTF8StringEncoding];
}
return [[NSXMLDocument alloc] initWithData:data options:NSXMLNodeOptionsNone
error:error];
}
@end
You can pass any character set you want. Note that this sets the options for reading the XML document to none. You might want to change this for your own purposes.
This only filters the content of attributes strings, which is where my malformed string came from.