Advanced Parsing
While basic XML parsing handles simple documents, advanced parsing techniques are essential for complex scenarios involving large documents, custom processing requirements, performance optimization, and specialized data extraction needs.
Parsing Approaches Comparison
Approach | Memory Usage | Speed | Random Access | Best For |
---|---|---|---|---|
DOM | High | Slow | Yes | Small documents, frequent access |
SAX | Low | Fast | No | Large documents, sequential processing |
StAX | Low | Fast | Limited | Controlled streaming, partial parsing |
Custom | Variable | Variable | Depends | Specialized requirements |
Advanced SAX Parsing
Custom Content Handlers
public class AdvancedBookHandler extends DefaultHandler {
private Book currentBook;
private StringBuilder textBuffer;
private Stack<String> elementStack;
private List<Book> books;
private Map<String, Author> authorCache;
public AdvancedBookHandler() {
this.textBuffer = new StringBuilder();
this.elementStack = new Stack<>();
this.books = new ArrayList<>();
this.authorCache = new HashMap<>();
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs) {
elementStack.push(qName);
textBuffer.setLength(0);
switch (qName) {
case "book":
currentBook = new Book();
currentBook.setId(attrs.getValue("id"));
currentBook.setCategory(attrs.getValue("category"));
break;
case "price":
String currency = attrs.getValue("currency");
if (currency != null) {
currentBook.setCurrency(currency);
}
break;
}
}
@Override
public void characters(char[] chars, int start, int length) {
textBuffer.append(chars, start, length);
}
@Override
public void endElement(String uri, String localName, String qName) {
String currentPath = String.join("/", elementStack);
String text = textBuffer.toString().trim();
switch (currentPath) {
case "library/book/title":
currentBook.setTitle(text);
break;
case "library/book/author/first":
if (currentBook.getAuthor() == null) {
currentBook.setAuthor(new Author());
}
currentBook.getAuthor().setFirstName(text);
break;
case "library/book/author/last":
currentBook.getAuthor().setLastName(text);
// Cache author for reuse
String authorKey = currentBook.getAuthor().getFullName();
authorCache.put(authorKey, currentBook.getAuthor());
break;
case "library/book/price":
try {
currentBook.setPrice(Double.parseDouble(text));
} catch (NumberFormatException e) {
// Handle invalid price
currentBook.setPrice(0.0);
}
break;
case "library/book":
books.add(currentBook);
currentBook = null;
break;
}
elementStack.pop();
textBuffer.setLength(0);
}
@Override
public void error(SAXParseException e) throws SAXException {
System.err.println("Parse error: " + e.getMessage());
throw e;
}
public List<Book> getBooks() {
return books;
}
}
Filtered SAX Parsing
public class FilteredSAXParser {
private final Predicate<String> elementFilter;
private final Predicate<Map<String, String>> attributeFilter;
public FilteredSAXParser(Predicate<String> elementFilter,
Predicate<Map<String, String>> attributeFilter) {
this.elementFilter = elementFilter;
this.attributeFilter = attributeFilter;
}
public void parse(InputStream xml, ContentHandler handler) throws Exception {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser = factory.newSAXParser();
XMLReader reader = parser.getXMLReader();
reader.setContentHandler(new FilteringHandler(handler));
reader.parse(new InputSource(xml));
}
private class FilteringHandler extends DefaultHandler {
private final ContentHandler delegate;
private boolean inFilteredElement = false;
private int depth = 0;
public FilteringHandler(ContentHandler delegate) {
this.delegate = delegate;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs)
throws SAXException {
if (!inFilteredElement && shouldProcessElement(qName, attrs)) {
delegate.startElement(uri, localName, qName, attrs);
} else {
inFilteredElement = true;
}
depth++;
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
depth--;
if (!inFilteredElement) {
delegate.endElement(uri, localName, qName);
} else if (depth == 0) {
inFilteredElement = false;
}
}
private boolean shouldProcessElement(String qName, Attributes attrs) {
if (!elementFilter.test(qName)) {
return false;
}
Map<String, String> attrMap = new HashMap<>();
for (int i = 0; i < attrs.getLength(); i++) {
attrMap.put(attrs.getQName(i), attrs.getValue(i));
}
return attributeFilter.test(attrMap);
}
}
}
Advanced StAX Parsing
Pull Parsing with State Machine
public class StateMachineStAXParser {
enum ParsingState {
START, IN_LIBRARY, IN_BOOK, IN_TITLE, IN_AUTHOR, IN_PRICE, END
}
public List<Book> parseBooks(InputStream xml) throws XMLStreamException {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLStreamReader reader = factory.createXMLStreamReader(xml);
List<Book> books = new ArrayList<>();
Book currentBook = null;
Author currentAuthor = null;
ParsingState state = ParsingState.START;
while (reader.hasNext()) {
int event = reader.next();
switch (event) {
case XMLStreamConstants.START_ELEMENT:
state = handleStartElement(reader, state, books, currentBook, currentAuthor);
break;
case XMLStreamConstants.CHARACTERS:
handleCharacters(reader, state, currentBook, currentAuthor);
break;
case XMLStreamConstants.END_ELEMENT:
state = handleEndElement(reader, state, books, currentBook);
break;
}
}
return books;
}
private ParsingState handleStartElement(XMLStreamReader reader, ParsingState currentState,
List<Book> books, Book currentBook, Author currentAuthor) {
String localName = reader.getLocalName();
switch (localName) {
case "library":
return ParsingState.IN_LIBRARY;
case "book":
currentBook = new Book();
currentBook.setId(reader.getAttributeValue(null, "id"));
currentBook.setCategory(reader.getAttributeValue(null, "category"));
return ParsingState.IN_BOOK;
case "title":
return ParsingState.IN_TITLE;
case "author":
currentAuthor = new Author();
return ParsingState.IN_AUTHOR;
case "price":
String currency = reader.getAttributeValue(null, "currency");
if (currentBook != null && currency != null) {
currentBook.setCurrency(currency);
}
return ParsingState.IN_PRICE;
default:
return currentState;
}
}
}
Streaming with Custom Filters
public class StreamingXMLFilter implements XMLStreamReader {
private final XMLStreamReader delegate;
private final Set<String> elementsToSkip;
private int skipDepth = 0;
public StreamingXMLFilter(XMLStreamReader delegate, Set<String> elementsToSkip) {
this.delegate = delegate;
this.elementsToSkip = elementsToSkip;
}
@Override
public int next() throws XMLStreamException {
int event = delegate.next();
while (shouldSkip(event)) {
event = delegate.next();
}
return event;
}
private boolean shouldSkip(int event) {
if (event == XMLStreamConstants.START_ELEMENT) {
String localName = delegate.getLocalName();
if (elementsToSkip.contains(localName)) {
skipDepth = 1;
return true;
}
} else if (skipDepth > 0) {
if (event == XMLStreamConstants.START_ELEMENT) {
skipDepth++;
} else if (event == XMLStreamConstants.END_ELEMENT) {
skipDepth--;
}
return skipDepth > 0;
}
return false;
}
// Delegate all other methods to the wrapped reader
@Override
public String getLocalName() {
return delegate.getLocalName();
}
// ... other delegated methods
}
Memory-Efficient Parsing
Chunked Processing
public class ChunkedXMLProcessor {
private static final int CHUNK_SIZE = 1000;
public void processLargeXML(InputStream xml, Function<List<Book>, Void> chunkProcessor)
throws XMLStreamException {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLStreamReader reader = factory.createXMLStreamReader(xml);
List<Book> currentChunk = new ArrayList<>();
while (reader.hasNext()) {
int event = reader.next();
if (event == XMLStreamConstants.START_ELEMENT && "book".equals(reader.getLocalName())) {
Book book = parseBook(reader);
currentChunk.add(book);
if (currentChunk.size() >= CHUNK_SIZE) {
chunkProcessor.apply(currentChunk);
currentChunk.clear();
// Optional: Force garbage collection
System.gc();
}
}
}
// Process remaining books
if (!currentChunk.isEmpty()) {
chunkProcessor.apply(currentChunk);
}
}
private Book parseBook(XMLStreamReader reader) throws XMLStreamException {
Book book = new Book();
// Parse book attributes
book.setId(reader.getAttributeValue(null, "id"));
book.setCategory(reader.getAttributeValue(null, "category"));
while (reader.hasNext()) {
int event = reader.next();
if (event == XMLStreamConstants.END_ELEMENT && "book".equals(reader.getLocalName())) {
break;
}
if (event == XMLStreamConstants.START_ELEMENT) {
String elementName = reader.getLocalName();
String text = reader.getElementText();
switch (elementName) {
case "title":
book.setTitle(text);
break;
case "price":
book.setPrice(Double.parseDouble(text));
break;
// ... other elements
}
}
}
return book;
}
}
Lazy Loading with Suppliers
public class LazyXMLDocument {
private final Path xmlFile;
private final Map<String, Supplier<List<Element>>> lazyElements;
public LazyXMLDocument(Path xmlFile) {
this.xmlFile = xmlFile;
this.lazyElements = new HashMap<>();
indexDocument();
}
private void indexDocument() {
try (InputStream is = Files.newInputStream(xmlFile)) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLStreamReader reader = factory.createXMLStreamReader(is);
long position = 0;
String currentElement = null;
while (reader.hasNext()) {
int event = reader.next();
if (event == XMLStreamConstants.START_ELEMENT) {
currentElement = reader.getLocalName();
final long elementPosition = position;
lazyElements.put(currentElement, () -> {
return parseElementsAt(elementPosition, currentElement);
});
}
position = reader.getLocation().getCharacterOffset();
}
} catch (Exception e) {
throw new RuntimeException("Failed to index XML document", e);
}
}
public List<Element> getElements(String elementName) {
Supplier<List<Element>> supplier = lazyElements.get(elementName);
return supplier != null ? supplier.get() : Collections.emptyList();
}
private List<Element> parseElementsAt(long position, String elementName) {
// Implementation to parse elements starting at specific position
// This would involve seeking to the position and parsing
return new ArrayList<>();
}
}
Custom Parser Implementation
Event-Driven Parser with Callbacks
public class CallbackXMLParser {
private final Map<String, List<Consumer<XMLEvent>>> callbacks;
public CallbackXMLParser() {
this.callbacks = new HashMap<>();
}
public void registerCallback(String elementName, Consumer<XMLEvent> callback) {
callbacks.computeIfAbsent(elementName, k -> new ArrayList<>()).add(callback);
}
public void parse(InputStream xml) throws XMLStreamException {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader reader = factory.createXMLEventReader(xml);
while (reader.hasNext()) {
XMLEvent event = reader.nextEvent();
if (event.isStartElement()) {
StartElement startElement = event.asStartElement();
String elementName = startElement.getName().getLocalPart();
List<Consumer<XMLEvent>> elementCallbacks = callbacks.get(elementName);
if (elementCallbacks != null) {
elementCallbacks.forEach(callback -> callback.accept(event));
}
}
}
}
}
// Usage example
CallbackXMLParser parser = new CallbackXMLParser();
parser.registerCallback("book", event -> {
StartElement element = event.asStartElement();
System.out.println("Found book with ID: " +
element.getAttributeByName(new QName("id")).getValue());
});
parser.registerCallback("title", event -> {
// Handle title elements
});
Async XML Processing
public class AsyncXMLProcessor {
private final ExecutorService executor;
private final BlockingQueue<XMLEvent> eventQueue;
public AsyncXMLProcessor(int threadPoolSize) {
this.executor = Executors.newFixedThreadPool(threadPoolSize);
this.eventQueue = new LinkedBlockingQueue<>();
}
public CompletableFuture<Void> processAsync(InputStream xml) {
return CompletableFuture.runAsync(() -> {
try {
parseXMLAsync(xml);
} catch (XMLStreamException e) {
throw new RuntimeException(e);
}
}, executor);
}
private void parseXMLAsync(InputStream xml) throws XMLStreamException {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader reader = factory.createXMLEventReader(xml);
// Start event processing threads
List<CompletableFuture<Void>> processors = IntStream.range(0, 3)
.mapToObj(i -> CompletableFuture.runAsync(this::processEvents, executor))
.collect(Collectors.toList());
// Feed events to queue
while (reader.hasNext()) {
XMLEvent event = reader.nextEvent();
try {
eventQueue.put(event);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
// Signal end of events
for (int i = 0; i < 3; i++) {
eventQueue.offer(new EndDocumentEvent());
}
// Wait for all processors to complete
CompletableFuture.allOf(processors.toArray(new CompletableFuture[0])).join();
}
private void processEvents() {
while (true) {
try {
XMLEvent event = eventQueue.take();
if (event instanceof EndDocumentEvent) {
break;
}
// Process the event
processEvent(event);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}
}
Performance Optimization
XML Reader Pooling
public class XMLReaderPool {
private final BlockingQueue<XMLStreamReader> pool;
private final XMLInputFactory factory;
private final int maxSize;
public XMLReaderPool(int maxSize) {
this.pool = new LinkedBlockingQueue<>(maxSize);
this.factory = XMLInputFactory.newInstance();
this.maxSize = maxSize;
// Configure factory for performance
factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false);
factory.setProperty(XMLInputFactory.IS_VALIDATING, false);
factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
}
public XMLStreamReader borrowReader(InputStream input) throws XMLStreamException {
XMLStreamReader reader = pool.poll();
if (reader == null) {
reader = factory.createXMLStreamReader(input);
} else {
// Reset reader for new input
((XMLStreamReaderImpl) reader).setInputSource(new InputSource(input));
}
return reader;
}
public void returnReader(XMLStreamReader reader) {
if (pool.size() < maxSize) {
try {
reader.close();
pool.offer(reader);
} catch (XMLStreamException e) {
// Log error, don't return to pool
}
}
}
}
Buffer Management
public class BufferedXMLParser {
private static final int BUFFER_SIZE = 8192;
public void parseWithBuffering(InputStream input) throws XMLStreamException {
BufferedInputStream bufferedInput = new BufferedInputStream(input, BUFFER_SIZE);
XMLInputFactory factory = XMLInputFactory.newInstance();
// Optimize factory settings
factory.setProperty(XMLInputFactory.IS_COALESCING, true);
factory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, false);
XMLStreamReader reader = factory.createXMLStreamReader(bufferedInput);
// Use character array for text accumulation
StringBuilder textBuffer = new StringBuilder(1024);
while (reader.hasNext()) {
int event = reader.next();
switch (event) {
case XMLStreamConstants.CHARACTERS:
// Efficient text accumulation
if (!reader.isWhiteSpace()) {
textBuffer.append(reader.getText());
}
break;
case XMLStreamConstants.END_ELEMENT:
if (textBuffer.length() > 0) {
processText(reader.getLocalName(), textBuffer.toString());
textBuffer.setLength(0); // Clear buffer
}
break;
}
}
}
private void processText(String elementName, String text) {
// Process accumulated text
}
}
Error Handling and Recovery
Resilient Parser
public class ResilientXMLParser {
private final List<ParseError> errors;
private final boolean continueOnError;
public ResilientXMLParser(boolean continueOnError) {
this.errors = new ArrayList<>();
this.continueOnError = continueOnError;
}
public ParseResult parse(InputStream xml) {
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLStreamReader reader = factory.createXMLStreamReader(xml);
List<Element> elements = new ArrayList<>();
while (reader.hasNext()) {
try {
int event = reader.next();
processEvent(reader, event, elements);
} catch (XMLStreamException e) {
ParseError error = new ParseError(
reader.getLocation().getLineNumber(),
reader.getLocation().getColumnNumber(),
e.getMessage()
);
errors.add(error);
if (!continueOnError) {
throw e;
}
// Try to recover by skipping to next element
skipToNextElement(reader);
}
}
return new ParseResult(elements, errors);
} catch (XMLStreamException e) {
errors.add(new ParseError(-1, -1, "Fatal parse error: " + e.getMessage()));
return new ParseResult(Collections.emptyList(), errors);
}
}
private void skipToNextElement(XMLStreamReader reader) throws XMLStreamException {
int depth = 0;
while (reader.hasNext()) {
int event = reader.next();
if (event == XMLStreamConstants.START_ELEMENT) {
depth++;
} else if (event == XMLStreamConstants.END_ELEMENT) {
depth--;
if (depth <= 0) {
break;
}
}
}
}
}
Integration Patterns
Parser Factory Pattern
public class XMLParserFactory {
public enum ParserType {
DOM, SAX, STAX, STREAMING, ASYNC
}
public static XMLProcessor createParser(ParserType type, Map<String, Object> config) {
switch (type) {
case DOM:
return new DOMProcessor(config);
case SAX:
return new SAXProcessor(config);
case STAX:
return new StAXProcessor(config);
case STREAMING:
return new StreamingProcessor(config);
case ASYNC:
return new AsyncProcessor(config);
default:
throw new IllegalArgumentException("Unknown parser type: " + type);
}
}
public static XMLProcessor createOptimalParser(long fileSize, boolean randomAccess) {
if (fileSize < 1024 * 1024) { // < 1MB
return createParser(ParserType.DOM, Collections.emptyMap());
} else if (randomAccess) {
return createParser(ParserType.STAX, Collections.emptyMap());
} else {
return createParser(ParserType.STREAMING, Collections.emptyMap());
}
}
}
Best Practices
Performance Guidelines
- Choose the right parser: DOM for small docs, SAX/StAX for large ones
- Disable unnecessary features: Turn off validation, DTD processing when not needed
- Use buffering: Buffer input streams for better I/O performance
- Pool resources: Reuse parser instances and readers
- Process incrementally: Use streaming for large documents
Memory Management
// Good: Process in chunks
public void processLargeXML(InputStream xml) {
XMLStreamReader reader = factory.createXMLStreamReader(xml);
while (reader.hasNext()) {
if (reader.isStartElement() && "record".equals(reader.getLocalName())) {
Element record = parseRecord(reader);
processRecord(record);
record = null; // Help GC
}
reader.next();
}
}
// Avoid: Loading entire document
public void avoidThis(File xmlFile) {
Document doc = DocumentBuilderFactory.newInstance()
.newDocumentBuilder()
.parse(xmlFile); // Loads entire file into memory
}
Error Handling
- Validate early: Check document structure as soon as possible
- Provide context: Include line numbers and element paths in error messages
- Fail gracefully: Continue processing when possible, collect errors
- Log appropriately: Use appropriate log levels for different error types
Conclusion
Advanced XML parsing techniques enable efficient processing of complex XML documents while maintaining performance and memory efficiency. Choose the appropriate parsing strategy based on your specific requirements for document size, access patterns, and processing needs.
Next Steps
- Explore XML Processing for implementation details
- Study Performance Optimization for tuning tips
- Learn Best Practices for production systems