1. xml
  2. /advanced
  3. /advanced-parsing

Advanced Parsing

While basic XML parsing handles simple documents, advanced parsing techniques are essential for complex scenarios involving large documents, custom processing requirements, performance optimization, and specialized data extraction needs.

Parsing Approaches Comparison

ApproachMemory UsageSpeedRandom AccessBest For
DOMHighSlowYesSmall documents, frequent access
SAXLowFastNoLarge documents, sequential processing
StAXLowFastLimitedControlled streaming, partial parsing
CustomVariableVariableDependsSpecialized requirements

Advanced SAX Parsing

Custom Content Handlers

public class AdvancedBookHandler extends DefaultHandler {
    private Book currentBook;
    private StringBuilder textBuffer;
    private Stack<String> elementStack;
    private List<Book> books;
    private Map<String, Author> authorCache;
    
    public AdvancedBookHandler() {
        this.textBuffer = new StringBuilder();
        this.elementStack = new Stack<>();
        this.books = new ArrayList<>();
        this.authorCache = new HashMap<>();
    }
    
    @Override
    public void startElement(String uri, String localName, String qName, Attributes attrs) {
        elementStack.push(qName);
        textBuffer.setLength(0);
        
        switch (qName) {
            case "book":
                currentBook = new Book();
                currentBook.setId(attrs.getValue("id"));
                currentBook.setCategory(attrs.getValue("category"));
                break;
                
            case "price":
                String currency = attrs.getValue("currency");
                if (currency != null) {
                    currentBook.setCurrency(currency);
                }
                break;
        }
    }
    
    @Override
    public void characters(char[] chars, int start, int length) {
        textBuffer.append(chars, start, length);
    }
    
    @Override
    public void endElement(String uri, String localName, String qName) {
        String currentPath = String.join("/", elementStack);
        String text = textBuffer.toString().trim();
        
        switch (currentPath) {
            case "library/book/title":
                currentBook.setTitle(text);
                break;
                
            case "library/book/author/first":
                if (currentBook.getAuthor() == null) {
                    currentBook.setAuthor(new Author());
                }
                currentBook.getAuthor().setFirstName(text);
                break;
                
            case "library/book/author/last":
                currentBook.getAuthor().setLastName(text);
                // Cache author for reuse
                String authorKey = currentBook.getAuthor().getFullName();
                authorCache.put(authorKey, currentBook.getAuthor());
                break;
                
            case "library/book/price":
                try {
                    currentBook.setPrice(Double.parseDouble(text));
                } catch (NumberFormatException e) {
                    // Handle invalid price
                    currentBook.setPrice(0.0);
                }
                break;
                
            case "library/book":
                books.add(currentBook);
                currentBook = null;
                break;
        }
        
        elementStack.pop();
        textBuffer.setLength(0);
    }
    
    @Override
    public void error(SAXParseException e) throws SAXException {
        System.err.println("Parse error: " + e.getMessage());
        throw e;
    }
    
    public List<Book> getBooks() {
        return books;
    }
}

Filtered SAX Parsing

public class FilteredSAXParser {
    private final Predicate<String> elementFilter;
    private final Predicate<Map<String, String>> attributeFilter;
    
    public FilteredSAXParser(Predicate<String> elementFilter, 
                            Predicate<Map<String, String>> attributeFilter) {
        this.elementFilter = elementFilter;
        this.attributeFilter = attributeFilter;
    }
    
    public void parse(InputStream xml, ContentHandler handler) throws Exception {
        SAXParserFactory factory = SAXParserFactory.newInstance();
        SAXParser parser = factory.newSAXParser();
        
        XMLReader reader = parser.getXMLReader();
        reader.setContentHandler(new FilteringHandler(handler));
        reader.parse(new InputSource(xml));
    }
    
    private class FilteringHandler extends DefaultHandler {
        private final ContentHandler delegate;
        private boolean inFilteredElement = false;
        private int depth = 0;
        
        public FilteringHandler(ContentHandler delegate) {
            this.delegate = delegate;
        }
        
        @Override
        public void startElement(String uri, String localName, String qName, Attributes attrs) 
                throws SAXException {
            if (!inFilteredElement && shouldProcessElement(qName, attrs)) {
                delegate.startElement(uri, localName, qName, attrs);
            } else {
                inFilteredElement = true;
            }
            depth++;
        }
        
        @Override
        public void endElement(String uri, String localName, String qName) throws SAXException {
            depth--;
            if (!inFilteredElement) {
                delegate.endElement(uri, localName, qName);
            } else if (depth == 0) {
                inFilteredElement = false;
            }
        }
        
        private boolean shouldProcessElement(String qName, Attributes attrs) {
            if (!elementFilter.test(qName)) {
                return false;
            }
            
            Map<String, String> attrMap = new HashMap<>();
            for (int i = 0; i < attrs.getLength(); i++) {
                attrMap.put(attrs.getQName(i), attrs.getValue(i));
            }
            
            return attributeFilter.test(attrMap);
        }
    }
}

Advanced StAX Parsing

Pull Parsing with State Machine

public class StateMachineStAXParser {
    enum ParsingState {
        START, IN_LIBRARY, IN_BOOK, IN_TITLE, IN_AUTHOR, IN_PRICE, END
    }
    
    public List<Book> parseBooks(InputStream xml) throws XMLStreamException {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        XMLStreamReader reader = factory.createXMLStreamReader(xml);
        
        List<Book> books = new ArrayList<>();
        Book currentBook = null;
        Author currentAuthor = null;
        ParsingState state = ParsingState.START;
        
        while (reader.hasNext()) {
            int event = reader.next();
            
            switch (event) {
                case XMLStreamConstants.START_ELEMENT:
                    state = handleStartElement(reader, state, books, currentBook, currentAuthor);
                    break;
                    
                case XMLStreamConstants.CHARACTERS:
                    handleCharacters(reader, state, currentBook, currentAuthor);
                    break;
                    
                case XMLStreamConstants.END_ELEMENT:
                    state = handleEndElement(reader, state, books, currentBook);
                    break;
            }
        }
        
        return books;
    }
    
    private ParsingState handleStartElement(XMLStreamReader reader, ParsingState currentState,
            List<Book> books, Book currentBook, Author currentAuthor) {
        
        String localName = reader.getLocalName();
        
        switch (localName) {
            case "library":
                return ParsingState.IN_LIBRARY;
                
            case "book":
                currentBook = new Book();
                currentBook.setId(reader.getAttributeValue(null, "id"));
                currentBook.setCategory(reader.getAttributeValue(null, "category"));
                return ParsingState.IN_BOOK;
                
            case "title":
                return ParsingState.IN_TITLE;
                
            case "author":
                currentAuthor = new Author();
                return ParsingState.IN_AUTHOR;
                
            case "price":
                String currency = reader.getAttributeValue(null, "currency");
                if (currentBook != null && currency != null) {
                    currentBook.setCurrency(currency);
                }
                return ParsingState.IN_PRICE;
                
            default:
                return currentState;
        }
    }
}

Streaming with Custom Filters

public class StreamingXMLFilter implements XMLStreamReader {
    private final XMLStreamReader delegate;
    private final Set<String> elementsToSkip;
    private int skipDepth = 0;
    
    public StreamingXMLFilter(XMLStreamReader delegate, Set<String> elementsToSkip) {
        this.delegate = delegate;
        this.elementsToSkip = elementsToSkip;
    }
    
    @Override
    public int next() throws XMLStreamException {
        int event = delegate.next();
        
        while (shouldSkip(event)) {
            event = delegate.next();
        }
        
        return event;
    }
    
    private boolean shouldSkip(int event) {
        if (event == XMLStreamConstants.START_ELEMENT) {
            String localName = delegate.getLocalName();
            if (elementsToSkip.contains(localName)) {
                skipDepth = 1;
                return true;
            }
        } else if (skipDepth > 0) {
            if (event == XMLStreamConstants.START_ELEMENT) {
                skipDepth++;
            } else if (event == XMLStreamConstants.END_ELEMENT) {
                skipDepth--;
            }
            return skipDepth > 0;
        }
        
        return false;
    }
    
    // Delegate all other methods to the wrapped reader
    @Override
    public String getLocalName() {
        return delegate.getLocalName();
    }
    
    // ... other delegated methods
}

Memory-Efficient Parsing

Chunked Processing

public class ChunkedXMLProcessor {
    private static final int CHUNK_SIZE = 1000;
    
    public void processLargeXML(InputStream xml, Function<List<Book>, Void> chunkProcessor) 
            throws XMLStreamException {
        
        XMLInputFactory factory = XMLInputFactory.newInstance();
        XMLStreamReader reader = factory.createXMLStreamReader(xml);
        
        List<Book> currentChunk = new ArrayList<>();
        
        while (reader.hasNext()) {
            int event = reader.next();
            
            if (event == XMLStreamConstants.START_ELEMENT && "book".equals(reader.getLocalName())) {
                Book book = parseBook(reader);
                currentChunk.add(book);
                
                if (currentChunk.size() >= CHUNK_SIZE) {
                    chunkProcessor.apply(currentChunk);
                    currentChunk.clear();
                    
                    // Optional: Force garbage collection
                    System.gc();
                }
            }
        }
        
        // Process remaining books
        if (!currentChunk.isEmpty()) {
            chunkProcessor.apply(currentChunk);
        }
    }
    
    private Book parseBook(XMLStreamReader reader) throws XMLStreamException {
        Book book = new Book();
        
        // Parse book attributes
        book.setId(reader.getAttributeValue(null, "id"));
        book.setCategory(reader.getAttributeValue(null, "category"));
        
        while (reader.hasNext()) {
            int event = reader.next();
            
            if (event == XMLStreamConstants.END_ELEMENT && "book".equals(reader.getLocalName())) {
                break;
            }
            
            if (event == XMLStreamConstants.START_ELEMENT) {
                String elementName = reader.getLocalName();
                String text = reader.getElementText();
                
                switch (elementName) {
                    case "title":
                        book.setTitle(text);
                        break;
                    case "price":
                        book.setPrice(Double.parseDouble(text));
                        break;
                    // ... other elements
                }
            }
        }
        
        return book;
    }
}

Lazy Loading with Suppliers

public class LazyXMLDocument {
    private final Path xmlFile;
    private final Map<String, Supplier<List<Element>>> lazyElements;
    
    public LazyXMLDocument(Path xmlFile) {
        this.xmlFile = xmlFile;
        this.lazyElements = new HashMap<>();
        indexDocument();
    }
    
    private void indexDocument() {
        try (InputStream is = Files.newInputStream(xmlFile)) {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            XMLStreamReader reader = factory.createXMLStreamReader(is);
            
            long position = 0;
            String currentElement = null;
            
            while (reader.hasNext()) {
                int event = reader.next();
                
                if (event == XMLStreamConstants.START_ELEMENT) {
                    currentElement = reader.getLocalName();
                    final long elementPosition = position;
                    
                    lazyElements.put(currentElement, () -> {
                        return parseElementsAt(elementPosition, currentElement);
                    });
                }
                
                position = reader.getLocation().getCharacterOffset();
            }
        } catch (Exception e) {
            throw new RuntimeException("Failed to index XML document", e);
        }
    }
    
    public List<Element> getElements(String elementName) {
        Supplier<List<Element>> supplier = lazyElements.get(elementName);
        return supplier != null ? supplier.get() : Collections.emptyList();
    }
    
    private List<Element> parseElementsAt(long position, String elementName) {
        // Implementation to parse elements starting at specific position
        // This would involve seeking to the position and parsing
        return new ArrayList<>();
    }
}

Custom Parser Implementation

Event-Driven Parser with Callbacks

public class CallbackXMLParser {
    private final Map<String, List<Consumer<XMLEvent>>> callbacks;
    
    public CallbackXMLParser() {
        this.callbacks = new HashMap<>();
    }
    
    public void registerCallback(String elementName, Consumer<XMLEvent> callback) {
        callbacks.computeIfAbsent(elementName, k -> new ArrayList<>()).add(callback);
    }
    
    public void parse(InputStream xml) throws XMLStreamException {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        XMLEventReader reader = factory.createXMLEventReader(xml);
        
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();
            
            if (event.isStartElement()) {
                StartElement startElement = event.asStartElement();
                String elementName = startElement.getName().getLocalPart();
                
                List<Consumer<XMLEvent>> elementCallbacks = callbacks.get(elementName);
                if (elementCallbacks != null) {
                    elementCallbacks.forEach(callback -> callback.accept(event));
                }
            }
        }
    }
}

// Usage example
CallbackXMLParser parser = new CallbackXMLParser();

parser.registerCallback("book", event -> {
    StartElement element = event.asStartElement();
    System.out.println("Found book with ID: " + 
        element.getAttributeByName(new QName("id")).getValue());
});

parser.registerCallback("title", event -> {
    // Handle title elements
});

Async XML Processing

public class AsyncXMLProcessor {
    private final ExecutorService executor;
    private final BlockingQueue<XMLEvent> eventQueue;
    
    public AsyncXMLProcessor(int threadPoolSize) {
        this.executor = Executors.newFixedThreadPool(threadPoolSize);
        this.eventQueue = new LinkedBlockingQueue<>();
    }
    
    public CompletableFuture<Void> processAsync(InputStream xml) {
        return CompletableFuture.runAsync(() -> {
            try {
                parseXMLAsync(xml);
            } catch (XMLStreamException e) {
                throw new RuntimeException(e);
            }
        }, executor);
    }
    
    private void parseXMLAsync(InputStream xml) throws XMLStreamException {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        XMLEventReader reader = factory.createXMLEventReader(xml);
        
        // Start event processing threads
        List<CompletableFuture<Void>> processors = IntStream.range(0, 3)
            .mapToObj(i -> CompletableFuture.runAsync(this::processEvents, executor))
            .collect(Collectors.toList());
        
        // Feed events to queue
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();
            try {
                eventQueue.put(event);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                break;
            }
        }
        
        // Signal end of events
        for (int i = 0; i < 3; i++) {
            eventQueue.offer(new EndDocumentEvent());
        }
        
        // Wait for all processors to complete
        CompletableFuture.allOf(processors.toArray(new CompletableFuture[0])).join();
    }
    
    private void processEvents() {
        while (true) {
            try {
                XMLEvent event = eventQueue.take();
                
                if (event instanceof EndDocumentEvent) {
                    break;
                }
                
                // Process the event
                processEvent(event);
                
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                break;
            }
        }
    }
}

Performance Optimization

XML Reader Pooling

public class XMLReaderPool {
    private final BlockingQueue<XMLStreamReader> pool;
    private final XMLInputFactory factory;
    private final int maxSize;
    
    public XMLReaderPool(int maxSize) {
        this.pool = new LinkedBlockingQueue<>(maxSize);
        this.factory = XMLInputFactory.newInstance();
        this.maxSize = maxSize;
        
        // Configure factory for performance
        factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false);
        factory.setProperty(XMLInputFactory.IS_VALIDATING, false);
        factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
    }
    
    public XMLStreamReader borrowReader(InputStream input) throws XMLStreamException {
        XMLStreamReader reader = pool.poll();
        
        if (reader == null) {
            reader = factory.createXMLStreamReader(input);
        } else {
            // Reset reader for new input
            ((XMLStreamReaderImpl) reader).setInputSource(new InputSource(input));
        }
        
        return reader;
    }
    
    public void returnReader(XMLStreamReader reader) {
        if (pool.size() < maxSize) {
            try {
                reader.close();
                pool.offer(reader);
            } catch (XMLStreamException e) {
                // Log error, don't return to pool
            }
        }
    }
}

Buffer Management

public class BufferedXMLParser {
    private static final int BUFFER_SIZE = 8192;
    
    public void parseWithBuffering(InputStream input) throws XMLStreamException {
        BufferedInputStream bufferedInput = new BufferedInputStream(input, BUFFER_SIZE);
        
        XMLInputFactory factory = XMLInputFactory.newInstance();
        
        // Optimize factory settings
        factory.setProperty(XMLInputFactory.IS_COALESCING, true);
        factory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, false);
        
        XMLStreamReader reader = factory.createXMLStreamReader(bufferedInput);
        
        // Use character array for text accumulation
        StringBuilder textBuffer = new StringBuilder(1024);
        
        while (reader.hasNext()) {
            int event = reader.next();
            
            switch (event) {
                case XMLStreamConstants.CHARACTERS:
                    // Efficient text accumulation
                    if (!reader.isWhiteSpace()) {
                        textBuffer.append(reader.getText());
                    }
                    break;
                    
                case XMLStreamConstants.END_ELEMENT:
                    if (textBuffer.length() > 0) {
                        processText(reader.getLocalName(), textBuffer.toString());
                        textBuffer.setLength(0); // Clear buffer
                    }
                    break;
            }
        }
    }
    
    private void processText(String elementName, String text) {
        // Process accumulated text
    }
}

Error Handling and Recovery

Resilient Parser

public class ResilientXMLParser {
    private final List<ParseError> errors;
    private final boolean continueOnError;
    
    public ResilientXMLParser(boolean continueOnError) {
        this.errors = new ArrayList<>();
        this.continueOnError = continueOnError;
    }
    
    public ParseResult parse(InputStream xml) {
        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            XMLStreamReader reader = factory.createXMLStreamReader(xml);
            
            List<Element> elements = new ArrayList<>();
            
            while (reader.hasNext()) {
                try {
                    int event = reader.next();
                    processEvent(reader, event, elements);
                    
                } catch (XMLStreamException e) {
                    ParseError error = new ParseError(
                        reader.getLocation().getLineNumber(),
                        reader.getLocation().getColumnNumber(),
                        e.getMessage()
                    );
                    errors.add(error);
                    
                    if (!continueOnError) {
                        throw e;
                    }
                    
                    // Try to recover by skipping to next element
                    skipToNextElement(reader);
                }
            }
            
            return new ParseResult(elements, errors);
            
        } catch (XMLStreamException e) {
            errors.add(new ParseError(-1, -1, "Fatal parse error: " + e.getMessage()));
            return new ParseResult(Collections.emptyList(), errors);
        }
    }
    
    private void skipToNextElement(XMLStreamReader reader) throws XMLStreamException {
        int depth = 0;
        
        while (reader.hasNext()) {
            int event = reader.next();
            
            if (event == XMLStreamConstants.START_ELEMENT) {
                depth++;
            } else if (event == XMLStreamConstants.END_ELEMENT) {
                depth--;
                if (depth <= 0) {
                    break;
                }
            }
        }
    }
}

Integration Patterns

Parser Factory Pattern

public class XMLParserFactory {
    public enum ParserType {
        DOM, SAX, STAX, STREAMING, ASYNC
    }
    
    public static XMLProcessor createParser(ParserType type, Map<String, Object> config) {
        switch (type) {
            case DOM:
                return new DOMProcessor(config);
            case SAX:
                return new SAXProcessor(config);
            case STAX:
                return new StAXProcessor(config);
            case STREAMING:
                return new StreamingProcessor(config);
            case ASYNC:
                return new AsyncProcessor(config);
            default:
                throw new IllegalArgumentException("Unknown parser type: " + type);
        }
    }
    
    public static XMLProcessor createOptimalParser(long fileSize, boolean randomAccess) {
        if (fileSize < 1024 * 1024) { // < 1MB
            return createParser(ParserType.DOM, Collections.emptyMap());
        } else if (randomAccess) {
            return createParser(ParserType.STAX, Collections.emptyMap());
        } else {
            return createParser(ParserType.STREAMING, Collections.emptyMap());
        }
    }
}

Best Practices

Performance Guidelines

  • Choose the right parser: DOM for small docs, SAX/StAX for large ones
  • Disable unnecessary features: Turn off validation, DTD processing when not needed
  • Use buffering: Buffer input streams for better I/O performance
  • Pool resources: Reuse parser instances and readers
  • Process incrementally: Use streaming for large documents

Memory Management

// Good: Process in chunks
public void processLargeXML(InputStream xml) {
    XMLStreamReader reader = factory.createXMLStreamReader(xml);
    
    while (reader.hasNext()) {
        if (reader.isStartElement() && "record".equals(reader.getLocalName())) {
            Element record = parseRecord(reader);
            processRecord(record);
            record = null; // Help GC
        }
        reader.next();
    }
}

// Avoid: Loading entire document
public void avoidThis(File xmlFile) {
    Document doc = DocumentBuilderFactory.newInstance()
        .newDocumentBuilder()
        .parse(xmlFile); // Loads entire file into memory
}

Error Handling

  • Validate early: Check document structure as soon as possible
  • Provide context: Include line numbers and element paths in error messages
  • Fail gracefully: Continue processing when possible, collect errors
  • Log appropriately: Use appropriate log levels for different error types

Conclusion

Advanced XML parsing techniques enable efficient processing of complex XML documents while maintaining performance and memory efficiency. Choose the appropriate parsing strategy based on your specific requirements for document size, access patterns, and processing needs.

Next Steps