|
SAX-based XML pretty printer
Russell Bateman |
--because I'm tired of messing with the broken org.dom4j stuff that can't handle missing namespaces, etc. in the in-coming XML which, besides, is super memory-heavy as compared to SAX.
package com.windofkeltia.prettyprint;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.SAXException;
public class PrettyPrintXml
{
private static final SAXParserFactory factory = SAXParserFactory.newInstance();
private final SAXParser parser;
private final Handler handler;
private final InputStream inputStream;
private final String tabWidth;
/**
* Look for an element, begin displaying output (which is just parsing status)
* until that element's close is found, then stop displaying output again.
*/
protected PrettyPrintXml( Builder builder ) throws ParserConfigurationException, SAXException
{
inputStream = builder.inputStream;
tabWidth = builder.tabWidth;
parser = factory.newSAXParser();
handler = new Handler();
}
/** Execute the parsing and generate the output. */
public String parse() throws IOException, SAXException
{
parser.parse( inputStream, handler );
return handler.getOutput().toString();
}
public static class Builder
{
private InputStream inputStream;
private String tabWidth = " ";
public Builder inputStream( final String input ) { this.inputStream = new ByteArrayInputStream( input.getBytes() ); return this; }
public Builder inputStream( final byte[] input ) { this.inputStream = new ByteArrayInputStream( input ); return this; }
public Builder inputStream( InputStream inputStream ) { this.inputStream = inputStream; return this; }
public Builder tabWidth ( String tabWidth ) { this.tabWidth = tabWidth; return this; }
public PrettyPrintXml build() throws ParserConfigurationException, SAXException
{
return new PrettyPrintXml( this );
}
}
}
package com.windofkeltia.prettyprint;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import static java.util.Objects.isNull;
import static java.util.Objects.nonNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import com.windofkeltia.utilities.StringUtilities;
/**
* This SAX-based printer requires no stack because
* it prints as it goes.
*/
@SuppressWarnings( { "DanglingJavadoc" } )
public class Handler extends DefaultHandler
{
private static final Logger logger = LoggerFactory.getLogger( Handler.class );
private int indentation = 0; // indentation is a count (of) tabs
private String tab = " "; // default width is two spaces
private ByteArrayOutputStream output = new ByteArrayOutputStream();
public void setTab ( final String tab ) { this.tab = tab; }
public ByteArrayOutputStream getOutput() { return output; }
protected Handler() { super(); }
public void startDocument() { }
public void startElement( String uri, String localName, String elementName, Attributes attributes )
{
StringBuilder sb = new StringBuilder();
sb.append( indent() )
.append( '<' ).append( elementName )
.append( attributesAsString( getAttributesAsMap( attributes ) ) )
.append( ">\n" );
write( sb.toString() );
indentation++;
}
public void endElement( String uri, String localName, String elementName )
{
indentation--;
StringBuilder sb = new StringBuilder();
sb.append( indent() )
.append( "</" ).append( elementName )
.append( ">\n" );
write( sb.toString() );
}
public void characters( char[] ch, int start, int length )
{
String characters = new String( ch, start, length ).trim();
if( !StringUtilities.isEmpty( characters ) )
doCharacters( characters );
}
public void comment( char[] ch, int start, int length )
{
String comment = new String( ch, start, length );
if( !StringUtilities.isEmpty( comment ) )
doCharacters( comment );
}
public void doCharacters( String input )
{
StringBuilder sb = new StringBuilder();
sb.append( indent() )
.append( input.trim() )
.append( '\n' );
write( sb.toString() );
}
private String indent()
{
StringBuilder sb = new StringBuilder();
for( int level = 0; level < indentation; level++ )
sb.append( tab );
return sb.toString();
}
private void write( byte[] bytes )
{
try
{
output.write( bytes );
}
catch( IOException e )
{
throw new RuntimeException( e );
}
}
private void write( String string )
{
try
{
output.write( string.getBytes() );
}
catch( IOException e )
{
throw new RuntimeException( e );
}
}
/** Do some post-processing of this handler's output to clean up stuff like:
*
* <!-- a change here: -->
* <realmCode code="US"> → <realmCode code="US" />
* </realmCode>
*
* But, don't go wild and clean by folding the last two lines here:
*
* <!-- no change here: -->
* <csmk:div csmk:class="demographic" npmk:nlp="off">
* <csmk:div csmk:class="extended_data" npmk:nlp="off">
* full_name: BEITEL, ELISE
* </csmk:div>
* </csmk:div>
*/
public void endDocument()
{
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
ByteArrayInputStream inputStream = new ByteArrayInputStream( output.toByteArray() );
try
{
BufferedReader bufferedReader = new BufferedReader( new InputStreamReader( inputStream ) );
String firstLine = null, secondLine = null;
while( true )
{
try
{
if( isNull( firstLine ) )
firstLine = bufferedReader.readLine();
secondLine = bufferedReader.readLine();
if( isNull( secondLine ) )
{
// end of the input stream...
firstLine += '\n';
outputStream.write( firstLine.getBytes() ); lineNumber++;
break;
}
int firstLineIndent = HandlerUtilities.countLeadingSpaces( firstLine );
int secondLineIndent = HandlerUtilities.countLeadingSpaces( secondLine );
boolean startsWithOpeningElement = HandlerUtilities.startWithOpeningElement( firstLine, secondLine );
if( firstLineIndent != secondLineIndent || !startsWithOpeningElement )
{
// append "\n" to first line and copy it to the output as is...
firstLine += '\n';
outputStream.write( firstLine.getBytes() ); lineNumber++;
firstLine = secondLine;
secondLine = null;
continue;
}
List< String > firstList = HandlerUtilities.tokenize( firstLine.trim() );
List< String > secondList = HandlerUtilities.tokenize( secondLine.trim() );
String firstElement = firstList.get( 0 );
String secondElement = secondList.get( 0 );
if( firstElement.equals( secondElement ) )
{
// special rebuild modifying first line and copying it to the output...
firstLine = HandlerUtilities.indent( firstLine ) + "<" + firstElement + " />\n";
outputStream.write( firstLine.getBytes() ); lineNumber++;
// skip second line; we're dropping it...
firstLine = null;
secondLine = null;
}
else
{
// append "\n" to first line and copy it to the output as is...
firstLine += '\n';
outputStream.write( firstLine.getBytes() ); lineNumber++;
firstLine = secondLine;
}
secondLine = null;
}
catch( Exception e )
{
logger.warn( "Stopped I/O, failed tokenization and/or line folding: {}", e.getMessage() );
if( nonNull( firstLine ) )
{
firstLine += '\n';
outputStream.write( firstLine.getBytes() );
}
break;
}
}
output = outputStream;
}
catch ( Exception e )
{
logger.warn( "Failed stream I/O: {}", e.getMessage() );
}
}
/**
* Here's how to make SAX attributes "Java-useful." If we had uri (namespaces) defined,
* we'd have to get a lot more serious about how to use uri, localName and qName.
*/
private Map< String, String > getAttributesAsMap( Attributes saxAttributes )
{
int attrLength = saxAttributes.getLength();
Map< String, String > javaAttributes = new HashMap<>( attrLength );
for( int attr = 0; attr < attrLength; attr++ )
{
String attribute = saxAttributes.getQName( attr );
String value = saxAttributes.getValue( attr );
javaAttributes.put( attribute, value );
}
return javaAttributes;
}
/**
* Format XML element attributes canonically for concatenating
* to their element name.
*/
private String attributesAsString( Map< String, String > javaAttributes )
{
if( javaAttributes.isEmpty() )
return "";
StringBuilder sb = new StringBuilder();
sb.append( ' ' );
for( Map.Entry< String, String > attribute : javaAttributes.entrySet() )
sb.append( attribute.getKey() )
.append( "=\"" )
.append( StringUtilities.smash( attribute.getValue() ) )
.append( "\", " );
sb.setLength( sb.length()-2 );
return sb.toString();
}
}
package com.windofkeltia.prettyprint;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
/**
* These utilities are geared toward the particular output format
* of the pretty-printer SAX parser and output generated.
*/
public class HandlerUtilities
{
private static final int MAXTOKENCOUNT = 2;
private static final String XML_DELIMITERS = " <>/";
/**
* Tokenize this line using the space character as delimiter. As, for
* our purposes, we only need the first two tokens (we're looking for
* '<' and an XML element name), don't spend any more time on this
* than needed to accomplish that. This is an example of what we want
* to solve: opening element with no characters (text) followed by
* a closing element.
*
* <realmCode code="US"> → <realmCode code="US" />
* </realmCode>
*/
public static List< String > tokenize( String line )
{
List< String > tokens = new ArrayList<>();
StringTokenizer tokenizer = new StringTokenizer( line, XML_DELIMITERS );
while( tokenizer.hasMoreElements() )
tokens.add( tokenizer.nextToken() );
return tokens;
}
/**
* Allows us to short-circuit work above even though two consecutive
* line might match once their leading (and trailing) spaces are
* trimmed. Trimming will produce inaccurate matches because the
* number of leading spaces (indentation) was different. E.g.:
*
* <csmk:div csmk:class="demographic" npmk:nlp="off">
* <csmk:div csmk:class="extended_data" npmk:nlp="off">
* full_name: BEITEL, ELISE
* </csmk:div>
* </csmk:div>
*/
public static int countLeadingSpaces( String line )
{
int count = 0;
for( char ch : line.toCharArray() )
{
if( ch == ' ' )
{
count++;
continue;
}
break;
}
return count;
}
/** Count leading spaces and create indentation of the same amount. */
public static String indent( String line )
{
StringBuilder tab = new StringBuilder();
int width = countLeadingSpaces( line );
for( int ch = 0; ch < width; ch++ )
tab.append( ' ' );
return tab.toString();
}
public static boolean startWithOpeningElement( String line1, String line2 )
{
line1 = line1.trim();
line2 = line2.trim();
return( line1.startsWith( "<" ) && line2.startsWith( "<" ) );
}
}
package com.windofkeltia.prettyprint;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;
import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
public class PrettyPrintXmlTest
{
@Test
public void test() throws ParserConfigurationException, SAXException, IOException
{
final String CCD_BODY = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+ "<record>\n"
+ " <justforfun>\n"
+ " ...\n"
+ " </justforfun>\n"
+ " <ccdmessage>\n"
+ "<ClinicalDocument xmlns=\"urn:hl7-org:v3\" xmlns:sdtc=\"urn:hl7-org:sdtc\"\n"
+ " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n"
+ " xsi:schemaLocation=\"urn:hl7-org:v3\n"
+ " http://xreg2.nist.gov:8080/hitspValidation/schema/cdar2c32/infrastructure/cda/C32_CDA.xsd\">\n"
+ " <realmCode code=\"US\"/>\n"
+ " <typeId root=\"2.16.840.1.113883.1.3\" extension=\"POCD_HD000040\"/>\n"
+ " <component>\n"
+ " <stuff>\n"
+ " ...\n"
+ " </stuff>\n"
+ " </component>\n"
+ "</ClinicalDocument>\n"
+ " </ccdmessage>\n"
+ "</record>\n";
PrettyPrintXml prettyPrintXml = new PrettyPrintXml.Builder().inputStream( CCD_BODY ).build();
String OUTPUT = prettyPrintXml.parse();
System.out.println( OUTPUT );
}
@Test
public void testWithNameSpaces() throws ParserConfigurationException, SAXException, IOException
{
final String NAMESPACE_CONTENT = ""
+ "<record>\n"
+ " <csmk:div csmk:class=\"demographic\" npmk:nlp=\"off\">\n"
+ " <csmk:div csmk:class=\"extended_data\" npmk:nlp=\"off\">\n"
+ " full_name: BEITEL, ELISE\n"
+ " </csmk:div>\n"
+ " </csmk:div>\n"
+ "</record>\n";
PrettyPrintXml prettyPrintXml = new PrettyPrintXml.Builder().inputStream( NAMESPACE_CONTENT ).build();
String OUTPUT = prettyPrintXml.parse();
System.out.println( OUTPUT );
}
@Test
public void testWithAdditionalWiggle() throws ParserConfigurationException, SAXException, IOException
{
final String WIGGLE = ""
+ "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+ "<test>test\n"
+ " <br />test\n"
+ "</test>\n"
+ "";
PrettyPrintXml prettyPrintXml = new PrettyPrintXml.Builder().inputStream( WIGGLE ).build();
String OUTPUT = prettyPrintXml.parse();
System.out.println( OUTPUT );
}
}