001 package ca.uhn.hl7v2.util;
002
003 import ca.uhn.hl7v2.parser.*;
004 import ca.uhn.hl7v2.model.Message;
005 import ca.uhn.hl7v2.HL7Exception;
006 import java.util.regex.*;
007 import org.w3c.dom.*;
008 import org.xml.sax.SAXException;
009 import org.xml.sax.InputSource;
010 import org.apache.xml.serialize.*;
011 import org.apache.xerces.parsers.DOMParser;
012 import org.apache.xerces.parsers.StandardParserConfiguration;
013 import java.io.*;
014
015 /**
016 * Tools for testing message strings for semantic equivalence without assuming the correctness
017 * of parsers.
018 * @author Bryan Tripp
019 */
020 public class EncodedMessageComparator {
021
022 private static GenericParser parser = new GenericParser();
023
024 /**
025 * Returns a "standardized" equivalent of the given message string. For delimited
026 * messages, the returned value is the shortest string that has an equivalent
027 * meaning in HL7. For XML-encoded messages, the returned value is equivalent XML output
028 * using a standard pretty-print format. An automatic determination is made about whether
029 * the given string is XML or ER7 (i.e. traditionally) encoded.
030 * @param message an XML-encoded or ER7-encoded message string
031 */
032 public static String standardize(String message) throws SAXException {
033 String result = null;
034 String encoding = parser.getEncoding(message);
035 if (encoding.equals("XML")) {
036 result = standardizeXML(message);
037 } else {
038 result = standardizeER7(message);
039 }
040 return result;
041 }
042
043 /**
044 * Returns the shortest string that is semantically equivalent to a given ER7-encoded
045 * message string.
046 */
047 public static String standardizeER7(String message) {
048
049 //make delimiter sequences (must quote with \ if not alphanumeric; can't otherwise because of regexp rules)
050 char fieldDelimChar = message.charAt(3);
051 String fieldDelim = String.valueOf(fieldDelimChar);
052 if (!Character.isLetterOrDigit(fieldDelimChar)) fieldDelim = "\\" + fieldDelimChar;
053
054 char compSepChar = message.charAt(4);
055 String compSep = String.valueOf(compSepChar);
056 if (!Character.isLetterOrDigit(compSepChar)) compSep = "\\" + compSepChar;
057
058 char repSepChar = message.charAt(5);
059 String repSep = String.valueOf(repSepChar);
060 if (!Character.isLetterOrDigit(repSepChar)) repSep = "\\" + repSepChar;
061
062 char subSepChar = message.charAt(7);
063 String subSep = String.valueOf(subSepChar);
064 if (!Character.isLetterOrDigit(subSepChar)) subSep = "\\" + subSepChar;
065
066 //char space = ' ';
067
068 /* Things to strip (cumulative):
069 * - all delimiters and repetition separators before end line (i.e. end segment)
070 * - repetition separators, comp and subcomp delims before new field
071 * - subcomponent delimiters before new component
072 */
073 Pattern endSegment = Pattern.compile("[" + fieldDelim + compSep + repSep + subSep + "]*[\n\r]+");
074 message = endSegment.matcher(message).replaceAll("\r");
075
076 Pattern endField = Pattern.compile("[" + repSep + compSep + subSep + "]*" + fieldDelim);
077 message = endField.matcher(message).replaceAll(String.valueOf(fieldDelim));
078
079 Pattern endComp = Pattern.compile("[" + subSep + "]*" + compSep);
080 message = endComp.matcher(message).replaceAll(String.valueOf(compSep));
081
082 //Pattern endSub = Pattern.compile("[ ]*" + subSep);
083 //message = endSub.matcher(message).replaceAll(String.valueOf(subSep));
084
085 //handle special case of subcomp delim in encoding characters
086 message = message.substring(0, 7) + subSepChar + message.substring(7);
087
088 return message;
089 }
090
091 /**
092 * Returns a semantic equivalent of a given XML-encoded message in a default format.
093 * Attributes, comments, and processing instructions are not considered to change the
094 * HL7 meaning of the message, and are removed in the standardized representation.
095 */
096 public static String standardizeXML(String message) throws SAXException {
097 DOMParser parser = new DOMParser(new StandardParserConfiguration());
098 parser.setFeature("http://apache.org/xml/features/dom/include-ignorable-whitespace", false);
099
100 Document doc = null;
101 StringWriter out = new StringWriter();
102 try {
103 synchronized (parser) {
104 parser.parse(new InputSource(new StringReader(message)));
105 doc = parser.getDocument();
106 }
107 clean(doc.getDocumentElement());
108 OutputFormat outputFormat = new OutputFormat("", null, true);
109 XMLSerializer ser = new XMLSerializer(out, outputFormat);
110 ser.serialize(doc);
111 } catch (IOException e) {
112 throw new RuntimeException("IOException doing IO to a string!!! " + e.getMessage());
113 }
114 return out.toString();
115 }
116
117 /** Removes attributes, comments, and processing instructions. */
118 private static void clean(Element elem) {
119 NodeList children = elem.getChildNodes();
120 for (int i = 0; i < children.getLength(); i++) {
121 Node child = children.item(i);
122 if (child.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE
123 || child.getNodeType() == Node.COMMENT_NODE)
124 {
125 elem.removeChild(child);
126 } else if (child.getNodeType() == Node.ELEMENT_NODE) {
127 clean((Element) child);
128 }
129 }
130
131 NamedNodeMap attributes = elem.getAttributes();
132 //get names
133 String[] names = new String[attributes.getLength()];
134 for (int i = 0; i < names.length; i++) {
135 names[i] = attributes.item(i).getNodeName();
136 }
137 //remove by name
138 for (int i = 0; i < names.length; i++) {
139 attributes.removeNamedItem(names[i]);
140 }
141 }
142
143 /**
144 * <p>Compares two HL7 messages to see if they are equivalent (in terms of their
145 * HL7 meaning). Semantically irrelevant differences (e.g. spaces in an XML tag;
146 * extra field delimiters at the end of a segment; XML vs. ER7 encoding; XML attributes)
147 * are ignored. This check is performed without assuming the correctness of the HAPI parsers,
148 * and can therefore be used to test them. This is done by parsing a message, encoding it
149 * again, and comparing the result with this original. </p>
150 * <p>If one message is in XML and the other in ER7, the former is converted to ER7 to
151 * perform the comparison. This process relies on the HAPI parsers. However, the
152 * parsed message is first encoded as XML and compared to the original, so that the
153 * integrity of the parser can be verified. An exception is thrown if this comparison
154 * is unsuccessful. </p>
155 * @return true if given messages are semantically equivalent
156 */
157 public static boolean equivalent(String message1, String message2) throws HL7Exception {
158 String encoding1 = parser.getEncoding(message1);
159 String encoding2 = parser.getEncoding(message2);
160
161 if (!encoding1.equals(encoding2)) {
162 if (encoding1.equals("XML")) {
163 message1 = safeER7Conversion(message1);
164 } else {
165 message2 = safeER7Conversion(message2);
166 }
167 }
168
169 String std1, std2;
170 try {
171 std1 = standardize(message1);
172 std2 = standardize(message2);
173 } catch (SAXException e) {
174 throw new HL7Exception("Equivalence check failed due to SAXException: " + e.getMessage());
175 }
176
177 return std1.equals(std2);
178 }
179
180 /**
181 * Converts XML message to ER7, first checking integrity of parse and throwing
182 * an exception if parse not correct
183 */
184 private static String safeER7Conversion(String xmlMessage) throws HL7Exception {
185 Message m = parser.parse(xmlMessage);
186
187 String check = parser.encode(m, "XML");
188 if (!equivalent(xmlMessage, check)) {
189 throw new HL7Exception("Parsed and encoded message not equivalent to original (possibilities: invalid message, bug in parser)");
190 }
191
192 return parser.encode(m, "VB");
193 }
194
195 /**
196 * Compares messages in two files
197 */
198 public static void main(String args[]) {
199
200 }
201
202 }