001    package ca.uhn.hl7v2.preparser;
002    
003    import java.io.IOException;
004    import java.util.ArrayList;
005    import java.util.Collection;
006    import java.util.Iterator;
007    import java.util.List;
008    import java.util.Map;
009    import java.util.Properties;
010    import java.util.SortedMap;
011    import java.util.TreeMap;
012    
013    import javax.xml.parsers.ParserConfigurationException;
014    import javax.xml.parsers.SAXParser;
015    import javax.xml.parsers.SAXParserFactory;
016    
017    import org.xml.sax.Attributes;
018    import org.xml.sax.InputSource;
019    import org.xml.sax.SAXException;
020    import org.xml.sax.SAXParseException;
021    import org.xml.sax.helpers.DefaultHandler;
022    
023    import ca.uhn.hl7v2.HL7Exception;
024    
025    public class XML
026    {
027            @SuppressWarnings("serial")
028            protected static class StopParsingException extends SAXException
029            {
030                    public StopParsingException() 
031                    {
032                            super("ca.uhn.hl7.....StopParsingException");
033                    }
034            }
035    
036            /** the SAXParser reports parsing events to an object of this class.
037            We keep track of some parsing state, and the Properties object that 
038            we're supposed to write our data to.
039            */
040            static protected class HL7MessageHandler extends DefaultHandler 
041            {
042                    /* m_props & m_msgMask should be set by the user of this handler before
043                    they pass this handler to SAXParser.parse() or whatever */
044    
045                    /** The data that is found while parsing, and which passes m_msgMask, 
046                    will be dumped to m_props, as (DatumPath.toString() / text) key/value
047                    pairs */
048                    public Properties m_props = null;
049    
050                    /** Specifies what parts of a message should be dumped to m_props. 
051                    */
052                    public Collection<DatumPath> m_msgMask = null;
053    
054                    /* All other fields are parser state. */
055    
056                    protected boolean m_startedDocument = false;
057    
058                    /* m_msgID / m_curPath together keep track of where we are in the document.
059    
060                    If m_msgID.length() != 0, then we're within the message element.  (We're only
061                    expecting one message per document.)  Then m_msgID will be the name of the 
062                    message.  ("ACK" or whatever).  
063    
064                    m_curPath keeps track of where within the message we are.  See notes at 
065                    DatumPath class definition.  If m_curPath.size() != 0, then we must be 
066                    within a message.
067    
068                    At any point in the code below: 
069    
070                    if m_msgID.length() == 0, 
071                            then m_curPath().size() == 0
072    
073                    if m_curPath.length()  != 0
074                            then m_msgID.length() != 0
075                    
076                    Note that our DatumPaths count indices starting from 0 (not 1) -- they're 
077                    only converted to 1-based in the string representations that wind up 
078                    as m_props keys.
079                    */
080                    StringBuffer m_msgID = new StringBuffer();
081                    DatumPath m_curPath = new DatumPath();
082    
083                    /* the location in the document of the last datum we dumped to m_props. */
084                    DatumPath m_lastDumpedPath = new DatumPath();
085    
086                    /** For handling repeat segments.   segmentID (String) -> next repeat idx
087                    (Integer).  So when we hit a segment ZYX, we'll know how many times we've
088                    hit a ZYX before, and set the segmentRepIdx part of m_curPath
089                    appropriately. */
090                    SortedMap<String, Integer> m_segmentId2nextRepIdx = new TreeMap<String, Integer>();
091    
092                    /* m_depthWithinUselessElement and m_depthWithinUsefulElement 
093                    reflect what m_msgMask thinks about our location in the document at any
094                    given time.  
095    
096                    Both should always be >= -1.  Note that both can be >= 0 at the same time
097                    -- explained in a minute....
098    
099                    If m_depthWithinUsefulElement >= 0, this means that we are however deep
100                    (in terms of nested elements: 0 => just within) within an area of the
101                    message that passes m_msgMask.  We should should dump whatever we find
102                    there to m_props.  As we move around within such an element, we will still
103                    update m_curPath appropriately.
104    
105                    If m_depthWithinUsefulElement >= 0, we are however deep within an element
106                    which either made no sense (eg. <ZZZ.1> where we were expecting a <ZYX.1>
107                    -- a few other things maybe), or more importantly that we're within an
108                    element that otherwise has no hope of having any useful elements within it
109                    according to m_msgMask.  (eg. m_msgMask says it wants only ZYX segment
110                    contents, we're in an <MSH>).  So we can safely ignore all content within,
111                    and just keep track of how deep we are within this useless element (with
112                    m_depthWithinUselessElement, of course.)  We don't update m_curPath when
113                    m_depthWithinUselessElement >= 0, there's no point and how would we
114                    extract information for the DatumPath out of nonsensical element names
115                    anyway.
116    
117                    If they are both >= 0, this means that there we've found some useless
118                    stuff (nonsensical element names?) within a known-useful element.
119                    */
120                    int m_depthWithinUsefulElement = -1, m_depthWithinUselessElement = -1;
121    
122                    /* With this we keep the text that we've found within a certain element.
123                    It's cleared whenever we enter a (sub) element or leave an element. */
124                    StringBuffer m_chars = new StringBuffer(10);
125    
126                    public HL7MessageHandler()
127                    {
128                            this.clear();
129                    }
130    
131                    void clear()
132                    {
133                            // reset the state (m_props & m_msgMask are not state)
134                            m_startedDocument = false;
135                            m_msgID.delete(0, m_msgID.length());
136                            m_curPath.clear();
137                            // will always be "less than" (according to DatumPath.numbersLessThan)
138                            // any sensible DatumPath: 
139                            m_lastDumpedPath.clear().add(new String()).add(-42).add(-42).add(-42).add(-42).add(-42);
140                            m_segmentId2nextRepIdx.clear();
141                            m_depthWithinUsefulElement = -1;
142                            m_depthWithinUselessElement = -1;
143                            m_chars.delete(0, m_chars.length());
144                    }
145    
146                    public void startDocument() throws SAXException
147                    {
148                            boolean ok = false;
149                            if(!m_startedDocument && (m_props != null)) {
150                                    m_startedDocument = true;
151                                    ok = true;
152                            }
153    
154                            if(!ok) {
155                                    clear();
156                                    throw new StopParsingException();
157                            }
158                    }
159    
160                    public void endDocument() throws SAXException
161                    {
162                            boolean ok = false;
163                            if(m_startedDocument) {
164                                    this.clear();
165                                    ok = true;
166                            }
167    
168                            if(!ok) {
169                                    clear();
170                                    throw new StopParsingException();
171                            }
172                    }
173    
174                    public void startElement(String uri, String localName, String qName, 
175                                    Attributes attributes) throws SAXException 
176                    {
177                            //System.err.println("startelem: " + qName + " curpathsize; " +
178                            //m_curPath.size());
179                            boolean ok = false;
180                            if(m_startedDocument) {
181                                    // A single unit of text data will be within a single element, 
182                                    // -- none of it will be in sub-elements and there will be no 
183                                    // sub-elements fragmenting the data text.
184                                    // Right now we're entering a new element: this means that anything
185                                    // in m_chars will be whitespace (likely), or text left over from, 
186                                    // say, the last field, or text that was somewhere it shouldn't have been.
187                                    // (ex. "<ZYX.9> shouldn't be here <PT.1> P </PT.1> </ZYX.9>"
188                                    m_chars.delete(0, m_chars.length());
189    
190                                    if(m_depthWithinUselessElement >= 0) {
191                                            ++m_depthWithinUselessElement;
192                                    }
193                                    else {
194                                            int oldCurPathSize = m_curPath.size();
195                                            if(tryToGrowDocLocationFromElementName(m_msgID, m_curPath, 
196                                                    m_segmentId2nextRepIdx, m_lastDumpedPath, qName)) 
197                                            {
198                                                    if(m_curPath.size() > oldCurPathSize) {
199                                                            // assert (m_depthWithinUselessElement == -1) // m_curPath
200                                                            // should not have grown if we're within a useless element.
201                                                            if(m_depthWithinUsefulElement == -1) {
202                                                                    // this new element could match one of the DatumPaths in
203                                                                    // m_msgMask -- if that's the case, we've just entered a
204                                                                    // useful element.
205                                                                    // TODO: functional stylee (a la C++'s std::accumulate) ? 
206                                                                    boolean curPathStartsWithAMaskElem = false;
207                                                                    for(Iterator<DatumPath> maskIt = m_msgMask.iterator(); 
208                                                                            !curPathStartsWithAMaskElem && maskIt.hasNext(); )
209                                                                    {
210                                                                            curPathStartsWithAMaskElem 
211                                                                                    = m_curPath.startsWith(maskIt.next());
212                                                                    }
213    
214                                                                    if(curPathStartsWithAMaskElem) 
215                                                                            m_depthWithinUsefulElement = 0;
216                                                                    else {
217                                                                            // so this element we're entering is not specified by m_msgMask
218                                                                            // to be useful -- but might it contains elements that
219                                                                            // are?
220                                                                            boolean aMaskElemStartsWithCurPath = false;
221                                                                            for(Iterator<DatumPath> maskIt = m_msgMask.iterator(); 
222                                                                                    !aMaskElemStartsWithCurPath && maskIt.hasNext(); )
223                                                                            {
224                                                                                    aMaskElemStartsWithCurPath 
225                                                                                            = maskIt.next().startsWith(m_curPath);
226                                                                            }
227    
228                                                                            if(!aMaskElemStartsWithCurPath) {
229                                                                                    // ... nope!  useless.
230                                                                                    m_depthWithinUselessElement = 0;
231                                                                                    m_curPath.setSize(oldCurPathSize);
232                                                                            } // else => ok, carry on, m_depthWithinUse{less,ful}Element
233                                                                            // still both -1.
234                                                                    }
235                                                            }
236                                                            // else => already within a useful element, don't need to compare 
237                                                            // against m_msgMask.
238                                                    }
239                                            }
240                                            else
241                                                    m_depthWithinUselessElement = 0;
242                                    }
243                                    ok = true;
244                            }
245    
246                            if(!ok) {
247                                    clear();
248                                    throw new StopParsingException();
249                            }
250                    }
251    
252                    /* doc location == msgID & curPath together.  
253                    If we've encountered an element called "elementNam", then this tries 
254                    to determine what it is, based on what we already know about the document.
255                    returns true if we can make sense of this new element name given the
256                    position we're at (represented by msgID / curPath), 
257                    false if we can't (which probably means this should be a useless element). 
258                    returning true doesn't mean that we actually changed msgID or curPath, it
259                    might mean that we just passed through a segment group element OK.
260                    */
261                    protected static boolean tryToGrowDocLocationFromElementName(
262                            StringBuffer msgID /*in/out*/, DatumPath curPath /*in/out*/, 
263                            Map<String, Integer> segmentId2nextRepIdx /*in/out*/, DatumPath lastDumpedPath /*in*/, 
264                            String elementName /*in*/)
265                    {
266                            boolean ok = false; // ok == can we make sense of this new element?
267                            // hmm ... where are we in the document: 
268                            if((msgID.length() == 0) && (curPath.size() == 0)) {
269                                    // we're entering a message
270                                    msgID.replace(0, msgID.length(), elementName);
271                                    segmentId2nextRepIdx.clear();
272                                    ok = true;
273                            }
274                            else if((msgID.length() > 0) && (curPath.size() == 0)) {
275                                    // we're entering either a segment-group element (eg. <ADT_A01.PROCEDURE>)
276                                    // or an actual segment element.
277                                    if(!(elementName.startsWith("" + msgID + '.'))) {
278                                            // must be an actual segment.
279                                            curPath.add(elementName);
280    
281                                            if(segmentId2nextRepIdx.containsKey(elementName)) 
282                                                    curPath.add(segmentId2nextRepIdx.get(elementName));
283                                            else
284                                                    curPath.add(new Integer(0));
285    
286                                            segmentId2nextRepIdx.put(elementName, ((Integer)curPath.get(curPath.size()-1)).intValue() + 1);
287                                    }
288                                    ok = true;
289                            }
290                            else if((msgID.length() > 0) && (curPath.size() > 0)) {
291                                    // we're entering a field or a component or a subcomponent.
292                                    if(curPath.size() == 2) { // we're entering a field element
293                                            // all fields should start with segment-ID + '.' 
294                                            if(elementName.startsWith("" + curPath.get(0) + '.')) {
295                                                    try {
296                                                            int fieldIdxFromElementName 
297                                                                    = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1));
298    
299                                                            curPath.add(new Integer(fieldIdxFromElementName));
300    
301                                                            // now add the repetition idx to curPath: 
302                                                            if((lastDumpedPath.size() >= 4) 
303                                                                    && (((Integer)lastDumpedPath.get(2)).intValue() 
304                                                                            == fieldIdxFromElementName))
305                                                            {
306                                                                    // lastDumpedPath has a fieldIdx and a fieldRepIdx.
307                                                                    curPath.add(new Integer(((Integer)lastDumpedPath.get(3)).intValue() + 1));
308                                                            }
309                                                            else
310                                                                    curPath.add(new Integer(0));
311    
312                                                            ok = true;
313                                                    } catch(NumberFormatException e) {}
314                                            } // else => this isn't a field -- must be useless.
315                                    }
316                                    else if((curPath.size() == 4) || (curPath.size() == 5)) {
317                                            // we're entering a component or subcomponent element
318                                            try {
319                                                    int idxFromElementName 
320                                                            = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1));
321                                                    curPath.add(new Integer(idxFromElementName));
322                                                    ok = true;
323                                            } catch(NumberFormatException e) {}
324                                    }
325                            }
326                            return ok;
327                    }
328    
329                    public void endElement(String uri, String localName, String qName) 
330                            throws SAXException 
331                    {
332                            //System.err.println("endElement: " + qName);
333                            boolean ok = false;
334                            if(m_startedDocument) {
335                                    if(m_depthWithinUselessElement >= 0) {
336                                            --m_depthWithinUselessElement;
337                                            ok = true;
338                                    }
339                                    else {
340                                            if((m_msgID.length() > 0) && (m_curPath.size() == 0)) {
341                                                    // we're exiting either a message element or a 
342                                                    // segment group element.
343                                                    if((""+qName).compareTo(""+m_msgID) == 0)
344                                                            m_msgID.delete(0, m_msgID.length()); // => exiting message element
345                                                    // else => segment group element -- do nothing.
346    
347                                                    ok = true;
348                                            }
349                                            else if((m_msgID.length() > 0) && (m_curPath.size() > 0)) {
350                                                    tryToDumpDataToProps();
351    
352                                                    if(m_curPath.size() == 2) {
353                                                            // exiting a segment element
354                                                            m_curPath.setSize(0);
355                                                            ok = true;
356                                                    }
357                                                    else if(m_curPath.size() == 4) {
358                                                            // exiting a field element 
359                                                            m_curPath.setSize(2);
360                                                            ok = true;
361                                                    }
362                                                    else if((m_curPath.size() == 5) || (m_curPath.size() == 6)) {
363                                                            // exiting a component or a subcomponent
364                                                            m_curPath.setSize(m_curPath.size() - 1);
365                                                            ok = true;
366                                                    }
367                                            }
368    
369                                            if(m_depthWithinUsefulElement >= 0) 
370                                                    --m_depthWithinUsefulElement;
371                                    }
372                            }
373    
374                            if(!ok) {
375                                    clear();
376                                    throw new StopParsingException();
377                            }
378                    }
379    
380                    /** try to dump whatever we've got in m_chars to m_props, 
381                    with a key of m_curPath.toString(). 
382                    */
383                    protected void tryToDumpDataToProps()
384                    {
385                            if((m_curPath.size() >= 2) && (m_depthWithinUselessElement == -1)) {
386                                    /* m_curPath.toString() will be the property key whose value will be
387                                    m_chars.
388    
389                                    This is (part of) what m_lastDumpedPath is for: With, for example "<ZYX.9>
390                                    <PT.1>P</PT.1> </ZYX.9>" we might have had a m_curPath containing something
391                                    like [ZYX, 0, 9, 0, 0] when we exited the PT.1 element.  (note: internal
392                                    DatumPath elements are 0-indexed, string representations of DatumPaths and
393                                    the XML text is 1-indexed.)  So in m_props the key for "P" would have been
394                                    "ZYX[0]-9[0]-1-1".  (the last "-1" is a default that got added by
395                                    toString()).
396                                    
397                                    Then we would have exited the PT.3 element, changed m_curPath to [ZYX, 0,
398                                    9, 0], picked up the whitespace between </PT.3> and </ZYX.9>, and when
399                                    exiting the ZYX.9 element, we might have written that whitespace to m_props
400                                    with a key of the toString() of [ZYX, 0, 9, 0]; that is, "ZYX[0]-9[0]-1-1":
401                                    the same as the key for the "P" ... clobbering "P" in m_props with
402                                    whitespace.
403    
404                                    But since we know that HL7 fields / components / etc are always in order
405                                    (numerically), we can count on m_lastDumpedPath and use
406                                    DatumPath.numbersLessThan to avoid the clobbering.
407                                    */
408                                    if((m_lastDumpedPath.get(0).equals(m_curPath.get(0))) 
409                                                    ? (m_lastDumpedPath.numbersLessThan(m_curPath)) 
410                                                    : true)
411                                    {
412                                            if(m_depthWithinUsefulElement >= 0) {
413                                                    // TODO: remove!  or assert 
414                                                    if(m_props.containsKey("" + m_curPath)) 
415                                                            System.err.println("ALAAAARM: CLOBBERING PROPERTY in " + getClass());
416    
417                                                    m_props.setProperty("" + m_curPath, "" + m_chars);
418                                                    m_lastDumpedPath.copy(m_curPath);
419                                                    m_chars.delete(0, m_chars.length());
420                                            }
421                                    }
422                            }
423                    }
424    
425                    public void characters(char[] chars, int start, int length)
426                    {
427                            // note that a contiguous run of characters in the document 
428                            // might get reported to us in several chunks. 
429                            // (In the order that the text appears in the document, 
430                            // non-overlapping and with no gaps between chunks.) 
431                            // An entity like &amp; will reach us as an actual & character.
432                            
433                            if((m_msgID.length() > 0) && (m_curPath.size() >= 4)) {
434                                    m_chars.append(chars, start, length);
435                            }
436                    }
437    
438                    public void ignoreableWhitespace(char []chars, int start, int length)
439                    {
440                            // it's unclear which whitespace is considered ignorable for us.  
441                            // what the heck, add it to m_chars. 
442                            characters(chars, start, length);
443                    }
444    
445                    public void error(SAXParseException e)
446                    {
447                            // TODO: remove.
448                            System.err.println("Error in " + getClass() + ": " + e);
449                    }
450    
451                    public void fatalError(SAXParseException e) throws SAXException 
452                    {
453                            throw e;
454                    }
455            }
456    
457            /** parse message according to our HL7 XML handler, and dump the data found
458            to props.  
459            
460            returns true if we parsed ok, which means well-formed XML, and
461            that's about it.  We just barely check against HL7 structure, and ignore any
462            elements / text that is unexpected (that is, impossible in any HL7 message:
463            independant of any message / segment definitions).
464    
465            "message" should be an XML document with one top-level element -- that being
466            the message.  (<ACK> or whatever).  We're only expecting one message to be in
467            "message".
468    
469            props can be null if you don't want the data (we still parse).  The message
470            data found in message (that passes msgMask) will be added to props as key /
471            value pairs with the key a toString() of the appropriate DatumPath for the
472            location where the data is found (i.e. in the ZYX[a]-b[c]-d-e style), and
473            the value the corresponding text.  So, after calling parseMessage
474            successfully, if you wanted to retrieve the message data from props you
475            might call something like 
476            props.getProperty((new DatumPath()).add("MSH").add(1).toString())
477            and that would return a String with "|", probably.
478    
479            Note that this package facilitates the extraction of message data in a way
480            independent of message version (i.e. components and whatever getting added):
481    
482            With a message of "<FOO><ZYX><ZYX.42>fieldy-field-field</ZYX.42></ZYX></FOO>",
483            "ZYX[0]-1[0]-1-1" will be the key that ends up in props (see notes at
484            DatumPath.toString())
485    
486            So if you, coding for a future version of the FOO message but
487            recieving old-version message data, tried
488            props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).add(0).add(1).toString()) 
489            with the message above (that is, trying to extract a repetition and
490            component that aren't there), you would get "ZYX[0]-42[0]-1-1" mapping to 
491            "fieldy-field-field" in the resulting props.  
492    
493            If the message was
494            "<FOO><ZYX><ZYX.42><ARG.1>component data</ARG.1></ZYX.42></ZYX></FOO>"
495            and you, coding for an old version of this FOO message but recieving
496            new-version FOO message data, tried 
497            props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).toString())
498            you would get "ZYX[0]-42[0]-1-1" mapping to "component data" in the resulting 
499            props.
500    
501            msgMask lets you specify which parts of the message you want dumped to props.
502            Passing in null gets you everything.  Otherwise, msgMask's elements should
503            all be DatumPaths (! => ClassCastException), and a particular part of the
504            message will be dumped to props only if it's location, as represented by a
505            DatumPath, startsWith (as in DatumPath.startsWith()) at least one element of
506            msgMask.  So if one element of msgMask was a (new DatumPath()).add(new
507            String("ZYX")), then everything in all ZYX segment would get dumped to props.
508            A (new DatumPath()).add(new String("ZYX")).add(1) would get only the first
509            repetitions of same (if there is one) dumped to props.  etc. etc.  Note that
510            a DatumPath of size() == 0 in msgMask will get you everything, no matter what
511            the other elements of msgMask are, because all DatumPaths startsWith the
512            zero-length DatumPath.
513    
514            Segment group elements (eg. ADT_A01.PROCEDURE) are handled fine, but they
515            aren't addressed in msgMask or in the output in props -- basically any
516            element tags at the level immediately inside the message element, and having
517            a name that starts with the message element name + '.', is ignored (meaning
518            it's contents are dealt with the same as if the start and end tags' just 
519            wasn't there.)
520            */
521            public static boolean parseMessage(Properties props, String message, 
522                            Collection<DatumPath> msgMask) throws HL7Exception
523            {
524                    boolean ret = false;
525                    try {
526                            SAXParserFactory factory = SAXParserFactory.newInstance();
527                            SAXParser parser = factory.newSAXParser();
528    
529                            InputSource inSrc = new InputSource(new java.io.StringReader(message));
530    
531                            HL7MessageHandler handler = new HL7MessageHandler();
532                            handler.m_props = (props != null 
533                                    ? props : new Properties()); // it's expecting a props.
534    
535                            if(msgMask != null)
536                                    handler.m_msgMask = msgMask;
537                            else {
538                                    handler.m_msgMask = new ArrayList<DatumPath>();
539                                    handler.m_msgMask.add(new DatumPath());
540                            }
541    
542                            parser.parse(inSrc, handler);
543                            ret = true;
544            } catch (ParserConfigurationException e) {
545                throw new HL7Exception(e);
546            } catch (IOException e) {
547                throw new HL7Exception(e);
548            } catch (StopParsingException e) {
549                throw new HL7Exception(e);
550            } catch (SAXException e) {
551                throw new HL7Exception(e);
552            }
553    
554                    return ret;
555            }
556    
557            public static void main(String args[]) 
558            {
559                    if(args.length >= 1) {
560                            Properties props = new Properties();
561                            List<DatumPath> msgMask = new ArrayList<DatumPath>();
562                            msgMask.add(new DatumPath().add("MSH").add(0).add(9));
563                            //msgMask.add(new DatumPath());
564                            boolean parseret;
565                try {
566                    parseret = XML.parseMessage(props, args[0], msgMask);
567                    System.err.println("parseMessage returned " + parseret);
568                } catch (HL7Exception e) {
569                    e.printStackTrace();
570                }
571                            props.list(System.err);
572                    }
573            }
574    }
575