001/*
002 * (c) Copyright 2009 University of Bristol
003 * All rights reserved.
004 * [See end of file]
005 */
006package net.rootdev.javardfa;
007
008import net.rootdev.javardfa.uri.URIExtractor10;
009import net.rootdev.javardfa.uri.URIExtractor;
010import net.rootdev.javardfa.uri.IRIResolver;
011import net.rootdev.javardfa.literal.LiteralCollector;
012import java.util.Collection;
013import java.util.EnumSet;
014import java.util.Iterator;
015import java.util.LinkedList;
016import java.util.List;
017import java.util.Set;
018import javax.xml.namespace.QName;
019import javax.xml.stream.XMLEventFactory;
020import javax.xml.stream.XMLOutputFactory;
021import javax.xml.stream.XMLStreamException;
022import javax.xml.stream.events.Attribute;
023import javax.xml.stream.events.StartElement;
024import javax.xml.stream.events.XMLEvent;
025import org.xml.sax.Attributes;
026import org.xml.sax.ContentHandler;
027import org.xml.sax.ErrorHandler;
028import org.xml.sax.Locator;
029import org.xml.sax.SAXException;
030import org.xml.sax.SAXParseException;
031
032/**
033 * @author Damian Steer <pldms@mac.com>
034 */
035public class Parser implements ContentHandler, ErrorHandler {
036
037    private final XMLEventFactory eventFactory;
038    private final StatementSink sink;
039    private final Set<Setting> settings;
040    private final LiteralCollector literalCollector;
041    private final URIExtractor extractor;
042
043    public Parser(StatementSink sink) {
044        this(   sink,
045                XMLOutputFactory.newInstance(),
046                XMLEventFactory.newInstance(),
047                new URIExtractor10(new IRIResolver()));
048    }
049
050    public Parser(StatementSink sink,
051            XMLOutputFactory outputFactory,
052            XMLEventFactory eventFactory,
053            URIExtractor extractor) {
054        this.sink = sink;
055        this.eventFactory = eventFactory;
056        this.settings = EnumSet.noneOf(Setting.class);
057        this.extractor = extractor;
058        this.literalCollector = new LiteralCollector(this, eventFactory, outputFactory);
059
060        extractor.setSettings(settings);
061
062        // Important, although I guess the caller doesn't get total control
063        outputFactory.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true);
064    }
065    
066    public boolean isEnabled(Setting setting) {
067        return settings.contains(setting);
068    }
069    
070    public void enable(Setting setting) {
071        settings.add(setting);
072    }
073
074    public void disable(Setting setting) {
075        settings.remove(setting);
076    }
077    
078    public void setBase(String base) {
079        this.context = new EvalContext(base);
080        if (isEnabled(Setting.OnePointOne)) context.setPrefixes(Constants.CORE_DEFAULT_PREFIXES);
081        sink.setBase(context.getBase());
082    }
083
084    EvalContext parse(EvalContext context, StartElement element)
085            throws XMLStreamException {
086        String currentLanguage = context.language;
087        boolean inXHTML = Constants.xhtmlNS.equals(element.getName().getNamespaceURI());
088        
089        // Respect xml:base outside xhtml
090        if (element.getAttributeByName(Constants.xmlbaseNS) != null && !inXHTML) {
091            context.setBase(element.getAttributeByName(Constants.xmlbaseNS).getValue());
092            sink.setBase(context.getBase());
093        }
094        
095        if (Constants.base.equals(element.getName()) &&
096                element.getAttributeByName(Constants.href) != null) {
097            context.setBase(element.getAttributeByName(Constants.href).getValue());
098            sink.setBase(context.getBase());
099        }
100        
101        // The xml / html namespace matching is a bit ropey. I wonder if the html 5
102        // parser has a setting for this?
103        if (settings.contains(Setting.ManualNamespaces)) {
104            if (element.getAttributeByName(Constants.xmllang) != null) {
105                currentLanguage = element.getAttributeByName(Constants.xmllang).getValue();
106                if (currentLanguage.length() == 0) currentLanguage = null;
107            } else if (element.getAttributeByName(Constants.lang) != null) {
108                currentLanguage = element.getAttributeByName(Constants.lang).getValue();
109                if (currentLanguage.length() == 0) currentLanguage = null;
110            }
111        } else if (element.getAttributeByName(Constants.xmllangNS) != null) {
112            currentLanguage = element.getAttributeByName(Constants.xmllangNS).getValue();
113            if (currentLanguage.length() == 0) currentLanguage = null;
114        }
115        
116        if (settings.contains(Setting.OnePointOne)) {
117
118            if (element.getAttributeByName(Constants.vocab) != null) {
119                String vocab =
120                    element.getAttributeByName(Constants.vocab).getValue().trim();
121                // empty vocab removes default vocab
122                if (vocab.length() == 0) {
123                    context.vocab = null;
124                } else {
125                    context.vocab = vocab;
126                    emitTriples(context.base, Constants.rdfaUses, vocab);
127                }
128            }
129
130            if (element.getAttributeByName(Constants.prefix) != null) {
131                parsePrefixes(element.getAttributeByName(Constants.prefix).getValue(), context);
132            }
133        }
134        
135        String about = extractor.getURI(element, Constants.about, context);
136        String src = extractor.getURI(element, Constants.src, context);
137        String href = extractor.getURI(element, Constants.href, context);
138        String resource = extractor.getURI(element, Constants.resource, context);
139        String datatype = extractor.getURI(element, Constants.datatype, context);
140        Attribute contentAttr = element.getAttributeByName(Constants.content);
141        String content = (contentAttr == null) ? null : contentAttr.getValue();
142        
143        List<String> typeof = extractor.getURIs(element, Constants.typeof, context);
144        List<String> rel = extractor.getURIs(element, Constants.rel, context);
145        List<String> rev = extractor.getURIs(element, Constants.rev, context);
146        List<String> property = extractor.getURIs(element, Constants.property, context);
147        
148        if (settings.contains(Setting.OnePointOne)) {
149            return parse11(rev, rel, about, src, resource, href, context, inXHTML, 
150                    element, typeof, property, content, datatype, currentLanguage);
151        } else {
152            return parse10(rev, rel, about, src, resource, href, context, inXHTML,
153                    element, typeof, property, content, datatype, currentLanguage);
154        }
155    }
156
157    private EvalContext parse10(List<String> rev, List<String> rel, String about, String src, String resource, String href, EvalContext context, boolean inXHTML, StartElement element, List<String> typeof, List<String> property, String content, String datatype, String currentLanguage) {
158        boolean skipElement = false;
159        String newSubject = null;
160        String currentObject = null;
161        List<String> forwardProperties = new LinkedList();
162        List<String> backwardProperties = new LinkedList();
163        
164        if (rev == null && rel == null) {
165            newSubject = coalesce(about, src, resource, href);
166            if (newSubject == null) {
167                if (context.parent == null && !inXHTML) {
168                    newSubject = context.base;
169                } else if (Constants.body.equals(element.getName()) ||
170                            Constants.head.equals(element.getName())) {
171                    newSubject = context.base;
172                } else if (typeof != null) {
173                    newSubject = createBNode();
174                } else {
175                    if (context.parentObject != null) {
176                        newSubject = context.parentObject;
177                    }
178                    if (property == null) {
179                        skipElement = true;
180                    }
181                }
182            }
183        } else {
184            newSubject = coalesce(about, src);
185            if (newSubject == null) {
186                if (context.parent == null && !inXHTML) {
187                    newSubject = context.base;
188                } else if (Constants.head.equals(element.getName()) ||
189                        Constants.body.equals(element.getName())) {
190                    newSubject = context.base;
191                } else if (typeof != null) {
192                    newSubject = createBNode();
193                } else if (context.parentObject != null) {
194                    newSubject = context.parentObject;
195                }
196            }
197            currentObject = coalesce(resource, href);
198        }
199
200        if (newSubject != null && typeof != null) {
201            for (String type : typeof) {
202                emitTriples(newSubject,
203                        Constants.rdfType,
204                        type);
205            }
206        }
207
208        // Dodgy extension
209        if (settings.contains(Setting.FormMode)) {
210            if (Constants.form.equals(element.getName())) {
211                emitTriples(newSubject, Constants.rdfType, "http://www.w3.org/1999/xhtml/vocab/#form"); // Signal entering form
212            }
213            if (Constants.input.equals(element.getName()) &&
214                    element.getAttributeByName(Constants.name) != null) {
215                currentObject = "?" + element.getAttributeByName(Constants.name).getValue();
216            }
217
218        }
219        
220        if (property != null) {
221            
222            if (content != null) { // The easy bit
223                if (datatype == null || datatype.length() == 0) {
224                    emitTriplesPlainLiteral(newSubject, property, content, currentLanguage);
225                } else {
226                    emitTriplesDatatypeLiteral(newSubject, property, content, datatype);
227                }
228            } else {
229                literalCollector.collect(newSubject, property, datatype, currentLanguage);
230            }
231        }
232        
233        if (currentObject != null) {
234            if (element.getAttributeByName(Constants.rel) != null) {
235                emitTriples(newSubject, rel, currentObject);
236            }
237            if (element.getAttributeByName(Constants.rev) != null) {
238                emitTriples(currentObject, rev, newSubject);
239            }
240        } else {
241            if (element.getAttributeByName(Constants.rel) != null) {
242                forwardProperties.addAll(rel);
243            }
244            if (element.getAttributeByName(Constants.rev) != null) {
245                backwardProperties.addAll(rev);
246            }
247            if (!forwardProperties.isEmpty() || !backwardProperties.isEmpty()) {
248                // if predicate present
249                currentObject = createBNode();
250            }
251        }
252
253        if (!skipElement && newSubject != null) {
254            emitTriples(context.parentSubject,
255                    context.forwardProperties,
256                    newSubject);
257
258            emitTriples(newSubject,
259                    context.backwardProperties,
260                    context.parentSubject);
261        }
262
263        EvalContext ec = new EvalContext(context);
264        if (skipElement) {
265            ec.language = currentLanguage;
266        } else {
267            if (newSubject != null) {
268                ec.parentSubject = newSubject;
269            } else {
270                ec.parentSubject = context.parentSubject;
271            }
272
273            if (currentObject != null) {
274                ec.parentObject = currentObject;
275            } else if (newSubject != null) {
276                ec.parentObject = newSubject;
277            } else {
278                ec.parentObject = context.parentSubject;
279            }
280
281            ec.language = currentLanguage;
282            ec.forwardProperties = forwardProperties;
283            ec.backwardProperties = backwardProperties;
284        }
285        return ec;
286    }
287
288    private EvalContext parse11(List<String> rev, List<String> rel, String about, String src, String resource, String href, EvalContext context, boolean inXHTML, StartElement element, List<String> typeof, List<String> property, String content, String datatype, String currentLanguage) {
289        boolean skipElement = false;
290        String newSubject = null;
291        String currentObject = null;
292        String typedResource = null;
293        List<String> forwardProperties = new LinkedList();
294        List<String> backwardProperties = new LinkedList();
295        
296        if (rev == null && rel == null) {
297            if (property != null && content == null && datatype == null) {
298                if (about != null && about != URIExtractor.NONE) newSubject = about;
299                else if (context.parent == null) newSubject = context.base;
300                else if (context.parentObject != null) newSubject = context.parentObject;
301                
302                if (typeof != null) {
303                    if (about != null && about != URIExtractor.NONE) typedResource = about;
304                    else if (context.parent == null) typedResource = context.base;
305                    else typedResource = coalesce(resource, href, src);
306                    
307                    if (typedResource == null) typedResource = createBNode();
308                                        
309                    currentObject = typedResource;
310                }
311            } else {
312                newSubject = coalesce(about, resource, href, src);
313                                
314                if (newSubject == null) {
315                    if (context.parent == null) newSubject = context.base;
316                    else if (typeof != null) newSubject = createBNode();
317                    else if (context.parentObject != null) {
318                        newSubject = context.parentObject;
319                        if (property == null) skipElement = true;
320                    }
321                }
322                
323                if (typeof != null) typedResource = newSubject;
324            }
325        } else { // rev or rel present
326            if (about != null && about != URIExtractor.NONE) newSubject = about;
327            if (typeof != null) typedResource = newSubject;
328            
329            if (newSubject == null) {
330                if (context.parent == null) newSubject = context.base;
331                else if (context.parentObject != null) newSubject = context.parentObject;
332            }
333            
334            currentObject = coalesce(resource, href, src);
335            
336            if (currentObject == null && typeof != null && about == null) currentObject = createBNode();
337            
338            if (typeof != null && about == null) typedResource = currentObject;
339        }
340        
341        if (typedResource != null) {
342            for (String type : typeof) {
343                emitTriples(typedResource,
344                        Constants.rdfType,
345                        type);
346            }
347        }
348        
349        // STEP 8 skipped... list etc
350        
351        if (currentObject != null) {
352            if (rel != null) emitTriples(newSubject, rel, currentObject);
353            if (rev != null) emitTriples(currentObject, rev, newSubject);
354        } else {
355            // Do I really want to add all here, or simply assign???
356            if (rel != null) forwardProperties.addAll(rel);
357            if (rev != null) backwardProperties.addAll(rev);
358            if (rev != null || rel != null) currentObject = createBNode();
359        }
360        
361        if (property != null) {
362                        
363            String propertyValue = null;
364            
365            if (content != null) { // The easy bit
366                if (datatype == null || datatype.length() == 0) {
367                    emitTriplesPlainLiteral(newSubject, property, content, currentLanguage);
368                } else {
369                    emitTriplesDatatypeLiteral(newSubject, property, content, datatype);
370                }
371                propertyValue = URIExtractor.NONE;
372            } else if (datatype != null) {
373                literalCollector.collect(newSubject, property, datatype, currentLanguage);
374                propertyValue = URIExtractor.NONE;
375            } else if (rev == null && rev == null && content == null) {
376                propertyValue = coalesce(resource, href, src);
377            }
378                        
379            if (propertyValue == null && typeof != null && about == null) {
380                propertyValue = typedResource;
381            }
382            
383            if (propertyValue == null && content == null && datatype == null) {
384                literalCollector.collect(newSubject, property, datatype, currentLanguage);
385            }
386            
387            if (propertyValue != null && propertyValue != URIExtractor.NONE) emitTriples(newSubject, property, propertyValue);
388        }
389        
390        if (!skipElement && newSubject != null) {
391            emitTriples(context.parentSubject,
392                    context.forwardProperties,
393                    newSubject);
394
395            emitTriples(newSubject,
396                    context.backwardProperties,
397                    context.parentSubject);
398        }
399        
400        EvalContext ec = new EvalContext(context);
401        if (skipElement) {
402            ec.language = currentLanguage;
403        } else {
404            if (newSubject != null) {
405                ec.parentSubject = newSubject;
406            } else {
407                ec.parentSubject = context.parentSubject;
408            }
409
410            if (currentObject != null) {
411                ec.parentObject = currentObject;
412            } else if (newSubject != null) {
413                ec.parentObject = newSubject;
414            } else {
415                ec.parentObject = context.parentSubject;
416            }
417
418            ec.language = currentLanguage;
419            ec.forwardProperties = forwardProperties;
420            ec.backwardProperties = backwardProperties;
421        }
422        return ec;
423    }
424
425    public void emitTriples(String subj, Collection<String> props, String obj) {
426        for (String prop : props) {
427            if (!prop.startsWith("_")) sink.addObject(subj, prop, obj);
428        }
429    }
430
431    public void emitTriplesPlainLiteral(String subj, Collection<String> props, String lex, String language) {
432        for (String prop : props) {
433            if (!prop.startsWith("_")) sink.addLiteral(subj, prop, lex, language, null);
434        }
435    }
436
437    public void emitTriplesDatatypeLiteral(String subj, Collection<String> props, String lex, String datatype) {
438        for (String prop : props) {
439            if (!prop.startsWith("_")) sink.addLiteral(subj, prop, lex, null, datatype);
440        }
441    }
442
443    int bnodeId = 0;
444    
445    private String createBNode() // TODO probably broken? Can you write bnodes in rdfa directly?
446    {
447        return "_:node" + (bnodeId++);
448    }
449
450    private void getNamespaces(Attributes attrs) {
451        for (int i = 0; i < attrs.getLength(); i++) {
452            String qname = attrs.getQName(i);
453            String prefix = getPrefix(qname);
454            if ("xmlns".equals(prefix)) {
455                String pre = getLocal(prefix, qname);
456                String uri = attrs.getValue(i);
457                if (!settings.contains(Setting.ManualNamespaces) && pre.contains("_"))
458                    continue; // not permitted
459                context.setNamespaceURI(pre, uri);
460                sink.addPrefix(pre, uri);
461            }
462        }
463    }
464
465    private String getPrefix(String qname) {
466        if (!qname.contains(":")) {
467            return "";
468        }
469        return qname.substring(0, qname.indexOf(":"));
470    }
471
472    private String getLocal(String prefix, String qname) {
473        if (prefix.length() == 0) {
474            return qname;
475        }
476        return qname.substring(prefix.length() + 1);
477    }
478    /**
479     * SAX methods
480     */
481    private Locator locator;
482    private EvalContext context;
483
484    public void setDocumentLocator(Locator arg0) {
485        this.locator = arg0;
486        if (locator.getSystemId() != null)
487            this.setBase(arg0.getSystemId());
488    }
489
490    public void startDocument() throws SAXException {
491        sink.start();
492    }
493
494    public void endDocument() throws SAXException {
495        sink.end();
496    }
497
498    public void startPrefixMapping(String arg0, String arg1)
499            throws SAXException {
500        context.setNamespaceURI(arg0, arg1);
501        sink.addPrefix(arg0, arg1);
502    }
503
504    public void endPrefixMapping(String arg0) throws SAXException {
505    }
506
507    public void startElement(String arg0, String localname, String qname, Attributes arg3) throws SAXException {
508        try {
509            //System.err.println("Start element: " + arg0 + " " + arg1 + " " + arg2);
510
511            // This is set very late in some html5 cases (not even ready by document start)
512            if (context == null) {
513                this.setBase(locator.getSystemId());
514            }
515
516            // Dammit, not quite the same as XMLEventFactory
517            String prefix = /*(localname.equals(qname))*/
518                    (qname.indexOf(':') == -1 ) ? ""
519                    : qname.substring(0, qname.indexOf(':'));
520            if (settings.contains(Setting.ManualNamespaces)) {
521                getNamespaces(arg3);
522                if (prefix.length() != 0) {
523                    arg0 = context.getNamespaceURI(prefix);
524                    localname = localname.substring(prefix.length() + 1);
525                }
526            }
527            StartElement e = eventFactory.createStartElement(
528                    prefix, arg0, localname,
529                    fromAttributes(arg3), null, context);
530
531            if (literalCollector.isCollecting()) literalCollector.handleEvent(e);
532
533            // If we are gathering XML we stop parsing
534            if (!literalCollector.isCollectingXML()) context = parse(context, e);
535        } catch (XMLStreamException ex) {
536            throw new RuntimeException("Streaming issue", ex);
537        }
538
539    }
540
541    public void endElement(String arg0, String localname, String qname) throws SAXException {
542        //System.err.println("End element: " + arg0 + " " + arg1 + " " + arg2);
543        if (literalCollector.isCollecting()) {
544            String prefix = (localname.equals(qname)) ? ""
545                    : qname.substring(0, qname.indexOf(':'));
546            XMLEvent e = eventFactory.createEndElement(prefix, arg0, localname);
547            literalCollector.handleEvent(e);
548        }
549        // If we aren't collecting an XML literal keep parsing
550        if (!literalCollector.isCollectingXML()) context = context.parent;
551    }
552
553    public void characters(char[] arg0, int arg1, int arg2) throws SAXException {
554        if (literalCollector.isCollecting()) {
555            XMLEvent e = eventFactory.createCharacters(String.valueOf(arg0, arg1, arg2));
556            literalCollector.handleEvent(e);
557        }
558    }
559
560    public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
561        //System.err.println("Whitespace...");
562        if (literalCollector.isCollecting()) {
563            XMLEvent e = eventFactory.createIgnorableSpace(String.valueOf(arg0, arg1, arg2));
564            literalCollector.handleEvent(e);
565        }
566    }
567
568    public void processingInstruction(String arg0, String arg1) throws SAXException {
569    }
570
571    public void skippedEntity(String arg0) throws SAXException {
572    }
573
574    private Iterator fromAttributes(Attributes attributes) {
575        List toReturn = new LinkedList();
576        
577        for (int i = 0; i < attributes.getLength(); i++) {
578            String qname = attributes.getQName(i);
579            String prefix = qname.contains(":") ? qname.substring(0, qname.indexOf(":")) : "";
580            Attribute attr = eventFactory.createAttribute(
581                    prefix, attributes.getURI(i),
582                    attributes.getLocalName(i), attributes.getValue(i));
583
584            if (!qname.equals("xmlns") && !qname.startsWith("xmlns:"))
585                toReturn.add(attr);
586        }
587        
588        return toReturn.iterator();
589    }
590
591    // 1.1 method
592
593    private void parsePrefixes(String value, EvalContext context) {
594        String[] parts = value.split("\\s+");
595        for (int i = 0; i < parts.length; i += 2) {
596            String prefix = parts[i];
597            if (i + 1 < parts.length && prefix.endsWith(":")) {
598                String prefixFix = prefix.substring(0, prefix.length() - 1);
599                context.setPrefix(prefixFix, parts[i+1]);
600                sink.addPrefix(prefixFix, parts[i+1]);
601            }
602        }
603    }
604    
605    // SAX error handling
606    
607    public void warning(SAXParseException exception) throws SAXException {
608        System.err.printf("Warning: %s\n", exception.getLocalizedMessage());
609    }
610
611    public void error(SAXParseException exception) throws SAXException {
612        System.err.printf("Error: %s\n", exception.getLocalizedMessage());
613    }
614
615    public void fatalError(SAXParseException exception) throws SAXException {
616        System.err.printf("Fatal error: %s\n", exception.getLocalizedMessage());
617    }
618    
619    // Coalesce utility functions. Useful in parsing.
620    
621    private static <T> T coalesce(T a, T b) {
622        if (a != null && a != URIExtractor.NONE) return a;
623        return b;
624    }
625    
626    private static <T> T coalesce(T a, T b, T c) {
627        if (a != null && a != URIExtractor.NONE) return a;
628        if (b != null && b != URIExtractor.NONE) return b;
629        return c;
630    }
631    
632    private static <T> T coalesce(T a, T b, T c, T d) {
633        if (a != null && a != URIExtractor.NONE) return a;
634        if (b != null && b != URIExtractor.NONE) return b;
635        if (c != null && c != URIExtractor.NONE) return c;
636        return d;
637    }
638}
639
640/*
641 * (c) Copyright 2009 University of Bristol
642 * All rights reserved.
643 *
644 * Redistribution and use in source and binary forms, with or without
645 * modification, are permitted provided that the following conditions
646 * are met:
647 * 1. Redistributions of source code must retain the above copyright
648 *    notice, this list of conditions and the following disclaimer.
649 * 2. Redistributions in binary form must reproduce the above copyright
650 *    notice, this list of conditions and the following disclaimer in the
651 *    documentation and/or other materials provided with the distribution.
652 * 3. The name of the author may not be used to endorse or promote products
653 *    derived from this software without specific prior written permission.
654 *
655 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
656 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
657 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
658 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
659 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
660 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
661 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
662 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
663 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
664 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
665 */