001/* 002 * (c) Copyright 2009 University of Bristol 003 * All rights reserved. 004 * [See end of file] 005 */ 006package net.rootdev.javardfa; 007 008import net.rootdev.javardfa.uri.URIExtractor10; 009import net.rootdev.javardfa.uri.URIExtractor; 010import net.rootdev.javardfa.uri.IRIResolver; 011import net.rootdev.javardfa.literal.LiteralCollector; 012import java.util.Collection; 013import java.util.EnumSet; 014import java.util.Iterator; 015import java.util.LinkedList; 016import java.util.List; 017import java.util.Set; 018import javax.xml.namespace.QName; 019import javax.xml.stream.XMLEventFactory; 020import javax.xml.stream.XMLOutputFactory; 021import javax.xml.stream.XMLStreamException; 022import javax.xml.stream.events.Attribute; 023import javax.xml.stream.events.StartElement; 024import javax.xml.stream.events.XMLEvent; 025import org.xml.sax.Attributes; 026import org.xml.sax.ContentHandler; 027import org.xml.sax.ErrorHandler; 028import org.xml.sax.Locator; 029import org.xml.sax.SAXException; 030import org.xml.sax.SAXParseException; 031 032/** 033 * @author Damian Steer <pldms@mac.com> 034 */ 035public class Parser implements ContentHandler, ErrorHandler { 036 037 private final XMLEventFactory eventFactory; 038 private final StatementSink sink; 039 private final Set<Setting> settings; 040 private final LiteralCollector literalCollector; 041 private final URIExtractor extractor; 042 043 public Parser(StatementSink sink) { 044 this( sink, 045 XMLOutputFactory.newInstance(), 046 XMLEventFactory.newInstance(), 047 new URIExtractor10(new IRIResolver())); 048 } 049 050 public Parser(StatementSink sink, 051 XMLOutputFactory outputFactory, 052 XMLEventFactory eventFactory, 053 URIExtractor extractor) { 054 this.sink = sink; 055 this.eventFactory = eventFactory; 056 this.settings = EnumSet.noneOf(Setting.class); 057 this.extractor = extractor; 058 this.literalCollector = new LiteralCollector(this, eventFactory, outputFactory); 059 060 extractor.setSettings(settings); 061 062 // Important, although I guess the caller doesn't get total control 063 outputFactory.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true); 064 } 065 066 public boolean isEnabled(Setting setting) { 067 return settings.contains(setting); 068 } 069 070 public void enable(Setting setting) { 071 settings.add(setting); 072 } 073 074 public void disable(Setting setting) { 075 settings.remove(setting); 076 } 077 078 public void setBase(String base) { 079 this.context = new EvalContext(base); 080 if (isEnabled(Setting.OnePointOne)) context.setPrefixes(Constants.CORE_DEFAULT_PREFIXES); 081 sink.setBase(context.getBase()); 082 } 083 084 EvalContext parse(EvalContext context, StartElement element) 085 throws XMLStreamException { 086 String currentLanguage = context.language; 087 boolean inXHTML = Constants.xhtmlNS.equals(element.getName().getNamespaceURI()); 088 089 // Respect xml:base outside xhtml 090 if (element.getAttributeByName(Constants.xmlbaseNS) != null && !inXHTML) { 091 context.setBase(element.getAttributeByName(Constants.xmlbaseNS).getValue()); 092 sink.setBase(context.getBase()); 093 } 094 095 if (Constants.base.equals(element.getName()) && 096 element.getAttributeByName(Constants.href) != null) { 097 context.setBase(element.getAttributeByName(Constants.href).getValue()); 098 sink.setBase(context.getBase()); 099 } 100 101 // The xml / html namespace matching is a bit ropey. I wonder if the html 5 102 // parser has a setting for this? 103 if (settings.contains(Setting.ManualNamespaces)) { 104 if (element.getAttributeByName(Constants.xmllang) != null) { 105 currentLanguage = element.getAttributeByName(Constants.xmllang).getValue(); 106 if (currentLanguage.length() == 0) currentLanguage = null; 107 } else if (element.getAttributeByName(Constants.lang) != null) { 108 currentLanguage = element.getAttributeByName(Constants.lang).getValue(); 109 if (currentLanguage.length() == 0) currentLanguage = null; 110 } 111 } else if (element.getAttributeByName(Constants.xmllangNS) != null) { 112 currentLanguage = element.getAttributeByName(Constants.xmllangNS).getValue(); 113 if (currentLanguage.length() == 0) currentLanguage = null; 114 } 115 116 if (settings.contains(Setting.OnePointOne)) { 117 118 if (element.getAttributeByName(Constants.vocab) != null) { 119 String vocab = 120 element.getAttributeByName(Constants.vocab).getValue().trim(); 121 // empty vocab removes default vocab 122 if (vocab.length() == 0) { 123 context.vocab = null; 124 } else { 125 context.vocab = vocab; 126 emitTriples(context.base, Constants.rdfaUses, vocab); 127 } 128 } 129 130 if (element.getAttributeByName(Constants.prefix) != null) { 131 parsePrefixes(element.getAttributeByName(Constants.prefix).getValue(), context); 132 } 133 } 134 135 String about = extractor.getURI(element, Constants.about, context); 136 String src = extractor.getURI(element, Constants.src, context); 137 String href = extractor.getURI(element, Constants.href, context); 138 String resource = extractor.getURI(element, Constants.resource, context); 139 String datatype = extractor.getURI(element, Constants.datatype, context); 140 Attribute contentAttr = element.getAttributeByName(Constants.content); 141 String content = (contentAttr == null) ? null : contentAttr.getValue(); 142 143 List<String> typeof = extractor.getURIs(element, Constants.typeof, context); 144 List<String> rel = extractor.getURIs(element, Constants.rel, context); 145 List<String> rev = extractor.getURIs(element, Constants.rev, context); 146 List<String> property = extractor.getURIs(element, Constants.property, context); 147 148 if (settings.contains(Setting.OnePointOne)) { 149 return parse11(rev, rel, about, src, resource, href, context, inXHTML, 150 element, typeof, property, content, datatype, currentLanguage); 151 } else { 152 return parse10(rev, rel, about, src, resource, href, context, inXHTML, 153 element, typeof, property, content, datatype, currentLanguage); 154 } 155 } 156 157 private EvalContext parse10(List<String> rev, List<String> rel, String about, String src, String resource, String href, EvalContext context, boolean inXHTML, StartElement element, List<String> typeof, List<String> property, String content, String datatype, String currentLanguage) { 158 boolean skipElement = false; 159 String newSubject = null; 160 String currentObject = null; 161 List<String> forwardProperties = new LinkedList(); 162 List<String> backwardProperties = new LinkedList(); 163 164 if (rev == null && rel == null) { 165 newSubject = coalesce(about, src, resource, href); 166 if (newSubject == null) { 167 if (context.parent == null && !inXHTML) { 168 newSubject = context.base; 169 } else if (Constants.body.equals(element.getName()) || 170 Constants.head.equals(element.getName())) { 171 newSubject = context.base; 172 } else if (typeof != null) { 173 newSubject = createBNode(); 174 } else { 175 if (context.parentObject != null) { 176 newSubject = context.parentObject; 177 } 178 if (property == null) { 179 skipElement = true; 180 } 181 } 182 } 183 } else { 184 newSubject = coalesce(about, src); 185 if (newSubject == null) { 186 if (context.parent == null && !inXHTML) { 187 newSubject = context.base; 188 } else if (Constants.head.equals(element.getName()) || 189 Constants.body.equals(element.getName())) { 190 newSubject = context.base; 191 } else if (typeof != null) { 192 newSubject = createBNode(); 193 } else if (context.parentObject != null) { 194 newSubject = context.parentObject; 195 } 196 } 197 currentObject = coalesce(resource, href); 198 } 199 200 if (newSubject != null && typeof != null) { 201 for (String type : typeof) { 202 emitTriples(newSubject, 203 Constants.rdfType, 204 type); 205 } 206 } 207 208 // Dodgy extension 209 if (settings.contains(Setting.FormMode)) { 210 if (Constants.form.equals(element.getName())) { 211 emitTriples(newSubject, Constants.rdfType, "http://www.w3.org/1999/xhtml/vocab/#form"); // Signal entering form 212 } 213 if (Constants.input.equals(element.getName()) && 214 element.getAttributeByName(Constants.name) != null) { 215 currentObject = "?" + element.getAttributeByName(Constants.name).getValue(); 216 } 217 218 } 219 220 if (property != null) { 221 222 if (content != null) { // The easy bit 223 if (datatype == null || datatype.length() == 0) { 224 emitTriplesPlainLiteral(newSubject, property, content, currentLanguage); 225 } else { 226 emitTriplesDatatypeLiteral(newSubject, property, content, datatype); 227 } 228 } else { 229 literalCollector.collect(newSubject, property, datatype, currentLanguage); 230 } 231 } 232 233 if (currentObject != null) { 234 if (element.getAttributeByName(Constants.rel) != null) { 235 emitTriples(newSubject, rel, currentObject); 236 } 237 if (element.getAttributeByName(Constants.rev) != null) { 238 emitTriples(currentObject, rev, newSubject); 239 } 240 } else { 241 if (element.getAttributeByName(Constants.rel) != null) { 242 forwardProperties.addAll(rel); 243 } 244 if (element.getAttributeByName(Constants.rev) != null) { 245 backwardProperties.addAll(rev); 246 } 247 if (!forwardProperties.isEmpty() || !backwardProperties.isEmpty()) { 248 // if predicate present 249 currentObject = createBNode(); 250 } 251 } 252 253 if (!skipElement && newSubject != null) { 254 emitTriples(context.parentSubject, 255 context.forwardProperties, 256 newSubject); 257 258 emitTriples(newSubject, 259 context.backwardProperties, 260 context.parentSubject); 261 } 262 263 EvalContext ec = new EvalContext(context); 264 if (skipElement) { 265 ec.language = currentLanguage; 266 } else { 267 if (newSubject != null) { 268 ec.parentSubject = newSubject; 269 } else { 270 ec.parentSubject = context.parentSubject; 271 } 272 273 if (currentObject != null) { 274 ec.parentObject = currentObject; 275 } else if (newSubject != null) { 276 ec.parentObject = newSubject; 277 } else { 278 ec.parentObject = context.parentSubject; 279 } 280 281 ec.language = currentLanguage; 282 ec.forwardProperties = forwardProperties; 283 ec.backwardProperties = backwardProperties; 284 } 285 return ec; 286 } 287 288 private EvalContext parse11(List<String> rev, List<String> rel, String about, String src, String resource, String href, EvalContext context, boolean inXHTML, StartElement element, List<String> typeof, List<String> property, String content, String datatype, String currentLanguage) { 289 boolean skipElement = false; 290 String newSubject = null; 291 String currentObject = null; 292 String typedResource = null; 293 List<String> forwardProperties = new LinkedList(); 294 List<String> backwardProperties = new LinkedList(); 295 296 if (rev == null && rel == null) { 297 if (property != null && content == null && datatype == null) { 298 if (about != null && about != URIExtractor.NONE) newSubject = about; 299 else if (context.parent == null) newSubject = context.base; 300 else if (context.parentObject != null) newSubject = context.parentObject; 301 302 if (typeof != null) { 303 if (about != null && about != URIExtractor.NONE) typedResource = about; 304 else if (context.parent == null) typedResource = context.base; 305 else typedResource = coalesce(resource, href, src); 306 307 if (typedResource == null) typedResource = createBNode(); 308 309 currentObject = typedResource; 310 } 311 } else { 312 newSubject = coalesce(about, resource, href, src); 313 314 if (newSubject == null) { 315 if (context.parent == null) newSubject = context.base; 316 else if (typeof != null) newSubject = createBNode(); 317 else if (context.parentObject != null) { 318 newSubject = context.parentObject; 319 if (property == null) skipElement = true; 320 } 321 } 322 323 if (typeof != null) typedResource = newSubject; 324 } 325 } else { // rev or rel present 326 if (about != null && about != URIExtractor.NONE) newSubject = about; 327 if (typeof != null) typedResource = newSubject; 328 329 if (newSubject == null) { 330 if (context.parent == null) newSubject = context.base; 331 else if (context.parentObject != null) newSubject = context.parentObject; 332 } 333 334 currentObject = coalesce(resource, href, src); 335 336 if (currentObject == null && typeof != null && about == null) currentObject = createBNode(); 337 338 if (typeof != null && about == null) typedResource = currentObject; 339 } 340 341 if (typedResource != null) { 342 for (String type : typeof) { 343 emitTriples(typedResource, 344 Constants.rdfType, 345 type); 346 } 347 } 348 349 // STEP 8 skipped... list etc 350 351 if (currentObject != null) { 352 if (rel != null) emitTriples(newSubject, rel, currentObject); 353 if (rev != null) emitTriples(currentObject, rev, newSubject); 354 } else { 355 // Do I really want to add all here, or simply assign??? 356 if (rel != null) forwardProperties.addAll(rel); 357 if (rev != null) backwardProperties.addAll(rev); 358 if (rev != null || rel != null) currentObject = createBNode(); 359 } 360 361 if (property != null) { 362 363 String propertyValue = null; 364 365 if (content != null) { // The easy bit 366 if (datatype == null || datatype.length() == 0) { 367 emitTriplesPlainLiteral(newSubject, property, content, currentLanguage); 368 } else { 369 emitTriplesDatatypeLiteral(newSubject, property, content, datatype); 370 } 371 propertyValue = URIExtractor.NONE; 372 } else if (datatype != null) { 373 literalCollector.collect(newSubject, property, datatype, currentLanguage); 374 propertyValue = URIExtractor.NONE; 375 } else if (rev == null && rev == null && content == null) { 376 propertyValue = coalesce(resource, href, src); 377 } 378 379 if (propertyValue == null && typeof != null && about == null) { 380 propertyValue = typedResource; 381 } 382 383 if (propertyValue == null && content == null && datatype == null) { 384 literalCollector.collect(newSubject, property, datatype, currentLanguage); 385 } 386 387 if (propertyValue != null && propertyValue != URIExtractor.NONE) emitTriples(newSubject, property, propertyValue); 388 } 389 390 if (!skipElement && newSubject != null) { 391 emitTriples(context.parentSubject, 392 context.forwardProperties, 393 newSubject); 394 395 emitTriples(newSubject, 396 context.backwardProperties, 397 context.parentSubject); 398 } 399 400 EvalContext ec = new EvalContext(context); 401 if (skipElement) { 402 ec.language = currentLanguage; 403 } else { 404 if (newSubject != null) { 405 ec.parentSubject = newSubject; 406 } else { 407 ec.parentSubject = context.parentSubject; 408 } 409 410 if (currentObject != null) { 411 ec.parentObject = currentObject; 412 } else if (newSubject != null) { 413 ec.parentObject = newSubject; 414 } else { 415 ec.parentObject = context.parentSubject; 416 } 417 418 ec.language = currentLanguage; 419 ec.forwardProperties = forwardProperties; 420 ec.backwardProperties = backwardProperties; 421 } 422 return ec; 423 } 424 425 public void emitTriples(String subj, Collection<String> props, String obj) { 426 for (String prop : props) { 427 if (!prop.startsWith("_")) sink.addObject(subj, prop, obj); 428 } 429 } 430 431 public void emitTriplesPlainLiteral(String subj, Collection<String> props, String lex, String language) { 432 for (String prop : props) { 433 if (!prop.startsWith("_")) sink.addLiteral(subj, prop, lex, language, null); 434 } 435 } 436 437 public void emitTriplesDatatypeLiteral(String subj, Collection<String> props, String lex, String datatype) { 438 for (String prop : props) { 439 if (!prop.startsWith("_")) sink.addLiteral(subj, prop, lex, null, datatype); 440 } 441 } 442 443 int bnodeId = 0; 444 445 private String createBNode() // TODO probably broken? Can you write bnodes in rdfa directly? 446 { 447 return "_:node" + (bnodeId++); 448 } 449 450 private void getNamespaces(Attributes attrs) { 451 for (int i = 0; i < attrs.getLength(); i++) { 452 String qname = attrs.getQName(i); 453 String prefix = getPrefix(qname); 454 if ("xmlns".equals(prefix)) { 455 String pre = getLocal(prefix, qname); 456 String uri = attrs.getValue(i); 457 if (!settings.contains(Setting.ManualNamespaces) && pre.contains("_")) 458 continue; // not permitted 459 context.setNamespaceURI(pre, uri); 460 sink.addPrefix(pre, uri); 461 } 462 } 463 } 464 465 private String getPrefix(String qname) { 466 if (!qname.contains(":")) { 467 return ""; 468 } 469 return qname.substring(0, qname.indexOf(":")); 470 } 471 472 private String getLocal(String prefix, String qname) { 473 if (prefix.length() == 0) { 474 return qname; 475 } 476 return qname.substring(prefix.length() + 1); 477 } 478 /** 479 * SAX methods 480 */ 481 private Locator locator; 482 private EvalContext context; 483 484 public void setDocumentLocator(Locator arg0) { 485 this.locator = arg0; 486 if (locator.getSystemId() != null) 487 this.setBase(arg0.getSystemId()); 488 } 489 490 public void startDocument() throws SAXException { 491 sink.start(); 492 } 493 494 public void endDocument() throws SAXException { 495 sink.end(); 496 } 497 498 public void startPrefixMapping(String arg0, String arg1) 499 throws SAXException { 500 context.setNamespaceURI(arg0, arg1); 501 sink.addPrefix(arg0, arg1); 502 } 503 504 public void endPrefixMapping(String arg0) throws SAXException { 505 } 506 507 public void startElement(String arg0, String localname, String qname, Attributes arg3) throws SAXException { 508 try { 509 //System.err.println("Start element: " + arg0 + " " + arg1 + " " + arg2); 510 511 // This is set very late in some html5 cases (not even ready by document start) 512 if (context == null) { 513 this.setBase(locator.getSystemId()); 514 } 515 516 // Dammit, not quite the same as XMLEventFactory 517 String prefix = /*(localname.equals(qname))*/ 518 (qname.indexOf(':') == -1 ) ? "" 519 : qname.substring(0, qname.indexOf(':')); 520 if (settings.contains(Setting.ManualNamespaces)) { 521 getNamespaces(arg3); 522 if (prefix.length() != 0) { 523 arg0 = context.getNamespaceURI(prefix); 524 localname = localname.substring(prefix.length() + 1); 525 } 526 } 527 StartElement e = eventFactory.createStartElement( 528 prefix, arg0, localname, 529 fromAttributes(arg3), null, context); 530 531 if (literalCollector.isCollecting()) literalCollector.handleEvent(e); 532 533 // If we are gathering XML we stop parsing 534 if (!literalCollector.isCollectingXML()) context = parse(context, e); 535 } catch (XMLStreamException ex) { 536 throw new RuntimeException("Streaming issue", ex); 537 } 538 539 } 540 541 public void endElement(String arg0, String localname, String qname) throws SAXException { 542 //System.err.println("End element: " + arg0 + " " + arg1 + " " + arg2); 543 if (literalCollector.isCollecting()) { 544 String prefix = (localname.equals(qname)) ? "" 545 : qname.substring(0, qname.indexOf(':')); 546 XMLEvent e = eventFactory.createEndElement(prefix, arg0, localname); 547 literalCollector.handleEvent(e); 548 } 549 // If we aren't collecting an XML literal keep parsing 550 if (!literalCollector.isCollectingXML()) context = context.parent; 551 } 552 553 public void characters(char[] arg0, int arg1, int arg2) throws SAXException { 554 if (literalCollector.isCollecting()) { 555 XMLEvent e = eventFactory.createCharacters(String.valueOf(arg0, arg1, arg2)); 556 literalCollector.handleEvent(e); 557 } 558 } 559 560 public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException { 561 //System.err.println("Whitespace..."); 562 if (literalCollector.isCollecting()) { 563 XMLEvent e = eventFactory.createIgnorableSpace(String.valueOf(arg0, arg1, arg2)); 564 literalCollector.handleEvent(e); 565 } 566 } 567 568 public void processingInstruction(String arg0, String arg1) throws SAXException { 569 } 570 571 public void skippedEntity(String arg0) throws SAXException { 572 } 573 574 private Iterator fromAttributes(Attributes attributes) { 575 List toReturn = new LinkedList(); 576 577 for (int i = 0; i < attributes.getLength(); i++) { 578 String qname = attributes.getQName(i); 579 String prefix = qname.contains(":") ? qname.substring(0, qname.indexOf(":")) : ""; 580 Attribute attr = eventFactory.createAttribute( 581 prefix, attributes.getURI(i), 582 attributes.getLocalName(i), attributes.getValue(i)); 583 584 if (!qname.equals("xmlns") && !qname.startsWith("xmlns:")) 585 toReturn.add(attr); 586 } 587 588 return toReturn.iterator(); 589 } 590 591 // 1.1 method 592 593 private void parsePrefixes(String value, EvalContext context) { 594 String[] parts = value.split("\\s+"); 595 for (int i = 0; i < parts.length; i += 2) { 596 String prefix = parts[i]; 597 if (i + 1 < parts.length && prefix.endsWith(":")) { 598 String prefixFix = prefix.substring(0, prefix.length() - 1); 599 context.setPrefix(prefixFix, parts[i+1]); 600 sink.addPrefix(prefixFix, parts[i+1]); 601 } 602 } 603 } 604 605 // SAX error handling 606 607 public void warning(SAXParseException exception) throws SAXException { 608 System.err.printf("Warning: %s\n", exception.getLocalizedMessage()); 609 } 610 611 public void error(SAXParseException exception) throws SAXException { 612 System.err.printf("Error: %s\n", exception.getLocalizedMessage()); 613 } 614 615 public void fatalError(SAXParseException exception) throws SAXException { 616 System.err.printf("Fatal error: %s\n", exception.getLocalizedMessage()); 617 } 618 619 // Coalesce utility functions. Useful in parsing. 620 621 private static <T> T coalesce(T a, T b) { 622 if (a != null && a != URIExtractor.NONE) return a; 623 return b; 624 } 625 626 private static <T> T coalesce(T a, T b, T c) { 627 if (a != null && a != URIExtractor.NONE) return a; 628 if (b != null && b != URIExtractor.NONE) return b; 629 return c; 630 } 631 632 private static <T> T coalesce(T a, T b, T c, T d) { 633 if (a != null && a != URIExtractor.NONE) return a; 634 if (b != null && b != URIExtractor.NONE) return b; 635 if (c != null && c != URIExtractor.NONE) return c; 636 return d; 637 } 638} 639 640/* 641 * (c) Copyright 2009 University of Bristol 642 * All rights reserved. 643 * 644 * Redistribution and use in source and binary forms, with or without 645 * modification, are permitted provided that the following conditions 646 * are met: 647 * 1. Redistributions of source code must retain the above copyright 648 * notice, this list of conditions and the following disclaimer. 649 * 2. Redistributions in binary form must reproduce the above copyright 650 * notice, this list of conditions and the following disclaimer in the 651 * documentation and/or other materials provided with the distribution. 652 * 3. The name of the author may not be used to endorse or promote products 653 * derived from this software without specific prior written permission. 654 * 655 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 656 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 657 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 658 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 659 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 660 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 661 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 662 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 663 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 664 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 665 */