001/* 002 * Copyright 2008-2011 Thomas Nichols. http://blog.thomnichols.org 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * You are receiving this code free of charge, which represents many hours of 017 * effort from other individuals and corporations. As a responsible member 018 * of the community, you are encouraged (but not required) to donate any 019 * enhancements or improvements back to the community under a similar open 020 * source license. Thank you. -TMN 021 */ 022package groovyx.net.http; 023 024import groovy.json.JsonSlurper; 025import groovy.lang.Closure; 026import groovy.util.XmlSlurper; 027import groovy.util.slurpersupport.GPathResult; 028import groovyx.net.http.HTTPBuilder.RequestConfigDelegate; 029 030import java.io.IOException; 031import java.io.InputStream; 032import java.io.InputStreamReader; 033import java.io.Reader; 034import java.io.UnsupportedEncodingException; 035import java.net.URL; 036import java.nio.charset.Charset; 037import java.util.HashMap; 038import java.util.Iterator; 039import java.util.List; 040import java.util.Map; 041 042import javax.xml.parsers.ParserConfigurationException; 043 044import org.apache.commons.logging.Log; 045import org.apache.commons.logging.LogFactory; 046import org.apache.http.HttpEntity; 047import org.apache.http.HttpResponse; 048import org.apache.http.NameValuePair; 049import org.apache.http.client.utils.URLEncodedUtils; 050import org.apache.http.entity.HttpEntityWrapper; 051import org.apache.http.message.BasicHeader; 052import org.apache.xml.resolver.Catalog; 053import org.apache.xml.resolver.CatalogManager; 054import org.apache.xml.resolver.tools.CatalogResolver; 055import org.codehaus.groovy.runtime.MethodClosure; 056import org.xml.sax.SAXException; 057import org.xml.sax.XMLReader; 058 059 060/** 061 * <p>Keeps track of response parsers for each content type. Each parser 062 * should should be a closure that accepts an {@link HttpResponse} instance, 063 * and returns whatever handler is appropriate for reading the response 064 * data for that content-type. For example, a plain-text response should 065 * probably be parsed with a <code>Reader</code>, while an XML response 066 * might be parsed by an XmlSlurper, which would then be passed to the 067 * response closure. </p> 068 * 069 * <p>Note that all methods in this class assume {@link HttpResponse#getEntity()} 070 * return a non-null value. It is the job of the HTTPBuilder instance to ensure 071 * a NullPointerException is not thrown by passing a response that contains no 072 * entity.</p> 073 * 074 * <p>You can see the list of content-type parsers that are built-in to the 075 * ParserRegistry class in {@link #buildDefaultParserMap()}.</p> 076 * 077 * @see ContentType 078 * @author <a href='mailto:tomstrummer+httpbuilder@gmail.com'>Tom Nichols</a> 079 */ 080public class ParserRegistry { 081 082 /** 083 * The default parser used for unregistered content-types. This is a copy 084 * of {@link #parseStream(HttpResponse)}, which is like a no-op that just 085 * returns the unaltered response stream. 086 */ 087 protected final Closure DEFAULT_PARSER = new MethodClosure( this, "parseStream" ); 088 /** 089 * The default charset to use when no charset is given in the Content-Type 090 * header of a response. This can be modifid via {@link #setDefaultCharset(String)}. 091 */ 092 public static final String DEFAULT_CHARSET = "UTF-8"; 093 094 private Closure defaultParser = DEFAULT_PARSER; 095 private Map<String,Closure> registeredParsers = buildDefaultParserMap(); 096 private static String defaultCharset = DEFAULT_CHARSET; 097 098 protected static final Log log = LogFactory.getLog( ParserRegistry.class ); 099 100 /** 101 * This CatalogResolver is static to avoid the overhead of re-parsing 102 * the catalog definition file every time. Unfortunately, there's no 103 * way to share a single Catalog instance between resolvers. The 104 * {@link Catalog} class is technically not thread-safe, but as long as you 105 * do not parse catalog files while using the resolver, it should be fine. 106 */ 107 protected static CatalogResolver catalogResolver; 108 109 static { 110 CatalogManager catalogManager = new CatalogManager(); 111 catalogManager.setIgnoreMissingProperties( true ); 112 catalogManager.setUseStaticCatalog( false ); 113 catalogManager.setRelativeCatalogs( true ); 114 try { 115 catalogResolver = new CatalogResolver( catalogManager ); 116 catalogResolver.getCatalog().parseCatalog( 117 ParserRegistry.class.getResource( "/catalog/html.xml" ) ); 118 } catch ( IOException ex ) { 119 LogFactory.getLog( ParserRegistry.class ) 120 .warn( "Could not resolve default XML catalog", ex ); 121 } 122 } 123 124 /** 125 * Set the charset to use for parsing character streams when no charset 126 * is given in the Content-Type header. 127 * @param charset the charset to use, or <code>null</code> to use 128 * {@link #DEFAULT_CHARSET} 129 */ 130 public static void setDefaultCharset( String charset ) { 131 defaultCharset = charset == null ? DEFAULT_CHARSET : charset; 132 } 133 134 /** 135 * Helper method to get the charset from the response. This should be done 136 * when manually parsing any text response to ensure it is decoded using the 137 * correct charset. For instance:<pre> 138 * Reader reader = new InputStreamReader( resp.getEntity().getContent(), 139 * ParserRegistry.getCharset( resp ) );</pre> 140 * @param resp 141 */ 142 public static String getCharset( HttpResponse resp ) { 143 try { 144 NameValuePair charset = resp.getEntity().getContentType() 145 .getElements()[0].getParameterByName("charset"); 146 147 if ( charset == null || charset.getValue().trim().equals("") ) { 148 log.debug( "Could not find charset in response; using " + defaultCharset ); 149 return defaultCharset; 150 } 151 152 return charset.getValue(); 153 } 154 catch ( RuntimeException ex ) { // NPE or OOB Exceptions 155 log.warn( "Could not parse charset from content-type header in response" ); 156 return Charset.defaultCharset().name(); 157 } 158 } 159 160 /** 161 * Helper method to get the content-type string from the response 162 * (no charset). 163 * @param resp 164 */ 165 public static String getContentType( HttpResponse resp ) { 166 if ( resp.getEntity() == null ) 167 throw new IllegalArgumentException( "Response does not contain data" ); 168 if ( resp.getEntity().getContentType() == null ) 169 throw new IllegalArgumentException( "Response does not have a content-type header" ); 170 try { 171 return resp.getEntity().getContentType().getElements()[0].getName(); 172 } 173 catch ( RuntimeException ex ) { // NPE or OOB Exceptions 174 throw new IllegalArgumentException( "Could not parse content-type from response" ); 175 } 176 } 177 178 /** 179 * Default parser used for binary data. This simply returns the underlying 180 * response InputStream. 181 * @see ContentType#BINARY 182 * @see HttpEntity#getContent() 183 * @param resp 184 * @return an InputStream the binary response stream 185 * @throws IllegalStateException 186 * @throws IOException 187 */ 188 public InputStream parseStream( HttpResponse resp ) throws IOException { 189 return resp.getEntity().getContent(); 190 } 191 192 /** 193 * Default parser used to handle plain text data. The response text 194 * is decoded using the charset passed in the response content-type 195 * header. 196 * @see ContentType#TEXT 197 * @param resp 198 * @return 199 * @throws UnsupportedEncodingException 200 * @throws IllegalStateException 201 * @throws IOException 202 */ 203 public Reader parseText( HttpResponse resp ) throws IOException { 204 return new InputStreamReader( resp.getEntity().getContent(), 205 ParserRegistry.getCharset( resp ) ); 206 } 207 208 /** 209 * Default parser used to decode a URL-encoded response. 210 * @see ContentType#URLENC 211 * @param resp 212 * @return 213 * @throws IOException 214 */ 215 public Map<String,String> parseForm( final HttpResponse resp ) throws IOException { 216 HttpEntity entity = resp.getEntity(); 217 /* URLEncodedUtils won't parse the content unless the content-type is 218 application/x-www-form-urlencoded. Since we want to be able to force 219 parsing regardless of what the content-type header says, we need to 220 'spoof' the content-type if it's not already acceptable. */ 221 if ( ! ContentType.URLENC.toString().equals( ParserRegistry.getContentType( resp ) ) ) { 222 entity = new HttpEntityWrapper( entity ) { 223 @Override public org.apache.http.Header getContentType() { 224 String value = ContentType.URLENC.toString(); 225 String charset = ParserRegistry.getCharset( resp ); 226 if ( charset != null ) value += "; charset=" + charset; 227 return new BasicHeader( "Content-Type", value ); 228 }; 229 }; 230 } 231 List<NameValuePair> params = URLEncodedUtils.parse( entity ); 232 Map<String,String> paramMap = new HashMap<String,String>(params.size()); 233 for ( NameValuePair param : params ) 234 paramMap.put( param.getName(), param.getValue() ); 235 return paramMap; 236 } 237 238 /** 239 * Parse an HTML document by passing it through the NekoHTML parser. 240 * @see ContentType#HTML 241 * @see org.cyberneko.html.parsers.SAXParser 242 * @see XmlSlurper#parse(Reader) 243 * @param resp HTTP response from which to parse content 244 * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)} 245 * @throws IOException 246 * @throws SAXException 247 */ 248 public GPathResult parseHTML( HttpResponse resp ) throws IOException, SAXException { 249 XMLReader p = new org.cyberneko.html.parsers.SAXParser(); 250 p.setEntityResolver( catalogResolver ); 251 return new XmlSlurper( p ).parse( parseText( resp ) ); 252 } 253 254 /** 255 * Default parser used to decode an XML response. 256 * @see ContentType#XML 257 * @see XmlSlurper#parse(Reader) 258 * @param resp HTTP response from which to parse content 259 * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)} 260 * @throws IOException 261 * @throws SAXException 262 * @throws ParserConfigurationException 263 */ 264 public GPathResult parseXML( HttpResponse resp ) throws IOException, SAXException, ParserConfigurationException { 265 XmlSlurper xml = new XmlSlurper(); 266 xml.setEntityResolver( catalogResolver ); 267 return xml.parse( parseText( resp ) ); 268 } 269 270 /** 271 * Default parser used to decode a JSON response. 272 * @see ContentType#JSON 273 * @param resp 274 * @return 275 * @throws IOException 276 */ 277 public Object parseJSON( HttpResponse resp ) throws IOException { 278 // there is a bug in the JsonSlurper.parse method... 279 //String jsonTxt = DefaultGroovyMethods.getText( parseText( resp ) ); 280 return new JsonSlurper().parse( parseText( resp ) ); 281 } 282 283 /** 284 * <p>Returns a map of default parsers. Override this method to change 285 * what parsers are registered by default. A 'parser' is really just a 286 * closure that acceipts an {@link HttpResponse} instance and returns 287 * some parsed data. You can of course call 288 * <code>super.buildDefaultParserMap()</code> and then add or remove 289 * from that result as well.</p> 290 * 291 * <p>Default registered parsers are: 292 * <ul> 293 * <li>{@link ContentType#BINARY} : {@link #parseStream(HttpResponse) parseStream()}</li> 294 * <li>{@link ContentType#TEXT} : {@link #parseText(HttpResponse) parseText()}</li> 295 * <li>{@link ContentType#URLENC} : {@link #parseForm(HttpResponse) parseForm()}</li> 296 * <li>{@link ContentType#XML} : {@link #parseXML(HttpResponse) parseXML()}</li> 297 * <li>{@link ContentType#JSON} : {@link #parseJSON(HttpResponse) parseJSON()}</li> 298 * </ul> 299 */ 300 protected Map<String,Closure> buildDefaultParserMap() { 301 Map<String,Closure> parsers = new HashMap<String,Closure>(); 302 303 parsers.put( ContentType.BINARY.toString(), new MethodClosure( this, "parseStream" ) ); 304 parsers.put( ContentType.TEXT.toString(), new MethodClosure(this,"parseText") ); 305 parsers.put( ContentType.URLENC.toString(), new MethodClosure(this,"parseForm") ); 306 parsers.put( ContentType.HTML.toString(), new MethodClosure(this,"parseHTML") ); 307 308 Closure pClosure = new MethodClosure(this,"parseXML"); 309 for ( String ct : ContentType.XML.getContentTypeStrings() ) 310 parsers.put( ct, pClosure ); 311 312 pClosure = new MethodClosure(this,"parseJSON"); 313 for ( String ct : ContentType.JSON.getContentTypeStrings() ) 314 parsers.put( ct, pClosure ); 315 316 return parsers; 317 } 318 319 /** 320 * Add a new XML catalog definiton to the static XML resolver catalog. 321 * See the <a href='http://fisheye.codehaus.org/browse/gmod/httpbuilder/trunk/src/main/resources/catalog/html.xml?r=root:'> 322 * HTTPBuilder source catalog</a> for an example. 323 * 324 * @param catalogLocation URL of a catalog definition file 325 * @throws IOException if the given URL cannot be parsed or accessed for whatever reason. 326 */ 327 public static void addCatalog( URL catalogLocation ) throws IOException { 328 catalogResolver.getCatalog().parseCatalog( catalogLocation ); 329 } 330 331 /** 332 * Access the default catalog used by all HTTPBuilder instances. 333 * @return the static {@link CatalogResolver} instance 334 */ 335 public static CatalogResolver getCatalogResolver() { 336 return catalogResolver; 337 } 338 339 /** 340 * Get the default parser used for unregistered content-types. 341 * @return 342 */ 343 public Closure getDefaultParser() { 344 return this.defaultParser; 345 } 346 347 /** 348 * Set the default parser used for unregistered content-types. 349 * @param defaultParser if 350 */ 351 public void setDefaultParser( Closure defaultParser ) { 352 if ( defaultParser == null ) this.defaultParser = DEFAULT_PARSER; 353 this.defaultParser = defaultParser; 354 } 355 356 /** 357 * Retrieve a parser for the given response content-type string. This 358 * is called by HTTPBuildre to retrieve the correct parser for a given 359 * content-type. The parser is then used to decode the response data prior 360 * to passing it to a response handler. 361 * @param contentType 362 * @return parser that can interpret the given response content type, 363 * or the default parser if no parser is registered for the given 364 * content-type. It should NOT return a null value. 365 */ 366 public Closure getAt( Object contentType ) { 367 String ct = contentType.toString(); 368 int idx = ct.indexOf( ';' ); 369 if ( idx > 0 ) ct = ct.substring( 0, idx ); 370 371 Closure parser = registeredParsers.get(ct); 372 if ( parser != null ) return parser; 373 374 log.warn( "Cannot find parser for content-type: " + ct 375 + " -- using default parser."); 376 return defaultParser; 377 } 378 379 /** 380 * Register a new parser for the given content-type. The parser closure 381 * should accept an {@link HttpResponse} argument and return a type suitable 382 * to be passed as the 'parsed data' argument of a 383 * {@link RequestConfigDelegate#getResponse() response handler} closure. 384 * @param contentType <code>content-type</code> string 385 * @param value code that will parse the HttpResponse and return parsed 386 * data to the response handler. 387 */ 388 public void putAt( Object contentType, Closure value ) { 389 if ( contentType instanceof ContentType ) { 390 for ( String ct : ((ContentType)contentType).getContentTypeStrings() ) 391 this.registeredParsers.put( ct, value ); 392 } 393 else this.registeredParsers.put( contentType.toString(), value ); 394 } 395 396 /** 397 * Alias for {@link #getAt(Object)} to allow property-style access. 398 * @param key content-type string 399 * @return 400 */ 401 public Closure propertyMissing( Object key ) { 402 return this.getAt( key ); 403 } 404 405 /** 406 * Alias for {@link #putAt(Object, Closure)} to allow property-style access. 407 * @param key content-type string 408 * @param value parser closure 409 */ 410 public void propertyMissing( Object key, Closure value ) { 411 this.putAt( key, value ); 412 } 413 414 /** 415 * Iterate over the entire parser map 416 * @return 417 */ 418 public Iterator<Map.Entry<String,Closure>> iterator() { 419 return this.registeredParsers.entrySet().iterator(); 420 } 421}