001/*
002 * Copyright 2008-2011 Thomas Nichols.  http://blog.thomnichols.org
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * You are receiving this code free of charge, which represents many hours of
017 * effort from other individuals and corporations.  As a responsible member
018 * of the community, you are encouraged (but not required) to donate any
019 * enhancements or improvements back to the community under a similar open
020 * source license.  Thank you. -TMN
021 */
022package groovyx.net.http;
023
024import groovy.json.JsonSlurper;
025import groovy.lang.Closure;
026import groovy.util.XmlSlurper;
027import groovy.util.slurpersupport.GPathResult;
028import groovyx.net.http.HTTPBuilder.RequestConfigDelegate;
029
030import java.io.IOException;
031import java.io.InputStream;
032import java.io.InputStreamReader;
033import java.io.Reader;
034import java.io.UnsupportedEncodingException;
035import java.net.URL;
036import java.nio.charset.Charset;
037import java.util.HashMap;
038import java.util.Iterator;
039import java.util.List;
040import java.util.Map;
041
042import javax.xml.parsers.ParserConfigurationException;
043
044import org.apache.commons.logging.Log;
045import org.apache.commons.logging.LogFactory;
046import org.apache.http.HttpEntity;
047import org.apache.http.HttpResponse;
048import org.apache.http.NameValuePair;
049import org.apache.http.client.utils.URLEncodedUtils;
050import org.apache.http.entity.HttpEntityWrapper;
051import org.apache.http.message.BasicHeader;
052import org.apache.xml.resolver.Catalog;
053import org.apache.xml.resolver.CatalogManager;
054import org.apache.xml.resolver.tools.CatalogResolver;
055import org.codehaus.groovy.runtime.MethodClosure;
056import org.xml.sax.SAXException;
057import org.xml.sax.XMLReader;
058
059
060/**
061 * <p>Keeps track of response parsers for each content type.  Each parser
062 * should should be a closure that accepts an {@link HttpResponse} instance,
063 * and returns whatever handler is appropriate for reading the response
064 * data for that content-type.  For example, a plain-text response should
065 * probably be parsed with a <code>Reader</code>, while an XML response
066 * might be parsed by an XmlSlurper, which would then be passed to the
067 * response closure. </p>
068 *
069 * <p>Note that all methods in this class assume {@link HttpResponse#getEntity()}
070 * return a non-null value.  It is the job of the HTTPBuilder instance to ensure
071 * a NullPointerException is not thrown by passing a response that contains no
072 * entity.</p>
073 *
074 * <p>You can see the list of content-type parsers that are built-in to the
075 * ParserRegistry class in {@link #buildDefaultParserMap()}.</p>
076 *
077 * @see ContentType
078 * @author <a href='mailto:tomstrummer+httpbuilder@gmail.com'>Tom Nichols</a>
079 */
080public class ParserRegistry {
081
082    /**
083     * The default parser used for unregistered content-types.  This is a copy
084     * of {@link #parseStream(HttpResponse)}, which is like a no-op that just
085     * returns the unaltered response stream.
086     */
087    protected final Closure DEFAULT_PARSER = new MethodClosure( this, "parseStream" );
088    /**
089     * The default charset to use when no charset is given in the Content-Type
090     * header of a response.  This can be modifid via {@link #setDefaultCharset(String)}.
091     */
092    public static final String DEFAULT_CHARSET = "UTF-8";
093
094    private Closure defaultParser = DEFAULT_PARSER;
095    private Map<String,Closure> registeredParsers = buildDefaultParserMap();
096    private static String defaultCharset = DEFAULT_CHARSET;
097
098    protected static final Log log = LogFactory.getLog( ParserRegistry.class );
099
100    /**
101     * This CatalogResolver is static to avoid the overhead of re-parsing
102     * the catalog definition file every time.  Unfortunately, there's no
103     * way to share a single Catalog instance between resolvers.  The
104     * {@link Catalog} class is technically not thread-safe, but as long as you
105     * do not parse catalog files while using the resolver, it should be fine.
106     */
107    protected static CatalogResolver catalogResolver;
108
109    static {
110        CatalogManager catalogManager = new CatalogManager();
111        catalogManager.setIgnoreMissingProperties( true );
112        catalogManager.setUseStaticCatalog( false );
113        catalogManager.setRelativeCatalogs( true );
114        try {
115            catalogResolver = new CatalogResolver( catalogManager );
116            catalogResolver.getCatalog().parseCatalog(
117                    ParserRegistry.class.getResource( "/catalog/html.xml" ) );
118        } catch ( IOException ex ) {
119            LogFactory.getLog( ParserRegistry.class )
120                .warn( "Could not resolve default XML catalog", ex );
121        }
122    }
123
124    /**
125     * Set the charset to use for parsing character streams when no charset
126     * is given in the Content-Type header.
127     * @param charset the charset to use, or <code>null</code> to use
128     *     {@link #DEFAULT_CHARSET}
129     */
130    public static void setDefaultCharset( String charset ) {
131        defaultCharset = charset == null ? DEFAULT_CHARSET : charset;
132    }
133
134    /**
135     * Helper method to get the charset from the response.  This should be done
136     * when manually parsing any text response to ensure it is decoded using the
137     * correct charset. For instance:<pre>
138     * Reader reader = new InputStreamReader( resp.getEntity().getContent(),
139     *   ParserRegistry.getCharset( resp ) );</pre>
140     * @param resp
141     */
142    public static String getCharset( HttpResponse resp ) {
143        try {
144            NameValuePair charset = resp.getEntity().getContentType()
145                .getElements()[0].getParameterByName("charset");
146
147            if ( charset == null || charset.getValue().trim().equals("") ) {
148                log.debug( "Could not find charset in response; using " + defaultCharset );
149                return defaultCharset;
150            }
151
152            return charset.getValue();
153        }
154        catch ( RuntimeException ex ) { // NPE or OOB Exceptions
155            log.warn( "Could not parse charset from content-type header in response" );
156            return Charset.defaultCharset().name();
157        }
158    }
159
160    /**
161     * Helper method to get the content-type string from the response
162     * (no charset).
163     * @param resp
164     */
165    public static String getContentType( HttpResponse resp ) {
166        if ( resp.getEntity() == null )
167            throw new IllegalArgumentException( "Response does not contain data" );
168        if ( resp.getEntity().getContentType() == null )
169            throw new IllegalArgumentException( "Response does not have a content-type header" );
170        try {
171            return resp.getEntity().getContentType().getElements()[0].getName();
172        }
173        catch ( RuntimeException ex ) {  // NPE or OOB Exceptions
174            throw new IllegalArgumentException( "Could not parse content-type from response" );
175        }
176    }
177
178    /**
179     * Default parser used for binary data.  This simply returns the underlying
180     * response InputStream.
181     * @see ContentType#BINARY
182     * @see HttpEntity#getContent()
183     * @param resp
184     * @return an InputStream the binary response stream
185     * @throws IllegalStateException
186     * @throws IOException
187     */
188    public InputStream parseStream( HttpResponse resp ) throws IOException {
189        return resp.getEntity().getContent();
190    }
191
192    /**
193     * Default parser used to handle plain text data.  The response text
194     * is decoded using the charset passed in the response content-type
195     * header.
196     * @see ContentType#TEXT
197     * @param resp
198     * @return
199     * @throws UnsupportedEncodingException
200     * @throws IllegalStateException
201     * @throws IOException
202     */
203    public Reader parseText( HttpResponse resp ) throws IOException {
204        return new InputStreamReader( resp.getEntity().getContent(),
205                ParserRegistry.getCharset( resp ) );
206    }
207
208    /**
209     * Default parser used to decode a URL-encoded response.
210     * @see ContentType#URLENC
211     * @param resp
212     * @return
213     * @throws IOException
214     */
215    public Map<String,String> parseForm( final HttpResponse resp ) throws IOException {
216        HttpEntity entity = resp.getEntity();
217        /* URLEncodedUtils won't parse the content unless the content-type is
218           application/x-www-form-urlencoded.  Since we want to be able to force
219           parsing regardless of what the content-type header says, we need to
220           'spoof' the content-type if it's not already acceptable. */
221        if ( ! ContentType.URLENC.toString().equals( ParserRegistry.getContentType( resp ) ) ) {
222            entity = new HttpEntityWrapper( entity ) {
223                @Override public org.apache.http.Header getContentType() {
224                    String value = ContentType.URLENC.toString();
225                    String charset = ParserRegistry.getCharset( resp );
226                    if ( charset != null ) value += "; charset=" + charset;
227                    return new BasicHeader( "Content-Type", value );
228                };
229            };
230        }
231        List<NameValuePair> params = URLEncodedUtils.parse( entity );
232        Map<String,String> paramMap = new HashMap<String,String>(params.size());
233        for ( NameValuePair param : params )
234            paramMap.put( param.getName(), param.getValue() );
235        return paramMap;
236    }
237
238    /**
239     * Parse an HTML document by passing it through the NekoHTML parser.
240     * @see ContentType#HTML
241     * @see org.cyberneko.html.parsers.SAXParser
242     * @see XmlSlurper#parse(Reader)
243     * @param resp HTTP response from which to parse content
244     * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
245     * @throws IOException
246     * @throws SAXException
247     */
248    public GPathResult parseHTML( HttpResponse resp ) throws IOException, SAXException {
249        XMLReader p = new org.cyberneko.html.parsers.SAXParser();
250        p.setEntityResolver( catalogResolver );
251        return new XmlSlurper( p ).parse( parseText( resp ) );
252    }
253
254    /**
255     * Default parser used to decode an XML response.
256     * @see ContentType#XML
257     * @see XmlSlurper#parse(Reader)
258     * @param resp HTTP response from which to parse content
259     * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
260     * @throws IOException
261     * @throws SAXException
262     * @throws ParserConfigurationException
263     */
264    public GPathResult parseXML( HttpResponse resp ) throws IOException, SAXException, ParserConfigurationException {
265        XmlSlurper xml = new XmlSlurper();
266        xml.setEntityResolver( catalogResolver );
267        return xml.parse( parseText( resp ) );
268    }
269
270    /**
271     * Default parser used to decode a JSON response.
272     * @see ContentType#JSON
273     * @param resp
274     * @return
275     * @throws IOException
276     */
277    public Object parseJSON( HttpResponse resp ) throws IOException {
278        // there is a bug in the JsonSlurper.parse method...
279        //String jsonTxt = DefaultGroovyMethods.getText( parseText( resp ) );
280        return new JsonSlurper().parse( parseText( resp ) );
281    }
282
283    /**
284     * <p>Returns a map of default parsers.  Override this method to change
285     * what parsers are registered by default.  A 'parser' is really just a
286     * closure that acceipts an {@link HttpResponse} instance and returns
287     * some parsed data.  You can of course call
288     * <code>super.buildDefaultParserMap()</code> and then add or remove
289     * from that result as well.</p>
290     *
291     * <p>Default registered parsers are:
292     * <ul>
293     * <li>{@link ContentType#BINARY} :  {@link #parseStream(HttpResponse) parseStream()}</li>
294     * <li>{@link ContentType#TEXT} :  {@link #parseText(HttpResponse) parseText()}</li>
295     * <li>{@link ContentType#URLENC} :  {@link #parseForm(HttpResponse) parseForm()}</li>
296     * <li>{@link ContentType#XML} :  {@link #parseXML(HttpResponse) parseXML()}</li>
297     * <li>{@link ContentType#JSON} :  {@link #parseJSON(HttpResponse) parseJSON()}</li>
298     * </ul>
299     */
300    protected Map<String,Closure> buildDefaultParserMap() {
301        Map<String,Closure> parsers = new HashMap<String,Closure>();
302
303        parsers.put( ContentType.BINARY.toString(), new MethodClosure( this, "parseStream" ) );
304        parsers.put( ContentType.TEXT.toString(), new MethodClosure(this,"parseText") );
305        parsers.put( ContentType.URLENC.toString(), new MethodClosure(this,"parseForm") );
306        parsers.put( ContentType.HTML.toString(), new MethodClosure(this,"parseHTML") );
307
308        Closure pClosure = new MethodClosure(this,"parseXML");
309        for ( String ct : ContentType.XML.getContentTypeStrings() )
310            parsers.put( ct, pClosure );
311
312        pClosure = new MethodClosure(this,"parseJSON");
313        for ( String ct : ContentType.JSON.getContentTypeStrings() )
314            parsers.put( ct, pClosure );
315
316        return parsers;
317    }
318
319    /**
320     * Add a new XML catalog definiton to the static XML resolver catalog.
321     * See the <a href='http://fisheye.codehaus.org/browse/gmod/httpbuilder/trunk/src/main/resources/catalog/html.xml?r=root:'>
322     * HTTPBuilder source catalog</a> for an example.
323     *
324     * @param catalogLocation URL of a catalog definition file
325     * @throws IOException if the given URL cannot be parsed or accessed for whatever reason.
326     */
327    public static void addCatalog( URL catalogLocation ) throws IOException {
328        catalogResolver.getCatalog().parseCatalog( catalogLocation );
329    }
330
331    /**
332     * Access the default catalog used by all HTTPBuilder instances.
333     * @return the static {@link CatalogResolver} instance
334     */
335    public static CatalogResolver getCatalogResolver() {
336        return catalogResolver;
337    }
338
339    /**
340     * Get the default parser used for unregistered content-types.
341     * @return
342     */
343    public Closure getDefaultParser() {
344        return this.defaultParser;
345    }
346
347    /**
348     * Set the default parser used for unregistered content-types.
349     * @param defaultParser if
350     */
351    public void setDefaultParser( Closure defaultParser ) {
352        if ( defaultParser == null ) this.defaultParser = DEFAULT_PARSER;
353        this.defaultParser = defaultParser;
354    }
355
356    /**
357     * Retrieve a parser for the given response content-type string.  This
358     * is called by HTTPBuildre to retrieve the correct parser for a given
359     * content-type.  The parser is then used to decode the response data prior
360     * to passing it to a response handler.
361     * @param contentType
362     * @return parser that can interpret the given response content type,
363     *   or the default parser if no parser is registered for the given
364     *   content-type.  It should NOT return a null value.
365     */
366    public Closure getAt( Object contentType ) {
367        String ct = contentType.toString();
368        int idx = ct.indexOf( ';' );
369        if ( idx > 0 ) ct = ct.substring( 0, idx );
370
371        Closure parser = registeredParsers.get(ct);
372        if ( parser != null ) return parser;
373
374        log.warn( "Cannot find parser for content-type: " + ct
375                    + " -- using default parser.");
376        return defaultParser;
377    }
378
379    /**
380     * Register a new parser for the given content-type.  The parser closure
381     * should accept an {@link HttpResponse} argument and return a type suitable
382     * to be passed as the 'parsed data' argument of a
383     * {@link RequestConfigDelegate#getResponse() response handler} closure.
384     * @param contentType  <code>content-type</code> string
385     * @param value code that will parse the HttpResponse and return parsed
386     *   data to the response handler.
387     */
388    public void putAt( Object contentType, Closure value ) {
389        if ( contentType instanceof ContentType ) {
390            for ( String ct : ((ContentType)contentType).getContentTypeStrings() )
391                this.registeredParsers.put( ct, value );
392        }
393        else this.registeredParsers.put( contentType.toString(), value );
394    }
395
396    /**
397     * Alias for {@link #getAt(Object)} to allow property-style access.
398     * @param key content-type string
399     * @return
400     */
401    public Closure propertyMissing( Object key ) {
402        return this.getAt( key );
403    }
404
405    /**
406     * Alias for {@link #putAt(Object, Closure)} to allow property-style access.
407     * @param key content-type string
408     * @param value parser closure
409     */
410    public void propertyMissing( Object key, Closure value ) {
411        this.putAt( key, value );
412    }
413
414    /**
415     * Iterate over the entire parser map
416     * @return
417     */
418    public Iterator<Map.Entry<String,Closure>> iterator() {
419        return this.registeredParsers.entrySet().iterator();
420    }
421}