// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 2.4
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package au.id.jericho.lib.html;

import java.util.*;
import java.io.*;
import java.net.*;

/**
 * Extracts the textual content from HTML markup.
 * <p>
 * The output is ideal for feeding into a text search engine such as <a target="_blank" href="http://lucene.apache.org/java/">Apache Lucene</a>,
 * especially when the {@link #setIncludeAttributes(boolean) IncludeAttributes} property has been set to <code>true</code>.
 * <p>
 * Use one of the following methods to obtain the output:
 * <ul>
 *  <li>{@link #writeTo(Writer)}</li>
 *  <li>{@link #toString()}</li>
 *  <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
 * </ul>
 * <p>
 * The process removes all of the tags and
 * {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}.
 * Tags are also converted to whitespace unless they belong to an
 * {@linkplain HTMLElements#getInlineLevelElementNames() inline-level} element.
 * An exception to this is the {@link HTMLElementName#BR BR} element, which is also converted to whitespace despite being an inline-level element.
 * <p>
 * Text inside {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements contained within this segment
 * is ignored.
 * <p>
 * Extracting the text from an entire {@link Source} object automatically calls the {@link Source#fullSequentialParse()} method internally.
 * <p>
 * To perform a simple rendering of HTML markup into text, which is more readable than the output of this class, use the {@link Renderer} class instead.
 * <dl>
 *  <dt>Example:</dt>
 *  <dd>Using the default settings, the source segment:<br />
 *   "<code>&lt;div&gt;&lt;b&gt;O&lt;/b&gt;ne&lt;/div&gt;&lt;div title="Two"&gt;&lt;b&gt;Th&lt;/b&gt;&lt;script&gt;//a&nbsp;script&nbsp;&lt;/script&gt;ree&lt;/div&gt;</code>"<br />
 *   produces the text "<code>One Two Three</code>".
 * </dl>
 */
public final class TextExtractor implements CharStreamSource {
	private final Segment segment;
	private boolean includeAttributes=false;

	/**
	 * Constructs a new <code>TextExtractor</code> based on the specified {@link Segment}.
	 * @param segment  the segment from which the text will be extracted.
	 * @see Segment#getTextExtractor()
	 */
	public TextExtractor(final Segment segment) {
		this.segment=segment;
	}

	// Documentation inherited from CharStreamSource
	public void writeTo(final Writer writer) throws IOException {
		writer.write(toString());
		writer.flush();
	}

	// Documentation inherited from CharStreamSource
	public long getEstimatedMaximumOutputLength() {
		return segment.length();
	}

	// Documentation inherited from CharStreamSource
	public String toString() {
		return new Processor(segment,getIncludeAttributes()).toString();
	}

	/**
	 * Sets whether the values of 
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>,
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>,
	 * <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>, and
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a>
	 * attributes of {@linkplain StartTagType#NORMAL normal} tags are to be included in the output.
	 * <p>
	 * The default value is <code>false</code>.
	 *
	 * @param includeAttributes  specifies whether the attribute values are included in the output.
	 * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement. 
	 * @see #getIncludeAttributes()
	 */
	public TextExtractor setIncludeAttributes(boolean includeAttributes) {
		this.includeAttributes=includeAttributes;
		return this;
	}
	
	/**
	 * Indicates whether the values of 
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>,
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>,
	 * <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>, and
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a>
	 * attributes of {@linkplain StartTagType#NORMAL normal} tags are to be included in the output.
	 * <p>
	 * See the {@link #setIncludeAttributes(boolean)} method for a full description of this property.
	 * 
	 * @return <code>true</code> if the attribute values are to be included in the output, otherwise <code>false</code>.
	 */
	public boolean getIncludeAttributes() {
		return includeAttributes;
	}
	
	/**
	 * This class does the actual work, but is first passed final copies of all the parameters for efficiency.
	 * Note at present this is not implemented in a memory-efficient manner.
	 * Once the CharacterReference.decodeCollapseWhiteSpace functionality is available as a FilterWriter (coming in release 3.0),
	 * the main algorithm with be implemented in the writeTo(Writer) method to allow for more memory-efficient processing.
	 */
	private static final class Processor {
		private final Segment segment;
		private final Source source;
		private final boolean includeAttributes;

		public Processor(final Segment segment, final boolean includeAttributes) {
			this.segment=segment;
			source=segment.source;
			this.includeAttributes=includeAttributes;	
		}
		
		public String toString() {
			final StringBuffer sb=new StringBuffer(segment.length());
			int textBegin=segment.begin;
			// use segment.findAllTags().iterator() instead of segment.source.findNextTag(textBegin) to take advantage of allTags cache in Source object
			for (final Iterator i=segment.findAllTags().iterator(); i.hasNext();) {
				final Tag tag=(Tag)i.next();
				final int textEnd=tag.begin;
				if (textEnd<textBegin) continue;
				while (textBegin<textEnd) sb.append(source.charAt(textBegin++));
				if (tag.getTagType()==StartTagType.NORMAL) {
					if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE) {
						final EndTag endTag=source.findNextEndTag(tag.end,tag.name,EndTagType.NORMAL);
						if (endTag!=null) {
							textBegin=endTag.end;
							while (i.hasNext() && i.next()!=endTag) {}
							continue;
						}
					}
					if (includeAttributes) {
						final StartTag startTag=(StartTag)tag;
						// add title attribute:
						final Attribute titleAttribute=startTag.getAttributes().get("title");
						if (titleAttribute!=null) sb.append(' ').append(titleAttribute.getValueSegment()).append(' ');
						// add alt attribute (APPLET, AREA, IMG and INPUT elements):
						final Attribute altAttribute=startTag.getAttributes().get("alt");
						if (altAttribute!=null) sb.append(' ').append(altAttribute.getValueSegment()).append(' ');
						// add label attribute (OPTION and OPTGROUP elements):
						final Attribute labelAttribute=startTag.getAttributes().get("label");
						if (labelAttribute!=null) sb.append(' ').append(labelAttribute.getValueSegment()).append(' ');
						// add summary attribute (TABLE element):
						final Attribute summaryAttribute=startTag.getAttributes().get("summary");
						if (summaryAttribute!=null) sb.append(' ').append(summaryAttribute.getValueSegment()).append(' ');
						// don't bother with the prompt attribute from the ININDEX element as the element is deprecated and very rarely used.
					}
				}
				// Treat tags not belonging to inline-level elements as whitespace:
				if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) sb.append(' ');
				textBegin=tag.end;
			}
			while (textBegin<segment.end) sb.append(source.charAt(textBegin++));
			final String decodedText=CharacterReference.decodeCollapseWhiteSpace(sb);
			return decodedText;
		}
	}
}
