/**
 * This file is released under the GNU General Public License.
 * Refer to the COPYING file distributed with this package.
 *
 * Copyright (c) 2008-2010 WURFL-Pro srl
 */

package net.sourceforge.wurfl.core.handlers.matchers.strategy;

import java.util.Iterator;
import java.util.SortedSet;

import org.apache.commons.lang.text.StrBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * Singleton class responsible applying the <strong>Levenshtein Distance
 * Algorithm</strong> to matching strings. This class use a custom
 * implementation.
 * 
 * @author Fantayeneh Asres Gizaw
 * @author Filippo De Luca
 * 
 * @version $Id: LDMatcher.java 432 2010-05-06 12:12:53Z filippo.deluca $
 */
public final class LDMatcher implements StringMatcher {

	/** Singleton instance */
	public static final LDMatcher INSTANCE = new LDMatcher();

	/** Log */
	private static final Log LOG = LogFactory.getLog(LDMatcher.class);

	/** private constructor to avoid building */
	private LDMatcher() {
		// Empty
	}
	
	// TODO extract to interface
	public String getName() {
		
		return "LD";
	}

	/**
	 * Searches for the string which has the minor Levenshtein distance from
	 * given needle. If there is not candidates within given tolerance, it
	 * returns null.
	 * 
	 * @param candidates
	 *            The SortedSet of possible candidates.
	 * @param needle
	 *            The String to match.
	 * @param tolerance
	 *            the tolerance between needle and candidates.
	 * 
	 * @return Matched candidate String.
	 */
	public String match(SortedSet candidates, String needle, int tolerance) {

        trace(candidates, needle, tolerance);

        String match = null;
		int best = tolerance;
		int current = needle.length();
		
		Iterator cIt = candidates.iterator();

		while (cIt.hasNext() && current > 0) {
			
			String key = (String) cIt.next();

			if (Math.abs(key.length() - needle.length()) <= tolerance) {
				current = getLevenshteinDistance(key, needle, tolerance);
				if (current < best || current == 0) {
					best = current;
					match = key;
				}
			}
		}

		return match;
	}

    private void trace(SortedSet candidates, String needle, int tolerance) {
        if (LOG.isTraceEnabled()) {
            StrBuilder sb = new StrBuilder("Applying LD(").append(tolerance)
                    .append(") on: ").append(needle).append(
                            " using candidates: [");
            for (Iterator uIt = candidates.iterator(); uIt.hasNext();) {
                sb.append(uIt.next());

                if (uIt.hasNext()) {
                    sb.append(", ");
                }
            }
            sb.append("]");

            LOG.trace(sb.toString());
        }
    }

    /**
	 * <p>
	 * Find the Levenshtein distance between two Strings.
	 * </p>
	 * 
	 * <p>
	 * This is the number of changes needed to change one String into another,
	 * where each change is a single character modification (deletion, insertion
	 * or substitution).
	 * </p>
	 * 
	 * <p>
	 * The previous implementation of the Levenshtein distance algorithm was
	 * from <a
	 * href="http://www.merriampark.com/ld.htm">http://www.merriampark.com
	 * /ld.htm</a>
	 * </p>
	 * 
	 * <p>
	 * Chas Emerick has written an implementation in Java, which avoids an
	 * OutOfMemoryError which can occur when my Java implementation is used with
	 * very large strings.<br>
	 * This implementation of the Levenshtein distance algorithm is from <a
	 * href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/
	 * ldjava.htm</a>
	 * </p>
	 * 
	 * <pre>
	 * StringUtils.getLevenshteinDistance(null, *)             = IllegalArgumentException
	 * StringUtils.getLevenshteinDistance(*, null)             = IllegalArgumentException
	 * StringUtils.getLevenshteinDistance(&quot;&quot;,&quot;&quot;)               = 0
	 * StringUtils.getLevenshteinDistance(&quot;&quot;,&quot;a&quot;)              = 1
	 * StringUtils.getLevenshteinDistance(&quot;aaapppp&quot;, &quot;&quot;)       = 7
	 * StringUtils.getLevenshteinDistance(&quot;frog&quot;, &quot;fog&quot;)       = 1
	 * StringUtils.getLevenshteinDistance(&quot;fly&quot;, &quot;ant&quot;)        = 3
	 * StringUtils.getLevenshteinDistance(&quot;elephant&quot;, &quot;hippo&quot;) = 7
	 * StringUtils.getLevenshteinDistance(&quot;hippo&quot;, &quot;elephant&quot;) = 7
	 * StringUtils.getLevenshteinDistance(&quot;hippo&quot;, &quot;zzzzzzzz&quot;) = 8
	 * StringUtils.getLevenshteinDistance(&quot;hello&quot;, &quot;hallo&quot;)    = 1
	 * </pre>
	 * 
	 * @param s
	 *            the first String, must not be null
	 * @param t
	 *            the second String, must not be null
     * @param tolerance
     *           the minimum distance 
	 * @return result distance
	 * @throws IllegalArgumentException
	 *             if either String input <code>null</code>
	 */
	public static int getLevenshteinDistance(String s, String t, int tolerance) {
		if (s == null || t == null) {
			throw new IllegalArgumentException("Strings must not be null");
		}

        if(tolerance == 0) {
            return s.equals(t) ? 0 : Integer.MAX_VALUE;
        }
		/*
		 * The difference between this impl. and the previous is that, rather
		 * than creating and retaining a matrix of size s.length()+1 by
		 * t.length()+1, we maintain two single-dimensional arrays of length
		 * s.length()+1. The first, d, is the 'current working' distance array
		 * that maintains the newest distance cost counts as we iterate through
		 * the characters of String s. Each time we increment the index of
		 * String t we are comparing, d is copied to p, the second int[]. Doing
		 * so allows us to retain the previous cost counts as required by the
		 * algorithm (taking the minimum of the cost count to the left, up one,
		 * and diagonally up and to the left of the current cost count being
		 * calculated). (Note that the arrays aren't really copied anymore, just
		 * switched...this is clearly much better than cloning an array or doing
		 * a System.arraycopy() each time through the outer loop.)
		 * 
		 * Effectively, the difference between the two implementations is this
		 * one does not cause an out of memory condition when calculating the LD
		 * over two very large strings.
		 */

		int n = s.length(); // length of s
		int m = t.length(); // length of t

		if (n == 0) {
			return m;
		} else if (m == 0) {
			return n;
		}

		int p[] = new int[n + 1]; // 'previous' cost array, horizontally
		int d[] = new int[n + 1]; // cost array, horizontally
		int _d[]; // placeholder to assist in swapping p and d

		// indexes into strings s and t
		int i; // iterates through s
		int j; // iterates through t

		char t_j; // jth character of t

		int cost; // cost

		for (i = 0; i <= n; i++) {
			p[i] = i;
		}

		for (j = 1; j <= m; j++) {
			t_j = t.charAt(j - 1);
			d[0] = j;

			for (i = 1; i <= n; i++) {
				cost = s.charAt(i - 1) == t_j ? 0 : 1;
				// minimum of cell to the left+1, to the top+1, diagonally left
				// and up +cost
				d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1]
						+ cost);

				// Performance check
				if (i == j && d[i] > (tolerance + 3)) {
					return Integer.MAX_VALUE;
				}

			}

			// copy current distance counts to 'previous row' distance counts
			_d = p;
			p = d;
			d = _d;
		}

		// our last action in the above loop was to switch d and p, so p now
		// actually has the most recent cost counts
		return p[n];
	}
	
	public String toString() {
		
		return getName();
	}

}
