EditDistance.java
/*
* Copyright (C) 2016 Alberto Irurueta Carro (alberto@irurueta.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.irurueta.commons;
/**
* Utility class to compute Levenshtein distance (number of
* substitutions/modifications) between two arrays or strings.
* Levenshtein distance is an approximate comparison measure that has into
* account the number of modifications, added or removed elements required to
* make two collections equal. Thus, this distance measure is appropriate to
* compare similar but not exactly equal Strings (to take into account
* typographic errors, etc).
*/
public class EditDistance {
/**
* Constructor.
*/
protected EditDistance() {
}
/**
* Computes Levenshtein distance between two Strings.
*
* @param x 1st String.
* @param y 2nd String.
* @return Levenshtein distance.
*/
public static int stringDistance(final String x, final String y) {
return distance(new StringArrayWrapper(x), new StringArrayWrapper(y));
}
/**
* Computes Levenshtein distance between two arrays of objects of type
* T.
* NOTE: this method cannot be used with arrays of primitive types,
* instead an array of wrappers of primitive objects must be used or
* this class must be extended to add implementations of ArrayWrappers
* for all primitive types that must be supported.
*
* @param x 1st array of objects of type T.
* @param y 2nd array of objects of type T.
* @param <T> type of arrays.
* @return Levenshtein distance.
*/
public static <T> int distance(final T[] x, final T[] y) {
return distance(new GenericArrayWrapper<>(x),
new GenericArrayWrapper<>(y));
}
/**
* Computes Levenshtein distance between two arrays of type T. To compute
* this distance an ArrayWrapper is used to make a more efficient access to
* primitive datatypes or strings.
*
* @param x 1st array of objects of type T.
* @param y 2nd array of objects of type T.
* @param <T> type of arrays.
* @return Levenshtein distance.
*/
private static <T> int distance(final ArrayWrapper<T> x, final ArrayWrapper<T> y) {
// validate parameters
if (x == null || y == null) {
throw new IllegalArgumentException();
}
// Obtains length of both parameters. If one of them is 0, we return the
// length of the other, since that number of insertions will be required
final int n = x.length();
final int m = y.length();
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
// Instead of keeping a whole matrix (which would require an O(n*m)
// space, we just keep current and next rows, both of which have a length
// of m+1, and hence only O(n) space is required.
// Initialize current row.
int curRow = 0;
int nextRow = 1;
final int[][] rows = new int[][]{new int[m + 1], new int[m + 1]};
for (int j = 0; j <= m; ++j) {
rows[curRow][j] = j;
}
// for each virtual row (since we only keep two)
for (int i = 1; i <= n; ++i) {
// fill values of row
rows[nextRow][0] = i;
for (int j = 1; j <= m; ++j) {
final int dist1 = rows[curRow][j] + 1;
final int dist2 = rows[nextRow][j - 1] + 1;
final int dist3 = rows[curRow][j - 1] +
(x.equals(y, i - 1, j - 1) ? 0 : 1);
rows[nextRow][j] = Math.min(dist1, Math.min(dist2, dist3));
}
// exchanges current and next rows
if (curRow == 0) {
curRow = 1;
nextRow = 0;
} else {
curRow = 0;
nextRow = 1;
}
}
// returns computed distance
return rows[curRow][m];
}
/**
* Interface used as a wrapper for arrays of type T used to generalized
* efficient access to elements of an array of a generic data type T.
* String implementations are provided as well as implementations for
* generic objects of type T. If primitive data types arrays must be
* supported, then the necessary implementation will need to be added
* similarly to existent ones.
*
* @param <T> generic data type.
*/
protected interface ArrayWrapper<T> {
/**
* Obtains length of array.
*
* @return length of array.
*/
int length();
/**
* Compares this array in position x, with the other object at position
* y.
*
* @param other the other array wrapper.
* @param posX position within this array that must be compared.
* @param posY position within the other array that must be compared.
* @return true if elements in provided positions are equal, false
* otherwise.
*/
boolean equals(final ArrayWrapper<T> other, final int posX, final int posY);
}
/**
* Specific implementation of ArrayWrapper for Strings.
* Allows efficient comparison of String elements in a generic way.
*
* @author Alberto Irurueta Carro
*/
protected static class StringArrayWrapper implements ArrayWrapper<String> {
/**
* String to be compared.
*/
private final String mObject;
/**
* Constructor.
*
* @param object string to be compared.
*/
public StringArrayWrapper(final String object) {
mObject = object;
}
/**
* Length of the string to be compared.
*/
@Override
public int length() {
return mObject.length();
}
/**
* Compares the string contained in this instance at position x with
* the string contained in the other array wrapper at position y.
*
* @param other the other String array wrapper.
* @param posX position in this array to be compared.
* @param posY position in the other array to be compared.
* @return true if elements in provided positions are considered to
* be equal, false otherwise.
*/
@Override
public boolean equals(final ArrayWrapper<String> other, final int posX,
final int posY) {
if (!(other instanceof StringArrayWrapper)) {
return false;
}
final String otherObject = ((StringArrayWrapper) other).mObject;
return mObject.charAt(posX) == otherObject.charAt(posY);
}
}
/**
* Implementation of ArrayWrapper for generic objects of type T.
* Allows the efficient and generic comparison of objects of type T, as long
* as T is not a primitive data type.
*
* @param <T> generic data type.
*/
protected static class GenericArrayWrapper<T> implements ArrayWrapper<T[]> {
/**
* Generic array of type T to be compared.
*/
private final T[] mObject;
/**
* Constructor.
*
* @param object generic array of type T to be compared.
*/
public GenericArrayWrapper(final T[] object) {
mObject = object;
}
/**
* Length of array to be compared.
*/
@Override
public int length() {
return mObject.length;
}
/**
* Compares the array contained in this instance at position X with the
* array contained in the other array wrapper at position Y.
*
* @param other the other generic array wrapper.
* @param posX position in this array to be compared.
* @param posY position in the other array to be compared.
* @return true if elements in both arrays at provided positions are
* considered to be equal, false otherwise.
*/
@Override
public boolean equals(final ArrayWrapper<T[]> other, final int posX, final int posY) {
if (!(other instanceof GenericArrayWrapper)) {
return false;
}
final T[] otherObject = ((GenericArrayWrapper<T>) other).mObject;
return mObject[posX].equals(otherObject[posY]);
}
}
}