//===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// This file defines a Levenshtein distance function that works for any two /// sequences, with each element of each sequence being analogous to a character /// in a string. /// //===----------------------------------------------------------------------===// #ifndef LLVM_ADT_EDIT_DISTANCE_H #define LLVM_ADT_EDIT_DISTANCE_H #include "llvm/ADT/ArrayRef.h" #include namespace llvm { /// Determine the edit distance between two sequences. /// /// \param FromArray the first sequence to compare. /// /// \param ToArray the second sequence to compare. /// /// \param Map A Functor to apply to each item of the sequences before /// comparison. /// /// \param AllowReplacements whether to allow element replacements (change one /// element into another) as a single operation, rather than as two operations /// (an insertion and a removal). /// /// \param MaxEditDistance If non-zero, the maximum edit distance that this /// routine is allowed to compute. If the edit distance will exceed that /// maximum, returns \c MaxEditDistance+1. /// /// \returns the minimum number of element insertions, removals, or (if /// \p AllowReplacements is \c true) replacements needed to transform one of /// the given sequences into the other. If zero, the sequences are identical. template unsigned ComputeMappedEditDistance(ArrayRef FromArray, ArrayRef ToArray, Functor Map, bool AllowReplacements = true, unsigned MaxEditDistance = 0) { // The algorithm implemented below is the "classic" // dynamic-programming algorithm for computing the Levenshtein // distance, which is described here: // // http://en.wikipedia.org/wiki/Levenshtein_distance // // Although the algorithm is typically described using an m x n // array, only one row plus one element are used at a time, so this // implementation just keeps one vector for the row. To update one entry, // only the entries to the left, top, and top-left are needed. The left // entry is in Row[x-1], the top entry is what's in Row[x] from the last // iteration, and the top-left entry is stored in Previous. typename ArrayRef::size_type m = FromArray.size(); typename ArrayRef::size_type n = ToArray.size(); if (MaxEditDistance) { // If the difference in size between the 2 arrays is larger than the max // distance allowed, we can bail out as we will always need at least // MaxEditDistance insertions or removals. typename ArrayRef::size_type AbsDiff = m > n ? m - n : n - m; if (AbsDiff > MaxEditDistance) return MaxEditDistance + 1; } SmallVector Row(n + 1); for (unsigned i = 1; i < Row.size(); ++i) Row[i] = i; for (typename ArrayRef::size_type y = 1; y <= m; ++y) { Row[0] = y; unsigned BestThisRow = Row[0]; unsigned Previous = y - 1; const auto &CurItem = Map(FromArray[y - 1]); for (typename ArrayRef::size_type x = 1; x <= n; ++x) { int OldRow = Row[x]; if (AllowReplacements) { Row[x] = std::min(Previous + (CurItem == Map(ToArray[x - 1]) ? 0u : 1u), std::min(Row[x - 1], Row[x]) + 1); } else { if (CurItem == Map(ToArray[x - 1])) Row[x] = Previous; else Row[x] = std::min(Row[x-1], Row[x]) + 1; } Previous = OldRow; BestThisRow = std::min(BestThisRow, Row[x]); } if (MaxEditDistance && BestThisRow > MaxEditDistance) return MaxEditDistance + 1; } unsigned Result = Row[n]; return Result; } template unsigned ComputeEditDistance(ArrayRef FromArray, ArrayRef ToArray, bool AllowReplacements = true, unsigned MaxEditDistance = 0) { return ComputeMappedEditDistance( FromArray, ToArray, [](const T &X) -> const T & { return X; }, AllowReplacements, MaxEditDistance); } } // End llvm namespace #endif