1% File src/library/utils/man/adist.Rd 2% Part of the R package, https://www.R-project.org 3% Copyright 2011-2015 R Core Team 4% Distributed under GPL 2 or later 5 6\name{adist} 7\alias{adist} 8\title{Approximate String Distances} 9\description{ 10 Compute the approximate string distance between character vectors. 11 The distance is a generalized Levenshtein (edit) distance, giving the 12 minimal possibly weighted number of insertions, deletions and 13 substitutions needed to transform one string into another. 14} 15\usage{ 16adist(x, y = NULL, costs = NULL, counts = FALSE, fixed = TRUE, 17 partial = !fixed, ignore.case = FALSE, useBytes = FALSE) 18} 19\arguments{ 20 \item{x}{a character vector. \link{Long vectors} are not supported.} 21 \item{y}{a character vector, or \code{NULL} (default) indicating 22 taking \code{x} as \code{y}.} 23 \item{costs}{a numeric vector or list with names partially matching 24 \samp{insertions}, \samp{deletions} and \samp{substitutions} giving 25 the respective costs for computing the Levenshtein distance, or 26 \code{NULL} (default) indicating using unit cost for all three 27 possible transformations.} 28 \item{counts}{a logical indicating whether to optionally return the 29 transformation counts (numbers of insertions, deletions and 30 substitutions) as the \code{"counts"} attribute of the return 31 value.} 32 \item{fixed}{a logical. If \code{TRUE} (default), the \code{x} 33 elements are used as string literals. Otherwise, they are taken as 34 regular expressions and \code{partial = TRUE} is implied 35 (corresponding to the approximate string distance used by 36 \code{\link{agrep}} with \code{fixed = FALSE}).} 37 \item{partial}{a logical indicating whether the transformed \code{x} 38 elements must exactly match the complete \code{y} elements, or only 39 substrings of these. The latter corresponds to the approximate 40 string distance used by \code{\link{agrep}} (by default).} 41 \item{ignore.case}{a logical. If \code{TRUE}, case is ignored for 42 computing the distances.} 43 \item{useBytes}{a logical. If \code{TRUE} distance computations are 44 done byte-by-byte rather than character-by-character.} 45} 46\value{ 47 A matrix with the approximate string distances of the elements of 48 \code{x} and \code{y}, with rows and columns corresponding to \code{x} 49 and \code{y}, respectively. 50 51 If \code{counts} is \code{TRUE}, the transformation counts are 52 returned as the \code{"counts"} attribute of this matrix, as a 53 3-dimensional array with dimensions corresponding to the elements of 54 \code{x}, the elements of \code{y}, and the type of transformation 55 (insertions, deletions and substitutions), respectively. 56 Additionally, if \code{partial = FALSE}, the transformation sequences 57 are returned as the \code{"trafos"} attribute of the return value, as 58 character strings with elements \samp{M}, \samp{I}, \samp{D} and 59 \samp{S} indicating a match, insertion, deletion and substitution, 60 respectively. If \code{partial = TRUE}, the offsets (positions of 61 the first and last element) of the matched substrings are returned as 62 the \code{"offsets"} attribute of the return value (with both offsets 63 \eqn{-1} in case of no match). 64} 65\details{ 66 The (generalized) Levenshtein (or edit) distance between two strings 67 \var{s} and \var{t} is the minimal possibly weighted number of 68 insertions, deletions and substitutions needed to transform \var{s} 69 into \var{t} (so that the transformation exactly matches \var{t}). 70 This distance is computed for \code{partial = FALSE}, currently using 71 a dynamic programming algorithm (see, e.g., 72 \url{https://en.wikipedia.org/wiki/Levenshtein_distance}) with space 73 and time complexity \eqn{O(mn)}, where \eqn{m} and \eqn{n} are the 74 lengths of \var{s} and \var{t}, respectively. Additionally computing 75 the transformation sequence and counts is \eqn{O(\max(m, n))}. 76 77 The generalized Levenshtein distance can also be used for approximate 78 (fuzzy) string matching, in which case one finds the substring of 79 \var{t} with minimal distance to the pattern \var{s} (which could be 80 taken as a regular expression, in which case the principle of using 81 the leftmost and longest match applies), see, e.g., 82 \url{https://en.wikipedia.org/wiki/Approximate_string_matching}. This 83 distance is computed for \code{partial = TRUE} using \samp{tre} by 84 Ville Laurikari (\url{https://github.com/laurikari/tre}) and 85 corresponds to the distance used by \code{\link{agrep}}. In this 86 case, the given cost values are coerced to integer. 87 88 Note that the costs for insertions and deletions can be different, in 89 which case the distance between \var{s} and \var{t} can be different 90 from the distance between \var{t} and \var{s}. 91} 92\seealso{ 93 \code{\link{agrep}} for approximate string matching (fuzzy matching) 94 using the generalized Levenshtein distance. 95} 96\examples{ 97## Cf. https://en.wikipedia.org/wiki/Levenshtein_distance 98adist("kitten", "sitting") 99## To see the transformation counts for the Levenshtein distance: 100drop(attr(adist("kitten", "sitting", counts = TRUE), "counts")) 101## To see the transformation sequences: 102attr(adist(c("kitten", "sitting"), counts = TRUE), "trafos") 103 104## Cf. the examples for agrep: 105adist("lasy", "1 lazy 2") 106## For a "partial approximate match" (as used for agrep): 107adist("lasy", "1 lazy 2", partial = TRUE) 108} 109\keyword{character} 110