1% File src/library/utils/man/adist.Rd
2% Part of the R package, https://www.R-project.org
3% Copyright 2011-2015 R Core Team
4% Distributed under GPL 2 or later
5
6\name{adist}
7\alias{adist}
8\title{Approximate String Distances}
9\description{
10  Compute the approximate string distance between character vectors.
11  The distance is a generalized Levenshtein (edit) distance, giving the
12  minimal possibly weighted number of insertions, deletions and
13  substitutions needed to transform one string into another.
14}
15\usage{
16adist(x, y = NULL, costs = NULL, counts = FALSE, fixed = TRUE,
17      partial = !fixed, ignore.case = FALSE, useBytes = FALSE)
18}
19\arguments{
20  \item{x}{a character vector.  \link{Long vectors} are not supported.}
21  \item{y}{a character vector, or \code{NULL} (default) indicating
22    taking \code{x} as \code{y}.}
23  \item{costs}{a numeric vector or list with names partially matching
24    \samp{insertions}, \samp{deletions} and \samp{substitutions} giving
25    the respective costs for computing the Levenshtein distance, or
26    \code{NULL} (default) indicating using unit cost for all three
27    possible transformations.}
28  \item{counts}{a logical indicating whether to optionally return the
29    transformation counts (numbers of insertions, deletions and
30    substitutions) as the \code{"counts"} attribute of the return
31    value.}
32  \item{fixed}{a logical.  If \code{TRUE} (default), the \code{x}
33    elements are used as string literals.  Otherwise, they are taken as
34    regular expressions and \code{partial = TRUE} is implied
35    (corresponding to the approximate string distance used by
36    \code{\link{agrep}} with \code{fixed = FALSE}).}
37  \item{partial}{a logical indicating whether the transformed \code{x}
38    elements must exactly match the complete \code{y} elements, or only
39    substrings of these.  The latter corresponds to the approximate
40    string distance used by \code{\link{agrep}} (by default).}
41  \item{ignore.case}{a logical.  If \code{TRUE}, case is ignored for
42    computing the distances.}
43  \item{useBytes}{a logical.  If \code{TRUE} distance computations are
44    done byte-by-byte rather than character-by-character.}
45}
46\value{
47  A matrix with the approximate string distances of the elements of
48  \code{x} and \code{y}, with rows and columns corresponding to \code{x}
49  and \code{y}, respectively.
50
51  If \code{counts} is \code{TRUE}, the transformation counts are
52  returned as the \code{"counts"} attribute of this matrix, as a
53  3-dimensional array with dimensions corresponding to the elements of
54  \code{x}, the elements of \code{y}, and the type of transformation
55  (insertions, deletions and substitutions), respectively.
56  Additionally, if \code{partial = FALSE}, the transformation sequences
57  are returned as the \code{"trafos"} attribute of the return value, as
58  character strings with elements \samp{M}, \samp{I}, \samp{D} and
59  \samp{S} indicating a match, insertion, deletion and substitution,
60  respectively.  If \code{partial = TRUE}, the offsets (positions of
61  the first and last element) of the matched substrings are returned as
62  the \code{"offsets"} attribute of the return value (with both offsets
63  \eqn{-1} in case of no match).
64}
65\details{
66  The (generalized) Levenshtein (or edit) distance between two strings
67  \var{s} and \var{t} is the minimal possibly weighted number of
68  insertions, deletions and substitutions needed to transform \var{s}
69  into \var{t} (so that the transformation exactly matches \var{t}).
70  This distance is computed for \code{partial = FALSE}, currently using
71  a dynamic programming algorithm (see, e.g.,
72  \url{https://en.wikipedia.org/wiki/Levenshtein_distance}) with space
73  and time complexity \eqn{O(mn)}, where \eqn{m} and \eqn{n} are the
74  lengths of \var{s} and \var{t}, respectively.  Additionally computing
75  the transformation sequence and counts is \eqn{O(\max(m, n))}.
76
77  The generalized Levenshtein distance can also be used for approximate
78  (fuzzy) string matching, in which case one finds the substring of
79  \var{t} with minimal distance to the pattern \var{s} (which could be
80  taken as a regular expression, in which case the principle of using
81  the leftmost and longest match applies), see, e.g.,
82  \url{https://en.wikipedia.org/wiki/Approximate_string_matching}.  This
83  distance is computed for \code{partial = TRUE} using \samp{tre} by
84  Ville Laurikari (\url{https://github.com/laurikari/tre}) and
85  corresponds to the distance used by \code{\link{agrep}}.  In this
86  case, the given cost values are coerced to integer.
87
88  Note that the costs for insertions and deletions can be different, in
89  which case the distance between \var{s} and \var{t} can be different
90  from the distance between \var{t} and \var{s}.
91}
92\seealso{
93  \code{\link{agrep}} for approximate string matching (fuzzy matching)
94  using the generalized Levenshtein distance.
95}
96\examples{
97## Cf. https://en.wikipedia.org/wiki/Levenshtein_distance
98adist("kitten", "sitting")
99## To see the transformation counts for the Levenshtein distance:
100drop(attr(adist("kitten", "sitting", counts = TRUE), "counts"))
101## To see the transformation sequences:
102attr(adist(c("kitten", "sitting"), counts = TRUE), "trafos")
103
104## Cf. the examples for agrep:
105adist("lasy", "1 lazy 2")
106## For a "partial approximate match" (as used for agrep):
107adist("lasy", "1 lazy 2", partial = TRUE)
108}
109\keyword{character}
110