1\name{strapply}
2\alias{strapply}
3\alias{strapply1}
4\alias{ostrapply}
5\alias{strapplyc}
6\alias{tclList2R}
7\title{
8 Apply a function over a string or strings.
9}
10\description{
11Similar to \code{"'gsubfn'"} except instead of performing substitutions
12it returns the output of \code{"'FUN'"}.}
13\usage{
14strapply(X, pattern, FUN = function(x, ...) x, backref = NULL, ...,
15	empty = NULL,
16	ignore.case = FALSE, perl = FALSE, engine = getOption("gsubfn.engine"),
17	simplify = FALSE, USE.NAMES = FALSE, combine = c)
18strapplyc(X, pattern, backref, ignore.case = FALSE, simplify = FALSE, USE.NAMES = FALSE, engine = getOption("gsubfn.engine"))
19}
20\arguments{
21  \item{X}{ list or (atomic) vector of character strings to be used. }
22  \item{pattern}{ character string containing a regular expression (or
23          character string for \code{"'fixed = TRUE')"} to be matched in the
24          given character vector.}
25  \item{FUN}{ a function, formula, character string, list or proto object
26          to be applied to each element of
27          \code{"'X'"}.  See discussion in \code{\link{gsubfn}}. }
28  \item{backref}{See \code{\link{gsubfn}}.}
29  \item{empty}{If there is no match to a string return this value.}
30  \item{ignore.case}{If \code{TRUE} then case is ignored in the \code{pattern}
31		argument.}
32  \item{perl}{If \code{TRUE} then \code{engine="R"} is used with
33		perl regular expressions.}
34  \item{engine}{Specifies which engine to use.  If the R installation
35   			has \code{tcltk} capability then the \code{tcl} engine is used
36			unless \code{FUN} is a proto object in which case the
37			\code{"R"} engine is used (regardless of the setting of this
38			argument).}
39  \item{\dots}{ optional arguments to \code{"'gsubfn'"}. }
40  \item{simplify}{  logical or function.  If logical, should the result be
41          simplified to a vector or matrix, as in \code{"sapply"} if possible?
42          If function, that function is applied to the result with each
43          component of the result passed as a separate argument.  Typically
44          if the form is used it will typically be specified as rbind.}
45  \item{USE.NAMES}{ logical; if \code{"'TRUE'"} and if \code{"'X'"} is
46	character, use \code{"'X'"} as
47          'names' for the result unless it had names already.}
48  \item{combine}{combine is a function applied to the components of
49  	 the result of \code{FUN}.
50     The default is \code{"c"}. \code{"list"}
51     is another common choice.  The default may change to be \code{"list"}
52     in the future.}
53}
54\details{
55If \code{FUN} is a function then for
56each character string in \code{"X"} the pattern is repeatedly
57matched,
58each such match along with
59back references, if any, are passed to
60the function \code{"FUN"} and the output of \code{FUN} is returned as a list.
61If \code{FUN} is a formula or proto object then it is interpreted
62to the way discussed in \code{\link{gsubfn}}.
63
64If \code{FUN} is a proto object or if \code{perl=TRUE} is specified
65then \code{engine="R"} is used and the \code{engine} argument is ignored.
66
67If \code{backref} is not specified and
68\code{engine="R"} is specified or implied then a heuristic is
69used to calculate the number of backreferences.  The primary situation
70that can fool it is if there are parentheses in the string that are
71not back references.
72In those cases the user will have to specify backref.
73If \code{engine="tcl"} then an exact algorithm is used and the problem
74sentence never occurs.
75
76\code{strapplyc} is like \code{strapply} but specialized to \code{FUN=c} for
77speed.  If the \code{"tcl"} engine is not available then it calls
78\code{strapply} and there will be no speed advantage.
79}
80\value{
81A list of character strings.
82}
83\seealso{ See \code{\link{gsubfn}}.
84For regular expression syntax used in tcl see
85\url{http://www.tcl.tk/man/tcl8.6/TclCmd/re_syntax.htm}
86and for regular expression syntax used in R see the help page for \code{regex}.
87}
88\examples{
89
90strapply("12;34:56,89,,12", "[0-9]+")
91
92# separate leading digits from rest of string
93# creating a 2 column matrix: digits, rest
94s <- c("123abc", "12cd34", "1e23")
95t(strapply(s, "^([[:digit:]]+)(.*)", c, simplify = TRUE))
96
97# same but create matrix
98strapply(s, "^([[:digit:]]+)(.*)", c, simplify = rbind)
99
100# running window of 5 characters using 0-lookahead perl regexp
101# Note that the three ( in the regexp will fool it into thinking there
102# are three backreferences so specify backref explicitly.
103x <- "abcdefghijkl"
104strapply(x, "(.)(?=(....))",  paste0, backref = -2, perl = TRUE)[[1]]
105
106# Note difference.  First gives character vector.  Second is the same.
107# Third has same elements but is a list.
108# Fourth gives list of two character vectors. Fifth is the same.
109strapply("a:b c:d", "(.):(.)", c)[[1]]
110strapply("a:b c:d", "(.):(.)", list, simplify = unlist) # same
111
112strapply("a:b c:d", "(.):(.)", list)[[1]]
113
114strapply("a:b c:d", "(.):(.)", c, combine = list)[[1]]
115strapply("a:b c:d", "(.):(.)", c, combine = list, simplify = c) # same
116
117# find second CPU_SPEED value given lines of config file
118Lines <- c("DEVICE = 'PC'", "CPU_SPEED = '1999', '233'")
119parms <- strapply(Lines, "[^ ',=]+", c, USE.NAMES = TRUE,
120	simplify = ~ lapply(list(...), "[", -1))
121parms$CPU_SPEED[2]
122
123# return first two words in each string
124p <- proto(fun = function(this, x) if (count <=2) x)
125strapply(c("the brown fox", "the eager beaver"), "\\\\w+", p)
126
127\dontrun{
128# convert to chron
129library(chron)
130x <- c("01/15/2005 23:32:45", "02/27/2005 01:22:30")
131x.chron <- strapply(x, "(../../....) (..:..:..)",  chron, simplify = c)
132
133# time parsing of all 275,546 words from James Joyce's Ulysses
134joyce <- readLines("http://www.gutenberg.org/files/4300/4300-8.txt")
135joycec <- paste(joyce, collapse = " ")
136system.time(s <- strapplyc(joycec, "\\\\w+")[[1]])
137length(s) # 275546
138}
139
140}
141\keyword{character}
142