1########################################################################
2##
3## Copyright (C) 2009-2021 The Octave Project Developers
4##
5## See the file COPYRIGHT.md in the top-level directory of this
6## distribution or <https://octave.org/copyright/>.
7##
8## This file is part of Octave.
9##
10## Octave is free software: you can redistribute it and/or modify it
11## under the terms of the GNU General Public License as published by
12## the Free Software Foundation, either version 3 of the License, or
13## (at your option) any later version.
14##
15## Octave is distributed in the hope that it will be useful, but
16## WITHOUT ANY WARRANTY; without even the implied warranty of
17## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18## GNU General Public License for more details.
19##
20## You should have received a copy of the GNU General Public License
21## along with Octave; see the file COPYING.  If not, see
22## <https://www.gnu.org/licenses/>.
23##
24########################################################################
25
26## -*- texinfo -*-
27## @deftypefn  {} {[@var{cstr}] =} strsplit (@var{str})
28## @deftypefnx {} {[@var{cstr}] =} strsplit (@var{str}, @var{del})
29## @deftypefnx {} {[@var{cstr}] =} strsplit (@dots{}, @var{name}, @var{value})
30## @deftypefnx {} {[@var{cstr}, @var{matches}] =} strsplit (@dots{})
31## Split the string @var{str} using the delimiters specified by @var{del} and
32## return a cell string array of substrings.
33##
34## If a delimiter is not specified the string is split at whitespace
35## @code{@{" ", "\f", "\n", "\r", "\t", "\v"@}}.  Otherwise, the delimiter,
36## @var{del} must be a string or cell array of strings.  By default,
37## consecutive delimiters in the input string @var{s} are collapsed into one
38## resulting in a single split.
39##
40## Supported @var{name}/@var{value} pair arguments are:
41##
42## @itemize
43## @item @var{collapsedelimiters} which may take the value of @code{true}
44## (default) or @code{false}.
45##
46## @item @var{delimitertype} which may take the value of @qcode{"simple"}
47## (default) or @nospell{@qcode{"regularexpression"}}.  A simple delimiter
48## matches the text exactly as written.  Otherwise, the syntax for regular
49## expressions outlined in @code{regexp} is used.
50## @end itemize
51##
52## The optional second output, @var{matches}, returns the delimiters which were
53## matched in the original string.
54##
55## Examples with simple delimiters:
56##
57## @example
58## strsplit ("a b c")
59##       @result{}
60##           @{
61##             [1,1] = a
62##             [1,2] = b
63##             [1,3] = c
64##           @}
65##
66## strsplit ("a,b,c", ",")
67##       @result{}
68##           @{
69##             [1,1] = a
70##             [1,2] = b
71##             [1,3] = c
72##           @}
73##
74## strsplit ("a foo b,bar c", @{" ", ",", "foo", "bar"@})
75##       @result{}
76##           @{
77##             [1,1] = a
78##             [1,2] = b
79##             [1,3] = c
80##           @}
81##
82## strsplit ("a,,b, c", @{",", " "@}, "collapsedelimiters", false)
83##       @result{}
84##           @{
85##             [1,1] = a
86##             [1,2] =
87##             [1,3] = b
88##             [1,4] =
89##             [1,5] = c
90##           @}
91##
92## @end example
93##
94## Examples with @nospell{regularexpression} delimiters:
95##
96## @smallexample
97## strsplit ("a foo b,bar c", ',|\s|foo|bar', ...
98##           "delimitertype", "regularexpression")
99##       @result{}
100##           @{
101##             [1,1] = a
102##             [1,2] = b
103##             [1,3] = c
104##           @}
105##
106## strsplit ("a,,b, c", '[, ]', "collapsedelimiters", false, ...
107##           "delimitertype", "regularexpression")
108##       @result{}
109##           @{
110##             [1,1] = a
111##             [1,2] =
112##             [1,3] = b
113##             [1,4] =
114##             [1,5] = c
115##           @}
116##
117## strsplit ("a,\t,b, c", @{',', '\s'@}, "delimitertype", "regularexpression")
118##       @result{}
119##           @{
120##             [1,1] = a
121##             [1,2] = b
122##             [1,3] = c
123##           @}
124##
125## strsplit ("a,\t,b, c", @{',', ' ', '\t'@}, "collapsedelimiters", false)
126##       @result{}
127##           @{
128##             [1,1] = a
129##             [1,2] =
130##             [1,3] =
131##             [1,4] = b
132##             [1,5] =
133##             [1,6] = c
134##           @}
135## @end smallexample
136##
137## @seealso{ostrsplit, strjoin, strtok, regexp}
138## @end deftypefn
139
140function [cstr, matches] = strsplit (str, del, varargin)
141
142  args.collapsedelimiters = true;
143  args.delimitertype = "simple";
144
145  [reg, params] = parseparams (varargin);
146
147  if (nargin < 1)
148    print_usage ();
149  elseif (numel (reg) > 1)
150    print_usage ();
151  elseif (numel (reg) == 1)
152    ## This is undocumented behavior to accept a logical 3rd arg.
153    if (islogical (reg{1}))
154      args.collapsedelimiters = reg{1};
155    else
156      print_usage ();
157    endif
158  endif
159  fields = fieldnames (args);
160  for n = 1:2:numel (params)
161    if (any (strcmpi (params{n}, fields)))
162      args.(tolower (params{n})) = params{n+1};
163    elseif (ischar (varargin{n}))
164      error ("Octave:strsplit:invalid_parameter_name",
165             "strsplit: invalid parameter name, '%s'", varargin{n});
166    else
167      print_usage ();
168    endif
169  endfor
170
171  ## Save the length of the "delimitertype" parameter
172  length_deltype = length (args.delimitertype);
173
174  if (nargin == 1 || (nargin > 1 && islogical (del)))
175    if (nargin > 1)
176      ## Second input is the "collapsedelimiters" parameter
177      args.collapsedelimiters = del;
178    endif
179    ## Set proper default for the delimiter type
180    if (strncmpi (args.delimitertype, "simple", length_deltype))
181      del = {" ", "\f", "\n", "\r", "\t", "\v"};
182    else
183      del = '\s';
184    endif
185  endif
186
187  if (! ischar (str) || (! ischar (del) && ! iscellstr (del)))
188    error ("strsplit: S and DEL must be string values");
189  elseif (! isempty (str) && ! isrow (str))
190    error ("strsplit: S must be a char row vector")
191  elseif (! isscalar (args.collapsedelimiters))
192    error ("strsplit: COLLAPSEDELIMITERS must be a scalar value");
193  endif
194
195  if (strncmpi (args.delimitertype, "simple", length_deltype))
196    is_simple = true;
197  elseif (strncmpi (args.delimitertype, "regularexpression", length_deltype))
198    is_simple = false;
199  else
200    error ("Octave:strsplit:invalid_delimitertype", "strsplit: Invalid DELIMITERTYPE");
201  endif
202
203  if (is_simple)
204    if (iscellstr (del))
205      del = cellfun (@do_string_escapes, del, "uniformoutput", false);
206    else
207      del = do_string_escapes (del);
208    endif
209    ## Escape characters which have a special meaning in regexp.
210    del = regexprep (del, '([{}()[\]^$.*?+|\\])', '\\$1');
211  endif
212
213  if (isempty (str))
214    cstr = {str};
215  else
216    if (iscellstr (del))
217      del = sprintf ("%s|", del{:});
218      del(end) = [];
219    endif
220    if (args.collapsedelimiters)
221      del = [ "(" del ")+" ];
222    endif
223    [cstr, matches] = regexp (str, del, "split", "match");
224  endif
225
226endfunction
227
228
229%!shared str
230%! str = "The rain in Spain stays mainly in the plain.";
231
232## Split on all whitespace.
233%!assert (strsplit (str), {"The", "rain", "in", "Spain", "stays", ...
234%!                         "mainly", "in", "the", "plain."})
235## Split on "ain".
236%!assert (strsplit (str, "ain"), {"The r", " in Sp", " stays m", ...
237%!                                "ly in the pl", "."})
238## Split on " " and "ain" (treating multiple delimiters as one).
239%!test
240%! s = strsplit (str, '\s|ain', true, "delimitertype", "r");
241%! assert (s, {"The", "r", "in", "Sp", "stays", ...
242%!             "m", "ly", "in", "the", "pl", "."});
243%!test
244%! s = strsplit (str, '\s|ain', true, "delimitertype", "r");
245%! assert (s, {"The", "r", "in", "Sp", "stays", ...
246%!             "m", "ly", "in", "the", "pl", "."});
247%!test
248%! [s, m] = strsplit (str, {'\s', 'ain'}, true, "delimitertype", "r");
249%! assert (s, {"The", "r", "in", "Sp", "stays", ...
250%!             "m", "ly", "in", "the", "pl", "."});
251%! assert (m, {" ", "ain ", " ", "ain ", " ", "ain", " ", " ", " ", "ain"});
252## Split on " " and "ain", and treat multiple delimiters separately.
253%!test
254%! [s, m] = strsplit (str, {" ", "ain"}, "collapsedelimiters", false);
255%! assert (s, {"The", "r", "", "in", "Sp", "", "stays", ...
256%!             "m", "ly", "in", "the", "pl", "."});
257%! assert (m, {" ", "ain", " ", " ", "ain", " ", " ", "ain", ...
258%!             " ", " ", " ", "ain"});
259
260%!assert (strsplit ("road to hell"), {"road", "to", "hell"})
261%!assert (strsplit ("road to hell", " "), {"road", "to", "hell"})
262%!assert (strsplit ("road to^hell", {" ","^"}), {"road", "to", "hell"})
263%!assert (strsplit ("road   to--hell", {" ","-"}, true), {"road", "to", "hell"})
264%!assert (strsplit (["a,bc,,de"], ",", false, "delimitertype", "s"), {"a", "bc", "", "de"})
265%!assert (strsplit (["a,bc,,de"], ",", false), {"a", "bc", "", "de"})
266%!assert (strsplit (["a,bc,de"], ",", true), {"a", "bc", "de"})
267%!assert (strsplit (["a,bc,de"], {","," "}, true), {"a", "bc", "de"})
268
269%!assert (strsplit ("road to hell", " ", "delimitertype", "r"), {"road", "to", "hell"})
270%!assert (strsplit ("road to^hell", '\^| ', "delimitertype", "r"), {"road", "to", "hell"})
271%!assert (strsplit ("road to^hell", "[ ^]", "delimitertype", "r"), {"road", "to", "hell"})
272%!assert (strsplit ("road   to--hell", "[ -]", false, "delimitertype", "r"), {"road", "", "", "to", "", "hell"})
273%!assert (strsplit (["a,bc,de"], ",", "delimitertype", "r"), {"a", "bc", "de"})
274%!assert (strsplit (["a,bc,,de"], ",", false, "delimitertype", "r"), {"a", "bc", "", "de"})
275%!assert (strsplit (["a,bc,de"], ",", true, "delimitertype", "r"), {"a", "bc", "de"})
276%!assert (strsplit (["a,bc,de"], "[, ]", true, "delimitertype", "r"), {"a", "bc", "de"})
277%!assert (strsplit ("hello \t world", true, "delimitertype", "r"), {"hello", "world"})
278
279%!assert (strsplit ("foo\tbar", '\t', "delimitertype", "r"), {"foo", "bar"})
280%!assert (strsplit ("foo\tbar", '\t', "delimitertype", "s"), {"foo", "bar"})
281
282## Test "match" for consecutive delmiters
283%!test
284%! [a, m] = strsplit ("a\t \nb", '\s', "delimitertype", "regularexpression",
285%!   "collapsedelimiters", false);
286%! assert (a, {"a", "", "", "b"});
287%! assert (m, {"\t", " ", "\n"});
288%!test
289%! [a, m] = strsplit ("a\t \nb", '\s', false, "delimitertype", "regularexpression");
290%! assert (a, {"a", "", "", "b"});
291%! assert (m, {"\t", " ", "\n"});
292%!test
293%! [a, m] = strsplit ("a\t \nb", '\s', "delimitertype", "regularexpression");
294%! assert (a, {"a", "b"});
295%! assert (m, {"\t \n"});
296%!test
297%! [a, m] = strsplit ("a\t \nb", {"\t", " ", "\n"}, "delimitertype", "simple");
298%! assert (a, {"a", "b"});
299%! assert (m, {"\t \n"});
300%!test
301%! [s, m] = strsplit ("hello \t world", true);
302%! assert (s, {"hello", "world"});
303%! assert (m, {" \t "});
304
305## Compatibility
306%! assert (strsplit ("", "a"), {""});
307%! assert (strsplit ("a", "a"), {"", ""});
308%! assert (strsplit ("aa", "a"), {"", ""});
309%! assert (strsplit ("aaa", "a"), {"", ""});
310
311%!assert <*44641> (strsplit ("xxx<yyy", "<"), {"xxx", "yyy"})
312%!assert <*44641> (strsplit ('xxx\yyy', '\'), {"xxx", "yyy"})
313
314%!assert <*47403> (strsplit ('xxx+yyy', '+'), {"xxx", "yyy"})
315
316## Test input validation
317%!error strsplit ()
318%!error strsplit ("abc", "b", true, 4)
319%!error <invalid parameter name, 'foo'> strsplit ("abc", "b", "foo", "true")
320%!error <S and DEL must be string values> strsplit (123, "b")
321%!error <S must be a char row vector> strsplit (["abc"; "xyz"])
322%!error <S must be a char row vector> strsplit (reshape ("axbycz", [1 3 2]))
323%!error <COLLAPSEDELIMITERS must be a scalar value> strsplit ("abc", "def", "collapsedelimiters", ones (3,3))
324%!error <Invalid DELIMITERTYPE> strsplit ("abc", "b", "delimitertype", "foobar")
325