1######################################################################## 2## 3## Copyright (C) 2009-2021 The Octave Project Developers 4## 5## See the file COPYRIGHT.md in the top-level directory of this 6## distribution or <https://octave.org/copyright/>. 7## 8## This file is part of Octave. 9## 10## Octave is free software: you can redistribute it and/or modify it 11## under the terms of the GNU General Public License as published by 12## the Free Software Foundation, either version 3 of the License, or 13## (at your option) any later version. 14## 15## Octave is distributed in the hope that it will be useful, but 16## WITHOUT ANY WARRANTY; without even the implied warranty of 17## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18## GNU General Public License for more details. 19## 20## You should have received a copy of the GNU General Public License 21## along with Octave; see the file COPYING. If not, see 22## <https://www.gnu.org/licenses/>. 23## 24######################################################################## 25 26## -*- texinfo -*- 27## @deftypefn {} {[@var{cstr}] =} strsplit (@var{str}) 28## @deftypefnx {} {[@var{cstr}] =} strsplit (@var{str}, @var{del}) 29## @deftypefnx {} {[@var{cstr}] =} strsplit (@dots{}, @var{name}, @var{value}) 30## @deftypefnx {} {[@var{cstr}, @var{matches}] =} strsplit (@dots{}) 31## Split the string @var{str} using the delimiters specified by @var{del} and 32## return a cell string array of substrings. 33## 34## If a delimiter is not specified the string is split at whitespace 35## @code{@{" ", "\f", "\n", "\r", "\t", "\v"@}}. Otherwise, the delimiter, 36## @var{del} must be a string or cell array of strings. By default, 37## consecutive delimiters in the input string @var{s} are collapsed into one 38## resulting in a single split. 39## 40## Supported @var{name}/@var{value} pair arguments are: 41## 42## @itemize 43## @item @var{collapsedelimiters} which may take the value of @code{true} 44## (default) or @code{false}. 45## 46## @item @var{delimitertype} which may take the value of @qcode{"simple"} 47## (default) or @nospell{@qcode{"regularexpression"}}. A simple delimiter 48## matches the text exactly as written. Otherwise, the syntax for regular 49## expressions outlined in @code{regexp} is used. 50## @end itemize 51## 52## The optional second output, @var{matches}, returns the delimiters which were 53## matched in the original string. 54## 55## Examples with simple delimiters: 56## 57## @example 58## strsplit ("a b c") 59## @result{} 60## @{ 61## [1,1] = a 62## [1,2] = b 63## [1,3] = c 64## @} 65## 66## strsplit ("a,b,c", ",") 67## @result{} 68## @{ 69## [1,1] = a 70## [1,2] = b 71## [1,3] = c 72## @} 73## 74## strsplit ("a foo b,bar c", @{" ", ",", "foo", "bar"@}) 75## @result{} 76## @{ 77## [1,1] = a 78## [1,2] = b 79## [1,3] = c 80## @} 81## 82## strsplit ("a,,b, c", @{",", " "@}, "collapsedelimiters", false) 83## @result{} 84## @{ 85## [1,1] = a 86## [1,2] = 87## [1,3] = b 88## [1,4] = 89## [1,5] = c 90## @} 91## 92## @end example 93## 94## Examples with @nospell{regularexpression} delimiters: 95## 96## @smallexample 97## strsplit ("a foo b,bar c", ',|\s|foo|bar', ... 98## "delimitertype", "regularexpression") 99## @result{} 100## @{ 101## [1,1] = a 102## [1,2] = b 103## [1,3] = c 104## @} 105## 106## strsplit ("a,,b, c", '[, ]', "collapsedelimiters", false, ... 107## "delimitertype", "regularexpression") 108## @result{} 109## @{ 110## [1,1] = a 111## [1,2] = 112## [1,3] = b 113## [1,4] = 114## [1,5] = c 115## @} 116## 117## strsplit ("a,\t,b, c", @{',', '\s'@}, "delimitertype", "regularexpression") 118## @result{} 119## @{ 120## [1,1] = a 121## [1,2] = b 122## [1,3] = c 123## @} 124## 125## strsplit ("a,\t,b, c", @{',', ' ', '\t'@}, "collapsedelimiters", false) 126## @result{} 127## @{ 128## [1,1] = a 129## [1,2] = 130## [1,3] = 131## [1,4] = b 132## [1,5] = 133## [1,6] = c 134## @} 135## @end smallexample 136## 137## @seealso{ostrsplit, strjoin, strtok, regexp} 138## @end deftypefn 139 140function [cstr, matches] = strsplit (str, del, varargin) 141 142 args.collapsedelimiters = true; 143 args.delimitertype = "simple"; 144 145 [reg, params] = parseparams (varargin); 146 147 if (nargin < 1) 148 print_usage (); 149 elseif (numel (reg) > 1) 150 print_usage (); 151 elseif (numel (reg) == 1) 152 ## This is undocumented behavior to accept a logical 3rd arg. 153 if (islogical (reg{1})) 154 args.collapsedelimiters = reg{1}; 155 else 156 print_usage (); 157 endif 158 endif 159 fields = fieldnames (args); 160 for n = 1:2:numel (params) 161 if (any (strcmpi (params{n}, fields))) 162 args.(tolower (params{n})) = params{n+1}; 163 elseif (ischar (varargin{n})) 164 error ("Octave:strsplit:invalid_parameter_name", 165 "strsplit: invalid parameter name, '%s'", varargin{n}); 166 else 167 print_usage (); 168 endif 169 endfor 170 171 ## Save the length of the "delimitertype" parameter 172 length_deltype = length (args.delimitertype); 173 174 if (nargin == 1 || (nargin > 1 && islogical (del))) 175 if (nargin > 1) 176 ## Second input is the "collapsedelimiters" parameter 177 args.collapsedelimiters = del; 178 endif 179 ## Set proper default for the delimiter type 180 if (strncmpi (args.delimitertype, "simple", length_deltype)) 181 del = {" ", "\f", "\n", "\r", "\t", "\v"}; 182 else 183 del = '\s'; 184 endif 185 endif 186 187 if (! ischar (str) || (! ischar (del) && ! iscellstr (del))) 188 error ("strsplit: S and DEL must be string values"); 189 elseif (! isempty (str) && ! isrow (str)) 190 error ("strsplit: S must be a char row vector") 191 elseif (! isscalar (args.collapsedelimiters)) 192 error ("strsplit: COLLAPSEDELIMITERS must be a scalar value"); 193 endif 194 195 if (strncmpi (args.delimitertype, "simple", length_deltype)) 196 is_simple = true; 197 elseif (strncmpi (args.delimitertype, "regularexpression", length_deltype)) 198 is_simple = false; 199 else 200 error ("Octave:strsplit:invalid_delimitertype", "strsplit: Invalid DELIMITERTYPE"); 201 endif 202 203 if (is_simple) 204 if (iscellstr (del)) 205 del = cellfun (@do_string_escapes, del, "uniformoutput", false); 206 else 207 del = do_string_escapes (del); 208 endif 209 ## Escape characters which have a special meaning in regexp. 210 del = regexprep (del, '([{}()[\]^$.*?+|\\])', '\\$1'); 211 endif 212 213 if (isempty (str)) 214 cstr = {str}; 215 else 216 if (iscellstr (del)) 217 del = sprintf ("%s|", del{:}); 218 del(end) = []; 219 endif 220 if (args.collapsedelimiters) 221 del = [ "(" del ")+" ]; 222 endif 223 [cstr, matches] = regexp (str, del, "split", "match"); 224 endif 225 226endfunction 227 228 229%!shared str 230%! str = "The rain in Spain stays mainly in the plain."; 231 232## Split on all whitespace. 233%!assert (strsplit (str), {"The", "rain", "in", "Spain", "stays", ... 234%! "mainly", "in", "the", "plain."}) 235## Split on "ain". 236%!assert (strsplit (str, "ain"), {"The r", " in Sp", " stays m", ... 237%! "ly in the pl", "."}) 238## Split on " " and "ain" (treating multiple delimiters as one). 239%!test 240%! s = strsplit (str, '\s|ain', true, "delimitertype", "r"); 241%! assert (s, {"The", "r", "in", "Sp", "stays", ... 242%! "m", "ly", "in", "the", "pl", "."}); 243%!test 244%! s = strsplit (str, '\s|ain', true, "delimitertype", "r"); 245%! assert (s, {"The", "r", "in", "Sp", "stays", ... 246%! "m", "ly", "in", "the", "pl", "."}); 247%!test 248%! [s, m] = strsplit (str, {'\s', 'ain'}, true, "delimitertype", "r"); 249%! assert (s, {"The", "r", "in", "Sp", "stays", ... 250%! "m", "ly", "in", "the", "pl", "."}); 251%! assert (m, {" ", "ain ", " ", "ain ", " ", "ain", " ", " ", " ", "ain"}); 252## Split on " " and "ain", and treat multiple delimiters separately. 253%!test 254%! [s, m] = strsplit (str, {" ", "ain"}, "collapsedelimiters", false); 255%! assert (s, {"The", "r", "", "in", "Sp", "", "stays", ... 256%! "m", "ly", "in", "the", "pl", "."}); 257%! assert (m, {" ", "ain", " ", " ", "ain", " ", " ", "ain", ... 258%! " ", " ", " ", "ain"}); 259 260%!assert (strsplit ("road to hell"), {"road", "to", "hell"}) 261%!assert (strsplit ("road to hell", " "), {"road", "to", "hell"}) 262%!assert (strsplit ("road to^hell", {" ","^"}), {"road", "to", "hell"}) 263%!assert (strsplit ("road to--hell", {" ","-"}, true), {"road", "to", "hell"}) 264%!assert (strsplit (["a,bc,,de"], ",", false, "delimitertype", "s"), {"a", "bc", "", "de"}) 265%!assert (strsplit (["a,bc,,de"], ",", false), {"a", "bc", "", "de"}) 266%!assert (strsplit (["a,bc,de"], ",", true), {"a", "bc", "de"}) 267%!assert (strsplit (["a,bc,de"], {","," "}, true), {"a", "bc", "de"}) 268 269%!assert (strsplit ("road to hell", " ", "delimitertype", "r"), {"road", "to", "hell"}) 270%!assert (strsplit ("road to^hell", '\^| ', "delimitertype", "r"), {"road", "to", "hell"}) 271%!assert (strsplit ("road to^hell", "[ ^]", "delimitertype", "r"), {"road", "to", "hell"}) 272%!assert (strsplit ("road to--hell", "[ -]", false, "delimitertype", "r"), {"road", "", "", "to", "", "hell"}) 273%!assert (strsplit (["a,bc,de"], ",", "delimitertype", "r"), {"a", "bc", "de"}) 274%!assert (strsplit (["a,bc,,de"], ",", false, "delimitertype", "r"), {"a", "bc", "", "de"}) 275%!assert (strsplit (["a,bc,de"], ",", true, "delimitertype", "r"), {"a", "bc", "de"}) 276%!assert (strsplit (["a,bc,de"], "[, ]", true, "delimitertype", "r"), {"a", "bc", "de"}) 277%!assert (strsplit ("hello \t world", true, "delimitertype", "r"), {"hello", "world"}) 278 279%!assert (strsplit ("foo\tbar", '\t', "delimitertype", "r"), {"foo", "bar"}) 280%!assert (strsplit ("foo\tbar", '\t', "delimitertype", "s"), {"foo", "bar"}) 281 282## Test "match" for consecutive delmiters 283%!test 284%! [a, m] = strsplit ("a\t \nb", '\s', "delimitertype", "regularexpression", 285%! "collapsedelimiters", false); 286%! assert (a, {"a", "", "", "b"}); 287%! assert (m, {"\t", " ", "\n"}); 288%!test 289%! [a, m] = strsplit ("a\t \nb", '\s', false, "delimitertype", "regularexpression"); 290%! assert (a, {"a", "", "", "b"}); 291%! assert (m, {"\t", " ", "\n"}); 292%!test 293%! [a, m] = strsplit ("a\t \nb", '\s', "delimitertype", "regularexpression"); 294%! assert (a, {"a", "b"}); 295%! assert (m, {"\t \n"}); 296%!test 297%! [a, m] = strsplit ("a\t \nb", {"\t", " ", "\n"}, "delimitertype", "simple"); 298%! assert (a, {"a", "b"}); 299%! assert (m, {"\t \n"}); 300%!test 301%! [s, m] = strsplit ("hello \t world", true); 302%! assert (s, {"hello", "world"}); 303%! assert (m, {" \t "}); 304 305## Compatibility 306%! assert (strsplit ("", "a"), {""}); 307%! assert (strsplit ("a", "a"), {"", ""}); 308%! assert (strsplit ("aa", "a"), {"", ""}); 309%! assert (strsplit ("aaa", "a"), {"", ""}); 310 311%!assert <*44641> (strsplit ("xxx<yyy", "<"), {"xxx", "yyy"}) 312%!assert <*44641> (strsplit ('xxx\yyy', '\'), {"xxx", "yyy"}) 313 314%!assert <*47403> (strsplit ('xxx+yyy', '+'), {"xxx", "yyy"}) 315 316## Test input validation 317%!error strsplit () 318%!error strsplit ("abc", "b", true, 4) 319%!error <invalid parameter name, 'foo'> strsplit ("abc", "b", "foo", "true") 320%!error <S and DEL must be string values> strsplit (123, "b") 321%!error <S must be a char row vector> strsplit (["abc"; "xyz"]) 322%!error <S must be a char row vector> strsplit (reshape ("axbycz", [1 3 2])) 323%!error <COLLAPSEDELIMITERS must be a scalar value> strsplit ("abc", "def", "collapsedelimiters", ones (3,3)) 324%!error <Invalid DELIMITERTYPE> strsplit ("abc", "b", "delimitertype", "foobar") 325