1 /* This file is part of the 'stringi' project.
2 * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32
33 #include "stri_stringi.h"
34 #include "stri_container_utf8.h"
35 #include "stri_container_bytesearch.h"
36 #include "stri_container_integer.h"
37 #include "stri_container_logical.h"
38 #include <deque>
39 #include <utility>
40 using namespace std;
41
42
43 /**
44 * Split a string into parts [byte compare]
45 *
46 * The pattern matches identify delimiters that separate the input into fields.
47 * The input data between the matches becomes the fields themselves.
48 *
49 * @param str character vector
50 * @param pattern character vector
51 * @param n integer vector
52 * @param omit_empty logical vector
53 * @param tokens_only single logical value
54 * @param simplify single logical value
55 *
56 * @return list of character vectors or character matrix
57 *
58 * @version 0.1-?? (Bartek Tartanus)
59 *
60 * @version 0.1-?? (Marek Gagolewski, 2013-06-25)
61 * StriException friendly, use StriContainerUTF8
62 *
63 * @version 0.1-?? (Marek Gagolewski, 2013-07-10)
64 * BUGFIX: wrong behavior on empty str
65 *
66 * @version 0.2-3 (Marek Gagolewski, 2014-05-08)
67 * stri_split_fixed now uses byte search only
68 *
69 * @version 0.3-1 (Marek Gagolewski, 2014-10-19)
70 * added tokens_only param
71 *
72 * @version 0.3-1 (Marek Gagolewski, 2014-10-23)
73 * added split param
74 *
75 * @version 0.3-1 (Marek Gagolewski, 2014-10-24)
76 * allow omit_empty=NA
77 *
78 * @version 0.3-1 (Marek Gagolewski, 2014-11-05)
79 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
80 *
81 * @version 0.4-1 (Marek Gagolewski, 2014-12-04)
82 * allow `simplify=NA`; FR #126: pass n to stri_list2matrix
83 *
84 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
85 * FR #110, #23: opts_fixed arg added
86 *
87 * @version 0.5-1 (Marek Gagolewski, 2015-02-14)
88 * use StriByteSearchMatcher
89 */
stri_split_fixed(SEXP str,SEXP pattern,SEXP n,SEXP omit_empty,SEXP tokens_only,SEXP simplify,SEXP opts_fixed)90 SEXP stri_split_fixed(SEXP str, SEXP pattern, SEXP n,
91 SEXP omit_empty, SEXP tokens_only, SEXP simplify, SEXP opts_fixed)
92 {
93 uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed);
94 bool tokens_only1 = stri__prepare_arg_logical_1_notNA(tokens_only, "tokens_only");
95 PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify"));
96 PROTECT(str = stri__prepare_arg_string(str, "str"));
97 PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern"));
98 PROTECT(n = stri__prepare_arg_integer(n, "n"));
99 PROTECT(omit_empty = stri__prepare_arg_logical(omit_empty, "omit_empty"));
100
101 STRI__ERROR_HANDLER_BEGIN(5)
102 R_len_t vectorize_length = stri__recycling_rule(true, 4,
103 LENGTH(str), LENGTH(pattern), LENGTH(n), LENGTH(omit_empty));
104 StriContainerUTF8 str_cont(str, vectorize_length);
105 StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags);
106 StriContainerInteger n_cont(n, vectorize_length);
107 StriContainerLogical omit_empty_cont(omit_empty, vectorize_length);
108
109 SEXP ret;
110 STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));
111
112 for (R_len_t i = pattern_cont.vectorize_init();
113 i != pattern_cont.vectorize_end();
114 i = pattern_cont.vectorize_next(i))
115 {
116 if (n_cont.isNA(i)) {
117 SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
118 continue;
119 }
120 int n_cur = n_cont.get(i);
121 int omit_empty_cur = !omit_empty_cont.isNA(i) && omit_empty_cont.get(i);
122
123 STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont,
124 SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));,
125 SET_VECTOR_ELT(ret, i,
126 (omit_empty_cont.isNA(i))?stri__vector_NA_strings(1):
127 stri__vector_empty_strings((omit_empty_cur || n_cur == 0)?0:1));)
128
129 R_len_t str_cur_n = str_cont.get(i).length();
130 const char* str_cur_s = str_cont.get(i).c_str();
131
132 if (n_cur >= INT_MAX-1)
133 throw StriException(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_SMALLER, "n");
134 else if (n_cur < 0)
135 n_cur = INT_MAX;
136 else if (n_cur == 0) {
137 SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0));
138 continue;
139 }
140 else if (tokens_only1)
141 n_cur++; // we need to do one split ahead here
142
143 StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i);
144 matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length());
145 R_len_t k;
146 deque< pair<R_len_t, R_len_t> > fields; // byte based-indices
147 fields.push_back(pair<R_len_t, R_len_t>(0,0));
148
149 for (k=1; k < n_cur && USEARCH_DONE != matcher->findNext(); ) {
150 R_len_t s1 = (R_len_t)matcher->getMatchedStart();
151 R_len_t s2 = (R_len_t)matcher->getMatchedLength() + s1;
152
153 if (omit_empty_cur && fields.back().first == s1)
154 fields.back().first = s2; // don't start any new field
155 else {
156 fields.back().second = s1;
157 fields.push_back(pair<R_len_t, R_len_t>(s2, s2)); // start a new field here
158 ++k; // another field
159 }
160 }
161 fields.back().second = str_cur_n;
162 if (omit_empty_cur && fields.back().first == fields.back().second)
163 fields.pop_back();
164
165 if (tokens_only1 && n_cur < INT_MAX) {
166 n_cur--; // one split ahead could have been made, see above
167 while (fields.size() > (size_t)n_cur)
168 fields.pop_back(); // get rid of the remainder
169 }
170
171 SEXP ans;
172 STRI__PROTECT(ans = Rf_allocVector(STRSXP, fields.size()));
173
174 deque< pair<R_len_t, R_len_t> >::iterator iter = fields.begin();
175 for (k = 0; iter != fields.end(); ++iter, ++k) {
176 pair<R_len_t, R_len_t> curoccur = *iter;
177 if (curoccur.second == curoccur.first && omit_empty_cont.isNA(i))
178 SET_STRING_ELT(ans, k, NA_STRING);
179 else
180 SET_STRING_ELT(ans, k,
181 Rf_mkCharLenCE(str_cur_s+curoccur.first,
182 curoccur.second-curoccur.first, CE_UTF8));
183 }
184
185 SET_VECTOR_ELT(ret, i, ans);
186 STRI__UNPROTECT(1);
187 }
188
189 if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) {
190 R_len_t n_min = 0;
191 R_len_t n_length = LENGTH(n);
192 int* n_tab = INTEGER(n);
193 for (R_len_t i=0; i<n_length; ++i) {
194 if (n_tab[i] != NA_INTEGER && n_min < n_tab[i])
195 n_min = n_tab[i];
196 }
197 SEXP robj_TRUE, robj_n_min, robj_na_strings, robj_empty_strings;
198 STRI__PROTECT(robj_TRUE = Rf_ScalarLogical(TRUE));
199 STRI__PROTECT(robj_n_min = Rf_ScalarInteger(n_min));
200 STRI__PROTECT(robj_na_strings = stri__vector_NA_strings(1));
201 STRI__PROTECT(robj_empty_strings = stri__vector_empty_strings(1));
202 STRI__PROTECT(ret = stri_list2matrix(ret, robj_TRUE,
203 (LOGICAL(simplify)[0] == NA_LOGICAL)?robj_na_strings
204 :robj_empty_strings,
205 robj_n_min))
206 }
207
208 STRI__UNPROTECT_ALL
209 return ret;
210 STRI__ERROR_HANDLER_END(; /* nothing interesting on error */)
211 }
212