1 /* This file is part of the 'stringi' project.
2  * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "stri_stringi.h"
34 #include "stri_container_utf16.h"
35 #include "stri_container_usearch.h"
36 #include "stri_container_integer.h"
37 #include "stri_container_logical.h"
38 #include <deque>
39 #include <utility>
40 using namespace std;
41 
42 
43 /**
44  * Split a string into parts [with collation]
45  *
46  * The pattern matches identify delimiters that separate the input into fields.
47  * The input data between the matches becomes the fields themselves.
48  *
49  * @param str character vector
50  * @param pattern character vector
51  * @param n integer vector
52  * @param omit_empty logical vector
53  * @param opts_collator passed to stri__ucol_open(),
54  * if \code{NA}, then \code{stri_detect_fixed_byte} is called
55  * @param tokens_only single logical value
56  * @param simplify single logical value
57  *
58  * @return list of character vectors or character matrix
59  *
60  *
61  * @version 0.1-?? (Bartek Tartanus)
62  *
63  * @version 0.1-?? (Marek Gagolewski, 2013-06-25)
64  *          StriException friendly, use StriContainerUTF16
65  *
66  * @version 0.1-?? (Marek Gagolewski, 2013-07-10)
67  *          BUGFIX: wrong behavior on empty str
68  *
69  * @version 0.2-3 (Marek Gagolewski, 2014-05-08)
70  *          new fun: stri_split_coll (opts_collator == NA not allowed)
71  *
72  * @version 0.3-1 (Marek Gagolewski, 2014-10-19)
73  *          added tokens_only param
74  *
75  * @version 0.3-1 (Marek Gagolewski, 2014-10-23)
76  *          added split param
77  *
78  * @version 0.3-1 (Marek Gagolewski, 2014-10-24)
79  *          allow omit_empty=NA
80  *
81  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
82  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
83  *
84  * @version 0.4-1 (Marek Gagolewski, 2014-12-04)
85  *    allow `simplify=NA`; FR #126: pass n to stri_list2matrix
86  */
stri_split_coll(SEXP str,SEXP pattern,SEXP n,SEXP omit_empty,SEXP tokens_only,SEXP simplify,SEXP opts_collator)87 SEXP stri_split_coll(SEXP str, SEXP pattern, SEXP n, SEXP omit_empty,
88                      SEXP tokens_only, SEXP simplify, SEXP opts_collator)
89 {
90     bool tokens_only1 = stri__prepare_arg_logical_1_notNA(tokens_only, "tokens_only");
91     PROTECT(str = stri__prepare_arg_string(str, "str"));
92     PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern"));
93     PROTECT(n = stri__prepare_arg_integer(n, "n"));
94     PROTECT(omit_empty = stri__prepare_arg_logical(omit_empty, "omit_empty"));
95     PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify"));
96 
97     UCollator* collator = NULL;
98     collator = stri__ucol_open(opts_collator);
99 
100     STRI__ERROR_HANDLER_BEGIN(5)
101     R_len_t vectorize_length = stri__recycling_rule(true, 4,
102                                LENGTH(str), LENGTH(pattern), LENGTH(n), LENGTH(omit_empty));
103     StriContainerUTF16 str_cont(str, vectorize_length);
104     StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator);  // collator is not owned by pattern_cont
105     StriContainerInteger   n_cont(n, vectorize_length);
106     StriContainerLogical   omit_empty_cont(omit_empty, vectorize_length);
107 
108     SEXP ret;
109     STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));
110 
111     for (R_len_t i = pattern_cont.vectorize_init();
112             i != pattern_cont.vectorize_end();
113             i = pattern_cont.vectorize_next(i))
114     {
115         if (n_cont.isNA(i)) {
116             SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
117             continue;
118         }
119 
120         int  n_cur        = n_cont.get(i);
121         int  omit_empty_cur   = !omit_empty_cont.isNA(i) && omit_empty_cont.get(i);
122 
123         STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont,
124                 SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));,
125                 SET_VECTOR_ELT(ret, i,
126                                (omit_empty_cont.isNA(i))?stri__vector_NA_strings(1):
127                                stri__vector_empty_strings((omit_empty_cur || n_cur == 0)?0:1));)
128 
129         UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i));
130         usearch_reset(matcher);
131 
132 
133         if (n_cur >= INT_MAX-1)
134             throw StriException(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_SMALLER, "n");
135         else if (n_cur < 0)
136             n_cur = INT_MAX;
137         else if (n_cur == 0) {
138             SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0));
139             continue;
140         }
141         else if (tokens_only1)
142             n_cur++; // we need to do one split ahead here
143 
144         R_len_t k;
145         deque< pair<R_len_t, R_len_t> > fields; // byte based-indices
146         fields.push_back(pair<R_len_t, R_len_t>(0,0));
147         UErrorCode status = U_ZERO_ERROR;
148 
149         for (k=1; k < n_cur && USEARCH_DONE != usearch_next(matcher, &status) && !U_FAILURE(status); ) {
150             R_len_t s1 = (R_len_t)usearch_getMatchedStart(matcher);
151             R_len_t s2 = (R_len_t)usearch_getMatchedLength(matcher) + s1;
152 
153             if (omit_empty_cur && fields.back().first == s1)
154                 fields.back().first = s2; // don't start any new field
155             else {
156                 fields.back().second = s1;
157                 fields.push_back(pair<R_len_t, R_len_t>(s2, s2)); // start a new field here
158                 ++k; // another field
159             }
160         }
161         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
162         fields.back().second = str_cont.get(i).length();
163         if (omit_empty_cur && fields.back().first == fields.back().second)
164             fields.pop_back();
165 
166         if (tokens_only1 && n_cur < INT_MAX) {
167             n_cur--; // one split ahead could have been made, see above
168             while (fields.size() > (size_t)n_cur)
169                 fields.pop_back(); // get rid of the remainder
170         }
171 
172         R_len_t noccurrences = (R_len_t)fields.size();
173         StriContainerUTF16 out_cont(noccurrences);
174         deque< pair<R_len_t, R_len_t> >::iterator iter = fields.begin();
175         for (k = 0; iter != fields.end(); ++iter, ++k) {
176             pair<R_len_t, R_len_t> curoccur = *iter;
177             if (curoccur.second == curoccur.first && omit_empty_cont.isNA(i))
178                 out_cont.setNA(k);
179             else
180                 out_cont.getWritable(k).setTo(str_cont.get(i),
181                                               curoccur.first, curoccur.second-curoccur.first);
182         }
183         SET_VECTOR_ELT(ret, i, out_cont.toR());
184     }
185 
186     if (collator) {
187         ucol_close(collator);
188         collator=NULL;
189     }
190 
191     if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) {
192         R_len_t n_min = 0;
193         R_len_t n_length = LENGTH(n);
194         int* n_tab = INTEGER(n);
195         for (R_len_t i=0; i<n_length; ++i) {
196             if (n_tab[i] != NA_INTEGER && n_min < n_tab[i])
197                 n_min = n_tab[i];
198         }
199         SEXP robj_TRUE, robj_n_min, robj_na_strings, robj_empty_strings;
200         STRI__PROTECT(robj_TRUE = Rf_ScalarLogical(TRUE));
201         STRI__PROTECT(robj_n_min = Rf_ScalarInteger(n_min));
202         STRI__PROTECT(robj_na_strings = stri__vector_NA_strings(1));
203         STRI__PROTECT(robj_empty_strings = stri__vector_empty_strings(1));
204         STRI__PROTECT(ret = stri_list2matrix(ret, robj_TRUE,
205                                              (LOGICAL(simplify)[0] == NA_LOGICAL)?robj_na_strings
206                                              :robj_empty_strings,
207                                              robj_n_min))
208     }
209 
210     STRI__UNPROTECT_ALL
211     return ret;
212     STRI__ERROR_HANDLER_END(
213         if (collator) ucol_close(collator);
214     )
215     }
216