1 /* This file is part of the 'stringi' project.
2  * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "stri_stringi.h"
34 #include "stri_container_utf8_indexable.h"
35 #include "stri_container_integer.h"
36 #include "stri_brkiter.h"
37 
38 
39 /** Split a string at BreakIterator boundaries
40  *
41  * @param str character vector
42  * @param n integer
43  * @param tokens_only logical
44  * @param simplify logical
45  * @param opts_brkiter named list
46  * @return list
47  *
48  * @version 0.2-2 (Marek Gagolewski, 2014-04-21)
49  *
50  * @version 0.2-2 (Marek Gagolewski, 2014-04-23)
51  *          removed "title": For Unicode 4.0 and above title boundary
52  *          iteration, please use Word Boundary iterator.
53  *
54  * @version 0.2-2 (Marek Gagolewski, 2014-04-25)
55  *          use stri__split_or_locate_boundaries
56  *
57  * @version 0.3-1 (Marek Gagolewski, 2014-10-29)
58  *          use opts_brkiter
59  *
60  * @version 0.4-1 (Marek Gagolewski, 2014-11-28)
61  *          new args: n, tokens_only, simplify
62  *
63  * @version 0.4-1 (Marek Gagolewski, 2014-12-02)
64  *          use StriRuleBasedBreakIterator
65  *
66  * @version 0.4-1 (Marek Gagolewski, 2014-12-04)
67  *    allow `simplify=NA`; FR #126: pass n to stri_list2matrix
68  */
stri_split_boundaries(SEXP str,SEXP n,SEXP tokens_only,SEXP simplify,SEXP opts_brkiter)69 SEXP stri_split_boundaries(SEXP str, SEXP n, SEXP tokens_only, SEXP simplify, SEXP opts_brkiter)
70 {
71     bool tokens_only1 = stri__prepare_arg_logical_1_notNA(tokens_only, "tokens_only");
72     PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify"));
73     PROTECT(str = stri__prepare_arg_string(str, "str"));
74     PROTECT(n = stri__prepare_arg_integer(n, "n"));
75     StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break");
76 
77     STRI__ERROR_HANDLER_BEGIN(3)
78     R_len_t vectorize_length = stri__recycling_rule(true, 2,
79                                LENGTH(str), LENGTH(n));
80     StriContainerUTF8_indexable str_cont(str, vectorize_length);
81     StriContainerInteger n_cont(n, vectorize_length);
82     StriRuleBasedBreakIterator brkiter(opts_brkiter2);
83 
84     SEXP ret;
85     STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));
86 
87     for (R_len_t i = 0; i < vectorize_length; ++i)
88     {
89         if (n_cont.isNA(i)) {
90             SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
91             continue;
92         }
93         int  n_cur = n_cont.get(i);
94 
95         if (str_cont.isNA(i)) {
96             SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
97             continue;
98         }
99 
100         if (n_cur >= INT_MAX-1)
101             throw StriException(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_SMALLER, "n");
102         else if (n_cur < 0)
103             n_cur = INT_MAX;
104         else if (n_cur == 0) {
105             SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0));
106             continue;
107         }
108 
109         R_len_t str_cur_n = str_cont.get(i).length();
110         const char* str_cur_s = str_cont.get(i).c_str();
111         deque< pair<R_len_t,R_len_t> > occurrences;
112         brkiter.setupMatcher(str_cur_s, str_cur_n);
113         brkiter.first();
114 
115         pair<R_len_t,R_len_t> curpair;
116         R_len_t k = 0;
117         while (k < n_cur && brkiter.next(curpair)) {
118             occurrences.push_back(curpair);
119             ++k; // another field
120         }
121 
122 
123         R_len_t noccurrences = (R_len_t)occurrences.size();
124         if (noccurrences <= 0) {
125             SET_VECTOR_ELT(ret, i, stri__vector_empty_strings(0)); // @TODO: Should it be a NA? Hard to say...
126             continue;
127         }
128         if (k == n_cur && !tokens_only1)
129             occurrences.back().second = str_cur_n;
130 
131         SEXP ans;
132         STRI__PROTECT(ans = Rf_allocVector(STRSXP, noccurrences));
133         deque< pair<R_len_t,R_len_t> >::iterator iter = occurrences.begin();
134         for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) {
135             SET_STRING_ELT(ans, j, Rf_mkCharLenCE(str_cur_s+(*iter).first,
136                                                   (*iter).second-(*iter).first, CE_UTF8));
137         }
138         SET_VECTOR_ELT(ret, i, ans);
139         STRI__UNPROTECT(1);
140     }
141 
142     if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) {
143         R_len_t n_min = 0;
144         R_len_t n_length = LENGTH(n);
145         int* n_tab = INTEGER(n);
146         for (R_len_t i=0; i<n_length; ++i) {
147             if (n_tab[i] != NA_INTEGER && n_min < n_tab[i])
148                 n_min = n_tab[i];
149         }
150         SEXP robj_TRUE, robj_n_min, robj_na_strings, robj_empty_strings;
151         STRI__PROTECT(robj_TRUE = Rf_ScalarLogical(TRUE));
152         STRI__PROTECT(robj_n_min = Rf_ScalarInteger(n_min));
153         STRI__PROTECT(robj_na_strings = stri__vector_NA_strings(1));
154         STRI__PROTECT(robj_empty_strings = stri__vector_empty_strings(1));
155         STRI__PROTECT(ret = stri_list2matrix(ret, robj_TRUE,
156                                              (LOGICAL(simplify)[0] == NA_LOGICAL)?robj_na_strings
157                                              :robj_empty_strings,
158                                              robj_n_min))
159     }
160 
161     STRI__UNPROTECT_ALL
162     return ret;
163     STRI__ERROR_HANDLER_END({ /* no action */ })
164 }
165