1 /* This file is part of the 'stringi' project.
2  * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "stri_stringi.h"
34 #include "stri_container_utf8.h"
35 #include "stri_container_bytesearch.h"
36 #include <deque>
37 #include <utility>
38 using namespace std;
39 
40 
41 /**
42  * Extract first or last occurrences of pattern in a string [exact byte search]
43  *
44  * @param str character vector
45  * @param pattern character vector
46  * @param first looking for first or last match?
47  * @return character vector
48  *
49  * @version 0.1-?? (Marek Gagolewski, 2013-06-24)
50  *
51  * @version 0.2-3 (Marek Gagolewski, 2014-05-08)
52  *          stri_extract_fixed now uses byte search only
53  *
54  * @version 0.4-1 (Marek Gagolewski, 2014-12-08)
55  *          new args: opts_fixed
56  *
57  * @version 0.5-1 (Marek Gagolewski, 2015-02-14)
58  *    use StriByteSearchMatcher
59  */
stri__extract_firstlast_fixed(SEXP str,SEXP pattern,SEXP opts_fixed,bool first)60 SEXP stri__extract_firstlast_fixed(SEXP str, SEXP pattern, SEXP opts_fixed, bool first)
61 {
62     uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed);
63     PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument
64     PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument
65 
66     STRI__ERROR_HANDLER_BEGIN(2)
67     int vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
68     StriContainerUTF8 str_cont(str, vectorize_length);
69     StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags);
70 
71     SEXP ret;
72     STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));
73 
74     for (R_len_t i = pattern_cont.vectorize_init();
75             i != pattern_cont.vectorize_end();
76             i = pattern_cont.vectorize_next(i))
77     {
78         STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont,
79                 SET_STRING_ELT(ret, i, NA_STRING);, SET_STRING_ELT(ret, i, NA_STRING);)
80 
81         StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i);
82         matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length());
83         int start, len;
84         if (first) {
85             start = matcher->findFirst();
86         } else {
87             start = matcher->findLast();
88         }
89         if (start == USEARCH_DONE) {
90             SET_STRING_ELT(ret, i, NA_STRING);
91             continue;
92         }
93 
94         len = matcher->getMatchedLength();
95 
96         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cont.get(i).c_str()+start, len, CE_UTF8));
97     }
98 
99     STRI__UNPROTECT_ALL
100     return ret;
101     STRI__ERROR_HANDLER_END({ /* no-op */ })
102 }
103 
104 
105 /**
106  * Extract first occurrence of a fixed pattern in each string
107  *
108  * @param str character vector
109  * @param pattern character vector
110  * @return character vector
111  *
112  * @version 0.1-?? (Marek Gagolewski, 2013-06-24)
113  *
114  * @version 0.2-3 (Marek Gagolewski, 2014-05-08)
115  *          stri_extract_fixed now uses byte search only
116  *
117  * @version 0.4-1 (Marek Gagolewski, 2014-12-08)
118  *          new args: opts_fixed
119  */
stri_extract_first_fixed(SEXP str,SEXP pattern,SEXP opts_fixed)120 SEXP stri_extract_first_fixed(SEXP str, SEXP pattern, SEXP opts_fixed)
121 {
122     return stri__extract_firstlast_fixed(str, pattern, opts_fixed, true);
123 }
124 
125 
126 /**
127  * Extract last occurrence of a fixed pattern in each string
128  *
129  * @param str character vector
130  * @param pattern character vector
131  * @return character vector
132  *
133  * @version 0.1-?? (Marek Gagolewski, 2013-06-24)
134  *
135  * @version 0.2-3 (Marek Gagolewski, 2014-05-08)
136  *          stri_extract_fixed now uses byte search only
137  *
138  * @version 0.4-1 (Marek Gagolewski, 2014-12-08)
139  *          new args: opts_fixed
140  */
stri_extract_last_fixed(SEXP str,SEXP pattern,SEXP opts_fixed)141 SEXP stri_extract_last_fixed(SEXP str, SEXP pattern, SEXP opts_fixed)
142 {
143     return stri__extract_firstlast_fixed(str, pattern, opts_fixed, false);
144 }
145 
146 
147 /**
148  * Extract all occurrences of pattern in a string [exact byte search]
149  *
150  * @param str character vector
151  * @param pattern character vector
152  * @return character vector
153  *
154  * @version 0.1-?? (Marek Gagolewski, 2013-06-24)
155  *
156  * @version 0.2-3 (Marek Gagolewski, 2014-05-08)
157  *          stri_extract_fixed now uses byte search only
158  *
159  * @version 0.4-1 (Marek Gagolewski, 2014-12-08)
160  *          new args: opts_fixed, omit_no_match, simplify
161  *
162  * @version 0.5-1 (Marek Gagolewski, 2015-02-14)
163  *    use StriByteSearchMatcher
164  */
stri_extract_all_fixed(SEXP str,SEXP pattern,SEXP simplify,SEXP omit_no_match,SEXP opts_fixed)165 SEXP stri_extract_all_fixed(SEXP str, SEXP pattern, SEXP simplify, SEXP omit_no_match, SEXP opts_fixed)
166 {
167     uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed, /*allow_overlap*/true);
168     bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match");
169     PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify"));
170     PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument
171     PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument
172 
173     STRI__ERROR_HANDLER_BEGIN(3)
174     R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
175     StriContainerUTF8 str_cont(str, vectorize_length);
176     StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags);
177 
178     SEXP ret;
179     STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));
180 
181     for (R_len_t i = pattern_cont.vectorize_init();
182             i != pattern_cont.vectorize_end();
183             i = pattern_cont.vectorize_next(i))
184     {
185         STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont,
186                 SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));,
187                 SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1));)
188 
189         StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i);
190         matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length());
191 
192         int start = matcher->findFirst();
193         deque< pair<R_len_t, R_len_t> > occurrences;
194         while (start != USEARCH_DONE) {
195             occurrences.push_back(pair<R_len_t, R_len_t>(start, start+matcher->getMatchedLength()));
196             start = matcher->findNext();
197         }
198 
199         R_len_t noccurrences = (R_len_t)occurrences.size();
200         if (noccurrences <= 0) {
201             SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1));
202             continue;
203         }
204 
205         const char* str_cur_s = str_cont.get(i).c_str();
206         SEXP cur_res;
207         STRI__PROTECT(cur_res = Rf_allocVector(STRSXP, noccurrences));
208         deque< pair<R_len_t, R_len_t> >::iterator iter = occurrences.begin();
209         for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) {
210             pair<R_len_t, R_len_t> curo = *iter;
211             SET_STRING_ELT(cur_res, j,
212                            Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8));
213         }
214         SET_VECTOR_ELT(ret, i, cur_res);
215         STRI__UNPROTECT(1);
216     }
217 
218     if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) {
219         SEXP robj_TRUE, robj_zero, robj_na_strings, robj_empty_strings;
220         STRI__PROTECT(robj_TRUE = Rf_ScalarLogical(TRUE));
221         STRI__PROTECT(robj_zero = Rf_ScalarInteger(0));
222         STRI__PROTECT(robj_na_strings = stri__vector_NA_strings(1));
223         STRI__PROTECT(robj_empty_strings = stri__vector_empty_strings(1));
224         STRI__PROTECT(ret = stri_list2matrix(ret, robj_TRUE,
225                                              (LOGICAL(simplify)[0] == NA_LOGICAL)?robj_na_strings
226                                              :robj_empty_strings,
227                                              robj_zero));
228     }
229 
230     STRI__UNPROTECT_ALL
231     return ret;
232     STRI__ERROR_HANDLER_END({/* no-op */})
233 }
234