1 /* This file is part of the 'stringi' project.
2  * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "stri_stringi.h"
34 #include "stri_container_utf8.h"
35 #include "stri_container_regex.h"
36 #include <deque>
37 #include <utility>
38 using namespace std;
39 
40 
41 /**
42  * Extract first occurrence of a regex pattern in each string
43  *
44  * @param str character vector
45  * @param pattern character vector
46  * @param opts_regex list
47  * @param first logical - search for the first or the last occurrence?
48  * @return character vector
49  *
50  * @version 0.1-?? (Marek Gagolewski, 2013-06-20)
51  *
52  * @version 0.3-1 (Marek Gagolewski, 2014-11-05)
53  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
54  *
55  * @version 1.0-2 (Marek Gagolewski, 2016-01-29)
56  *    Issue #214: allow a regex pattern like `.*`  to match an empty string
57  *
58  * @version 1.4.7 (Marek Gagolewski, 2020-08-24)
59  *    Use StriContainerRegexPattern::getRegexOptions
60  */
stri__extract_firstlast_regex(SEXP str,SEXP pattern,SEXP opts_regex,bool first)61 SEXP stri__extract_firstlast_regex(SEXP str, SEXP pattern, SEXP opts_regex, bool first)
62 {
63     PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument
64     PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument
65     R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
66 
67     StriRegexMatcherOptions pattern_opts =
68         StriContainerRegexPattern::getRegexOptions(opts_regex);
69 
70     UText* str_text = NULL; // may potentially be slower, but definitely is more convenient!
71     STRI__ERROR_HANDLER_BEGIN(2)
72     StriContainerUTF8 str_cont(str, vectorize_length);
73     StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts);
74 
75     SEXP ret;
76     STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));
77 
78     for (R_len_t i = pattern_cont.vectorize_init();
79             i != pattern_cont.vectorize_end();
80             i = pattern_cont.vectorize_next(i))
81     {
82         STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont,
83                                               SET_STRING_ELT(ret, i, NA_STRING);)
84 
85         UErrorCode status = U_ZERO_ERROR;
86         RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
87         str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status);
88         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
89 
90         int m_start = -1;
91         int m_end = -1;
92         int m_res;
93         matcher->reset(str_text);
94         m_res = (int)matcher->find(status);
95         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
96         if (m_res) { // find first match
97             m_start = (int)matcher->start(status); // The **native** position in the input string :-)
98             STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
99             m_end   = (int)matcher->end(status);
100             STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
101         }
102         else {
103             SET_STRING_ELT(ret, i, NA_STRING);
104             continue;
105         }
106 
107         if (!first) { // continue searching
108             while (1) {
109                 m_res = (int)matcher->find(status);
110                 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
111                 if (!m_res) break;
112                 m_start = (int)matcher->start(status);
113                 m_end   = (int)matcher->end(status);
114                 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
115             }
116         }
117 
118         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cont.get(i).c_str()+m_start, m_end-m_start, CE_UTF8));
119     }
120 
121     if (str_text) {
122         utext_close(str_text);
123         str_text = NULL;
124     }
125     STRI__UNPROTECT_ALL
126     return ret;
127     STRI__ERROR_HANDLER_END(if (str_text) utext_close(str_text);)
128     }
129 
130 
131 /**
132  * Extract first occurrence of a regex pattern in each string
133  *
134  * @param str character vector
135  * @param pattern character vector
136  * @param opts_regex list
137  * @return character vector
138  *
139  * @version 0.1-?? (Marek Gagolewski, 2013-06-20)
140  */
stri_extract_first_regex(SEXP str,SEXP pattern,SEXP opts_regex)141 SEXP stri_extract_first_regex(SEXP str, SEXP pattern, SEXP opts_regex)
142 {
143     return stri__extract_firstlast_regex(str, pattern, opts_regex, true);
144 }
145 
146 
147 /**
148  * Extract last occurrence of a regex pattern in each string
149  *
150  * @param str character vector
151  * @param pattern character vector
152  * @param opts_regex list
153  * @return character vector
154  *
155  * @version 0.1-?? (Marek Gagolewski, 2013-06-20)
156  */
stri_extract_last_regex(SEXP str,SEXP pattern,SEXP opts_regex)157 SEXP stri_extract_last_regex(SEXP str, SEXP pattern, SEXP opts_regex)
158 {
159     return stri__extract_firstlast_regex(str, pattern, opts_regex, false);
160 }
161 
162 
163 /**
164  * Extract all occurrences of a regex pattern in each string
165  *
166  * @param str character vector
167  * @param pattern character vector
168  * @param opts_regex list
169  * @param simplify single logical value
170  *
171  * @return list of character vectors  or character matrix
172  *
173  * @version 0.1-?? (Marek Gagolewski, 2013-06-20)
174  *
175  * @version 0.3-1 (Marek Gagolewski, 2014-10-24)
176  *          added simplify param
177  *
178  * @version 0.3-1 (Marek Gagolewski, 2014-11-05)
179  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
180  *
181  * @version 0.4-1 (Marek Gagolewski, 2014-11-27)
182  *    FR #117: omit_no_match arg added
183  *
184  * @version 0.4-1 (Marek Gagolewski, 2014-12-04)
185  *    allow `simplify=NA`
186  *
187  * @version 1.0-2 (Marek Gagolewski, 2016-01-29)
188  *    Issue #214: allow a regex pattern like `.*`  to match an empty string
189  */
stri_extract_all_regex(SEXP str,SEXP pattern,SEXP simplify,SEXP omit_no_match,SEXP opts_regex)190 SEXP stri_extract_all_regex(SEXP str, SEXP pattern, SEXP simplify, SEXP omit_no_match, SEXP opts_regex)
191 {
192     StriRegexMatcherOptions pattern_opts =
193         StriContainerRegexPattern::getRegexOptions(opts_regex);
194     bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match");
195     PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify"));
196     PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument
197     PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument
198     R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
199 
200     UText* str_text = NULL; // may potentially be slower, but definitely is more convenient!
201     STRI__ERROR_HANDLER_BEGIN(3)
202     StriContainerUTF8 str_cont(str, vectorize_length);
203     StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts);
204 
205     SEXP ret;
206     STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));
207 
208     for (R_len_t i = pattern_cont.vectorize_init();
209             i != pattern_cont.vectorize_end();
210             i = pattern_cont.vectorize_next(i))
211     {
212         STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont,
213                                               SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));)
214 
215         UErrorCode status = U_ZERO_ERROR;
216         RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
217         str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status);
218         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
219 
220         matcher->reset(str_text);
221 
222         deque< pair<R_len_t, R_len_t> > occurrences;
223         int m_res;
224         while (1) {
225             m_res = (int)matcher->find(status);
226             STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
227             if (!m_res) break;
228 
229             occurrences.push_back(pair<R_len_t, R_len_t>(
230                                       (R_len_t)matcher->start(status), (R_len_t)matcher->end(status)
231                                   ));
232             STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
233         }
234 
235         R_len_t noccurrences = (R_len_t)occurrences.size();
236         if (noccurrences <= 0) {
237             SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1));
238             continue;
239         }
240 
241         const char* str_cur_s = str_cont.get(i).c_str();
242         SEXP cur_res;
243         STRI__PROTECT(cur_res = Rf_allocVector(STRSXP, noccurrences));
244         deque< pair<R_len_t, R_len_t> >::iterator iter = occurrences.begin();
245         for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) {
246             pair<R_len_t, R_len_t> curo = *iter;
247             SET_STRING_ELT(cur_res, j,
248                            Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8));
249         }
250         SET_VECTOR_ELT(ret, i, cur_res);
251         STRI__UNPROTECT(1);
252     }
253 
254     if (str_text) {
255         utext_close(str_text);
256         str_text = NULL;
257     }
258 
259     if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) {
260         SEXP robj_TRUE, robj_zero, robj_na_strings, robj_empty_strings;
261         STRI__PROTECT(robj_TRUE = Rf_ScalarLogical(TRUE));
262         STRI__PROTECT(robj_zero = Rf_ScalarInteger(0));
263         STRI__PROTECT(robj_na_strings = stri__vector_NA_strings(1));
264         STRI__PROTECT(robj_empty_strings = stri__vector_empty_strings(1));
265         STRI__PROTECT(ret = stri_list2matrix(ret, robj_TRUE,
266                                              (LOGICAL(simplify)[0] == NA_LOGICAL)?robj_na_strings
267                                              :robj_empty_strings,
268                                              robj_zero));
269     }
270 
271     STRI__UNPROTECT_ALL
272     return ret;
273     STRI__ERROR_HANDLER_END(if (str_text) utext_close(str_text);)
274     }
275