1 /* This file is part of the 'stringi' project.
2  * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "stri_stringi.h"
34 #include "stri_container_utf8_indexable.h"
35 #include "stri_container_integer.h"
36 #include "stri_container_charclass.h"
37 
38 
39 /**
40  * Detect if a string starts with a pattern match
41  *
42  * @param str character vector
43  * @param pattern character vector
44  * @param from integer vector
45  * @return logical vector
46  *
47  * @version 0.3-1 (Marek Gagolewski, 2014-10-31)
48  *
49  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
50  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
51  *
52  * @version 1.4.7 (Marek Gagolewski, 2020-08-24)
53  *    #345: `negate` arg added
54  */
stri_startswith_charclass(SEXP str,SEXP pattern,SEXP from,SEXP negate)55 SEXP stri_startswith_charclass(SEXP str, SEXP pattern, SEXP from, SEXP negate)
56 {
57     bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
58     PROTECT(str = stri__prepare_arg_string(str, "str"));
59     PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern"));
60     PROTECT(from = stri__prepare_arg_integer(from, "from"));
61 
62     STRI__ERROR_HANDLER_BEGIN(3)
63     int vectorize_length = stri__recycling_rule(true, 3,
64                            LENGTH(str), LENGTH(pattern), LENGTH(from));
65     StriContainerUTF8_indexable str_cont(str, vectorize_length);
66     StriContainerCharClass pattern_cont(pattern, vectorize_length);
67     StriContainerInteger from_cont(from, vectorize_length);
68 
69     SEXP ret;
70     STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
71     int* ret_tab = LOGICAL(ret);
72 
73     for (R_len_t i = pattern_cont.vectorize_init();
74             i != pattern_cont.vectorize_end();
75             i = pattern_cont.vectorize_next(i))
76     {
77         if (str_cont.isNA(i) || pattern_cont.isNA(i) || from_cont.isNA(i)) {
78             ret_tab[i] = NA_LOGICAL;
79             continue;
80         }
81 
82         R_len_t from_cur = from_cont.get(i);
83         if (from_cur == 1)
84             from_cur = 0; /* most commonly used case */
85         else if (from_cur >= 0)
86             from_cur = str_cont.UChar32_to_UTF8_index_fwd(i, from_cur-1);
87         else
88             from_cur = str_cont.UChar32_to_UTF8_index_back(i, -from_cur);
89         // now surely from_cur >= 0 && from_cur <= cur_n
90 
91         const char* str_cur_s = str_cont.get(i).c_str();
92         R_len_t     str_cur_n = str_cont.get(i).length();
93         const UnicodeSet* pattern_cur = &pattern_cont.get(i);
94 
95         if (from_cur > str_cur_n)
96             ret_tab[i] = negate_1;
97         else {
98             UChar32 chr = 0;
99             U8_NEXT(str_cur_s, from_cur, str_cur_n, chr);
100             if (chr < 0) // invalid utf-8 sequence
101                 throw StriException(MSG__INVALID_UTF8);
102             ret_tab[i] = pattern_cur->contains(chr);
103 
104             if (negate_1)
105                 ret_tab[i] = !ret_tab[i];
106         }
107     }
108 
109     STRI__UNPROTECT_ALL
110     return ret;
111     STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ )
112 }
113 
114 
115 /**
116  * Detect if a string ends with a pattern match
117  *
118  * @param str character vector
119  * @param pattern character vector
120  * @param to integer vector
121  * @return logical vector
122  *
123  * @version 0.3-1 (Marek Gagolewski, 2014-10-31)
124  *
125  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
126  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
127  *
128  * @version 1.4.7 (Marek Gagolewski, 2020-08-24)
129  *    #345: `negate` arg added
130  */
stri_endswith_charclass(SEXP str,SEXP pattern,SEXP to,SEXP negate)131 SEXP stri_endswith_charclass(SEXP str, SEXP pattern, SEXP to, SEXP negate)
132 {
133     bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
134     PROTECT(str = stri__prepare_arg_string(str, "str"));
135     PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern"));
136     PROTECT(to = stri__prepare_arg_integer(to, "to"));
137 
138     STRI__ERROR_HANDLER_BEGIN(3)
139     int vectorize_length = stri__recycling_rule(true, 3,
140                            LENGTH(str), LENGTH(pattern), LENGTH(to));
141     StriContainerUTF8_indexable str_cont(str, vectorize_length);
142     StriContainerCharClass pattern_cont(pattern, vectorize_length);
143     StriContainerInteger to_cont(to, vectorize_length);
144 
145     SEXP ret;
146     STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
147     int* ret_tab = LOGICAL(ret);
148 
149     for (R_len_t i = pattern_cont.vectorize_init();
150             i != pattern_cont.vectorize_end();
151             i = pattern_cont.vectorize_next(i))
152     {
153         if (str_cont.isNA(i) || pattern_cont.isNA(i) || to_cont.isNA(i)) {
154             ret_tab[i] = NA_LOGICAL;
155             continue;
156         }
157 
158         const char* str_cur_s = str_cont.get(i).c_str();
159         R_len_t     str_cur_n = str_cont.get(i).length();
160         const UnicodeSet* pattern_cur = &pattern_cont.get(i);
161 
162         R_len_t to_cur = to_cont.get(i);
163         if (to_cur == -1)
164             to_cur = str_cur_n; /* most commonly used case */
165         else if (to_cur >= 0)
166             to_cur = str_cont.UChar32_to_UTF8_index_fwd(i, to_cur);
167         else
168             to_cur = str_cont.UChar32_to_UTF8_index_back(i, -to_cur-1);
169         // now surely to_cur >= 0 && to_cur <= cur_n
170 
171         if (to_cur <= 0)
172             ret_tab[i] = negate_1;
173         else {
174             UChar32 chr = 0;
175             U8_PREV(str_cur_s, 0, to_cur, chr);
176             if (chr < 0) // invalid utf-8 sequence
177                 throw StriException(MSG__INVALID_UTF8);
178             ret_tab[i] = pattern_cur->contains(chr);
179 
180             if (negate_1)
181                 ret_tab[i] = !ret_tab[i];
182         }
183     }
184 
185     STRI__UNPROTECT_ALL
186     return ret;
187     STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ )
188 }
189