1 /* This file is part of the 'stringi' project.
2  * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "stri_stringi.h"
34 #include "stri_container_utf8.h"
35 #include "stri_container_charclass.h"
36 
37 
38 /**
39  * Trim characters from a charclass from left AND/OR right side of the string
40  *
41  * @param str character vector
42  * @param pattern character vector
43  * @param left from left?
44  * @param right from left?
45  * @return character vector
46  *
47  * @version 0.1-?? (Bartek Tartanus)
48  *
49  * @version 0.1-?? (Marek Gagolewski, 2013-06-04)
50  *          Use StriContainerUTF8 and CharClass
51  *
52  * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
53  *          make StriException-friendly & Use StrContainerCharClass
54  *
55  * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
56  *          detects invalid UTF-8 byte stream
57  *
58  * @version 0.2-1 (Marek Gagolewski, 2014-04-05)
59  *          StriContainerCharClass now relies on UnicodeSet
60  *
61  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
62  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
63  *
64  * @version 1.6.3 (Marek Gagolewski, 2021-06-10) negate
65 */
stri__trim_leftright(SEXP str,SEXP pattern,bool left,bool right,bool negate)66 SEXP stri__trim_leftright(SEXP str, SEXP pattern, bool left, bool right, bool negate)
67 {
68     PROTECT(str = stri__prepare_arg_string(str, "str"));
69     PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern"));
70     R_len_t vectorize_length =
71         stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
72 
73     STRI__ERROR_HANDLER_BEGIN(2)
74     StriContainerUTF8 str_cont(str, vectorize_length);
75     StriContainerCharClass pattern_cont(pattern, vectorize_length, negate);
76 
77     SEXP ret;
78     STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));
79 
80     for (R_len_t i = pattern_cont.vectorize_init();
81             i != pattern_cont.vectorize_end();
82             i = pattern_cont.vectorize_next(i))
83     {
84         if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
85             SET_STRING_ELT(ret, i, NA_STRING);
86             continue;
87         }
88 
89         const UnicodeSet* pattern_cur = &pattern_cont.get(i);
90         R_len_t     str_cur_n = str_cont.get(i).length();
91         const char* str_cur_s = str_cont.get(i).c_str();
92         R_len_t jlast1 = 0;
93         R_len_t jlast2 = str_cur_n;
94 
95         if (left) {
96             UChar32 chr;
97             for (R_len_t j=0; j<str_cur_n; ) {
98                 U8_NEXT(str_cur_s, j, str_cur_n, chr); // "look ahead"
99                 if (chr < 0) // invalid UTF-8 sequence
100                     throw StriException(MSG__INVALID_UTF8);
101                 if (pattern_cur->contains(chr)) {
102                     break; // break at first occurrence
103                 }
104                 jlast1 = j;
105             }
106         }
107 
108         if (right && jlast1 < str_cur_n) {
109             UChar32 chr;
110             for (R_len_t j=str_cur_n; j>0; ) {
111                 U8_PREV(str_cur_s, 0, j, chr); // "look behind"
112                 if (chr < 0) // invalid utf-8 sequence
113                     throw StriException(MSG__INVALID_UTF8);
114                 if (pattern_cur->contains(chr)) {
115                     break; // break at first occurrence
116                 }
117                 jlast2 = j;
118             }
119         }
120 
121         // now jlast is the index, from which we start copying
122         SET_STRING_ELT(ret, i,
123                        Rf_mkCharLenCE(str_cur_s+jlast1, (jlast2-jlast1), CE_UTF8));
124     }
125 
126     STRI__UNPROTECT_ALL
127     return ret;
128     STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
129 }
130 
131 
132 /**
133  * Trim characters from a charclass from both sides of the string
134  *
135  * @param str character vector
136  * @param pattern character vector
137  * @return character vector
138  *
139  * @version 0.1-?? (Bartek Tartanus)
140  *
141  * @version 0.1-?? (Marek Gagolewski, 2013-06-04)
142  *          Use stri__trim_leftright
143  *
144  * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
145  *          make StriException-friendly
146  *
147  * @version 1.6.3 (Marek Gagolewski, 2021-06-10) negate
148 */
stri_trim_both(SEXP str,SEXP pattern,SEXP negate)149 SEXP stri_trim_both(SEXP str, SEXP pattern, SEXP negate)
150 {
151     bool negate_val = stri__prepare_arg_logical_1_notNA(negate, "negate");
152     return stri__trim_leftright(str, pattern, true, true, negate_val);
153 }
154 
155 
156 /**
157  * Trim characters from a charclass from the left of the string
158  *
159  * @param str character vector
160  * @param pattern character vector
161  * @return character vector
162  *
163  * @version 0.1-?? (Bartek Tartanus)
164  *
165  * @version 0.1-?? (Marek Gagolewski, 2013-06-04)
166  *          Use stri__trim_leftright
167  *
168  * @version 1.6.3 (Marek Gagolewski, 2021-06-10) negate
169 */
stri_trim_left(SEXP str,SEXP pattern,SEXP negate)170 SEXP stri_trim_left(SEXP str, SEXP pattern, SEXP negate)
171 {
172     bool negate_val = stri__prepare_arg_logical_1_notNA(negate, "negate");
173     return stri__trim_leftright(str, pattern, true, false, negate_val);
174 }
175 
176 
177 /**
178  * Trim characters from a charclass from the right of the string
179  *
180  * @param str character vector
181  * @param pattern character vector
182  * @return character vector
183  *
184  * @version 0.1-?? (Bartek Tartanus)
185  *
186  * @version 0.1-?? (Marek Gagolewski, 2013-06-04)
187  *          Use stri__trim_leftright
188  *
189  * @version 1.6.3 (Marek Gagolewski, 2021-06-10) negate
190 */
stri_trim_right(SEXP str,SEXP pattern,SEXP negate)191 SEXP stri_trim_right(SEXP str, SEXP pattern, SEXP negate)
192 {
193     bool negate_val = stri__prepare_arg_logical_1_notNA(negate, "negate");
194     return stri__trim_leftright(str, pattern, false, true, negate_val);
195 }
196