1 /* This file is part of the 'stringi' project.
2 * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32
33 #include "stri_stringi.h"
34 #include "stri_container_utf8_indexable.h"
35 #include <deque>
36 #include <vector>
37 #include <utility>
38 #include <unicode/brkiter.h>
39 #include <unicode/uniset.h>
40
41
42 /** Greedy word wrap algorithm
43 *
44 * @param wrap_after [out]
45 * @param nwords number of "words"
46 * @param width_val maximal desired out line width
47 * @param widths_orig ith word width original
48 * @param widths_trim ith word width trimmed
49 * @param add_para_1
50 * @param add_para_n
51 *
52 * @version 0.1-?? (Bartek Tartanus)
53 * original implementation
54 *
55 * @version 0.2-2 (Marek Gagolewski, 2014-04-28)
56 * BreakIterator usage mods
57 *
58 * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
59 * new args: add_para_1, add_para_n
60 */
stri__wrap_greedy(std::deque<R_len_t> & wrap_after,R_len_t nwords,int width_val,const std::vector<R_len_t> & widths_orig,const std::vector<R_len_t> & widths_trim,int add_para_1,int add_para_n)61 void stri__wrap_greedy(std::deque<R_len_t>& wrap_after,
62 R_len_t nwords, int width_val,
63 const std::vector<R_len_t>& widths_orig,
64 const std::vector<R_len_t>& widths_trim,
65 int add_para_1, int add_para_n)
66 {
67 R_len_t cur_len = add_para_1+widths_orig[0];
68 for (R_len_t j = 1; j < nwords; ++j) {
69 if (cur_len + widths_trim[j] > width_val) {
70 cur_len = add_para_n+widths_orig[j];
71 wrap_after.push_back(j-1);
72 }
73 else {
74 cur_len += widths_orig[j];
75 }
76 }
77 }
78
79
80 /** Dynamic word wrap algorithm
81 * (Knuth's word wrapping algorithm that minimizes raggedness of formatted text)
82 *
83 * @param wrap_after [out]
84 * @param nwords number of "words"
85 * @param width_val maximal desired out line width
86 * @param exponent_val cost function exponent
87 * @param widths_orig ith word width original
88 * @param widths_trim ith word width trimmed
89 * @param add_para_1
90 * @param add_para_a
91 *
92 * @version 0.1-?? (Bartek Tartanus)
93 * original implementation
94 *
95 * @version 0.2-2 (Marek Gagolewski, 2014-04-30)
96 * BreakIterator usage mods
97 *
98 * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
99 * new args: add_para_1, add_para_n,
100 * cost of the last line is zero
101 */
stri__wrap_dynamic(std::deque<R_len_t> & wrap_after,R_len_t nwords,int width_val,double exponent_val,const std::vector<R_len_t> & widths_orig,const std::vector<R_len_t> & widths_trim,int add_para_1,int add_para_n)102 void stri__wrap_dynamic(std::deque<R_len_t>& wrap_after,
103 R_len_t nwords, int width_val, double exponent_val,
104 const std::vector<R_len_t>& widths_orig,
105 const std::vector<R_len_t>& widths_trim,
106 int add_para_1, int add_para_n)
107 {
108 #define IDX(i,j) (i)*nwords+(j)
109 vector<double> cost(nwords*nwords);
110 // where cost[IDX(i,j)] == cost of printing words i..j in a single line, i<=j
111
112 // calculate costs:
113 // there is some "punishment" for leaving blanks at the end of each line
114 // (number of "blank" codepoints ^ exponent_val)
115 for (int i=0; i<nwords; i++) {
116 int sum = 0;
117 for (int j=i; j<nwords; j++) {
118 if (j > i) {
119 if (cost[IDX(i,j-1)] < 0.0) { // already Inf
120 cost[IDX(i,j)] = -1.0; // Inf
121 continue;
122 }
123 else {
124 sum -= widths_trim[j-1];
125 sum += widths_orig[j-1];
126 }
127 }
128 sum += widths_trim[j];
129 int ct = width_val - sum;
130 if (i == 0) ct -= add_para_1;
131 else ct -= add_para_n;
132
133 if (j == nwords-1) { // last line == cost 0
134 if (j == i || ct >= 0)
135 cost[IDX(i,j)] = 0.0;
136 else
137 cost[IDX(i,j)] = -1.0/*Inf*/;
138 }
139 else if (j == i)
140 // some words don't fit in a line at all -> cost 0.0
141 cost[IDX(i,j)] = (ct < 0) ? 0.0 : pow((double)ct, exponent_val);
142 else
143 cost[IDX(i,j)] = (ct < 0) ? -1.0/*"Inf"*/ : pow((double)ct, exponent_val);
144 }
145 }
146
147 vector<double> f(nwords); // f[j] == total cost of (optimally) printing words 0..j
148 vector<bool> where(nwords*nwords, false); // where[IDX(i,j)] == false iff
149 // we don't wrap after i-th word, i<=j
150 // when (optimally) printing words 0..j
151
152 for (int j=0; j<nwords; ++j) {
153 if (cost[IDX(0,j)] >= 0.0) {
154 // no breaking needed: words 0..j fit in one line
155 f[j] = cost[IDX(0,j)];
156 continue;
157 }
158
159 // let i = optimal way of printing of words 0..i + printing i+1..j
160 int i = 0;
161 while (i <= j) {
162 if (cost[IDX(i+1,j)] >= 0.0) break;
163 ++i;
164 }
165
166 double best_i = f[i] + cost[IDX(i+1,j)];
167 for (int k=i+1; k<j; ++k) {
168 if (cost[IDX(k+1,j)] < 0.0) continue;
169 double best_cur = f[k] + cost[IDX(k+1,j)];
170 if (best_cur < best_i) {
171 best_i = best_cur;
172 i = k;
173 }
174 }
175 for (int k=0; k<i; ++k)
176 where[IDX(k,j)] = where[IDX(k,i)];
177 where[IDX(i,j)] = true;
178 f[j] = best_i;
179 }
180
181 //result is in the last row of where...
182 for (int k=0; k<nwords; ++k)
183 if (where[IDX(k, nwords-1)])
184 wrap_after.push_back(k);
185 }
186
187
188 struct StriWrapLineStart {
189 std::string str;
190 R_len_t nbytes;
191 R_len_t count;
192 R_len_t width;
193
StriWrapLineStartStriWrapLineStart194 StriWrapLineStart(const String8& s, R_len_t v) :
195 str(s.c_str()) {
196 nbytes = s.length()+v;
197 count = s.countCodePoints()+v;
198 width = stri__width_string(s.c_str(), s.length())+v;
199 str.append(std::string(v, ' '));
200 }
201 };
202
203
204 /** Word wrap text
205 *
206 * @param str character vector
207 * @param width single integer
208 * @param cost_exponent single double
209 * @param indent single integer
210 * @param exdent single integer
211 * @param prefix single string
212 * @param initial single string
213 * @param locale locale identifier or NULL for default locale
214 * @param use_length single logical value
215 *
216 * @return list
217 *
218 * @version 0.1-?? (Bartek Tartanus)
219 *
220 * @version 0.2-2 (Marek Gagolewski, 2014-04-27)
221 * single function for wrap_greedy and wrap_dynamic
222 * (dispatch inside);
223 * use BreakIterator
224 *
225 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
226 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
227 *
228 * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
229 * new args: indent, exdent, prefix, initial
230 *
231 * @version 0.5-1 (Marek Gagolewski, 2014-12-19)
232 * #133 allow width <= 0
233 *
234 * @version 0.5-1 (Marek Gagolewski, 2015-02-28)
235 * don't trim so many white spaces at the end of each word (normalize arg does that)
236 * #139: allow a "whitespace" break iterator
237 *
238 * @version 0.5-1 (Marek Gagolewski, 2015-04-23)
239 * `use_length` arg added
240 *
241 *
242 * @version 0.5-1 (Marek Gagolewski, 2015-06-09)
243 * BIGSKIP: no more CHARSXP on out on "" input
244 */
stri_wrap(SEXP str,SEXP width,SEXP cost_exponent,SEXP indent,SEXP exdent,SEXP prefix,SEXP initial,SEXP whitespace_only,SEXP use_length,SEXP locale)245 SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent,
246 SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only,
247 SEXP use_length, SEXP locale)
248 {
249 bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length");
250 double exponent_val = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent");
251 bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only");
252
253 int width_val = stri__prepare_arg_integer_1_notNA(width, "width");
254 if (width_val <= 0) width_val = 0;
255
256 int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent");
257 if (indent_val < 0) Rf_error(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_POSITIVE, "indent");
258
259 int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent");
260 if (exdent_val < 0) Rf_error(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_POSITIVE, "exdent");
261
262
263 const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
264 Locale loc = Locale::createFromName(qloc);
265 PROTECT(str = stri__prepare_arg_string(str, "str"));
266 PROTECT(prefix = stri__prepare_arg_string_1(prefix, "prefix"));
267 PROTECT(initial = stri__prepare_arg_string_1(initial, "initial"));
268
269 BreakIterator* briter = NULL;
270 UText* str_text = NULL;
271
272 STRI__ERROR_HANDLER_BEGIN(3)
273 UErrorCode status = U_ZERO_ERROR;
274 briter = BreakIterator::createLineInstance(loc, status);
275 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
276
277 R_len_t str_length = LENGTH(str);
278 StriContainerUTF8_indexable str_cont(str, str_length);
279 StriContainerUTF8 prefix_cont(prefix, 1);
280 StriContainerUTF8 initial_cont(initial, 1);
281
282
283 // prepare indent/exdent/prefix/initial stuff:
284 // 1st line, 1st para (i==0, u==0): initial+indent
285 // nth line, 1st para (i==0, u> 0): prefix +exdent
286 // 1st line, nth para (i> 0, u==0): prefix +indent
287 // nth line, nth para (i> 0, u> 0): prefix +exdent
288 StriWrapLineStart ii(initial_cont.get(0), indent_val);
289 StriWrapLineStart pi(prefix_cont.get(0), indent_val);
290 StriWrapLineStart pe(prefix_cont.get(0), exdent_val);
291
292
293 status = U_ZERO_ERROR;
294 //Unicode Newline Guidelines - Unicode Technical Report #13
295 UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status);
296 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
297 uset_linebreaks.freeze();
298
299 status = U_ZERO_ERROR;
300 UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status);
301 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
302 uset_whitespaces.freeze();
303
304 SEXP ret;
305 STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length));
306 for (R_len_t i = 0; i < str_length; ++i)
307 {
308 if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) {
309 SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
310 continue;
311 }
312
313 status = U_ZERO_ERROR;
314 const char* str_cur_s = str_cont.get(i).c_str();
315 R_len_t str_cur_n = str_cont.get(i).length();
316 str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status);
317 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
318
319 status = U_ZERO_ERROR;
320 briter->setText(str_text, status);
321 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
322
323 // first generate a list of positions of line breaks
324 deque< R_len_t > occurrences_list; // this could be an R_len_t queue
325 R_len_t match = briter->first();
326 while (match != BreakIterator::DONE) {
327
328 if (!whitespace_only_val)
329 occurrences_list.push_back(match);
330 else {
331 if (match > 0 && match < str_cur_n) {
332 UChar32 c;
333 U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c);
334 if (uset_whitespaces.contains(c))
335 occurrences_list.push_back(match);
336 }
337 else
338 occurrences_list.push_back(match);
339 }
340
341 match = briter->next();
342 }
343
344 R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries
345 if (noccurrences <= 1) { // no match (1 boundary == 0)
346 SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i)));
347 continue;
348 }
349
350 // the number of "words" is:
351 R_len_t nwords = noccurrences - 1;
352
353 // convert occurrences_list to a vector
354 // in order to obtain end positions (in a string) of each "words",
355 // noting that occurrences_list.at(0) == 0
356 #ifndef NDEBUG
357 if (occurrences_list.at(0) != 0)
358 throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)");
359 #endif
360
361 std::vector<R_len_t> end_pos_orig(nwords);
362 deque<R_len_t>::iterator iter = ++(occurrences_list.begin());
363 for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) {
364 end_pos_orig[j] = (*iter); // this is a UTF-8 index
365 }
366
367
368 // now:
369 // we'll get the total widths/number of code points in each "word"
370 std::vector<R_len_t> widths_orig(nwords);
371 // we'll get the total widths/number of code points without trailing whitespaces
372 std::vector<R_len_t> widths_trim(nwords);
373 // we'll get the end positions without trailing whitespaces
374 std::vector<R_len_t> end_pos_trim(nwords);
375 // detect line endings (fail on a match)
376
377 UChar32 p;
378 UChar32 c = 0;
379 bool reset = true;
380 R_len_t j = 0;
381 R_len_t cur_block = 0;
382 R_len_t cur_width_orig = 0;
383 R_len_t cur_width_trim = 0;
384 R_len_t cur_count_orig = 0;
385 R_len_t cur_count_trim = 0;
386 R_len_t cur_end_pos_trim = 0;
387 while (j < str_cur_n) {
388 R_len_t jlast = j;
389 p = c;
390 U8_NEXT(str_cur_s, j, str_cur_n, c);
391 if (c < 0) // invalid utf-8 sequence
392 throw StriException(MSG__INVALID_UTF8);
393
394 if (uset_linebreaks.contains(c))
395 throw StriException(MSG__NEWLINE_FOUND);
396
397 // OLD: cur_width_orig += stri__width_char(c);
398 cur_width_orig += stri__width_char_with_context(c, p, reset);
399 ++cur_count_orig;
400 if (uset_whitespaces.contains(c)) {
401 // OLD: trim all white spaces from the end:
402 // ++cur_count_trim;
403 // [we have the normalize arg for that]
404
405 // NEW: trim just one white space at the end:
406 // OLD: cur_width_trim = stri__width_char(c);
407 cur_width_trim = stri__width_char_with_context(c, p, reset);
408 cur_count_trim = 1;
409 cur_end_pos_trim = jlast;
410 }
411 else {
412 cur_width_trim = 0;
413 cur_count_trim = 0;
414 cur_end_pos_trim = j;
415 }
416
417 if (j >= str_cur_n || end_pos_orig[cur_block] <= j) {
418 // we'll start a new block in a moment
419 if (use_length_val) {
420 widths_orig[cur_block] = cur_count_orig;
421 widths_trim[cur_block] = cur_count_orig-cur_count_trim;
422 }
423 else {
424 widths_orig[cur_block] = cur_width_orig;
425 widths_trim[cur_block] = cur_width_orig-cur_width_trim;
426 }
427 end_pos_trim[cur_block] = cur_end_pos_trim;
428 cur_block++;
429 cur_width_orig = 0;
430 cur_width_trim = 0;
431 cur_count_orig = 0;
432 cur_count_trim = 0;
433 cur_end_pos_trim = j;
434 reset = true;
435 }
436 }
437
438 // do wrap
439 std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}?
440 if (exponent_val <= 0.0) {
441 stri__wrap_greedy(wrap_after, nwords, width_val,
442 widths_orig, widths_trim,
443 (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
444 (use_length_val)?pe.count:pe.width);
445 }
446 else {
447 stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val,
448 widths_orig, widths_trim,
449 (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
450 (use_length_val)?pe.count:pe.width);
451 }
452
453 // wrap_after.size() line breaks => wrap_after.size()+1 lines
454 R_len_t nlines = (R_len_t)wrap_after.size()+1;
455 R_len_t last_pos = 0;
456 SEXP ans;
457 STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines));
458 deque<R_len_t>::iterator iter_wrap = wrap_after.begin();
459 for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) {
460 R_len_t wrap_after_cur = *iter_wrap;
461 R_len_t cur_pos = end_pos_trim[wrap_after_cur];
462
463 std::string cs;
464 if (i == 0 && u == 0) cs = ii.str;
465 else if (i > 0 && u == 0) cs = pi.str;
466 else cs = pe.str;
467 cs.append(str_cur_s+last_pos, cur_pos-last_pos);
468 SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));
469
470 last_pos = end_pos_orig[wrap_after_cur];
471 }
472
473 // last line goes here:
474 std::string cs;
475 if (i == 0 && nlines-1 == 0) cs = ii.str;
476 else if (i > 0 && nlines-1 == 0) cs = pi.str;
477 else cs = pe.str;
478 cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos);
479 SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));
480
481 SET_VECTOR_ELT(ret, i, ans);
482 STRI__UNPROTECT(1);
483 }
484
485 if (briter) {
486 delete briter;
487 briter = NULL;
488 }
489 if (str_text) {
490 utext_close(str_text);
491 str_text = NULL;
492 }
493 STRI__UNPROTECT_ALL
494 return ret;
495 STRI__ERROR_HANDLER_END({
496 if (briter) {
497 delete briter;
498 briter = NULL;
499 }
500 if (str_text) {
501 utext_close(str_text);
502 str_text = NULL;
503 }
504 })
505 }
506