1 /* This file is part of the 'stringi' project.
2  * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "stri_stringi.h"
34 #include "stri_container_utf8_indexable.h"
35 #include <deque>
36 #include <vector>
37 #include <utility>
38 #include <unicode/brkiter.h>
39 #include <unicode/uniset.h>
40 
41 
42 /** Greedy word wrap algorithm
43  *
44  * @param wrap_after [out]
45  * @param nwords number of "words"
46  * @param width_val maximal desired out line width
47  * @param widths_orig ith word width original
48  * @param widths_trim ith word width trimmed
49  * @param add_para_1
50  * @param add_para_n
51  *
52  * @version 0.1-?? (Bartek Tartanus)
53  *          original implementation
54  *
55  * @version 0.2-2 (Marek Gagolewski, 2014-04-28)
56  *          BreakIterator usage mods
57  *
58  * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
59  *    new args: add_para_1, add_para_n
60  */
stri__wrap_greedy(std::deque<R_len_t> & wrap_after,R_len_t nwords,int width_val,const std::vector<R_len_t> & widths_orig,const std::vector<R_len_t> & widths_trim,int add_para_1,int add_para_n)61 void stri__wrap_greedy(std::deque<R_len_t>& wrap_after,
62                        R_len_t nwords, int width_val,
63                        const std::vector<R_len_t>& widths_orig,
64                        const std::vector<R_len_t>& widths_trim,
65                        int add_para_1, int add_para_n)
66 {
67     R_len_t cur_len = add_para_1+widths_orig[0];
68     for (R_len_t j = 1; j < nwords; ++j) {
69         if (cur_len + widths_trim[j] > width_val) {
70             cur_len = add_para_n+widths_orig[j];
71             wrap_after.push_back(j-1);
72         }
73         else {
74             cur_len += widths_orig[j];
75         }
76     }
77 }
78 
79 
80 /** Dynamic word wrap algorithm
81  * (Knuth's word wrapping algorithm that minimizes raggedness of formatted text)
82  *
83  * @param wrap_after [out]
84  * @param nwords number of "words"
85  * @param width_val maximal desired out line width
86  * @param exponent_val cost function exponent
87  * @param widths_orig ith word width original
88  * @param widths_trim ith word width trimmed
89  * @param add_para_1
90  * @param add_para_a
91  *
92  * @version 0.1-?? (Bartek Tartanus)
93  *          original implementation
94  *
95  * @version 0.2-2 (Marek Gagolewski, 2014-04-30)
96  *          BreakIterator usage mods
97  *
98  * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
99  *    new args: add_para_1, add_para_n,
100  *    cost of the last line is zero
101  */
stri__wrap_dynamic(std::deque<R_len_t> & wrap_after,R_len_t nwords,int width_val,double exponent_val,const std::vector<R_len_t> & widths_orig,const std::vector<R_len_t> & widths_trim,int add_para_1,int add_para_n)102 void stri__wrap_dynamic(std::deque<R_len_t>& wrap_after,
103                         R_len_t nwords, int width_val, double exponent_val,
104                         const std::vector<R_len_t>& widths_orig,
105                         const std::vector<R_len_t>& widths_trim,
106                         int add_para_1, int add_para_n)
107 {
108 #define IDX(i,j) (i)*nwords+(j)
109     vector<double> cost(nwords*nwords);
110     // where cost[IDX(i,j)] == cost of printing words i..j in a single line, i<=j
111 
112     // calculate costs:
113     // there is some "punishment" for leaving blanks at the end of each line
114     // (number of "blank" codepoints ^ exponent_val)
115     for (int i=0; i<nwords; i++) {
116         int sum = 0;
117         for (int j=i; j<nwords; j++) {
118             if (j > i) {
119                 if (cost[IDX(i,j-1)] < 0.0) { // already Inf
120                     cost[IDX(i,j)] = -1.0; // Inf
121                     continue;
122                 }
123                 else {
124                     sum -= widths_trim[j-1];
125                     sum += widths_orig[j-1];
126                 }
127             }
128             sum += widths_trim[j];
129             int ct = width_val - sum;
130             if (i == 0) ct -= add_para_1;
131             else        ct -= add_para_n;
132 
133             if (j == nwords-1) { // last line == cost 0
134                 if (j == i || ct >= 0)
135                     cost[IDX(i,j)] = 0.0;
136                 else
137                     cost[IDX(i,j)] = -1.0/*Inf*/;
138             }
139             else if (j == i)
140                 // some words don't fit in a line at all -> cost 0.0
141                 cost[IDX(i,j)] = (ct < 0) ? 0.0 : pow((double)ct, exponent_val);
142             else
143                 cost[IDX(i,j)] = (ct < 0) ? -1.0/*"Inf"*/ : pow((double)ct, exponent_val);
144         }
145     }
146 
147     vector<double> f(nwords); // f[j] == total cost of  (optimally) printing words 0..j
148     vector<bool> where(nwords*nwords, false); // where[IDX(i,j)] == false iff
149     // we don't wrap after i-th word, i<=j
150     // when (optimally) printing words 0..j
151 
152     for (int j=0; j<nwords; ++j) {
153         if (cost[IDX(0,j)] >= 0.0) {
154             // no breaking needed: words 0..j fit in one line
155             f[j] = cost[IDX(0,j)];
156             continue;
157         }
158 
159         // let i = optimal way of printing of words 0..i + printing i+1..j
160         int i = 0;
161         while (i <= j) {
162             if (cost[IDX(i+1,j)] >= 0.0) break;
163             ++i;
164         }
165 
166         double best_i = f[i] + cost[IDX(i+1,j)];
167         for (int k=i+1; k<j; ++k) {
168             if (cost[IDX(k+1,j)] < 0.0) continue;
169             double best_cur = f[k] + cost[IDX(k+1,j)];
170             if (best_cur < best_i) {
171                 best_i = best_cur;
172                 i = k;
173             }
174         }
175         for (int k=0; k<i; ++k)
176             where[IDX(k,j)] = where[IDX(k,i)];
177         where[IDX(i,j)] = true;
178         f[j] = best_i;
179     }
180 
181     //result is in the last row of where...
182     for (int k=0; k<nwords; ++k)
183         if (where[IDX(k, nwords-1)])
184             wrap_after.push_back(k);
185 }
186 
187 
188 struct StriWrapLineStart {
189     std::string str;
190     R_len_t nbytes;
191     R_len_t count;
192     R_len_t width;
193 
StriWrapLineStartStriWrapLineStart194     StriWrapLineStart(const String8& s, R_len_t v) :
195         str(s.c_str()) {
196         nbytes  = s.length()+v;
197         count   = s.countCodePoints()+v;
198         width   = stri__width_string(s.c_str(), s.length())+v;
199         str.append(std::string(v, ' '));
200     }
201 };
202 
203 
204 /** Word wrap text
205  *
206  * @param str character vector
207  * @param width single integer
208  * @param cost_exponent single double
209  * @param indent single integer
210  * @param exdent single integer
211  * @param prefix single string
212  * @param initial single string
213  * @param locale locale identifier or NULL for default locale
214  * @param use_length single logical value
215  *
216  * @return list
217  *
218  * @version 0.1-?? (Bartek Tartanus)
219  *
220  * @version 0.2-2 (Marek Gagolewski, 2014-04-27)
221  *          single function for wrap_greedy and wrap_dynamic
222  *          (dispatch inside);
223  *          use BreakIterator
224  *
225  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
226  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
227  *
228  * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
229  *    new args: indent, exdent, prefix, initial
230  *
231  * @version 0.5-1 (Marek Gagolewski, 2014-12-19)
232  *    #133 allow width <= 0
233  *
234  * @version 0.5-1 (Marek Gagolewski, 2015-02-28)
235  *    don't trim so many white spaces at the end of each word (normalize arg does that)
236  *    #139: allow a "whitespace" break iterator
237  *
238  * @version 0.5-1 (Marek Gagolewski, 2015-04-23)
239  *    `use_length` arg added
240  *
241  *
242  * @version 0.5-1 (Marek Gagolewski, 2015-06-09)
243  *    BIGSKIP: no more CHARSXP on out on "" input
244  */
stri_wrap(SEXP str,SEXP width,SEXP cost_exponent,SEXP indent,SEXP exdent,SEXP prefix,SEXP initial,SEXP whitespace_only,SEXP use_length,SEXP locale)245 SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent,
246                SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only,
247                SEXP use_length, SEXP locale)
248 {
249     bool use_length_val      = stri__prepare_arg_logical_1_notNA(use_length, "use_length");
250     double exponent_val      = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent");
251     bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only");
252 
253     int width_val = stri__prepare_arg_integer_1_notNA(width, "width");
254     if (width_val <= 0) width_val = 0;
255 
256     int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent");
257     if (indent_val < 0) Rf_error(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_POSITIVE, "indent");
258 
259     int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent");
260     if (exdent_val < 0) Rf_error(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_POSITIVE, "exdent");
261 
262 
263     const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
264     Locale loc = Locale::createFromName(qloc);
265     PROTECT(str     = stri__prepare_arg_string(str, "str"));
266     PROTECT(prefix  = stri__prepare_arg_string_1(prefix, "prefix"));
267     PROTECT(initial = stri__prepare_arg_string_1(initial, "initial"));
268 
269     BreakIterator* briter = NULL;
270     UText* str_text = NULL;
271 
272     STRI__ERROR_HANDLER_BEGIN(3)
273     UErrorCode status = U_ZERO_ERROR;
274     briter = BreakIterator::createLineInstance(loc, status);
275     STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
276 
277     R_len_t str_length = LENGTH(str);
278     StriContainerUTF8_indexable str_cont(str, str_length);
279     StriContainerUTF8 prefix_cont(prefix, 1);
280     StriContainerUTF8 initial_cont(initial, 1);
281 
282 
283     // prepare indent/exdent/prefix/initial stuff:
284     // 1st line, 1st para (i==0, u==0): initial+indent
285     // nth line, 1st para (i==0, u> 0): prefix +exdent
286     // 1st line, nth para (i> 0, u==0): prefix +indent
287     // nth line, nth para (i> 0, u> 0): prefix +exdent
288     StriWrapLineStart ii(initial_cont.get(0), indent_val);
289     StriWrapLineStart pi(prefix_cont.get(0), indent_val);
290     StriWrapLineStart pe(prefix_cont.get(0), exdent_val);
291 
292 
293     status = U_ZERO_ERROR;
294     //Unicode Newline Guidelines - Unicode Technical Report #13
295     UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status);
296     STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
297     uset_linebreaks.freeze();
298 
299     status = U_ZERO_ERROR;
300     UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status);
301     STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
302     uset_whitespaces.freeze();
303 
304     SEXP ret;
305     STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length));
306     for (R_len_t i = 0; i < str_length; ++i)
307     {
308         if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) {
309             SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
310             continue;
311         }
312 
313         status = U_ZERO_ERROR;
314         const char* str_cur_s = str_cont.get(i).c_str();
315         R_len_t str_cur_n = str_cont.get(i).length();
316         str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status);
317         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
318 
319         status = U_ZERO_ERROR;
320         briter->setText(str_text, status);
321         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
322 
323         // first generate a list of positions of line breaks
324         deque< R_len_t > occurrences_list; // this could be an R_len_t queue
325         R_len_t match = briter->first();
326         while (match != BreakIterator::DONE) {
327 
328             if (!whitespace_only_val)
329                 occurrences_list.push_back(match);
330             else {
331                 if (match > 0 && match < str_cur_n) {
332                     UChar32 c;
333                     U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c);
334                     if (uset_whitespaces.contains(c))
335                         occurrences_list.push_back(match);
336                 }
337                 else
338                     occurrences_list.push_back(match);
339             }
340 
341             match = briter->next();
342         }
343 
344         R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries
345         if (noccurrences <= 1) { // no match (1 boundary == 0)
346             SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i)));
347             continue;
348         }
349 
350         // the number of "words" is:
351         R_len_t nwords = noccurrences - 1;
352 
353         // convert occurrences_list to a vector
354         // in order to obtain end positions (in a string) of each "words",
355         // noting that occurrences_list.at(0) == 0
356 #ifndef NDEBUG
357         if (occurrences_list.at(0) != 0)
358             throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)");
359 #endif
360 
361         std::vector<R_len_t> end_pos_orig(nwords);
362         deque<R_len_t>::iterator iter = ++(occurrences_list.begin());
363         for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) {
364             end_pos_orig[j] = (*iter); // this is a UTF-8 index
365         }
366 
367 
368         // now:
369         // we'll get the total widths/number of code points in each "word"
370         std::vector<R_len_t> widths_orig(nwords);
371         // we'll get the total widths/number of code points without trailing whitespaces
372         std::vector<R_len_t> widths_trim(nwords);
373         // we'll get the end positions without trailing whitespaces
374         std::vector<R_len_t> end_pos_trim(nwords);
375         // detect line endings (fail on a match)
376 
377         UChar32 p;
378         UChar32 c = 0;
379         bool reset = true;
380         R_len_t j = 0;
381         R_len_t cur_block = 0;
382         R_len_t cur_width_orig = 0;
383         R_len_t cur_width_trim = 0;
384         R_len_t cur_count_orig = 0;
385         R_len_t cur_count_trim = 0;
386         R_len_t cur_end_pos_trim = 0;
387         while (j < str_cur_n) {
388             R_len_t jlast = j;
389             p = c;
390             U8_NEXT(str_cur_s, j, str_cur_n, c);
391             if (c < 0) // invalid utf-8 sequence
392                 throw StriException(MSG__INVALID_UTF8);
393 
394             if (uset_linebreaks.contains(c))
395                 throw StriException(MSG__NEWLINE_FOUND);
396 
397             // OLD: cur_width_orig += stri__width_char(c);
398             cur_width_orig += stri__width_char_with_context(c, p, reset);
399             ++cur_count_orig;
400             if (uset_whitespaces.contains(c)) {
401 // OLD: trim all white spaces from the end:
402 //            ++cur_count_trim;
403 //           [we have the normalize arg for that]
404 
405 // NEW: trim just one white space at the end:
406                 // OLD: cur_width_trim = stri__width_char(c);
407                 cur_width_trim = stri__width_char_with_context(c, p, reset);
408                 cur_count_trim = 1;
409                 cur_end_pos_trim = jlast;
410             }
411             else {
412                 cur_width_trim = 0;
413                 cur_count_trim = 0;
414                 cur_end_pos_trim = j;
415             }
416 
417             if (j >= str_cur_n || end_pos_orig[cur_block] <= j) {
418                 // we'll start a new block in a moment
419                 if (use_length_val) {
420                     widths_orig[cur_block] = cur_count_orig;
421                     widths_trim[cur_block] = cur_count_orig-cur_count_trim;
422                 }
423                 else {
424                     widths_orig[cur_block] = cur_width_orig;
425                     widths_trim[cur_block] = cur_width_orig-cur_width_trim;
426                 }
427                 end_pos_trim[cur_block] = cur_end_pos_trim;
428                 cur_block++;
429                 cur_width_orig = 0;
430                 cur_width_trim = 0;
431                 cur_count_orig = 0;
432                 cur_count_trim = 0;
433                 cur_end_pos_trim = j;
434                 reset = true;
435             }
436         }
437 
438         // do wrap
439         std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}?
440         if (exponent_val <= 0.0) {
441             stri__wrap_greedy(wrap_after, nwords, width_val,
442                 widths_orig, widths_trim,
443                 (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
444                 (use_length_val)?pe.count:pe.width);
445         }
446         else {
447             stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val,
448                 widths_orig, widths_trim,
449                 (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
450                 (use_length_val)?pe.count:pe.width);
451         }
452 
453         // wrap_after.size() line breaks => wrap_after.size()+1 lines
454         R_len_t nlines = (R_len_t)wrap_after.size()+1;
455         R_len_t last_pos = 0;
456         SEXP ans;
457         STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines));
458         deque<R_len_t>::iterator iter_wrap = wrap_after.begin();
459         for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) {
460             R_len_t wrap_after_cur = *iter_wrap;
461             R_len_t cur_pos = end_pos_trim[wrap_after_cur];
462 
463             std::string cs;
464             if (i == 0 && u == 0)     cs = ii.str;
465             else if (i > 0 && u == 0) cs = pi.str;
466             else                      cs = pe.str;
467             cs.append(str_cur_s+last_pos, cur_pos-last_pos);
468             SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));
469 
470             last_pos = end_pos_orig[wrap_after_cur];
471         }
472 
473         // last line goes here:
474         std::string cs;
475         if (i == 0 && nlines-1 == 0)     cs = ii.str;
476         else if (i > 0 && nlines-1 == 0) cs = pi.str;
477         else                             cs = pe.str;
478         cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos);
479         SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));
480 
481         SET_VECTOR_ELT(ret, i, ans);
482         STRI__UNPROTECT(1);
483     }
484 
485     if (briter) {
486         delete briter;
487         briter = NULL;
488     }
489     if (str_text) {
490         utext_close(str_text);
491         str_text = NULL;
492     }
493     STRI__UNPROTECT_ALL
494     return ret;
495     STRI__ERROR_HANDLER_END({
496         if (briter) {
497             delete briter;
498             briter = NULL;
499         }
500         if (str_text) {
501             utext_close(str_text);
502             str_text = NULL;
503         }
504     })
505 }
506