1 /*
2 ** Copyright 2011, Double Precision Inc.
3 **
4 ** See COPYING for distribution information.
5 */
6
7 #ifndef widechar_H
8 #define widechar_H
9
10 #include "../curses/curses_config.h"
11 #include <courier-unicode.h>
12 #include <string.h>
13
14 #include <wchar.h>
15 #include <vector>
16 #include <iterator>
17
18 /*
19 ** Convert a char sequence defined by beg_iter and end_iter iterators into
20 ** a wchar_t sequence, written to the out_iter output iterator
21 */
22
23 template<typename input_iter, typename output_iter>
towidechar(input_iter beg_iter,input_iter end_iter,output_iter out_iter)24 output_iter towidechar(input_iter beg_iter,
25 input_iter end_iter,
26 output_iter out_iter)
27 {
28 mbstate_t ps;
29 memset(&ps, 0, sizeof(ps));
30
31 std::vector<char> cbuf;
32 cbuf.reserve(MB_CUR_MAX*2);
33
34 int cnt=MB_CUR_MAX*2;
35
36 while (1)
37 {
38 if (cnt == 0 || beg_iter == end_iter)
39 {
40 if (cbuf.size())
41 {
42 const char *b=&cbuf[0];
43 const char *e=b+cbuf.size();
44 size_t cnt=0;
45
46 while (b != e)
47 {
48 wchar_t wc;
49 size_t r=mbrtowc(&wc, b, e-b, &ps);
50
51 if (r == (size_t)-1)
52 {
53 wc='?';
54 memset(&ps, 0, sizeof(ps));
55 r=1;
56 }
57
58 if (r > (size_t)(e-b))
59 // must be (size_t)-2 -
60 // Incomplete multibyte sequence
61 break;
62
63 if (r == 0)
64 {
65 wc=0;
66 r=1;
67 }
68
69 *out_iter++=wc;
70
71 b += r;
72 cnt += r;
73 }
74 }
75 cbuf.clear();
76 cnt=MB_CUR_MAX;
77
78 if (beg_iter == end_iter)
79 break;
80 }
81
82 cbuf.push_back(*beg_iter++);
83 --cnt;
84 }
85
86 return out_iter;
87 }
88
89 /*
90 ** Convenience function.
91 */
92
93 template<typename input_iter>
towidechar(input_iter beg_iter,input_iter end_iter,std::vector<wchar_t> & uc)94 void towidechar(input_iter beg_iter,
95 input_iter end_iter,
96 std::vector<wchar_t> &uc)
97 {
98 uc.clear();
99
100 towidechar(beg_iter, end_iter,
101 std::back_insert_iterator<std::vector<wchar_t> >(uc));
102 }
103
104 /*
105 ** Convert a wchar_t sequence defined by beg_iter and end_iter iterators into
106 ** a char sequence, written to the out_iter output iterator
107 */
108
109 template<typename input_iter,
110 typename output_iter>
fromwidechar(input_iter b,input_iter e,output_iter out_iter)111 output_iter fromwidechar(input_iter b, input_iter e, output_iter out_iter)
112 {
113 mbstate_t ps;
114 std::vector<char> buf;
115
116 buf.resize(MB_CUR_MAX*2);
117
118 memset(&ps, 0, sizeof(ps));
119
120 for (; b != e; ++b)
121 {
122 size_t n=wcrtomb(&buf[0], *b, &ps);
123
124 if (n == (size_t)-1)
125 continue;
126
127 out_iter=std::copy(buf.begin(), buf.begin()+n, out_iter);
128 }
129
130 size_t n=wcrtomb(&buf[0], 0, &ps);
131
132 if (n != (size_t)-1)
133 {
134 if (n > 0 && buf[n-1] == 0)
135 --n;
136 out_iter=std::copy(buf.begin(), buf.begin()+n, out_iter);
137 }
138 return out_iter;
139 }
140
141 /*
142 ** Convenience function.
143 */
144
145 template<typename input_iter>
fromwidechar(input_iter beg_iter,input_iter end_iter)146 std::string fromwidechar(input_iter beg_iter,
147 input_iter end_iter)
148 {
149 std::string s;
150
151 fromwidechar(beg_iter, end_iter,
152 std::back_insert_iterator<std::string>(s));
153
154 return s;
155 }
156
157 // A unicode string demarcated at grapheme boundaries
158
159 class widecharbuf {
160
161 public:
widecharbuf()162 widecharbuf() {}
163
164 // Initialize from a string
165
166 void init_string(const std::string &str);
167
168 // Given the maximum desired column width of the string,
169 // return a pair of string, and the width of the returned
170 // string.
171 std::pair<std::string, size_t>
172 get_string_truncated(size_t maxwidth, ssize_t atcol) const;
173
174 // Return as a unicode string
175 std::pair<std::u32string, size_t>
176 get_unicode_truncated(size_t maxwidth, ssize_t atcol) const;
177
178 // Return a substring
179 std::string get_substring(size_t first_grapheme,
180 size_t grapheme_cnt) const;
181
182 // Return a unicode substring
183 std::u32string get_unicode_substring(size_t first_grapheme,
184 size_t grapheme_cnt)
185 const;
186
187 // Return as a unicode string, truncated or padded to the given width
188
189 std::u32string get_unicode_fixedwidth(size_t width,
190 ssize_t atcol) const;
191
192 // Initialize from a beginning and ending iterator, iterating
193 // over char32_ts.
194
195 template<typename iter_type>
init_unicode(iter_type b,iter_type e)196 void init_unicode(iter_type b, iter_type e)
197 {
198 std::string s;
199 bool ignore;
200
201 unicode::iconvert::fromu::convert(b, e,
202 unicode_default_chset(), s,
203 ignore);
204
205 std::vector<wchar_t> wc;
206
207 towidechar(s.begin(), s.end(), wc);
208
209 s=fromwidechar(wc.begin(), wc.end());
210
211 unicode::iconvert::tou::convert(s.begin(), s.end(),
212 unicode_default_chset(),
213 ustring);
214 resetgraphemes();
215 }
216
217 // The unicode string
218
219 std::u32string ustring;
220
221 // A grapheme: a pointer somewhere in ustring, plus wchar count
222
223 class grapheme_t {
224
225 public:
226 const char32_t *uptr; // Offset into ustring
227 size_t cnt; // How many unicode chars in the grapheme
228
grapheme_t(const char32_t * uptrArg,size_t cntArg)229 grapheme_t(const char32_t *uptrArg, size_t cntArg)
230 : uptr(uptrArg), cnt(cntArg) {}
231
232 size_t wcwidth(ssize_t start_col) const;
233 };
234
235 std::vector<grapheme_t> graphemes;
236
237 // Reset wchar_t in each grapheme
238 void resetgraphemes();
239
clear()240 void clear()
241 {
242 ustring.clear();
243 graphemes.clear();
244 }
245
246 ssize_t expandtabs(ssize_t col);
247
248 // Append
249 widecharbuf &operator+=(const widecharbuf &);
250
251 // Replace
252
widecharbuf(const widecharbuf & o)253 widecharbuf(const widecharbuf &o)
254 {
255 operator=(o);
256 }
257
258 widecharbuf &operator=(const widecharbuf &o);
259
260 size_t wcwidth(ssize_t start_col) const;
261
262 void tounicode(std::u32string &text) const;
263
264 static size_t charwidth(wchar_t ch, ssize_t atcol);
265 };
266
267 //
268 // Output iterator for towidechar() that throws away the wide character,
269 // but keeps track of its width.
270 //
271 // widecharbuf::grapheme_t::wcwidth() is hot. Saving the output of
272 // towidechar() into a vector is very expensive.
273
274 class towidechar_wcwidth_iter
275 : public std::iterator<std::output_iterator_tag, void, void, void, void>
276 {
277
278 size_t col;
279 size_t w;
280
281 public:
282
towidechar_wcwidth_iter(size_t colArg)283 towidechar_wcwidth_iter(size_t colArg) : col(colArg), w(0) {}
284
285 towidechar_wcwidth_iter &operator++() { return *this; }
286 towidechar_wcwidth_iter &operator++(int) { return *this; }
287 towidechar_wcwidth_iter &operator*() { return *this; }
288
289 void operator=(wchar_t wc)
290 {
291 size_t ww=widecharbuf::charwidth(wc, col);
292
293 col += ww;
294 w += ww;
295 }
296
size_t()297 operator size_t() const { return w; }
298 };
299
300 // Editable wchar_ts, together with an insertion point.
301 //
302 // The wchar_ts get internally divided into three parts: before the
303 // insertion point, at the insertion point, and past the insertion
304 // point. Before and after contain valid graphemes. The current
305 // insertion point may or may not contain valid graphemes.
306 //
307 // set_contents() initializes wchar_ts before and after the
308 // insertion point, setting at the insertion point to an empty list.
309 //
310 // add() adds a wchar_t at the insertion point.
311 //
312 // get_contents() retrieves the contents as unicode characters,
313 // combining the insertion point wchar_ts with wchar_ts before the
314 // insertion point.
315 //
316 // contents_cut() removes graphemes between the cut_pos and the
317 // insertion point. contents_cut() may only be invoked when the
318 // insertion point is empty. The removed graphemes get placed into
319 // cut_text.
320 //
321 // insert_to_before() calls get_contents() then set_contents(),
322 // essentially converting wchar_ts at the insertion point to
323 // valid graphemes, and moving them to before_insert.
324 //
325 // to_before() calls insert_to_before(), then moves after_insert
326 // to before_insert().
327 //
328 // to_after() calls insert_to_before(), then moves before_isnert
329 // to after_insert().
330 //
331 // to_position() calls insert_to_before() then moves the insertion
332 // point to the given position.
333 //
334 // adjust_shift_position() calculates horizontal scrolling. It takes
335 // the editable field's width, and a reference to the current
336 // horizontal shift position, which gets adjusted, as necessary, to
337 // keep the cursor position visible, and returns the cursor position.
338
339 class editablewidechar {
340
341 public:
342 widecharbuf before_insert;
343
344 std::u32string inserted;
345
346 widecharbuf after_insert;
347
348 void set_contents(const std::u32string &before,
349 const std::u32string &after);
350
351 void get_contents(std::u32string &before,
352 std::u32string &after) const;
353
354 void contents_cut(size_t cut_pos,
355 std::u32string &cut_text);
356
insert_char(char32_t wc)357 void insert_char(char32_t wc) { inserted.push_back(wc); }
358
359 void insert_to_before();
360
361 void to_before();
362
363 void to_after();
364
clear()365 void clear()
366 {
367 before_insert.clear();
368 after_insert.clear();
369 inserted.clear();
370 }
371
372 void to_position(size_t pos);
373
374 size_t adjust_shift_pos(size_t &shiftoffset, size_t width,
375
376 // wbefore and wafter: wide characters
377 // before and after cursor position,
378 // as computed by adjust_shift_pos().
379 // Useful to the caller
380 widecharbuf &wbefore,
381 widecharbuf &wafter);
382 };
383
384 /*
385 ** Helper class for the word-wrapping logic.
386 **
387 ** This class receives char32_t pieces, that can be broken. The collector
388 ** converts them to system wchars, measures each piece, and collects them
389 ** until the next piece can't fit within the alloted width.
390 */
391
392 template<typename output_sink>
393 class wordwrap_collector {
394
395 public:
396 /* Collected line */
397 std::u32string linebuf;
398
399 /* Width of the collected line */
400 size_t col;
401
402 /*
403 ** iter(line) gets invoked for each assembled line, where
404 ** line is a std::vector<wchar_t>.
405 */
406
407 output_sink &iter;
408
409 /*
410 ** Desired line width.
411 */
412
413 size_t towidth;
414
415 /*
416 ** If true, the trailing space on each line gets removed.
417 ** In all cases, each line is wrapped at towidth() characters,
418 ** where possible.
419 */
420
421 bool delsp;
422
wordwrap_collector(output_sink & iterArg,size_t towidthArg,bool delspArg)423 wordwrap_collector(output_sink &iterArg,
424 size_t towidthArg,
425 bool delspArg) : col(0), iter(iterArg),
426 towidth(towidthArg),
427 delsp(delspArg)
428 {
429 }
430
addsegment(const std::u32string & segment)431 void addsegment(const std::u32string &segment)
432 {
433 std::vector<wchar_t> wsegment;
434
435 {
436 std::string s=unicode::iconvert
437 ::convert(segment, unicode_default_chset());
438
439 towidechar(s.begin(), s.end(), wsegment);
440
441 if (wsegment.empty())
442 return;
443 }
444
445 size_t width=0;
446
447 for (std::vector<wchar_t>::const_iterator
448 b(wsegment.begin()),
449 e(wsegment.end()); b != e; ++b)
450 width += widecharbuf::charwidth(*b, col+width);
451
452 if (!(delsp && width + col == towidth+1 &&
453 *--wsegment.end() == ' '))
454 {
455 if (width + col > towidth && !linebuf.empty())
456 breakline();
457 }
458
459 linebuf.insert(linebuf.end(), segment.begin(), segment.end());
460 col += width;
461 }
462
breakline()463 void breakline()
464 {
465 if (delsp && !linebuf.empty() &&
466 linebuf[linebuf.size()-1] == ' ')
467 linebuf.pop_back();
468
469 *iter++=linebuf;
470 col=0;
471 linebuf.clear();
472 }
473 };
474
475
476 /*
477 ** A default rewrap helper object that does not rewrap anything.
478 */
479
480 class unicoderewrapnone {
481
482 public:
483 bool operator()(size_t n) const;
484 };
485
486 /*
487 ** Unicode-based linewrapping logic.
488 **
489 ** This template defines an output iterator that takes in char32_ts.
490 ** The constructor receives a rewrap helper object reference, an output
491 ** iterator, the requested width, and whether to trim the trailing
492 ** space from each wrapped line.
493 **
494 ** As this iterator is iterated over char32_ts, it will iterate the
495 ** received output iterator over std::u32strings, representing
496 ** each line wrapped to the requested width.
497 **
498 ** The trim flag should normally be false. This properly preserves all
499 ** whitespace in the unicode character sequence. The trim flag may be true
500 ** only in contexts where the wrapped text will never be rewrapped again.
501 ** Removal of a trailing space on each line allows an extra character to be
502 ** present instead of the trailing space.
503 **
504 ** The rewrap helper object instance must define a bool operator()(size_t n)
505 ** const. size_t receives a character offset, and should return true if
506 ** character #n is the first character in an original line of text. If so,
507 ** and the unicode word wrap algorithm does not indicate that there's a
508 ** potential linebreak here, a space gets appended at this point. This is
509 ** used to rewrap existing lines of text which may not end with a space
510 ** character.
511 **
512 ** After the original unicode chars are iterated over, eof() must be
513 ** invoked in order to output any partially-wrapped content that's still
514 ** held internally in this iterator.
515 */
516
517 template<typename output_sink_t,
518 typename rewrap_helper_t=unicoderewrapnone> class unicodewordwrapper :
519 public std::iterator<std::output_iterator_tag, void, void, void, void> {
520
521 // State maintained by the iterator. This iterator is copyable.
522 // The state is associated with only one iterator instance. Copying
523 // an iterator copies the state from the original iterator into the
524 // new one.
525
526 class buffer : public unicode::linebreakc_callback_save_buf {
527
528 public:
529 std::u32string segment; // Current word
530 size_t cnt; // Counts characters that are being wrapped.
531
532 wordwrap_collector<output_sink_t> collector;
533 // The collector object.
534
535 const rewrap_helper_t &rewrapper;
536 // The rewrap helper object.
537
buffer(const rewrap_helper_t & rewrapperArg,output_sink_t & out_iter,size_t towidth,bool delsp)538 buffer(const rewrap_helper_t &rewrapperArg,
539 output_sink_t &out_iter,
540 size_t towidth,
541 bool delsp)
542 : cnt(0),
543 collector(out_iter, towidth, delsp),
544 rewrapper(rewrapperArg) {}
545
~buffer()546 virtual ~buffer() {}
547 };
548
549 mutable buffer *buf;
550
551 typedef unicodewordwrapper<output_sink_t, rewrap_helper_t> iter_t;
552
553 public:
554 // Iterator constructor
unicodewordwrapper(const rewrap_helper_t & rewrap_helper,output_sink_t & out_iter,size_t towidth,bool delsp)555 unicodewordwrapper(const rewrap_helper_t &rewrap_helper,
556 output_sink_t &out_iter,
557 size_t towidth,
558 bool delsp)
559 : buf(new buffer(rewrap_helper, out_iter, towidth, delsp))
560 {
561 buf->set_opts(UNICODE_LB_OPT_PRBREAK|
562 UNICODE_LB_OPT_SYBREAK|
563 UNICODE_LB_OPT_DASHWJ);
564 }
565
566 // End iterator constructor
unicodewordwrapper()567 unicodewordwrapper() : buf(NULL)
568 {
569 }
570
~unicodewordwrapper()571 ~unicodewordwrapper()
572 {
573 eof();
574 }
575
576 // Assignment operator moves the state
577
578 iter_t &operator=(const iter_t &o)
579 {
580 if (buf)
581 delete buf;
582
583 buf=o.buf;
584 o.buf=NULL;
585 return *this;
586 }
587
588 // Copy constructor moves the state
unicodewordwrapper(const iter_t & o)589 unicodewordwrapper(const iter_t &o) : buf(o.buf)
590 {
591 o.buf=NULL;
592 }
593
594 // Operator implementation
595
596 iter_t &operator++() { return *this; }
597 iter_t &operator++(int) { return *this; }
598 iter_t &operator*() { return *this; }
599
600 void operator=(char32_t ch)
601 {
602 if (!buf)
603 return;
604
605 // Feed into the linebreaking algorithm.
606
607 buf->operator<<(ch);
608
609 // Process linebreaking algorithm output.
610
611 while (!buf->lb_buf.empty())
612 {
613 std::pair<int, char32_t> ch(buf->lb_buf.front());
614 buf->lb_buf.pop_front();
615
616 // If text is being rewrapped, and the linebreaking
617 // algorithm prohibits a linebreak here, but this
618 // was the first character of the pre-wrapped line,
619 // then there must've been a space character here,
620 // which allows a break.
621
622 if (ch.first == UNICODE_LB_NONE && buf->cnt > 0 &&
623 (buf->rewrapper)(buf->cnt))
624 {
625 buf->segment.push_back(' ');
626 ch.first=UNICODE_LB_ALLOWED;
627 }
628 ++buf->cnt;
629
630 // Process a potential linebreak.
631
632 if (ch.first != UNICODE_LB_NONE)
633 {
634 buf->collector.addsegment(buf->segment);
635
636 if (ch.first == UNICODE_LB_MANDATORY)
637 buf->collector.breakline();
638 buf->segment.clear();
639 }
640
641 if (ch.second != '\r' && ch.second != '\n')
642 buf->segment.push_back(ch.second);
643 }
644 }
645
646 // Finish remaining content, and clean up.
647
eof()648 void eof()
649 {
650 if (buf)
651 {
652 buf->collector.addsegment(buf->segment);
653 if (!buf->collector.linebuf.empty())
654 buf->collector.breakline();
655
656 delete buf;
657 buf=NULL;
658 }
659 }
660 };
661
662 // A convenience function to iterate over an arbitrary sequence defined
663 // by a beginning and an ending iterator, and wrap it.
664
665 template<typename input_iter, typename output_sink,
666 typename rewrap_helper_t>
unicodewordwrap(input_iter beg_iter,input_iter end_iter,const rewrap_helper_t & rewrap_helper,output_sink & out_iter,size_t towidth,bool delsp)667 void unicodewordwrap(input_iter beg_iter,
668 input_iter end_iter,
669 const rewrap_helper_t &rewrap_helper,
670 output_sink &out_iter,
671 size_t towidth,
672 bool delsp)
673 {
674 unicodewordwrapper<output_sink, rewrap_helper_t>
675 iter(rewrap_helper, out_iter, towidth, delsp);
676
677 while (beg_iter != end_iter)
678 {
679 iter= *beg_iter;
680 ++beg_iter;
681 }
682 iter.eof();
683 }
684
685 #endif
686