1 /*
2 ** Copyright 2011, Double Precision Inc.
3 **
4 ** See COPYING for distribution information.
5 */
6 
7 #ifndef widechar_H
8 #define widechar_H
9 
10 #include "../curses/curses_config.h"
11 #include <courier-unicode.h>
12 #include <string.h>
13 
14 #include <wchar.h>
15 #include <vector>
16 #include <iterator>
17 
18 /*
19 ** Convert a char sequence defined by beg_iter and end_iter iterators into
20 ** a wchar_t sequence, written to the out_iter output iterator
21 */
22 
23 template<typename input_iter, typename output_iter>
towidechar(input_iter beg_iter,input_iter end_iter,output_iter out_iter)24 output_iter towidechar(input_iter beg_iter,
25 		       input_iter end_iter,
26 		       output_iter out_iter)
27 {
28 	mbstate_t ps;
29 	memset(&ps, 0, sizeof(ps));
30 
31 	std::vector<char> cbuf;
32 	cbuf.reserve(MB_CUR_MAX*2);
33 
34 	int cnt=MB_CUR_MAX*2;
35 
36 	while (1)
37 	{
38 		if (cnt == 0 || beg_iter == end_iter)
39 		{
40 			if (cbuf.size())
41 			{
42 				const char *b=&cbuf[0];
43 				const char *e=b+cbuf.size();
44 				size_t cnt=0;
45 
46 				while (b != e)
47 				{
48 					wchar_t wc;
49 					size_t r=mbrtowc(&wc, b, e-b, &ps);
50 
51 					if (r == (size_t)-1)
52 					{
53 						wc='?';
54 						memset(&ps, 0, sizeof(ps));
55 						r=1;
56 					}
57 
58 					if (r > (size_t)(e-b))
59 						// must be (size_t)-2 -
60 						// Incomplete multibyte sequence
61 						break;
62 
63 					if (r == 0)
64 					{
65 						wc=0;
66 						r=1;
67 					}
68 
69 					*out_iter++=wc;
70 
71 					b += r;
72 					cnt += r;
73 				}
74 			}
75 			cbuf.clear();
76 			cnt=MB_CUR_MAX;
77 
78 			if (beg_iter == end_iter)
79 				break;
80 		}
81 
82 		cbuf.push_back(*beg_iter++);
83 		--cnt;
84 	}
85 
86 	return out_iter;
87 }
88 
89 /*
90 ** Convenience function.
91 */
92 
93 template<typename input_iter>
towidechar(input_iter beg_iter,input_iter end_iter,std::vector<wchar_t> & uc)94 void towidechar(input_iter beg_iter,
95 		input_iter end_iter,
96 		std::vector<wchar_t> &uc)
97 {
98 	uc.clear();
99 
100 	towidechar(beg_iter, end_iter,
101 		   std::back_insert_iterator<std::vector<wchar_t> >(uc));
102 }
103 
104 /*
105 ** Convert a wchar_t sequence defined by beg_iter and end_iter iterators into
106 ** a char sequence, written to the out_iter output iterator
107 */
108 
109 template<typename input_iter,
110 	 typename output_iter>
fromwidechar(input_iter b,input_iter e,output_iter out_iter)111 output_iter fromwidechar(input_iter b, input_iter e, output_iter out_iter)
112 {
113 	mbstate_t ps;
114 	std::vector<char> buf;
115 
116 	buf.resize(MB_CUR_MAX*2);
117 
118 	memset(&ps, 0, sizeof(ps));
119 
120 	for (; b != e; ++b)
121 	{
122 		size_t n=wcrtomb(&buf[0], *b, &ps);
123 
124 		if (n == (size_t)-1)
125 			continue;
126 
127 		out_iter=std::copy(buf.begin(), buf.begin()+n, out_iter);
128 	}
129 
130 	size_t n=wcrtomb(&buf[0], 0, &ps);
131 
132 	if (n != (size_t)-1)
133 	{
134 		if (n > 0 && buf[n-1] == 0)
135 			--n;
136 		out_iter=std::copy(buf.begin(), buf.begin()+n, out_iter);
137 	}
138 	return out_iter;
139 }
140 
141 /*
142 ** Convenience function.
143 */
144 
145 template<typename input_iter>
fromwidechar(input_iter beg_iter,input_iter end_iter)146 std::string fromwidechar(input_iter beg_iter,
147 			 input_iter end_iter)
148 {
149 	std::string s;
150 
151 	fromwidechar(beg_iter, end_iter,
152 		     std::back_insert_iterator<std::string>(s));
153 
154 	return s;
155 }
156 
157 // A unicode string demarcated at grapheme boundaries
158 
159 class widecharbuf {
160 
161 public:
widecharbuf()162 	widecharbuf() {}
163 
164 	// Initialize from a string
165 
166 	void init_string(const std::string &str);
167 
168 	// Given the maximum desired column width of the string,
169 	// return a pair of string, and the width of the returned
170 	// string.
171 	std::pair<std::string, size_t>
172 	get_string_truncated(size_t maxwidth, ssize_t atcol) const;
173 
174 	// Return as a unicode string
175 	std::pair<std::u32string, size_t>
176 	get_unicode_truncated(size_t maxwidth, ssize_t atcol) const;
177 
178 	// Return a substring
179 	std::string get_substring(size_t first_grapheme,
180 				  size_t grapheme_cnt) const;
181 
182 	// Return a unicode substring
183 	std::u32string get_unicode_substring(size_t first_grapheme,
184 							size_t grapheme_cnt)
185 		const;
186 
187 	// Return as a unicode string, truncated or padded to the given width
188 
189 	std::u32string get_unicode_fixedwidth(size_t width,
190 							 ssize_t atcol) const;
191 
192 	// Initialize from a beginning and ending iterator, iterating
193 	// over char32_ts.
194 
195 	template<typename iter_type>
init_unicode(iter_type b,iter_type e)196 	void init_unicode(iter_type b, iter_type e)
197 	{
198 		std::string s;
199 		bool ignore;
200 
201 		unicode::iconvert::fromu::convert(b, e,
202 						  unicode_default_chset(), s,
203 						  ignore);
204 
205 		std::vector<wchar_t> wc;
206 
207 		towidechar(s.begin(), s.end(), wc);
208 
209 		s=fromwidechar(wc.begin(), wc.end());
210 
211 		unicode::iconvert::tou::convert(s.begin(), s.end(),
212 					     unicode_default_chset(),
213 					     ustring);
214 		resetgraphemes();
215 	}
216 
217 	// The unicode string
218 
219 	std::u32string ustring;
220 
221 	// A grapheme: a pointer somewhere in ustring, plus wchar count
222 
223 	class grapheme_t {
224 
225 	public:
226 		const char32_t *uptr; // Offset into ustring
227 		size_t cnt; // How many unicode chars in the grapheme
228 
grapheme_t(const char32_t * uptrArg,size_t cntArg)229 		grapheme_t(const char32_t *uptrArg, size_t cntArg)
230 			: uptr(uptrArg), cnt(cntArg) {}
231 
232 		size_t wcwidth(ssize_t start_col) const;
233 	};
234 
235 	std::vector<grapheme_t> graphemes;
236 
237 	// Reset wchar_t in each grapheme
238 	void resetgraphemes();
239 
clear()240 	void clear()
241 	{
242 		ustring.clear();
243 		graphemes.clear();
244 	}
245 
246 	ssize_t expandtabs(ssize_t col);
247 
248 	// Append
249 	widecharbuf &operator+=(const widecharbuf &);
250 
251 	// Replace
252 
widecharbuf(const widecharbuf & o)253 	widecharbuf(const widecharbuf &o)
254 	{
255 		operator=(o);
256 	}
257 
258 	widecharbuf &operator=(const widecharbuf &o);
259 
260 	size_t wcwidth(ssize_t start_col) const;
261 
262 	void tounicode(std::u32string &text) const;
263 
264 	static size_t charwidth(wchar_t ch, ssize_t atcol);
265 };
266 
267 //
268 // Output iterator for towidechar() that throws away the wide character,
269 // but keeps track of its width.
270 //
271 // widecharbuf::grapheme_t::wcwidth() is hot. Saving the output of
272 // towidechar() into a vector is very expensive.
273 
274 class towidechar_wcwidth_iter
275 	: public std::iterator<std::output_iterator_tag, void, void, void, void>
276 {
277 
278 	size_t col;
279 	size_t w;
280 
281 public:
282 
towidechar_wcwidth_iter(size_t colArg)283 	towidechar_wcwidth_iter(size_t colArg) : col(colArg), w(0) {}
284 
285 	towidechar_wcwidth_iter &operator++() { return *this; }
286 	towidechar_wcwidth_iter &operator++(int) { return *this; }
287 	towidechar_wcwidth_iter &operator*() { return *this; }
288 
289 	void operator=(wchar_t wc)
290 	{
291 		size_t ww=widecharbuf::charwidth(wc, col);
292 
293 		col += ww;
294 		w += ww;
295 	}
296 
size_t()297 	operator size_t() const { return w; }
298 };
299 
300 // Editable wchar_ts, together with an insertion point.
301 //
302 // The wchar_ts get internally divided into three parts: before the
303 // insertion point, at the insertion point, and past the insertion
304 // point. Before and after contain valid graphemes. The current
305 // insertion point may or may not contain valid graphemes.
306 //
307 // set_contents() initializes wchar_ts before and after the
308 // insertion point, setting at the insertion point to an empty list.
309 //
310 // add() adds a wchar_t at the insertion point.
311 //
312 // get_contents() retrieves the contents as unicode characters,
313 // combining the insertion point wchar_ts with wchar_ts before the
314 // insertion point.
315 //
316 // contents_cut() removes graphemes between the cut_pos and the
317 // insertion point. contents_cut() may only be invoked when the
318 // insertion point is empty. The removed graphemes get placed into
319 // cut_text.
320 //
321 // insert_to_before() calls get_contents() then set_contents(),
322 // essentially converting wchar_ts at the insertion point to
323 // valid graphemes, and moving them to before_insert.
324 //
325 // to_before() calls insert_to_before(), then moves after_insert
326 // to before_insert().
327 //
328 // to_after() calls insert_to_before(), then moves before_isnert
329 // to after_insert().
330 //
331 // to_position() calls insert_to_before() then moves the insertion
332 // point to the given position.
333 //
334 // adjust_shift_position() calculates horizontal scrolling. It takes
335 // the editable field's width, and a reference to the current
336 // horizontal shift position, which gets adjusted, as necessary, to
337 // keep the cursor position visible, and returns the cursor position.
338 
339 class editablewidechar {
340 
341 public:
342 	widecharbuf before_insert;
343 
344 	std::u32string inserted;
345 
346 	widecharbuf after_insert;
347 
348 	void set_contents(const std::u32string &before,
349 			  const std::u32string &after);
350 
351 	void get_contents(std::u32string &before,
352 			  std::u32string &after) const;
353 
354 	void contents_cut(size_t cut_pos,
355 			  std::u32string &cut_text);
356 
insert_char(char32_t wc)357 	void insert_char(char32_t wc) { inserted.push_back(wc); }
358 
359 	void insert_to_before();
360 
361 	void to_before();
362 
363 	void to_after();
364 
clear()365 	void clear()
366 	{
367 		before_insert.clear();
368 		after_insert.clear();
369 		inserted.clear();
370 	}
371 
372 	void to_position(size_t pos);
373 
374 	size_t adjust_shift_pos(size_t &shiftoffset, size_t width,
375 
376 				// wbefore and wafter: wide characters
377 				// before and after cursor position,
378 				// as computed by adjust_shift_pos().
379 				// Useful to the caller
380 				widecharbuf &wbefore,
381 				widecharbuf &wafter);
382 };
383 
384 /*
385 ** Helper class for the word-wrapping logic.
386 **
387 ** This class receives char32_t pieces, that can be broken. The collector
388 ** converts them to system wchars, measures each piece, and collects them
389 ** until the next piece can't fit within the alloted width.
390 */
391 
392 template<typename output_sink>
393 class wordwrap_collector {
394 
395 public:
396 	/* Collected line */
397 	std::u32string linebuf;
398 
399 	/* Width of the collected line */
400 	size_t col;
401 
402 	/*
403 	** iter(line) gets invoked for each assembled line, where
404 	** line is a std::vector<wchar_t>.
405 	*/
406 
407 	output_sink &iter;
408 
409 	/*
410 	** Desired line width.
411 	*/
412 
413 	size_t towidth;
414 
415 	/*
416 	** If true, the trailing space on each line gets removed.
417 	** In all cases, each line is wrapped at towidth() characters,
418 	** where possible.
419 	*/
420 
421 	bool delsp;
422 
wordwrap_collector(output_sink & iterArg,size_t towidthArg,bool delspArg)423 	wordwrap_collector(output_sink &iterArg,
424 			   size_t towidthArg,
425 			   bool delspArg) : col(0), iter(iterArg),
426 					    towidth(towidthArg),
427 					    delsp(delspArg)
428 	{
429 	}
430 
addsegment(const std::u32string & segment)431 	void addsegment(const std::u32string &segment)
432 	{
433 		std::vector<wchar_t> wsegment;
434 
435 		{
436 			std::string s=unicode::iconvert
437 				::convert(segment, unicode_default_chset());
438 
439 			towidechar(s.begin(), s.end(), wsegment);
440 
441 			if (wsegment.empty())
442 				return;
443 		}
444 
445 		size_t width=0;
446 
447 		for (std::vector<wchar_t>::const_iterator
448 			     b(wsegment.begin()),
449 			     e(wsegment.end()); b != e; ++b)
450 			width += widecharbuf::charwidth(*b, col+width);
451 
452 		if (!(delsp && width + col == towidth+1 &&
453 		      *--wsegment.end() == ' '))
454 		{
455 			if (width + col > towidth && !linebuf.empty())
456 				breakline();
457 		}
458 
459 		linebuf.insert(linebuf.end(), segment.begin(), segment.end());
460 		col += width;
461 	}
462 
breakline()463 	void breakline()
464 	{
465 		if (delsp && !linebuf.empty() &&
466 		    linebuf[linebuf.size()-1] == ' ')
467 			linebuf.pop_back();
468 
469 		*iter++=linebuf;
470 		col=0;
471 		linebuf.clear();
472 	}
473 };
474 
475 
476 /*
477 ** A default rewrap helper object that does not rewrap anything.
478 */
479 
480 class unicoderewrapnone {
481 
482 public:
483 	bool operator()(size_t n) const;
484 };
485 
486 /*
487 ** Unicode-based linewrapping logic.
488 **
489 ** This template defines an output iterator that takes in char32_ts.
490 ** The constructor receives a rewrap helper object reference, an output
491 ** iterator, the requested width, and whether to trim the trailing
492 ** space from each wrapped line.
493 **
494 ** As this iterator is iterated over char32_ts, it will iterate the
495 ** received output iterator over std::u32strings, representing
496 ** each line wrapped to the requested width.
497 **
498 ** The trim flag should normally be false. This properly preserves all
499 ** whitespace in the unicode character sequence. The trim flag may be true
500 ** only in contexts where the wrapped text will never be rewrapped again.
501 ** Removal of a trailing space on each line allows an extra character to be
502 ** present instead of the trailing space.
503 **
504 ** The rewrap helper object instance must define a bool operator()(size_t n)
505 ** const. size_t receives a character offset, and should return true if
506 ** character #n is the first character in an original line of text. If so,
507 ** and the unicode word wrap algorithm does not indicate that there's a
508 ** potential linebreak here, a space gets appended at this point. This is
509 ** used to rewrap existing lines of text which may not end with a space
510 ** character.
511 **
512 ** After the original unicode chars are iterated over, eof() must be
513 ** invoked in order to output any partially-wrapped content that's still
514 ** held internally in this iterator.
515 */
516 
517 template<typename output_sink_t,
518 	 typename rewrap_helper_t=unicoderewrapnone> class unicodewordwrapper :
519 	public std::iterator<std::output_iterator_tag, void, void, void, void> {
520 
521 	// State maintained by the iterator. This iterator is copyable.
522 	// The state is associated with only one iterator instance. Copying
523 	// an iterator copies the state from the original iterator into the
524 	// new one.
525 
526 	class buffer : public unicode::linebreakc_callback_save_buf {
527 
528 	public:
529 		std::u32string segment; // Current word
530 		size_t cnt; // Counts characters that are being wrapped.
531 
532 		wordwrap_collector<output_sink_t> collector;
533 		// The collector object.
534 
535 		const rewrap_helper_t &rewrapper;
536 		// The rewrap helper object.
537 
buffer(const rewrap_helper_t & rewrapperArg,output_sink_t & out_iter,size_t towidth,bool delsp)538 		buffer(const rewrap_helper_t &rewrapperArg,
539 		       output_sink_t &out_iter,
540 		       size_t towidth,
541 		       bool delsp)
542 			: cnt(0),
543 			  collector(out_iter, towidth, delsp),
544 			  rewrapper(rewrapperArg) {}
545 
~buffer()546 		virtual ~buffer() {}
547 	};
548 
549 	mutable buffer *buf;
550 
551 	typedef unicodewordwrapper<output_sink_t, rewrap_helper_t> iter_t;
552 
553 public:
554 	// Iterator constructor
unicodewordwrapper(const rewrap_helper_t & rewrap_helper,output_sink_t & out_iter,size_t towidth,bool delsp)555 	unicodewordwrapper(const rewrap_helper_t &rewrap_helper,
556 			   output_sink_t &out_iter,
557 			   size_t towidth,
558 			   bool delsp)
559 		: buf(new buffer(rewrap_helper, out_iter, towidth, delsp))
560 	{
561 		buf->set_opts(UNICODE_LB_OPT_PRBREAK|
562 			      UNICODE_LB_OPT_SYBREAK|
563 			      UNICODE_LB_OPT_DASHWJ);
564 	}
565 
566 	// End iterator constructor
unicodewordwrapper()567 	unicodewordwrapper() : buf(NULL)
568 	{
569 	}
570 
~unicodewordwrapper()571 	~unicodewordwrapper()
572 	{
573 		eof();
574 	}
575 
576 	// Assignment operator moves the state
577 
578 	iter_t &operator=(const iter_t &o)
579 	{
580 		if (buf)
581 			delete buf;
582 
583 		buf=o.buf;
584 		o.buf=NULL;
585 		return *this;
586 	}
587 
588 	// Copy constructor moves the state
unicodewordwrapper(const iter_t & o)589 	unicodewordwrapper(const iter_t &o) : buf(o.buf)
590 	{
591 		o.buf=NULL;
592 	}
593 
594 	// Operator implementation
595 
596 	iter_t &operator++() { return *this; }
597 	iter_t &operator++(int) { return *this; }
598 	iter_t &operator*() { return *this; }
599 
600 	void operator=(char32_t ch)
601 	{
602 		if (!buf)
603 			return;
604 
605 		// Feed into the linebreaking algorithm.
606 
607 		buf->operator<<(ch);
608 
609 		// Process linebreaking algorithm output.
610 
611 		while (!buf->lb_buf.empty())
612 		{
613 			std::pair<int, char32_t> ch(buf->lb_buf.front());
614 			buf->lb_buf.pop_front();
615 
616 			// If text is being rewrapped, and the linebreaking
617 			// algorithm prohibits a linebreak here, but this
618 			// was the first character of the pre-wrapped line,
619 			// then there must've been a space character here,
620 			// which allows a break.
621 
622 			if (ch.first == UNICODE_LB_NONE && buf->cnt > 0 &&
623 			    (buf->rewrapper)(buf->cnt))
624 			{
625 				buf->segment.push_back(' ');
626 				ch.first=UNICODE_LB_ALLOWED;
627 			}
628 			++buf->cnt;
629 
630 			// Process a potential linebreak.
631 
632 			if (ch.first != UNICODE_LB_NONE)
633 			{
634 				buf->collector.addsegment(buf->segment);
635 
636 				if (ch.first == UNICODE_LB_MANDATORY)
637 					buf->collector.breakline();
638 				buf->segment.clear();
639 			}
640 
641 			if (ch.second != '\r' && ch.second != '\n')
642 				buf->segment.push_back(ch.second);
643 		}
644 	}
645 
646 	// Finish remaining content, and clean up.
647 
eof()648 	void eof()
649 	{
650 		if (buf)
651 		{
652 			buf->collector.addsegment(buf->segment);
653 			if (!buf->collector.linebuf.empty())
654 				buf->collector.breakline();
655 
656 			delete buf;
657 			buf=NULL;
658 		}
659 	}
660 };
661 
662 // A convenience function to iterate over an arbitrary sequence defined
663 // by a beginning and an ending iterator, and wrap it.
664 
665 template<typename input_iter, typename output_sink,
666 	 typename rewrap_helper_t>
unicodewordwrap(input_iter beg_iter,input_iter end_iter,const rewrap_helper_t & rewrap_helper,output_sink & out_iter,size_t towidth,bool delsp)667 void unicodewordwrap(input_iter beg_iter,
668 		     input_iter end_iter,
669 		     const rewrap_helper_t &rewrap_helper,
670 		     output_sink &out_iter,
671 		     size_t towidth,
672 		     bool delsp)
673 {
674 	unicodewordwrapper<output_sink, rewrap_helper_t>
675 		iter(rewrap_helper, out_iter, towidth, delsp);
676 
677 	while (beg_iter != end_iter)
678 	{
679 		iter= *beg_iter;
680 		++beg_iter;
681 	}
682 	iter.eof();
683 }
684 
685 #endif
686