1 /*
2  * This is an implementation of wcwidth() and wcswidth() (defined in
3  * IEEE Std 1002.1-2001) for Unicode.
4  *
5  * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
6  * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
7  *
8  * In fixed-width output devices, Latin characters all occupy a single
9  * "cell" position of equal width, whereas ideographic CJK characters
10  * occupy two such cells. Interoperability between terminal-line
11  * applications and (teletype-style) character terminals using the
12  * UTF-8 encoding requires agreement on which character should advance
13  * the cursor by how many cell positions. No established formal
14  * standards exist at present on which Unicode character shall occupy
15  * how many cell positions on character terminals. These routines are
16  * a first attempt of defining such behavior based on simple rules
17  * applied to data provided by the Unicode Consortium.
18  *
19  * For some graphical characters, the Unicode standard explicitly
20  * defines a character-cell width via the definition of the East Asian
21  * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
22  * In all these cases, there is no ambiguity about which width a
23  * terminal shall use. For characters in the East Asian Ambiguous (A)
24  * class, the width choice depends purely on a preference of backward
25  * compatibility with either historic CJK or Western practice.
26  * Choosing single-width for these characters is easy to justify as
27  * the appropriate long-term solution, as the CJK practice of
28  * displaying these characters as double-width comes from historic
29  * implementation simplicity (8-bit encoded characters were displayed
30  * single-width and 16-bit ones double-width, even for Greek,
31  * Cyrillic, etc.) and not any typographic considerations.
32  *
33  * Much less clear is the choice of width for the Not East Asian
34  * (Neutral) class. Existing practice does not dictate a width for any
35  * of these characters. It would nevertheless make sense
36  * typographically to allocate two character cells to characters such
37  * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
38  * represented adequately with a single-width glyph. The following
39  * routines at present merely assign a single-cell width to all
40  * neutral characters, in the interest of simplicity. This is not
41  * entirely satisfactory and should be reconsidered before
42  * establishing a formal standard in this area. At the moment, the
43  * decision which Not East Asian (Neutral) characters should be
44  * represented by double-width glyphs cannot yet be answered by
45  * applying a simple rule from the Unicode database content. Setting
46  * up a proper standard for the behavior of UTF-8 character terminals
47  * will require a careful analysis not only of each Unicode character,
48  * but also of each presentation form, something the author of these
49  * routines has avoided to do so far.
50  *
51  * http://www.unicode.org/unicode/reports/tr11/
52  *
53  * Markus Kuhn -- 2007-05-26 (Unicode 5.0)
54  *
55  * Permission to use, copy, modify, and distribute this software
56  * for any purpose and without fee is hereby granted. The author
57  * disclaims all warranties with regard to this software.
58  *
59  * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
60  */
61 
62 #include <wchar.h>
63 #include <string>
64 #include <memory>
65 
66 namespace replxx {
67 
68 struct interval {
69 	char32_t first;
70 	char32_t last;
71 };
72 
73 /* auxiliary function for binary search in interval table */
bisearch(char32_t ucs,const struct interval * table,int max)74 static int bisearch(char32_t ucs, const struct interval *table, int max) {
75 	int min = 0;
76 	int mid;
77 
78 	if (ucs < table[0].first || ucs > table[max].last)
79 		return 0;
80 	while (max >= min) {
81 		mid = (min + max) / 2;
82 		if (ucs > table[mid].last)
83 			min = mid + 1;
84 		else if (ucs < table[mid].first)
85 			max = mid - 1;
86 		else
87 			return 1;
88 	}
89 
90 	return 0;
91 }
92 
93 
94 /* The following two functions define the column width of an ISO 10646
95  * character as follows:
96  *
97  *		- The null character (U+0000) has a column width of 0.
98  *
99  *		- Other C0/C1 control characters and DEL will lead to a return
100  *			value of -1.
101  *
102  *		- Non-spacing and enclosing combining characters (general
103  *			category code Mn or Me in the Unicode database) have a
104  *			column width of 0.
105  *
106  *		- SOFT HYPHEN (U+00AD) has a column width of 1.
107  *
108  *		- Other format characters (general category code Cf in the Unicode
109  *			database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
110  *
111  *		- Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
112  *			have a column width of 0.
113  *
114  *		- Spacing characters in the East Asian Wide (W) or East Asian
115  *			Full-width (F) category as defined in Unicode Technical
116  *			Report #11 have a column width of 2.
117  *
118  *		- All remaining characters (including all printable
119  *			ISO 8859-1 and WGL4 characters, Unicode control characters,
120  *			etc.) have a column width of 1.
121  *
122  * This implementation assumes that wchar_t characters are encoded
123  * in ISO 10646.
124  */
125 
mk_is_wide_char(char32_t ucs)126 int mk_is_wide_char(char32_t ucs) {
127   static const struct interval wide[] = {
128     {0x1100, 0x115f}, {0x231a, 0x231b}, {0x2329, 0x232a},
129     {0x23e9, 0x23ec}, {0x23f0, 0x23f0}, {0x23f3, 0x23f3},
130     {0x25fd, 0x25fe}, {0x2614, 0x2615}, {0x2648, 0x2653},
131     {0x267f, 0x267f}, {0x2693, 0x2693}, {0x26a1, 0x26a1},
132     {0x26aa, 0x26ab}, {0x26bd, 0x26be}, {0x26c4, 0x26c5},
133     {0x26ce, 0x26ce}, {0x26d4, 0x26d4}, {0x26ea, 0x26ea},
134     {0x26f2, 0x26f3}, {0x26f5, 0x26f5}, {0x26fa, 0x26fa},
135     {0x26fd, 0x26fd}, {0x2705, 0x2705}, {0x270a, 0x270b},
136     {0x2728, 0x2728}, {0x274c, 0x274c}, {0x274e, 0x274e},
137     {0x2753, 0x2755}, {0x2757, 0x2757}, {0x2795, 0x2797},
138     {0x27b0, 0x27b0}, {0x27bf, 0x27bf}, {0x2b1b, 0x2b1c},
139     {0x2b50, 0x2b50}, {0x2b55, 0x2b55}, {0x2e80, 0x2fdf},
140     {0x2ff0, 0x303e}, {0x3040, 0x3247}, {0x3250, 0x4dbf},
141     {0x4e00, 0xa4cf}, {0xa960, 0xa97f}, {0xac00, 0xd7a3},
142     {0xf900, 0xfaff}, {0xfe10, 0xfe19}, {0xfe30, 0xfe6f},
143     {0xff01, 0xff60}, {0xffe0, 0xffe6}, {0x16fe0, 0x16fe1},
144     {0x17000, 0x18aff}, {0x1b000, 0x1b12f}, {0x1b170, 0x1b2ff},
145     {0x1f004, 0x1f004}, {0x1f0cf, 0x1f0cf}, {0x1f18e, 0x1f18e},
146     {0x1f191, 0x1f19a}, {0x1f200, 0x1f202}, {0x1f210, 0x1f23b},
147     {0x1f240, 0x1f248}, {0x1f250, 0x1f251}, {0x1f260, 0x1f265},
148     {0x1f300, 0x1f320}, {0x1f32d, 0x1f335}, {0x1f337, 0x1f37c},
149     {0x1f37e, 0x1f393}, {0x1f3a0, 0x1f3ca}, {0x1f3cf, 0x1f3d3},
150     {0x1f3e0, 0x1f3f0}, {0x1f3f4, 0x1f3f4}, {0x1f3f8, 0x1f43e},
151     {0x1f440, 0x1f440}, {0x1f442, 0x1f4fc}, {0x1f4ff, 0x1f53d},
152     {0x1f54b, 0x1f54e}, {0x1f550, 0x1f567}, {0x1f57a, 0x1f57a},
153     {0x1f595, 0x1f596}, {0x1f5a4, 0x1f5a4}, {0x1f5fb, 0x1f64f},
154     {0x1f680, 0x1f6c5}, {0x1f6cc, 0x1f6cc}, {0x1f6d0, 0x1f6d2},
155     {0x1f6eb, 0x1f6ec}, {0x1f6f4, 0x1f6f8}, {0x1f910, 0x1f93e},
156     {0x1f940, 0x1f94c}, {0x1f950, 0x1f96b}, {0x1f980, 0x1f997},
157     {0x1f9c0, 0x1f9c0}, {0x1f9d0, 0x1f9e6}, {0x20000, 0x2fffd},
158     {0x30000, 0x3fffd},
159   };
160 
161   if ( bisearch(ucs, wide, sizeof(wide) / sizeof(struct interval) - 1) ) {
162     return 1;
163 	}
164 
165   return 0;
166 }
167 
mk_wcwidth(char32_t ucs)168 int mk_wcwidth(char32_t ucs) {
169 	/* sorted list of non-overlapping intervals of non-spacing characters */
170 	/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
171 	static const struct interval combining[] = {
172     {0x00ad, 0x00ad}, {0x0300, 0x036f}, {0x0483, 0x0489},
173     {0x0591, 0x05bd}, {0x05bf, 0x05bf}, {0x05c1, 0x05c2},
174     {0x05c4, 0x05c5}, {0x05c7, 0x05c7}, {0x0610, 0x061a},
175     {0x061c, 0x061c}, {0x064b, 0x065f}, {0x0670, 0x0670},
176     {0x06d6, 0x06dc}, {0x06df, 0x06e4}, {0x06e7, 0x06e8},
177     {0x06ea, 0x06ed}, {0x0711, 0x0711}, {0x0730, 0x074a},
178     {0x07a6, 0x07b0}, {0x07eb, 0x07f3}, {0x0816, 0x0819},
179     {0x081b, 0x0823}, {0x0825, 0x0827}, {0x0829, 0x082d},
180     {0x0859, 0x085b}, {0x08d4, 0x08e1}, {0x08e3, 0x0902},
181     {0x093a, 0x093a}, {0x093c, 0x093c}, {0x0941, 0x0948},
182     {0x094d, 0x094d}, {0x0951, 0x0957}, {0x0962, 0x0963},
183     {0x0981, 0x0981}, {0x09bc, 0x09bc}, {0x09c1, 0x09c4},
184     {0x09cd, 0x09cd}, {0x09e2, 0x09e3}, {0x0a01, 0x0a02},
185     {0x0a3c, 0x0a3c}, {0x0a41, 0x0a42}, {0x0a47, 0x0a48},
186     {0x0a4b, 0x0a4d}, {0x0a51, 0x0a51}, {0x0a70, 0x0a71},
187     {0x0a75, 0x0a75}, {0x0a81, 0x0a82}, {0x0abc, 0x0abc},
188     {0x0ac1, 0x0ac5}, {0x0ac7, 0x0ac8}, {0x0acd, 0x0acd},
189     {0x0ae2, 0x0ae3}, {0x0afa, 0x0aff}, {0x0b01, 0x0b01},
190     {0x0b3c, 0x0b3c}, {0x0b3f, 0x0b3f}, {0x0b41, 0x0b44},
191     {0x0b4d, 0x0b4d}, {0x0b56, 0x0b56}, {0x0b62, 0x0b63},
192     {0x0b82, 0x0b82}, {0x0bc0, 0x0bc0}, {0x0bcd, 0x0bcd},
193     {0x0c00, 0x0c00}, {0x0c3e, 0x0c40}, {0x0c46, 0x0c48},
194     {0x0c4a, 0x0c4d}, {0x0c55, 0x0c56}, {0x0c62, 0x0c63},
195     {0x0c81, 0x0c81}, {0x0cbc, 0x0cbc}, {0x0cbf, 0x0cbf},
196     {0x0cc6, 0x0cc6}, {0x0ccc, 0x0ccd}, {0x0ce2, 0x0ce3},
197     {0x0d00, 0x0d01}, {0x0d3b, 0x0d3c}, {0x0d41, 0x0d44},
198     {0x0d4d, 0x0d4d}, {0x0d62, 0x0d63}, {0x0dca, 0x0dca},
199     {0x0dd2, 0x0dd4}, {0x0dd6, 0x0dd6}, {0x0e31, 0x0e31},
200     {0x0e34, 0x0e3a}, {0x0e47, 0x0e4e}, {0x0eb1, 0x0eb1},
201     {0x0eb4, 0x0eb9}, {0x0ebb, 0x0ebc}, {0x0ec8, 0x0ecd},
202     {0x0f18, 0x0f19}, {0x0f35, 0x0f35}, {0x0f37, 0x0f37},
203     {0x0f39, 0x0f39}, {0x0f71, 0x0f7e}, {0x0f80, 0x0f84},
204     {0x0f86, 0x0f87}, {0x0f8d, 0x0f97}, {0x0f99, 0x0fbc},
205     {0x0fc6, 0x0fc6}, {0x102d, 0x1030}, {0x1032, 0x1037},
206     {0x1039, 0x103a}, {0x103d, 0x103e}, {0x1058, 0x1059},
207     {0x105e, 0x1060}, {0x1071, 0x1074}, {0x1082, 0x1082},
208     {0x1085, 0x1086}, {0x108d, 0x108d}, {0x109d, 0x109d},
209     {0x1160, 0x11ff}, {0x135d, 0x135f}, {0x1712, 0x1714},
210     {0x1732, 0x1734}, {0x1752, 0x1753}, {0x1772, 0x1773},
211     {0x17b4, 0x17b5}, {0x17b7, 0x17bd}, {0x17c6, 0x17c6},
212     {0x17c9, 0x17d3}, {0x17dd, 0x17dd}, {0x180b, 0x180e},
213     {0x1885, 0x1886}, {0x18a9, 0x18a9}, {0x1920, 0x1922},
214     {0x1927, 0x1928}, {0x1932, 0x1932}, {0x1939, 0x193b},
215     {0x1a17, 0x1a18}, {0x1a1b, 0x1a1b}, {0x1a56, 0x1a56},
216     {0x1a58, 0x1a5e}, {0x1a60, 0x1a60}, {0x1a62, 0x1a62},
217     {0x1a65, 0x1a6c}, {0x1a73, 0x1a7c}, {0x1a7f, 0x1a7f},
218     {0x1ab0, 0x1abe}, {0x1b00, 0x1b03}, {0x1b34, 0x1b34},
219     {0x1b36, 0x1b3a}, {0x1b3c, 0x1b3c}, {0x1b42, 0x1b42},
220     {0x1b6b, 0x1b73}, {0x1b80, 0x1b81}, {0x1ba2, 0x1ba5},
221     {0x1ba8, 0x1ba9}, {0x1bab, 0x1bad}, {0x1be6, 0x1be6},
222     {0x1be8, 0x1be9}, {0x1bed, 0x1bed}, {0x1bef, 0x1bf1},
223     {0x1c2c, 0x1c33}, {0x1c36, 0x1c37}, {0x1cd0, 0x1cd2},
224     {0x1cd4, 0x1ce0}, {0x1ce2, 0x1ce8}, {0x1ced, 0x1ced},
225     {0x1cf4, 0x1cf4}, {0x1cf8, 0x1cf9}, {0x1dc0, 0x1df9},
226     {0x1dfb, 0x1dff}, {0x200b, 0x200f}, {0x202a, 0x202e},
227     {0x2060, 0x2064}, {0x2066, 0x206f}, {0x20d0, 0x20f0},
228     {0x2cef, 0x2cf1}, {0x2d7f, 0x2d7f}, {0x2de0, 0x2dff},
229     {0x302a, 0x302d}, {0x3099, 0x309a}, {0xa66f, 0xa672},
230     {0xa674, 0xa67d}, {0xa69e, 0xa69f}, {0xa6f0, 0xa6f1},
231     {0xa802, 0xa802}, {0xa806, 0xa806}, {0xa80b, 0xa80b},
232     {0xa825, 0xa826}, {0xa8c4, 0xa8c5}, {0xa8e0, 0xa8f1},
233     {0xa926, 0xa92d}, {0xa947, 0xa951}, {0xa980, 0xa982},
234     {0xa9b3, 0xa9b3}, {0xa9b6, 0xa9b9}, {0xa9bc, 0xa9bc},
235     {0xa9e5, 0xa9e5}, {0xaa29, 0xaa2e}, {0xaa31, 0xaa32},
236     {0xaa35, 0xaa36}, {0xaa43, 0xaa43}, {0xaa4c, 0xaa4c},
237     {0xaa7c, 0xaa7c}, {0xaab0, 0xaab0}, {0xaab2, 0xaab4},
238     {0xaab7, 0xaab8}, {0xaabe, 0xaabf}, {0xaac1, 0xaac1},
239     {0xaaec, 0xaaed}, {0xaaf6, 0xaaf6}, {0xabe5, 0xabe5},
240     {0xabe8, 0xabe8}, {0xabed, 0xabed}, {0xfb1e, 0xfb1e},
241     {0xfe00, 0xfe0f}, {0xfe20, 0xfe2f}, {0xfeff, 0xfeff},
242     {0xfff9, 0xfffb}, {0x101fd, 0x101fd}, {0x102e0, 0x102e0},
243     {0x10376, 0x1037a}, {0x10a01, 0x10a03}, {0x10a05, 0x10a06},
244     {0x10a0c, 0x10a0f}, {0x10a38, 0x10a3a}, {0x10a3f, 0x10a3f},
245     {0x10ae5, 0x10ae6}, {0x11001, 0x11001}, {0x11038, 0x11046},
246     {0x1107f, 0x11081}, {0x110b3, 0x110b6}, {0x110b9, 0x110ba},
247     {0x11100, 0x11102}, {0x11127, 0x1112b}, {0x1112d, 0x11134},
248     {0x11173, 0x11173}, {0x11180, 0x11181}, {0x111b6, 0x111be},
249     {0x111ca, 0x111cc}, {0x1122f, 0x11231}, {0x11234, 0x11234},
250     {0x11236, 0x11237}, {0x1123e, 0x1123e}, {0x112df, 0x112df},
251     {0x112e3, 0x112ea}, {0x11300, 0x11301}, {0x1133c, 0x1133c},
252     {0x11340, 0x11340}, {0x11366, 0x1136c}, {0x11370, 0x11374},
253     {0x11438, 0x1143f}, {0x11442, 0x11444}, {0x11446, 0x11446},
254     {0x114b3, 0x114b8}, {0x114ba, 0x114ba}, {0x114bf, 0x114c0},
255     {0x114c2, 0x114c3}, {0x115b2, 0x115b5}, {0x115bc, 0x115bd},
256     {0x115bf, 0x115c0}, {0x115dc, 0x115dd}, {0x11633, 0x1163a},
257     {0x1163d, 0x1163d}, {0x1163f, 0x11640}, {0x116ab, 0x116ab},
258     {0x116ad, 0x116ad}, {0x116b0, 0x116b5}, {0x116b7, 0x116b7},
259     {0x1171d, 0x1171f}, {0x11722, 0x11725}, {0x11727, 0x1172b},
260     {0x11a01, 0x11a06}, {0x11a09, 0x11a0a}, {0x11a33, 0x11a38},
261     {0x11a3b, 0x11a3e}, {0x11a47, 0x11a47}, {0x11a51, 0x11a56},
262     {0x11a59, 0x11a5b}, {0x11a8a, 0x11a96}, {0x11a98, 0x11a99},
263     {0x11c30, 0x11c36}, {0x11c38, 0x11c3d}, {0x11c3f, 0x11c3f},
264     {0x11c92, 0x11ca7}, {0x11caa, 0x11cb0}, {0x11cb2, 0x11cb3},
265     {0x11cb5, 0x11cb6}, {0x11d31, 0x11d36}, {0x11d3a, 0x11d3a},
266     {0x11d3c, 0x11d3d}, {0x11d3f, 0x11d45}, {0x11d47, 0x11d47},
267     {0x16af0, 0x16af4}, {0x16b30, 0x16b36}, {0x16f8f, 0x16f92},
268     {0x1bc9d, 0x1bc9e}, {0x1bca0, 0x1bca3}, {0x1d167, 0x1d169},
269     {0x1d173, 0x1d182}, {0x1d185, 0x1d18b}, {0x1d1aa, 0x1d1ad},
270     {0x1d242, 0x1d244}, {0x1da00, 0x1da36}, {0x1da3b, 0x1da6c},
271     {0x1da75, 0x1da75}, {0x1da84, 0x1da84}, {0x1da9b, 0x1da9f},
272     {0x1daa1, 0x1daaf}, {0x1e000, 0x1e006}, {0x1e008, 0x1e018},
273     {0x1e01b, 0x1e021}, {0x1e023, 0x1e024}, {0x1e026, 0x1e02a},
274     {0x1e8d0, 0x1e8d6}, {0x1e944, 0x1e94a}, {0xe0001, 0xe0001},
275     {0xe0020, 0xe007f}, {0xe0100, 0xe01ef},
276 	};
277 
278 	/* test for 8-bit control characters */
279 	if ( ucs == 0 ) {
280 		return 0;
281 	}
282 	if ( ( ucs < 32 ) || ( ( ucs >= 0x7f ) && ( ucs < 0xa0 ) ) ) {
283 		return -1;
284 	}
285 
286 	/* binary search in table of non-spacing characters */
287 	if ( bisearch( ucs, combining, sizeof( combining ) / sizeof( struct interval ) - 1 ) ) {
288 		return 0;
289 	}
290 
291 	/* if we arrive here, ucs is not a combining or C0/C1 control character */
292   return ( mk_is_wide_char( ucs ) ? 2 : 1 );
293 }
294 
295 }
296 
297