1 /* $XFree86: xc/programs/xterm/wcwidth.characters,v 1.9 2006/06/19 00:36:52 dickey Exp $ */
2
3 /*
4 * This is an implementation of wcwidth() and wcswidth() (defined in
5 * IEEE Std 1002.1-2001) for Unicode.
6 *
7 * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
8 * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
9 *
10 * In fixed-width output devices, Latin characters all occupy a single
11 * "cell" position of equal width, whereas ideographic CJK characters
12 * occupy two such cells. Interoperability between terminal-line
13 * applications and (teletype-style) character terminals using the
14 * UTF-8 encoding requires agreement on which character should advance
15 * the cursor by how many cell positions. No established formal
16 * standards exist at present on which Unicode character shall occupy
17 * how many cell positions on character terminals. These routines are
18 * a first attempt of defining such behavior based on simple rules
19 * applied to data provided by the Unicode Consortium.
20 *
21 * For some graphical characters, the Unicode standard explicitly
22 * defines a character-cell width via the definition of the East Asian
23 * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
24 * In all these cases, there is no ambiguity about which width a
25 * terminal shall use. For characters in the East Asian Ambiguous (A)
26 * class, the width choice depends purely on a preference of backward
27 * compatibility with either historic CJK or Western practice.
28 * Choosing single-width for these characters is easy to justify as
29 * the appropriate long-term solution, as the CJK practice of
30 * displaying these characters as double-width comes from historic
31 * implementation simplicity (8-bit encoded characters were displayed
32 * single-width and 16-bit ones double-width, even for Greek,
33 * Cyrillic, etc.) and not any typographic considerations.
34 *
35 * Much less clear is the choice of width for the Not East Asian
36 * (Neutral) class. Existing practice does not dictate a width for any
37 * of these characters. It would nevertheless make sense
38 * typographically to allocate two character cells to characters such
39 * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
40 * represented adequately with a single-width glyph. The following
41 * routines at present merely assign a single-cell width to all
42 * neutral characters, in the interest of simplicity. This is not
43 * entirely satisfactory and should be reconsidered before
44 * establishing a formal standard in this area. At the moment, the
45 * decision which Not East Asian (Neutral) characters should be
46 * represented by double-width glyphs cannot yet be answered by
47 * applying a simple rule from the Unicode database content. Setting
48 * up a proper standard for the behavior of UTF-8 character terminals
49 * will require a careful analysis not only of each Unicode character,
50 * but also of each presentation form, something the author of these
51 * routines has avoided to do so far.
52 *
53 * http://www.unicode.org/unicode/reports/tr11/
54 *
55 * Markus Kuhn -- 2007-05-25 (Unicode 5.0)
56 *
57 * Permission to use, copy, modify, and distribute this software
58 * for any purpose and without fee is hereby granted. The author
59 * disclaims all warranties with regard to this software.
60 *
61 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
62 */
63 /*
64 * Adaptions for KDE by Waldo Bastian <bastian@kde.org> and
65 * Francesco Cecconi <francesco.cecconi@gmail.com>
66 * See COPYING.Unicode for the license for the original wcwidth.c
67 */
68
69 // Own
70 #include "konsole_wcwidth.h"
71
72 struct interval {
73 unsigned long first;
74 unsigned long last;
75 };
76
77 /* auxiliary function for binary search in interval table */
bisearch(unsigned long ucs,const struct interval * table,int max)78 static int bisearch(unsigned long ucs, const struct interval* table, int max)
79 {
80 int min = 0;
81 int mid;
82
83 if (ucs < table[0].first || ucs > table[max].last)
84 return 0;
85 while (max >= min) {
86 mid = (min + max) / 2;
87 if (ucs > table[mid].last)
88 min = mid + 1;
89 else if (ucs < table[mid].first)
90 max = mid - 1;
91 else
92 return 1;
93 }
94
95 return 0;
96 }
97
98 /* The following functions define the column width of an ISO 10646
99 * character as follows:
100 *
101 * - The null character (U+0000) has a column width of 0.
102 *
103 * - Other C0/C1 control characters and DEL will lead to a return
104 * value of -1.
105 *
106 * - Non-spacing and enclosing combining characters (general
107 * category code Mn or Me in the Unicode database) have a
108 * column width of 0.
109 *
110 * - Other format characters (general category code Cf in the Unicode
111 * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
112 *
113 * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
114 * have a column width of 0.
115 *
116 * - Spacing characters in the East Asian Wide (W) or East Asian
117 * FullWidth (F) category as defined in Unicode Technical
118 * Report #11 have a column width of 2.
119 *
120 * - All remaining characters (including all printable
121 * ISO 8859-1 and WGL4 characters, Unicode control characters,
122 * etc.) have a column width of 1.
123 *
124 * This implementation assumes that quint16 characters are encoded
125 * in ISO 10646.
126 */
127
konsole_wcwidth(uint ucs)128 int konsole_wcwidth(uint ucs)
129 {
130 /* sorted list of non-overlapping intervals of non-spacing characters */
131 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
132 static const struct interval combining[] = {
133 { 0x0300, 0x036F }, { 0x0483, 0x0486 }, { 0x0488, 0x0489 },
134 { 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 },
135 { 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0600, 0x0603 },
136 { 0x0610, 0x0615 }, { 0x064B, 0x065E }, { 0x0670, 0x0670 },
137 { 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED },
138 { 0x070F, 0x070F }, { 0x0711, 0x0711 }, { 0x0730, 0x074A },
139 { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x0901, 0x0902 },
140 { 0x093C, 0x093C }, { 0x0941, 0x0948 }, { 0x094D, 0x094D },
141 { 0x0951, 0x0954 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 },
142 { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD },
143 { 0x09E2, 0x09E3 }, { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C },
144 { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D },
145 { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC },
146 { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD },
147 { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C },
148 { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, { 0x0B4D, 0x0B4D },
149 { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 },
150 { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 },
151 { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, { 0x0CBC, 0x0CBC },
152 { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
153 { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D },
154 { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 },
155 { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E },
156 { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC },
157 { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 },
158 { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E },
159 { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 },
160 { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 },
161 { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 },
162 { 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x135F, 0x135F },
163 { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
164 { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
165 { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
166 { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
167 { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
168 { 0x1A17, 0x1A18 }, { 0x1B00, 0x1B03 }, { 0x1B34, 0x1B34 },
169 { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, { 0x1B42, 0x1B42 },
170 { 0x1B6B, 0x1B73 }, { 0x1DC0, 0x1DCA }, { 0x1DFE, 0x1DFF },
171 { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x2063 },
172 { 0x206A, 0x206F }, { 0x20D0, 0x20EF }, { 0x302A, 0x302F },
173 { 0x3099, 0x309A }, { 0xA806, 0xA806 }, { 0xA80B, 0xA80B },
174 { 0xA825, 0xA826 }, { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F },
175 { 0xFE20, 0xFE23 }, { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB },
176 { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
177 { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x1D167, 0x1D169 },
178 { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
179 { 0x1D242, 0x1D244 }, { 0xE0001, 0xE0001 }, { 0xE0020, 0xE007F },
180 { 0xE0100, 0xE01EF }
181 };
182 static const struct interval emoji_width[] = {
183 {0x1f1e6, 0x1f1ff},
184 {0x1f321, 0x1f321},
185 {0x1f324, 0x1f32c},
186 {0x1f336, 0x1f336},
187 {0x1f37d, 0x1f37d},
188 {0x1f396, 0x1f397},
189 {0x1f399, 0x1f39b},
190 {0x1f39e, 0x1f39f},
191 {0x1f3cb, 0x1f3ce},
192 {0x1f3d4, 0x1f3df},
193 {0x1f3f3, 0x1f3f5},
194 {0x1f3f7, 0x1f3f7},
195 {0x1f43f, 0x1f43f},
196 {0x1f441, 0x1f441},
197 {0x1f4fd, 0x1f4fd},
198 {0x1f549, 0x1f54a},
199 {0x1f56f, 0x1f570},
200 {0x1f573, 0x1f579},
201 {0x1f587, 0x1f587},
202 {0x1f58a, 0x1f58d},
203 {0x1f590, 0x1f590},
204 {0x1f5a5, 0x1f5a5},
205 {0x1f5a8, 0x1f5a8},
206 {0x1f5b1, 0x1f5b2},
207 {0x1f5bc, 0x1f5bc},
208 {0x1f5c2, 0x1f5c4},
209 {0x1f5d1, 0x1f5d3},
210 {0x1f5dc, 0x1f5de},
211 {0x1f5e1, 0x1f5e1},
212 {0x1f5e3, 0x1f5e3},
213 {0x1f5e8, 0x1f5e8},
214 {0x1f5ef, 0x1f5ef},
215 {0x1f5f3, 0x1f5f3},
216 {0x1f5fa, 0x1f5fa},
217 {0x1f6cb, 0x1f6cf},
218 {0x1f6e0, 0x1f6e5},
219 {0x1f6e9, 0x1f6e9},
220 {0x1f6f0, 0x1f6f0},
221 {0x1f6f3, 0x1f6f3},
222 };
223 static const struct interval doublewidth[] = {
224 {0x1100, 0x115f},
225 {0x231a, 0x231b},
226 {0x2329, 0x232a},
227 {0x23e9, 0x23ec},
228 {0x23f0, 0x23f0},
229 {0x23f3, 0x23f3},
230 {0x25fd, 0x25fe},
231 {0x2614, 0x2615},
232 {0x2648, 0x2653},
233 {0x267f, 0x267f},
234 {0x2693, 0x2693},
235 {0x26a1, 0x26a1},
236 {0x26aa, 0x26ab},
237 {0x26bd, 0x26be},
238 {0x26c4, 0x26c5},
239 {0x26ce, 0x26ce},
240 {0x26d4, 0x26d4},
241 {0x26ea, 0x26ea},
242 {0x26f2, 0x26f3},
243 {0x26f5, 0x26f5},
244 {0x26fa, 0x26fa},
245 {0x26fd, 0x26fd},
246 {0x2705, 0x2705},
247 {0x270a, 0x270b},
248 {0x2728, 0x2728},
249 {0x274c, 0x274c},
250 {0x274e, 0x274e},
251 {0x2753, 0x2755},
252 {0x2757, 0x2757},
253 {0x2795, 0x2797},
254 {0x27b0, 0x27b0},
255 {0x27bf, 0x27bf},
256 {0x2b1b, 0x2b1c},
257 {0x2b50, 0x2b50},
258 {0x2b55, 0x2b55},
259 {0x2e80, 0x2e99},
260 {0x2e9b, 0x2ef3},
261 {0x2f00, 0x2fd5},
262 {0x2ff0, 0x2ffb},
263 {0x3000, 0x303e},
264 {0x3041, 0x3096},
265 {0x3099, 0x30ff},
266 {0x3105, 0x312f},
267 {0x3131, 0x318e},
268 {0x3190, 0x31ba},
269 {0x31c0, 0x31e3},
270 {0x31f0, 0x321e},
271 {0x3220, 0x3247},
272 {0x3250, 0x4dbf},
273 {0x4e00, 0xa48c},
274 {0xa490, 0xa4c6},
275 {0xa960, 0xa97c},
276 {0xac00, 0xd7a3},
277 {0xf900, 0xfaff},
278 {0xfe10, 0xfe19},
279 {0xfe30, 0xfe52},
280 {0xfe54, 0xfe66},
281 {0xfe68, 0xfe6b},
282 {0xff01, 0xff60},
283 {0xffe0, 0xffe6},
284 {0x16fe0, 0x16fe3},
285 {0x17000, 0x187f7},
286 {0x18800, 0x18af2},
287 {0x1b000, 0x1b11e},
288 {0x1b150, 0x1b152},
289 {0x1b164, 0x1b167},
290 {0x1b170, 0x1b2fb},
291 {0x1f004, 0x1f004},
292 {0x1f0cf, 0x1f0cf},
293 {0x1f18e, 0x1f18e},
294 {0x1f191, 0x1f19a},
295 {0x1f200, 0x1f202},
296 {0x1f210, 0x1f23b},
297 {0x1f240, 0x1f248},
298 {0x1f250, 0x1f251},
299 {0x1f260, 0x1f265},
300 {0x1f300, 0x1f320},
301 {0x1f32d, 0x1f335},
302 {0x1f337, 0x1f37c},
303 {0x1f37e, 0x1f393},
304 {0x1f3a0, 0x1f3ca},
305 {0x1f3cf, 0x1f3d3},
306 {0x1f3e0, 0x1f3f0},
307 {0x1f3f4, 0x1f3f4},
308 {0x1f3f8, 0x1f43e},
309 {0x1f440, 0x1f440},
310 {0x1f442, 0x1f4fc},
311 {0x1f4ff, 0x1f53d},
312 {0x1f54b, 0x1f54e},
313 {0x1f550, 0x1f567},
314 {0x1f57a, 0x1f57a},
315 {0x1f595, 0x1f596},
316 {0x1f5a4, 0x1f5a4},
317 {0x1f5fb, 0x1f64f},
318 {0x1f680, 0x1f6c5},
319 {0x1f6cc, 0x1f6cc},
320 {0x1f6d0, 0x1f6d2},
321 {0x1f6d5, 0x1f6d5},
322 {0x1f6eb, 0x1f6ec},
323 {0x1f6f4, 0x1f6fa},
324 {0x1f7e0, 0x1f7eb},
325 {0x1f90d, 0x1f971},
326 {0x1f973, 0x1f976},
327 {0x1f97a, 0x1f9a2},
328 {0x1f9a5, 0x1f9aa},
329 {0x1f9ae, 0x1f9ca},
330 {0x1f9cd, 0x1f9ff},
331 {0x1fa70, 0x1fa73},
332 {0x1fa78, 0x1fa7a},
333 {0x1fa80, 0x1fa82},
334 {0x1fa90, 0x1fa95},
335 {0x20000, 0x2fffd},
336 {0x30000, 0x3fffd},
337 };
338
339 /* test for 8-bit control characters */
340 if (ucs == 0)
341 return 0;
342 if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
343 return -1;
344
345 /* binary search in table of non-spacing characters */
346 if (bisearch(ucs, combining,
347 sizeof(combining) / sizeof(struct interval) - 1))
348 return 0;
349
350 /* binary search for known wide characters */
351 if (bisearch(ucs, doublewidth,
352 sizeof(doublewidth) / sizeof(struct interval) - 1))
353 return 2;
354
355 /* binary search for wide emoji */
356 if (bisearch(ucs, emoji_width,
357 sizeof(emoji_width) / sizeof(struct interval) - 1))
358 return 2;
359
360 return 1;
361 }
362
string_width(const QString & text)363 int string_width(const QString& text)
364 {
365 int w = 0;
366 for (int i = 0; i < text.length(); ++i)
367 w += konsole_wcwidth(text[i].unicode());
368 return w;
369 }
370
371