1 /* $XFree86: xc/programs/xterm/wcwidth.characters,v 1.9 2006/06/19 00:36:52 dickey Exp $ */
2 
3 /*
4  * This is an implementation of wcwidth() and wcswidth() (defined in
5  * IEEE Std 1002.1-2001) for Unicode.
6  *
7  * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
8  * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
9  *
10  * In fixed-width output devices, Latin characters all occupy a single
11  * "cell" position of equal width, whereas ideographic CJK characters
12  * occupy two such cells. Interoperability between terminal-line
13  * applications and (teletype-style) character terminals using the
14  * UTF-8 encoding requires agreement on which character should advance
15  * the cursor by how many cell positions. No established formal
16  * standards exist at present on which Unicode character shall occupy
17  * how many cell positions on character terminals. These routines are
18  * a first attempt of defining such behavior based on simple rules
19  * applied to data provided by the Unicode Consortium.
20  *
21  * For some graphical characters, the Unicode standard explicitly
22  * defines a character-cell width via the definition of the East Asian
23  * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
24  * In all these cases, there is no ambiguity about which width a
25  * terminal shall use. For characters in the East Asian Ambiguous (A)
26  * class, the width choice depends purely on a preference of backward
27  * compatibility with either historic CJK or Western practice.
28  * Choosing single-width for these characters is easy to justify as
29  * the appropriate long-term solution, as the CJK practice of
30  * displaying these characters as double-width comes from historic
31  * implementation simplicity (8-bit encoded characters were displayed
32  * single-width and 16-bit ones double-width, even for Greek,
33  * Cyrillic, etc.) and not any typographic considerations.
34  *
35  * Much less clear is the choice of width for the Not East Asian
36  * (Neutral) class. Existing practice does not dictate a width for any
37  * of these characters. It would nevertheless make sense
38  * typographically to allocate two character cells to characters such
39  * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
40  * represented adequately with a single-width glyph. The following
41  * routines at present merely assign a single-cell width to all
42  * neutral characters, in the interest of simplicity. This is not
43  * entirely satisfactory and should be reconsidered before
44  * establishing a formal standard in this area. At the moment, the
45  * decision which Not East Asian (Neutral) characters should be
46  * represented by double-width glyphs cannot yet be answered by
47  * applying a simple rule from the Unicode database content. Setting
48  * up a proper standard for the behavior of UTF-8 character terminals
49  * will require a careful analysis not only of each Unicode character,
50  * but also of each presentation form, something the author of these
51  * routines has avoided to do so far.
52  *
53  * http://www.unicode.org/unicode/reports/tr11/
54  *
55  * Markus Kuhn -- 2007-05-25 (Unicode 5.0)
56  *
57  * Permission to use, copy, modify, and distribute this software
58  * for any purpose and without fee is hereby granted. The author
59  * disclaims all warranties with regard to this software.
60  *
61  * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
62  */
63 /*
64  *  Adaptions for KDE by Waldo Bastian <bastian@kde.org> and
65  *    Francesco Cecconi <francesco.cecconi@gmail.com>
66  *  See COPYING.Unicode for the license for the original wcwidth.c
67  */
68 
69 // Own
70 #include "konsole_wcwidth.h"
71 
72 struct interval {
73     unsigned long first;
74     unsigned long last;
75 };
76 
77 /* auxiliary function for binary search in interval table */
bisearch(unsigned long ucs,const struct interval * table,int max)78 static int bisearch(unsigned long ucs, const struct interval* table, int max)
79 {
80     int min = 0;
81     int mid;
82 
83     if (ucs < table[0].first || ucs > table[max].last)
84         return 0;
85     while (max >= min) {
86         mid = (min + max) / 2;
87         if (ucs > table[mid].last)
88             min = mid + 1;
89         else if (ucs < table[mid].first)
90             max = mid - 1;
91         else
92             return 1;
93     }
94 
95     return 0;
96 }
97 
98 /* The following functions define the column width of an ISO 10646
99  * character as follows:
100  *
101  *    - The null character (U+0000) has a column width of 0.
102  *
103  *    - Other C0/C1 control characters and DEL will lead to a return
104  *      value of -1.
105  *
106  *    - Non-spacing and enclosing combining characters (general
107  *      category code Mn or Me in the Unicode database) have a
108  *      column width of 0.
109  *
110  *    - Other format characters (general category code Cf in the Unicode
111  *      database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
112  *
113  *    - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
114  *      have a column width of 0.
115  *
116  *    - Spacing characters in the East Asian Wide (W) or East Asian
117  *      FullWidth (F) category as defined in Unicode Technical
118  *      Report #11 have a column width of 2.
119  *
120  *    - All remaining characters (including all printable
121  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
122  *      etc.) have a column width of 1.
123  *
124  * This implementation assumes that quint16 characters are encoded
125  * in ISO 10646.
126  */
127 
konsole_wcwidth(uint ucs)128 int konsole_wcwidth(uint ucs)
129 {
130     /* sorted list of non-overlapping intervals of non-spacing characters */
131     /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
132     static const struct interval combining[] = {
133         { 0x0300, 0x036F }, { 0x0483, 0x0486 }, { 0x0488, 0x0489 },
134         { 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 },
135         { 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0600, 0x0603 },
136         { 0x0610, 0x0615 }, { 0x064B, 0x065E }, { 0x0670, 0x0670 },
137         { 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED },
138         { 0x070F, 0x070F }, { 0x0711, 0x0711 }, { 0x0730, 0x074A },
139         { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x0901, 0x0902 },
140         { 0x093C, 0x093C }, { 0x0941, 0x0948 }, { 0x094D, 0x094D },
141         { 0x0951, 0x0954 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 },
142         { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD },
143         { 0x09E2, 0x09E3 }, { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C },
144         { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D },
145         { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC },
146         { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD },
147         { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C },
148         { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, { 0x0B4D, 0x0B4D },
149         { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 },
150         { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 },
151         { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, { 0x0CBC, 0x0CBC },
152         { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
153         { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D },
154         { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 },
155         { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E },
156         { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC },
157         { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 },
158         { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E },
159         { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 },
160         { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 },
161         { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 },
162         { 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x135F, 0x135F },
163         { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
164         { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
165         { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
166         { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
167         { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
168         { 0x1A17, 0x1A18 }, { 0x1B00, 0x1B03 }, { 0x1B34, 0x1B34 },
169         { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, { 0x1B42, 0x1B42 },
170         { 0x1B6B, 0x1B73 }, { 0x1DC0, 0x1DCA }, { 0x1DFE, 0x1DFF },
171         { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x2063 },
172         { 0x206A, 0x206F }, { 0x20D0, 0x20EF }, { 0x302A, 0x302F },
173         { 0x3099, 0x309A }, { 0xA806, 0xA806 }, { 0xA80B, 0xA80B },
174         { 0xA825, 0xA826 }, { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F },
175         { 0xFE20, 0xFE23 }, { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB },
176         { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
177         { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x1D167, 0x1D169 },
178         { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
179         { 0x1D242, 0x1D244 }, { 0xE0001, 0xE0001 }, { 0xE0020, 0xE007F },
180         { 0xE0100, 0xE01EF }
181     };
182     static const struct interval emoji_width[] = {
183         {0x1f1e6, 0x1f1ff},
184         {0x1f321, 0x1f321},
185         {0x1f324, 0x1f32c},
186         {0x1f336, 0x1f336},
187         {0x1f37d, 0x1f37d},
188         {0x1f396, 0x1f397},
189         {0x1f399, 0x1f39b},
190         {0x1f39e, 0x1f39f},
191         {0x1f3cb, 0x1f3ce},
192         {0x1f3d4, 0x1f3df},
193         {0x1f3f3, 0x1f3f5},
194         {0x1f3f7, 0x1f3f7},
195         {0x1f43f, 0x1f43f},
196         {0x1f441, 0x1f441},
197         {0x1f4fd, 0x1f4fd},
198         {0x1f549, 0x1f54a},
199         {0x1f56f, 0x1f570},
200         {0x1f573, 0x1f579},
201         {0x1f587, 0x1f587},
202         {0x1f58a, 0x1f58d},
203         {0x1f590, 0x1f590},
204         {0x1f5a5, 0x1f5a5},
205         {0x1f5a8, 0x1f5a8},
206         {0x1f5b1, 0x1f5b2},
207         {0x1f5bc, 0x1f5bc},
208         {0x1f5c2, 0x1f5c4},
209         {0x1f5d1, 0x1f5d3},
210         {0x1f5dc, 0x1f5de},
211         {0x1f5e1, 0x1f5e1},
212         {0x1f5e3, 0x1f5e3},
213         {0x1f5e8, 0x1f5e8},
214         {0x1f5ef, 0x1f5ef},
215         {0x1f5f3, 0x1f5f3},
216         {0x1f5fa, 0x1f5fa},
217         {0x1f6cb, 0x1f6cf},
218         {0x1f6e0, 0x1f6e5},
219         {0x1f6e9, 0x1f6e9},
220         {0x1f6f0, 0x1f6f0},
221         {0x1f6f3, 0x1f6f3},
222     };
223     static const struct interval doublewidth[] = {
224         {0x1100, 0x115f},
225         {0x231a, 0x231b},
226         {0x2329, 0x232a},
227         {0x23e9, 0x23ec},
228         {0x23f0, 0x23f0},
229         {0x23f3, 0x23f3},
230         {0x25fd, 0x25fe},
231         {0x2614, 0x2615},
232         {0x2648, 0x2653},
233         {0x267f, 0x267f},
234         {0x2693, 0x2693},
235         {0x26a1, 0x26a1},
236         {0x26aa, 0x26ab},
237         {0x26bd, 0x26be},
238         {0x26c4, 0x26c5},
239         {0x26ce, 0x26ce},
240         {0x26d4, 0x26d4},
241         {0x26ea, 0x26ea},
242         {0x26f2, 0x26f3},
243         {0x26f5, 0x26f5},
244         {0x26fa, 0x26fa},
245         {0x26fd, 0x26fd},
246         {0x2705, 0x2705},
247         {0x270a, 0x270b},
248         {0x2728, 0x2728},
249         {0x274c, 0x274c},
250         {0x274e, 0x274e},
251         {0x2753, 0x2755},
252         {0x2757, 0x2757},
253         {0x2795, 0x2797},
254         {0x27b0, 0x27b0},
255         {0x27bf, 0x27bf},
256         {0x2b1b, 0x2b1c},
257         {0x2b50, 0x2b50},
258         {0x2b55, 0x2b55},
259         {0x2e80, 0x2e99},
260         {0x2e9b, 0x2ef3},
261         {0x2f00, 0x2fd5},
262         {0x2ff0, 0x2ffb},
263         {0x3000, 0x303e},
264         {0x3041, 0x3096},
265         {0x3099, 0x30ff},
266         {0x3105, 0x312f},
267         {0x3131, 0x318e},
268         {0x3190, 0x31ba},
269         {0x31c0, 0x31e3},
270         {0x31f0, 0x321e},
271         {0x3220, 0x3247},
272         {0x3250, 0x4dbf},
273         {0x4e00, 0xa48c},
274         {0xa490, 0xa4c6},
275         {0xa960, 0xa97c},
276         {0xac00, 0xd7a3},
277         {0xf900, 0xfaff},
278         {0xfe10, 0xfe19},
279         {0xfe30, 0xfe52},
280         {0xfe54, 0xfe66},
281         {0xfe68, 0xfe6b},
282         {0xff01, 0xff60},
283         {0xffe0, 0xffe6},
284         {0x16fe0, 0x16fe3},
285         {0x17000, 0x187f7},
286         {0x18800, 0x18af2},
287         {0x1b000, 0x1b11e},
288         {0x1b150, 0x1b152},
289         {0x1b164, 0x1b167},
290         {0x1b170, 0x1b2fb},
291         {0x1f004, 0x1f004},
292         {0x1f0cf, 0x1f0cf},
293         {0x1f18e, 0x1f18e},
294         {0x1f191, 0x1f19a},
295         {0x1f200, 0x1f202},
296         {0x1f210, 0x1f23b},
297         {0x1f240, 0x1f248},
298         {0x1f250, 0x1f251},
299         {0x1f260, 0x1f265},
300         {0x1f300, 0x1f320},
301         {0x1f32d, 0x1f335},
302         {0x1f337, 0x1f37c},
303         {0x1f37e, 0x1f393},
304         {0x1f3a0, 0x1f3ca},
305         {0x1f3cf, 0x1f3d3},
306         {0x1f3e0, 0x1f3f0},
307         {0x1f3f4, 0x1f3f4},
308         {0x1f3f8, 0x1f43e},
309         {0x1f440, 0x1f440},
310         {0x1f442, 0x1f4fc},
311         {0x1f4ff, 0x1f53d},
312         {0x1f54b, 0x1f54e},
313         {0x1f550, 0x1f567},
314         {0x1f57a, 0x1f57a},
315         {0x1f595, 0x1f596},
316         {0x1f5a4, 0x1f5a4},
317         {0x1f5fb, 0x1f64f},
318         {0x1f680, 0x1f6c5},
319         {0x1f6cc, 0x1f6cc},
320         {0x1f6d0, 0x1f6d2},
321         {0x1f6d5, 0x1f6d5},
322         {0x1f6eb, 0x1f6ec},
323         {0x1f6f4, 0x1f6fa},
324         {0x1f7e0, 0x1f7eb},
325         {0x1f90d, 0x1f971},
326         {0x1f973, 0x1f976},
327         {0x1f97a, 0x1f9a2},
328         {0x1f9a5, 0x1f9aa},
329         {0x1f9ae, 0x1f9ca},
330         {0x1f9cd, 0x1f9ff},
331         {0x1fa70, 0x1fa73},
332         {0x1fa78, 0x1fa7a},
333         {0x1fa80, 0x1fa82},
334         {0x1fa90, 0x1fa95},
335         {0x20000, 0x2fffd},
336         {0x30000, 0x3fffd},
337     };
338 
339     /* test for 8-bit control characters */
340     if (ucs == 0)
341         return 0;
342     if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
343         return -1;
344 
345     /* binary search in table of non-spacing characters */
346     if (bisearch(ucs, combining,
347                  sizeof(combining) / sizeof(struct interval) - 1))
348         return 0;
349 
350     /* binary search for known wide characters */
351     if (bisearch(ucs, doublewidth,
352                 sizeof(doublewidth) / sizeof(struct interval) - 1))
353         return 2;
354 
355     /* binary search for wide emoji */
356     if (bisearch(ucs, emoji_width,
357                  sizeof(emoji_width) / sizeof(struct interval) - 1))
358         return 2;
359 
360     return 1;
361 }
362 
string_width(const QString & text)363 int string_width(const QString& text)
364 {
365     int w = 0;
366     for (int i = 0; i < text.length(); ++i)
367         w += konsole_wcwidth(text[i].unicode());
368     return w;
369 }
370 
371