1 /* radare2 - LGPL - Copyright 2014-2018 - thelemon, kazarmy, pancake */
2
3 #include <r_types.h>
4 #include <r_util.h>
5
6 static const struct { ut32 from, to; } nonprintable_ranges[] = {
7 { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
8 { 0x0378, 0x0379 }, { 0x037F, 0x0383 }, { 0x038B, 0x038B },
9 { 0x038D, 0x038D }, { 0x03A2, 0x03A2 }, { 0x0528, 0x0530 },
10 { 0x0557, 0x0558 }, { 0x0560, 0x0560 }, { 0x0588, 0x0588 },
11 { 0x058B, 0x058E }, { 0x0590, 0x0590 }, { 0x05C8, 0x05CF },
12 { 0x05EB, 0x05EF }, { 0x05F5, 0x0605 }, { 0x061C, 0x061D },
13 { 0x06DD, 0x06DD }, { 0x070E, 0x070F }, { 0x074B, 0x074C },
14 { 0x07B2, 0x07BF }, { 0x07FB, 0x07FF }, { 0x082E, 0x082F },
15 { 0x083F, 0x083F }, { 0x085C, 0x085D }, { 0x085F, 0x089F },
16 { 0x08A1, 0x08A1 }, { 0x08AD, 0x08E3 }, { 0x08FF, 0x08FF },
17 { 0x0978, 0x0978 }, { 0x0980, 0x0980 }, { 0x0984, 0x0984 },
18 { 0x098D, 0x098E }, { 0x0991, 0x0992 }, { 0x09A9, 0x09A9 },
19 { 0x09B1, 0x09B1 }, { 0x09B3, 0x09B5 }, { 0x09BA, 0x09BB },
20 { 0x09C5, 0x09C6 }, { 0x09C9, 0x09CA }, { 0x09CF, 0x09D6 },
21 { 0x09D8, 0x09DB }, { 0x09DE, 0x09DE }, { 0x09E4, 0x09E5 },
22 { 0x09FC, 0x0A00 }, { 0x0A04, 0x0A04 }, { 0x0A0B, 0x0A0E },
23 { 0x0A11, 0x0A12 }, { 0x0A29, 0x0A29 }, { 0x0A31, 0x0A31 },
24 { 0x0A34, 0x0A34 }, { 0x0A37, 0x0A37 }, { 0x0A3A, 0x0A3B },
25 { 0x0A3D, 0x0A3D }, { 0x0A43, 0x0A46 }, { 0x0A49, 0x0A4A },
26 { 0x0A4E, 0x0A50 }, { 0x0A52, 0x0A58 }, { 0x0A5D, 0x0A5D },
27 { 0x0A5F, 0x0A65 }, { 0x0A76, 0x0A80 }, { 0x0A84, 0x0A84 },
28 { 0x0A8E, 0x0A8E }, { 0x0A92, 0x0A92 }, { 0x0AA9, 0x0AA9 },
29 { 0x0AB1, 0x0AB1 }, { 0x0AB4, 0x0AB4 }, { 0x0ABA, 0x0ABB },
30 { 0x0AC6, 0x0AC6 }, { 0x0ACA, 0x0ACA }, { 0x0ACE, 0x0ACF },
31 { 0x0AD1, 0x0ADF }, { 0x0AE4, 0x0AE5 }, { 0x0AF2, 0x0B00 },
32 { 0x0B04, 0x0B04 }, { 0x0B0D, 0x0B0E }, { 0x0B11, 0x0B12 },
33 { 0x0B29, 0x0B29 }, { 0x0B31, 0x0B31 }, { 0x0B34, 0x0B34 },
34 { 0x0B3A, 0x0B3B }, { 0x0B45, 0x0B46 }, { 0x0B49, 0x0B4A },
35 { 0x0B4E, 0x0B55 }, { 0x0B58, 0x0B5B }, { 0x0B5E, 0x0B5E },
36 { 0x0B64, 0x0B65 }, { 0x0B78, 0x0B81 }, { 0x0B84, 0x0B84 },
37 { 0x0B8B, 0x0B8D }, { 0x0B91, 0x0B91 }, { 0x0B96, 0x0B98 },
38 { 0x0B9B, 0x0B9B }, { 0x0B9D, 0x0B9D }, { 0x0BA0, 0x0BA2 },
39 { 0x0BA5, 0x0BA7 }, { 0x0BAB, 0x0BAD }, { 0x0BBA, 0x0BBD },
40 { 0x0BC3, 0x0BC5 }, { 0x0BC9, 0x0BC9 }, { 0x0BCE, 0x0BCF },
41 { 0x0BD1, 0x0BD6 }, { 0x0BD8, 0x0BE5 }, { 0x0BFB, 0x0C00 },
42 { 0x0C04, 0x0C04 }, { 0x0C0D, 0x0C0D }, { 0x0C11, 0x0C11 },
43 { 0x0C29, 0x0C29 }, { 0x0C34, 0x0C34 }, { 0x0C3A, 0x0C3C },
44 { 0x0C45, 0x0C45 }, { 0x0C49, 0x0C49 }, { 0x0C4E, 0x0C54 },
45 { 0x0C57, 0x0C57 }, { 0x0C5A, 0x0C5F }, { 0x0C64, 0x0C65 },
46 { 0x0C70, 0x0C77 }, { 0x0C80, 0x0C81 }, { 0x0C84, 0x0C84 },
47 { 0x0C8D, 0x0C8D }, { 0x0C91, 0x0C91 }, { 0x0CA9, 0x0CA9 },
48 { 0x0CB4, 0x0CB4 }, { 0x0CBA, 0x0CBB }, { 0x0CC5, 0x0CC5 },
49 { 0x0CC9, 0x0CC9 }, { 0x0CCE, 0x0CD4 }, { 0x0CD7, 0x0CDD },
50 { 0x0CDF, 0x0CDF }, { 0x0CE4, 0x0CE5 }, { 0x0CF0, 0x0CF0 },
51 { 0x0CF3, 0x0D01 }, { 0x0D04, 0x0D04 }, { 0x0D0D, 0x0D0D },
52 { 0x0D11, 0x0D11 }, { 0x0D3B, 0x0D3C }, { 0x0D45, 0x0D45 },
53 { 0x0D49, 0x0D49 }, { 0x0D4F, 0x0D56 }, { 0x0D58, 0x0D5F },
54 { 0x0D64, 0x0D65 }, { 0x0D76, 0x0D78 }, { 0x0D80, 0x0D81 },
55 { 0x0D84, 0x0D84 }, { 0x0D97, 0x0D99 }, { 0x0DB2, 0x0DB2 },
56 { 0x0DBC, 0x0DBC }, { 0x0DBE, 0x0DBF }, { 0x0DC7, 0x0DC9 },
57 { 0x0DCB, 0x0DCE }, { 0x0DD5, 0x0DD5 }, { 0x0DD7, 0x0DD7 },
58 { 0x0DE0, 0x0DF1 }, { 0x0DF5, 0x0E00 }, { 0x0E3B, 0x0E3E },
59 { 0x0E5C, 0x0E80 }, { 0x0E83, 0x0E83 }, { 0x0E85, 0x0E86 },
60 { 0x0E89, 0x0E89 }, { 0x0E8B, 0x0E8C }, { 0x0E8E, 0x0E93 },
61 { 0x0E98, 0x0E98 }, { 0x0EA0, 0x0EA0 }, { 0x0EA4, 0x0EA4 },
62 { 0x0EA6, 0x0EA6 }, { 0x0EA8, 0x0EA9 }, { 0x0EAC, 0x0EAC },
63 { 0x0EBA, 0x0EBA }, { 0x0EBE, 0x0EBF }, { 0x0EC5, 0x0EC5 },
64 { 0x0EC7, 0x0EC7 }, { 0x0ECE, 0x0ECF }, { 0x0EDA, 0x0EDB },
65 { 0x0EE0, 0x0EFF }, { 0x0F48, 0x0F48 }, { 0x0F6D, 0x0F70 },
66 { 0x0F98, 0x0F98 }, { 0x0FBD, 0x0FBD }, { 0x0FCD, 0x0FCD },
67 { 0x0FDB, 0x0FFF }, { 0x10C6, 0x10C6 }, { 0x10C8, 0x10CC },
68 { 0x10CE, 0x10CF }, { 0x115F, 0x1160 }, { 0x1249, 0x1249 },
69 { 0x124E, 0x124F }, { 0x1257, 0x1257 }, { 0x1259, 0x1259 },
70 { 0x125E, 0x125F }, { 0x1289, 0x1289 }, { 0x128E, 0x128F },
71 { 0x12B1, 0x12B1 }, { 0x12B6, 0x12B7 }, { 0x12BF, 0x12BF },
72 { 0x12C1, 0x12C1 }, { 0x12C6, 0x12C7 }, { 0x12D7, 0x12D7 },
73 { 0x1311, 0x1311 }, { 0x1316, 0x1317 }, { 0x135B, 0x135C },
74 { 0x137D, 0x137F }, { 0x139A, 0x139F }, { 0x13F5, 0x13FF },
75 { 0x169D, 0x169F }, { 0x16F1, 0x16FF }, { 0x170D, 0x170D },
76 { 0x1715, 0x171F }, { 0x1737, 0x173F }, { 0x1754, 0x175F },
77 { 0x176D, 0x176D }, { 0x1771, 0x1771 }, { 0x1774, 0x177F },
78 { 0x17B4, 0x17B5 }, { 0x17DE, 0x17DF }, { 0x17EA, 0x17EF },
79 { 0x17FA, 0x17FF }, { 0x180B, 0x180D }, { 0x180F, 0x180F },
80 { 0x181A, 0x181F }, { 0x1878, 0x187F }, { 0x18AB, 0x18AF },
81 { 0x18F6, 0x18FF }, { 0x191D, 0x191F }, { 0x192C, 0x192F },
82 { 0x193C, 0x193F }, { 0x1941, 0x1943 }, { 0x196E, 0x196F },
83 { 0x1975, 0x197F }, { 0x19AC, 0x19AF }, { 0x19CA, 0x19CF },
84 { 0x19DB, 0x19DD }, { 0x1A1C, 0x1A1D }, { 0x1A5F, 0x1A5F },
85 { 0x1A7D, 0x1A7E }, { 0x1A8A, 0x1A8F }, { 0x1A9A, 0x1A9F },
86 { 0x1AAE, 0x1AFF }, { 0x1B4C, 0x1B4F }, { 0x1B7D, 0x1B7F },
87 { 0x1BF4, 0x1BFB }, { 0x1C38, 0x1C3A }, { 0x1C4A, 0x1C4C },
88 { 0x1C80, 0x1CBF }, { 0x1CC8, 0x1CCF }, { 0x1CF7, 0x1CFF },
89 { 0x1DE7, 0x1DFB }, { 0x1F16, 0x1F17 }, { 0x1F1E, 0x1F1F },
90 { 0x1F46, 0x1F47 }, { 0x1F4E, 0x1F4F }, { 0x1F58, 0x1F58 },
91 { 0x1F5A, 0x1F5A }, { 0x1F5C, 0x1F5C }, { 0x1F5E, 0x1F5E },
92 { 0x1F7E, 0x1F7F }, { 0x1FB5, 0x1FB5 }, { 0x1FC5, 0x1FC5 },
93 { 0x1FD4, 0x1FD5 }, { 0x1FDC, 0x1FDC }, { 0x1FF0, 0x1FF1 },
94 { 0x1FF5, 0x1FF5 }, { 0x1FFF, 0x1FFF }, { 0x200B, 0x200F },
95 { 0x202A, 0x202E }, { 0x2060, 0x206F }, { 0x2072, 0x2073 },
96 { 0x208F, 0x208F }, { 0x209D, 0x209F }, { 0x20BB, 0x20CF },
97 { 0x20F1, 0x20FF }, { 0x218A, 0x218F }, { 0x23F4, 0x23FF },
98 { 0x2427, 0x243F }, { 0x244B, 0x245F }, { 0x2700, 0x2700 },
99 { 0x2B4D, 0x2B4F }, { 0x2B5A, 0x2BFF }, { 0x2C2F, 0x2C2F },
100 { 0x2C5F, 0x2C5F }, { 0x2CF4, 0x2CF8 }, { 0x2D26, 0x2D26 },
101 { 0x2D28, 0x2D2C }, { 0x2D2E, 0x2D2F }, { 0x2D68, 0x2D6E },
102 { 0x2D71, 0x2D7E }, { 0x2D97, 0x2D9F }, { 0x2DA7, 0x2DA7 },
103 { 0x2DAF, 0x2DAF }, { 0x2DB7, 0x2DB7 }, { 0x2DBF, 0x2DBF },
104 { 0x2DC7, 0x2DC7 }, { 0x2DCF, 0x2DCF }, { 0x2DD7, 0x2DD7 },
105 { 0x2DDF, 0x2DDF }, { 0x2E3C, 0x2E7F }, { 0x2E9A, 0x2E9A },
106 { 0x2EF4, 0x2EFF }, { 0x2FD6, 0x2FEF }, { 0x2FFC, 0x2FFF },
107 { 0x3040, 0x3040 }, { 0x3097, 0x3098 }, { 0x3100, 0x3104 },
108 { 0x312E, 0x3130 }, { 0x3164, 0x3164 }, { 0x318F, 0x318F },
109 { 0x31BB, 0x31BF }, { 0x31E4, 0x31EF }, { 0x321F, 0x321F },
110 { 0x32FF, 0x32FF }, { 0x4DB6, 0x4DBF }, { 0x9FCD, 0x9FFF },
111 { 0xA48D, 0xA48F }, { 0xA4C7, 0xA4CF }, { 0xA62C, 0xA63F },
112 { 0xA698, 0xA69E }, { 0xA6F8, 0xA6FF }, { 0xA78F, 0xA78F },
113 { 0xA794, 0xA79F }, { 0xA7AB, 0xA7F7 }, { 0xA82C, 0xA82F },
114 { 0xA83A, 0xA83F }, { 0xA878, 0xA87F }, { 0xA8C5, 0xA8CD },
115 { 0xA8DA, 0xA8DF }, { 0xA8FC, 0xA8FF }, { 0xA954, 0xA95E },
116 { 0xA97D, 0xA97F }, { 0xA9CE, 0xA9CE }, { 0xA9DA, 0xA9DD },
117 { 0xA9E0, 0xA9FF }, { 0xAA37, 0xAA3F }, { 0xAA4E, 0xAA4F },
118 { 0xAA5A, 0xAA5B }, { 0xAA7C, 0xAA7F }, { 0xAAC3, 0xAADA },
119 { 0xAAF7, 0xAB00 }, { 0xAB07, 0xAB08 }, { 0xAB0F, 0xAB10 },
120 { 0xAB17, 0xAB1F }, { 0xAB27, 0xAB27 }, { 0xAB2F, 0xABBF },
121 { 0xABEE, 0xABEF }, { 0xABFA, 0xABFF }, { 0xD7A4, 0xD7AF },
122 { 0xD7C7, 0xD7CA }, { 0xD7FC, 0xDFFF }, { 0xFA6E, 0xFA6F },
123 { 0xFADA, 0xFAFF }, { 0xFB07, 0xFB12 }, { 0xFB18, 0xFB1C },
124 { 0xFB37, 0xFB37 }, { 0xFB3D, 0xFB3D }, { 0xFB3F, 0xFB3F },
125 { 0xFB42, 0xFB42 }, { 0xFB45, 0xFB45 }, { 0xFBC2, 0xFBD2 },
126 { 0xFD40, 0xFD4F }, { 0xFD90, 0xFD91 }, { 0xFDC8, 0xFDEF },
127 { 0xFDFE, 0xFE0F }, { 0xFE1A, 0xFE1F }, { 0xFE27, 0xFE2F },
128 { 0xFE53, 0xFE53 }, { 0xFE67, 0xFE67 }, { 0xFE6C, 0xFE6F },
129 { 0xFE75, 0xFE75 }, { 0xFEFD, 0xFEFF }, { 0xFF00, 0xFF00 },
130 { 0xFFA0, 0xFFA0 }, { 0xFFBF, 0xFFC1 }, { 0xFFC8, 0xFFC9 },
131 { 0xFFD0, 0xFFD1 }, { 0xFFD8, 0xFFD9 }, { 0xFFDD, 0xFFDF },
132 { 0xFFE7, 0xFFE7 }, { 0xFFEF, 0xFFFB }, { 0xFFFE, 0xFFFF },
133 { 0x1000C, 0x1000C }, { 0x10027, 0x10027 }, { 0x1003B, 0x1003B },
134 { 0x1003E, 0x1003E }, { 0x1004E, 0x1004F }, { 0x1005E, 0x1007F },
135 { 0x100FB, 0x100FF }, { 0x10103, 0x10106 }, { 0x10134, 0x10136 },
136 { 0x1018B, 0x1018F }, { 0x1019C, 0x101CF }, { 0x101FE, 0x1027F },
137 { 0x1029D, 0x1029F }, { 0x102D1, 0x102FF }, { 0x1031F, 0x1031F },
138 { 0x10324, 0x1032F }, { 0x1034B, 0x1037F }, { 0x1039E, 0x1039E },
139 { 0x103C4, 0x103C7 }, { 0x103D6, 0x103FF }, { 0x1049E, 0x1049F },
140 { 0x104AA, 0x107FF }, { 0x10806, 0x10807 }, { 0x10809, 0x10809 },
141 { 0x10836, 0x10836 }, { 0x10839, 0x1083B }, { 0x1083D, 0x1083E },
142 { 0x10856, 0x10856 }, { 0x10860, 0x108FF }, { 0x1091C, 0x1091E },
143 { 0x1093A, 0x1093E }, { 0x10940, 0x1097F }, { 0x109B8, 0x109BD },
144 { 0x109C0, 0x109FF }, { 0x10A04, 0x10A04 }, { 0x10A07, 0x10A0B },
145 { 0x10A14, 0x10A14 }, { 0x10A18, 0x10A18 }, { 0x10A34, 0x10A37 },
146 { 0x10A3B, 0x10A3E }, { 0x10A48, 0x10A4F }, { 0x10A59, 0x10A5F },
147 { 0x10A80, 0x10AFF }, { 0x10B36, 0x10B38 }, { 0x10B56, 0x10B57 },
148 { 0x10B73, 0x10B77 }, { 0x10B80, 0x10BFF }, { 0x10C49, 0x10E5F },
149 { 0x10E7F, 0x10FFF }, { 0x1104E, 0x11051 }, { 0x11070, 0x1107F },
150 { 0x110BD, 0x110BD }, { 0x110C2, 0x110CF }, { 0x110E9, 0x110EF },
151 { 0x110FA, 0x110FF }, { 0x11135, 0x11135 }, { 0x11144, 0x1117F },
152 { 0x111C9, 0x111CF }, { 0x111DA, 0x1167F }, { 0x116B8, 0x116BF },
153 { 0x116CA, 0x11FFF }, { 0x1236F, 0x123FF }, { 0x12463, 0x1246F },
154 { 0x12474, 0x12FFF }, { 0x1342F, 0x167FF }, { 0x16A39, 0x16EFF },
155 { 0x16F45, 0x16F4F }, { 0x16F7F, 0x16F8E }, { 0x16FA0, 0x1AFFF },
156 { 0x1B002, 0x1CFFF }, { 0x1D0F6, 0x1D0FF }, { 0x1D127, 0x1D128 },
157 { 0x1D173, 0x1D17A }, { 0x1D1DE, 0x1D1FF }, { 0x1D246, 0x1D2FF },
158 { 0x1D357, 0x1D35F }, { 0x1D372, 0x1D3FF }, { 0x1D455, 0x1D455 },
159 { 0x1D49D, 0x1D49D }, { 0x1D4A0, 0x1D4A1 }, { 0x1D4A3, 0x1D4A4 },
160 { 0x1D4A7, 0x1D4A8 }, { 0x1D4AD, 0x1D4AD }, { 0x1D4BA, 0x1D4BA },
161 { 0x1D4BC, 0x1D4BC }, { 0x1D4C4, 0x1D4C4 }, { 0x1D506, 0x1D506 },
162 { 0x1D50B, 0x1D50C }, { 0x1D515, 0x1D515 }, { 0x1D51D, 0x1D51D },
163 { 0x1D53A, 0x1D53A }, { 0x1D53F, 0x1D53F }, { 0x1D545, 0x1D545 },
164 { 0x1D547, 0x1D549 }, { 0x1D551, 0x1D551 }, { 0x1D6A6, 0x1D6A7 },
165 { 0x1D7CC, 0x1D7CD }, { 0x1D800, 0x1EDFF }, { 0x1EE04, 0x1EE04 },
166 { 0x1EE20, 0x1EE20 }, { 0x1EE23, 0x1EE23 }, { 0x1EE25, 0x1EE26 },
167 { 0x1EE28, 0x1EE28 }, { 0x1EE33, 0x1EE33 }, { 0x1EE38, 0x1EE38 },
168 { 0x1EE3A, 0x1EE3A }, { 0x1EE3C, 0x1EE41 }, { 0x1EE43, 0x1EE46 },
169 { 0x1EE48, 0x1EE48 }, { 0x1EE4A, 0x1EE4A }, { 0x1EE4C, 0x1EE4C },
170 { 0x1EE50, 0x1EE50 }, { 0x1EE53, 0x1EE53 }, { 0x1EE55, 0x1EE56 },
171 { 0x1EE58, 0x1EE58 }, { 0x1EE5A, 0x1EE5A }, { 0x1EE5C, 0x1EE5C },
172 { 0x1EE5E, 0x1EE5E }, { 0x1EE60, 0x1EE60 }, { 0x1EE63, 0x1EE63 },
173 { 0x1EE65, 0x1EE66 }, { 0x1EE6B, 0x1EE6B }, { 0x1EE73, 0x1EE73 },
174 { 0x1EE78, 0x1EE78 }, { 0x1EE7D, 0x1EE7D }, { 0x1EE7F, 0x1EE7F },
175 { 0x1EE8A, 0x1EE8A }, { 0x1EE9C, 0x1EEA0 }, { 0x1EEA4, 0x1EEA4 },
176 { 0x1EEAA, 0x1EEAA }, { 0x1EEBC, 0x1EEEF }, { 0x1EEF2, 0x1EFFF },
177 { 0x1F02C, 0x1F02F }, { 0x1F094, 0x1F09F }, { 0x1F0AF, 0x1F0B0 },
178 { 0x1F0BF, 0x1F0C0 }, { 0x1F0D0, 0x1F0D0 }, { 0x1F0E0, 0x1F0FF },
179 { 0x1F10B, 0x1F10F }, { 0x1F12F, 0x1F12F }, { 0x1F16C, 0x1F16F },
180 { 0x1F19B, 0x1F1E5 }, { 0x1F203, 0x1F20F }, { 0x1F23B, 0x1F23F },
181 { 0x1F249, 0x1F24F }, { 0x1F252, 0x1F2FF }, { 0x1F321, 0x1F32F },
182 { 0x1F336, 0x1F336 }, { 0x1F37D, 0x1F37F }, { 0x1F394, 0x1F39F },
183 { 0x1F3C5, 0x1F3C5 }, { 0x1F3CB, 0x1F3DF }, { 0x1F3F1, 0x1F3FF },
184 { 0x1F43F, 0x1F43F }, { 0x1F441, 0x1F441 }, { 0x1F4F8, 0x1F4F8 },
185 { 0x1F4FD, 0x1F4FF }, { 0x1F53E, 0x1F53F }, { 0x1F544, 0x1F54F },
186 { 0x1F568, 0x1F5FA }, { 0x1F641, 0x1F644 }, { 0x1F650, 0x1F67F },
187 { 0x1F6C6, 0x1F6FF }, { 0x1F774, 0x1FFFF }, { 0x2A6D7, 0x2A6FF },
188 { 0x2B735, 0x2B73F }, { 0x2B81E, 0x2F7FF }, { 0x2FA1E, 0xF0000 },
189 { 0xFFFFE, 0xFFFFF }, { 0x10FFFE, 0x10FFFF }, { 0x110000, 0xFFFFFFFF }
190 };
191 static const int nonprintable_ranges_count = sizeof (nonprintable_ranges) / sizeof (nonprintable_ranges[0]);
192
193 static const int lastUtfBlock = 281;
194
195 const RUtfBlock r_utf_blocks[] = {
196 { 0x0000, 0x007F, "Basic Latin" },
197 { 0x0080, 0x00FF, "Latin-1 Supplement" },
198 { 0x0100, 0x017F, "Latin Extended-A" },
199 { 0x0180, 0x024F, "Latin Extended-B" },
200 { 0x0250, 0x02AF, "IPA Extensions" },
201 { 0x02B0, 0x02FF, "Spacing Modifier Letters" },
202 { 0x0300, 0x036F, "Combining Diacritical Marks" },
203 { 0x0370, 0x03FF, "Greek and Coptic" },
204 { 0x0400, 0x04FF, "Cyrillic" },
205 { 0x0500, 0x052F, "Cyrillic Supplement" },
206 { 0x0530, 0x058F, "Armenian" },
207 { 0x0590, 0x05FF, "Hebrew" },
208 { 0x0600, 0x06FF, "Arabic" },
209 { 0x0700, 0x074F, "Syriac" },
210 { 0x0750, 0x077F, "Arabic Supplement" },
211 { 0x0780, 0x07BF, "Thaana" },
212 { 0x07C0, 0x07FF, "NKo" },
213 { 0x0800, 0x083F, "Samaritan" },
214 { 0x0840, 0x085F, "Mandaic" },
215 { 0x0860, 0x086F, "Syriac Supplement" },
216 { 0x08A0, 0x08FF, "Arabic Extended-A" },
217 { 0x0900, 0x097F, "Devanagari" },
218 { 0x0980, 0x09FF, "Bengali" },
219 { 0x0A00, 0x0A7F, "Gurmukhi" },
220 { 0x0A80, 0x0AFF, "Gujarati" },
221 { 0x0B00, 0x0B7F, "Oriya" },
222 { 0x0B80, 0x0BFF, "Tamil" },
223 { 0x0C00, 0x0C7F, "Telugu" },
224 { 0x0C80, 0x0CFF, "Kannada" },
225 { 0x0D00, 0x0D7F, "Malayalam" },
226 { 0x0D80, 0x0DFF, "Sinhala" },
227 { 0x0E00, 0x0E7F, "Thai" },
228 { 0x0E80, 0x0EFF, "Lao" },
229 { 0x0F00, 0x0FFF, "Tibetan" },
230 { 0x1000, 0x109F, "Myanmar" },
231 { 0x10A0, 0x10FF, "Georgian" },
232 { 0x1100, 0x11FF, "Hangul Jamo" },
233 { 0x1200, 0x137F, "Ethiopic" },
234 { 0x1380, 0x139F, "Ethiopic Supplement" },
235 { 0x13A0, 0x13FF, "Cherokee" },
236 { 0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics" },
237 { 0x1680, 0x169F, "Ogham" },
238 { 0x16A0, 0x16FF, "Runic" },
239 { 0x1700, 0x171F, "Tagalog" },
240 { 0x1720, 0x173F, "Hanunoo" },
241 { 0x1740, 0x175F, "Buhid" },
242 { 0x1760, 0x177F, "Tagbanwa" },
243 { 0x1780, 0x17FF, "Khmer" },
244 { 0x1800, 0x18AF, "Mongolian" },
245 { 0x18B0, 0x18FF, "Unified Canadian Aboriginal Syllabics Extended" },
246 { 0x1900, 0x194F, "Limbu" },
247 { 0x1950, 0x197F, "Tai Le" },
248 { 0x1980, 0x19DF, "New Tai Lue" },
249 { 0x19E0, 0x19FF, "Khmer Symbols" },
250 { 0x1A00, 0x1A1F, "Buginese" },
251 { 0x1A20, 0x1AAF, "Tai Tham" },
252 { 0x1AB0, 0x1AFF, "Combining Diacritical Marks Extended" },
253 { 0x1B00, 0x1B7F, "Balinese" },
254 { 0x1B80, 0x1BBF, "Sundanese" },
255 { 0x1BC0, 0x1BFF, "Batak" },
256 { 0x1C00, 0x1C4F, "Lepcha" },
257 { 0x1C50, 0x1C7F, "Ol Chiki" },
258 { 0x1C80, 0x1C8F, "Cyrillic Extended-C" },
259 { 0x1CC0, 0x1CCF, "Sundanese Supplement" },
260 { 0x1CD0, 0x1CFF, "Vedic Extensions" },
261 { 0x1D00, 0x1D7F, "Phonetic Extensions" },
262 { 0x1D80, 0x1DBF, "Phonetic Extensions Supplement" },
263 { 0x1DC0, 0x1DFF, "Combining Diacritical Marks Supplement" },
264 { 0x1E00, 0x1EFF, "Latin Extended Additional" },
265 { 0x1F00, 0x1FFF, "Greek Extended" },
266 { 0x2000, 0x206F, "General Punctuation" },
267 { 0x2070, 0x209F, "Superscripts and Subscripts" },
268 { 0x20A0, 0x20CF, "Currency Symbols" },
269 { 0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols" },
270 { 0x2100, 0x214F, "Letterlike Symbols" },
271 { 0x2150, 0x218F, "Number Forms" },
272 { 0x2190, 0x21FF, "Arrows" },
273 { 0x2200, 0x22FF, "Mathematical Operators" },
274 { 0x2300, 0x23FF, "Miscellaneous Technical" },
275 { 0x2400, 0x243F, "Control Pictures" },
276 { 0x2440, 0x245F, "Optical Character Recognition" },
277 { 0x2460, 0x24FF, "Enclosed Alphanumerics" },
278 { 0x2500, 0x257F, "Box Drawing" },
279 { 0x2580, 0x259F, "Block Elements" },
280 { 0x25A0, 0x25FF, "Geometric Shapes" },
281 { 0x2600, 0x26FF, "Miscellaneous Symbols" },
282 { 0x2700, 0x27BF, "Dingbats" },
283 { 0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A" },
284 { 0x27F0, 0x27FF, "Supplemental Arrows-A" },
285 { 0x2800, 0x28FF, "Braille Patterns" },
286 { 0x2900, 0x297F, "Supplemental Arrows-B" },
287 { 0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B" },
288 { 0x2A00, 0x2AFF, "Supplemental Mathematical Operators" },
289 { 0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows" },
290 { 0x2C00, 0x2C5F, "Glagolitic" },
291 { 0x2C60, 0x2C7F, "Latin Extended-C" },
292 { 0x2C80, 0x2CFF, "Coptic" },
293 { 0x2D00, 0x2D2F, "Georgian Supplement" },
294 { 0x2D30, 0x2D7F, "Tifinagh" },
295 { 0x2D80, 0x2DDF, "Ethiopic Extended" },
296 { 0x2DE0, 0x2DFF, "Cyrillic Extended-A" },
297 { 0x2E00, 0x2E7F, "Supplemental Punctuation" },
298 { 0x2E80, 0x2EFF, "CJK Radicals Supplement" },
299 { 0x2F00, 0x2FDF, "Kangxi Radicals" },
300 { 0x2FF0, 0x2FFF, "Ideographic Description Characters" },
301 { 0x3000, 0x303F, "CJK Symbols and Punctuation" },
302 { 0x3040, 0x309F, "Hiragana" },
303 { 0x30A0, 0x30FF, "Katakana" },
304 { 0x3100, 0x312F, "Bopomofo" },
305 { 0x3130, 0x318F, "Hangul Compatibility Jamo" },
306 { 0x3190, 0x319F, "Kanbun" },
307 { 0x31A0, 0x31BF, "Bopomofo Extended" },
308 { 0x31C0, 0x31EF, "CJK Strokes" },
309 { 0x31F0, 0x31FF, "Katakana Phonetic Extensions" },
310 { 0x3200, 0x32FF, "Enclosed CJK Letters and Months" },
311 { 0x3300, 0x33FF, "CJK Compatibility" },
312 { 0x3400, 0x4DBF, "CJK Unified Ideographs Extension A" },
313 { 0x4DC0, 0x4DFF, "Yijing Hexagram Symbols" },
314 { 0x4E00, 0x9FFF, "CJK Unified Ideographs" },
315 { 0xA000, 0xA48F, "Yi Syllables" },
316 { 0xA490, 0xA4CF, "Yi Radicals" },
317 { 0xA4D0, 0xA4FF, "Lisu" },
318 { 0xA500, 0xA63F, "Vai" },
319 { 0xA640, 0xA69F, "Cyrillic Extended-B" },
320 { 0xA6A0, 0xA6FF, "Bamum" },
321 { 0xA700, 0xA71F, "Modifier Tone Letters" },
322 { 0xA720, 0xA7FF, "Latin Extended-D" },
323 { 0xA800, 0xA82F, "Syloti Nagri" },
324 { 0xA830, 0xA83F, "Common Indic Number Forms" },
325 { 0xA840, 0xA87F, "Phags-pa" },
326 { 0xA880, 0xA8DF, "Saurashtra" },
327 { 0xA8E0, 0xA8FF, "Devanagari Extended" },
328 { 0xA900, 0xA92F, "Kayah Li" },
329 { 0xA930, 0xA95F, "Rejang" },
330 { 0xA960, 0xA97F, "Hangul Jamo Extended-A" },
331 { 0xA980, 0xA9DF, "Javanese" },
332 { 0xA9E0, 0xA9FF, "Myanmar Extended-B" },
333 { 0xAA00, 0xAA5F, "Cham" },
334 { 0xAA60, 0xAA7F, "Myanmar Extended-A" },
335 { 0xAA80, 0xAADF, "Tai Viet" },
336 { 0xAAE0, 0xAAFF, "Meetei Mayek Extensions" },
337 { 0xAB00, 0xAB2F, "Ethiopic Extended-A" },
338 { 0xAB30, 0xAB6F, "Latin Extended-E" },
339 { 0xAB70, 0xABBF, "Cherokee Supplement" },
340 { 0xABC0, 0xABFF, "Meetei Mayek" },
341 { 0xAC00, 0xD7AF, "Hangul Syllables" },
342 { 0xD7B0, 0xD7FF, "Hangul Jamo Extended-B" },
343 { 0xD800, 0xDB7F, "High Surrogates" },
344 { 0xDB80, 0xDBFF, "High Private Use Surrogates" },
345 { 0xDC00, 0xDFFF, "Low Surrogates" },
346 { 0xE000, 0xF8FF, "Private Use Area" },
347 { 0xF900, 0xFAFF, "CJK Compatibility Ideographs" },
348 { 0xFB00, 0xFB4F, "Alphabetic Presentation Forms" },
349 { 0xFB50, 0xFDFF, "Arabic Presentation Forms-A" },
350 { 0xFE00, 0xFE0F, "Variation Selectors" },
351 { 0xFE10, 0xFE1F, "Vertical Forms" },
352 { 0xFE20, 0xFE2F, "Combining Half Marks" },
353 { 0xFE30, 0xFE4F, "CJK Compatibility Forms" },
354 { 0xFE50, 0xFE6F, "Small Form Variants" },
355 { 0xFE70, 0xFEFF, "Arabic Presentation Forms-B" },
356 { 0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms" },
357 { 0xFFF0, 0xFFFF, "Specials" },
358 { 0x10000, 0x1007F, "Linear B Syllabary" },
359 { 0x10080, 0x100FF, "Linear B Ideograms" },
360 { 0x10100, 0x1013F, "Aegean Numbers" },
361 { 0x10140, 0x1018F, "Ancient Greek Numbers" },
362 { 0x10190, 0x101CF, "Ancient Symbols" },
363 { 0x101D0, 0x101FF, "Phaistos Disc" },
364 { 0x10280, 0x1029F, "Lycian" },
365 { 0x102A0, 0x102DF, "Carian" },
366 { 0x102E0, 0x102FF, "Coptic Epact Numbers" },
367 { 0x10300, 0x1032F, "Old Italic" },
368 { 0x10330, 0x1034F, "Gothic" },
369 { 0x10350, 0x1037F, "Old Permic" },
370 { 0x10380, 0x1039F, "Ugaritic" },
371 { 0x103A0, 0x103DF, "Old Persian" },
372 { 0x10400, 0x1044F, "Deseret" },
373 { 0x10450, 0x1047F, "Shavian" },
374 { 0x10480, 0x104AF, "Osmanya" },
375 { 0x104B0, 0x104FF, "Osage" },
376 { 0x10500, 0x1052F, "Elbasan" },
377 { 0x10530, 0x1056F, "Caucasian Albanian" },
378 { 0x10600, 0x1077F, "Linear A" },
379 { 0x10800, 0x1083F, "Cypriot Syllabary" },
380 { 0x10840, 0x1085F, "Imperial Aramaic" },
381 { 0x10860, 0x1087F, "Palmyrene" },
382 { 0x10880, 0x108AF, "Nabataean" },
383 { 0x108E0, 0x108FF, "Hatran" },
384 { 0x10900, 0x1091F, "Phoenician" },
385 { 0x10920, 0x1093F, "Lydian" },
386 { 0x10980, 0x1099F, "Meroitic Hieroglyphs" },
387 { 0x109A0, 0x109FF, "Meroitic Cursive" },
388 { 0x10A00, 0x10A5F, "Kharoshthi" },
389 { 0x10A60, 0x10A7F, "Old South Arabian" },
390 { 0x10A80, 0x10A9F, "Old North Arabian" },
391 { 0x10AC0, 0x10AFF, "Manichaean" },
392 { 0x10B00, 0x10B3F, "Avestan" },
393 { 0x10B40, 0x10B5F, "Inscriptional Parthian" },
394 { 0x10B60, 0x10B7F, "Inscriptional Pahlavi" },
395 { 0x10B80, 0x10BAF, "Psalter Pahlavi" },
396 { 0x10C00, 0x10C4F, "Old Turkic" },
397 { 0x10C80, 0x10CFF, "Old Hungarian" },
398 { 0x10E60, 0x10E7F, "Rumi Numeral Symbols" },
399 { 0x11000, 0x1107F, "Brahmi" },
400 { 0x11080, 0x110CF, "Kaithi" },
401 { 0x110D0, 0x110FF, "Sora Sompeng" },
402 { 0x11100, 0x1114F, "Chakma" },
403 { 0x11150, 0x1117F, "Mahajani" },
404 { 0x11180, 0x111DF, "Sharada" },
405 { 0x111E0, 0x111FF, "Sinhala Archaic Numbers" },
406 { 0x11200, 0x1124F, "Khojki" },
407 { 0x11280, 0x112AF, "Multani" },
408 { 0x112B0, 0x112FF, "Khudawadi" },
409 { 0x11300, 0x1137F, "Grantha" },
410 { 0x11400, 0x1147F, "Newa" },
411 { 0x11480, 0x114DF, "Tirhuta" },
412 { 0x11580, 0x115FF, "Siddham" },
413 { 0x11600, 0x1165F, "Modi" },
414 { 0x11660, 0x1167F, "Mongolian Supplement" },
415 { 0x11680, 0x116CF, "Takri" },
416 { 0x11700, 0x1173F, "Ahom" },
417 { 0x118A0, 0x118FF, "Warang Citi" },
418 { 0x11A00, 0x11A4F, "Zanabazar Square" },
419 { 0x11A50, 0x11AAF, "Soyombo" },
420 { 0x11AC0, 0x11AFF, "Pau Cin Hau" },
421 { 0x11C00, 0x11C6F, "Bhaiksuki" },
422 { 0x11C70, 0x11CBF, "Marchen" },
423 { 0x11D00, 0x11D5F, "Masaram Gondi" },
424 { 0x12000, 0x123FF, "Cuneiform" },
425 { 0x12400, 0x1247F, "Cuneiform Numbers and Punctuation" },
426 { 0x12480, 0x1254F, "Early Dynastic Cuneiform" },
427 { 0x13000, 0x1342F, "Egyptian Hieroglyphs" },
428 { 0x14400, 0x1467F, "Anatolian Hieroglyphs" },
429 { 0x16800, 0x16A3F, "Bamum Supplement" },
430 { 0x16A40, 0x16A6F, "Mro" },
431 { 0x16AD0, 0x16AFF, "Bassa Vah" },
432 { 0x16B00, 0x16B8F, "Pahawh Hmong" },
433 { 0x16F00, 0x16F9F, "Miao" },
434 { 0x16FE0, 0x16FFF, "Ideographic Symbols and Punctuation" },
435 { 0x17000, 0x187FF, "Tangut" },
436 { 0x18800, 0x18AFF, "Tangut Components" },
437 { 0x1B000, 0x1B0FF, "Kana Supplement" },
438 { 0x1B100, 0x1B12F, "Kana Extended-A" },
439 { 0x1B170, 0x1B2FF, "Nushu" },
440 { 0x1BC00, 0x1BC9F, "Duployan" },
441 { 0x1BCA0, 0x1BCAF, "Shorthand Format Controls" },
442 { 0x1D000, 0x1D0FF, "Byzantine Musical Symbols" },
443 { 0x1D100, 0x1D1FF, "Musical Symbols" },
444 { 0x1D200, 0x1D24F, "Ancient Greek Musical Notation" },
445 { 0x1D300, 0x1D35F, "Tai Xuan Jing Symbols" },
446 { 0x1D360, 0x1D37F, "Counting Rod Numerals" },
447 { 0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols" },
448 { 0x1D800, 0x1DAAF, "Sutton SignWriting" },
449 { 0x1E000, 0x1E02F, "Glagolitic Supplement" },
450 { 0x1E800, 0x1E8DF, "Mende Kikakui" },
451 { 0x1E900, 0x1E95F, "Adlam" },
452 { 0x1EE00, 0x1EEFF, "Arabic Mathematical Alphabetic Symbols" },
453 { 0x1F000, 0x1F02F, "Mahjong Tiles" },
454 { 0x1F030, 0x1F09F, "Domino Tiles" },
455 { 0x1F0A0, 0x1F0FF, "Playing Cards" },
456 { 0x1F100, 0x1F1FF, "Enclosed Alphanumeric Supplement" },
457 { 0x1F200, 0x1F2FF, "Enclosed Ideographic Supplement" },
458 { 0x1F300, 0x1F5FF, "Miscellaneous Symbols and Pictographs" },
459 { 0x1F600, 0x1F64F, "Emoticons" },
460 { 0x1F650, 0x1F67F, "Ornamental Dingbats" },
461 { 0x1F680, 0x1F6FF, "Transport and Map Symbols" },
462 { 0x1F700, 0x1F77F, "Alchemical Symbols" },
463 { 0x1F780, 0x1F7FF, "Geometric Shapes Extended" },
464 { 0x1F800, 0x1F8FF, "Supplemental Arrows-C" },
465 { 0x1F900, 0x1F9FF, "Supplemental Symbols and Pictographs" },
466 { 0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B" },
467 { 0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C" },
468 { 0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D" },
469 { 0x2B820, 0x2CEAF, "CJK Unified Ideographs Extension E" },
470 { 0x2CEB0, 0x2EBEF, "CJK Unified Ideographs Extension F" },
471 { 0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement" },
472 { 0xE0000, 0xE007F, "Tags" },
473 { 0xE0100, 0xE01EF, "Variation Selectors Supplement" },
474 { 0xF0000, 0xFFFFF, "Supplementary Private Use Area-A" },
475 { 0x100000, 0x10FFFF, "Supplementary Private Use Area-B" },
476 { 0x110000, 0xFFFFFFFF, "No_Block" }
477 };
478
r_utf_block_name(int idx)479 R_API const char *r_utf_block_name(int idx) {
480 if (idx < 0 || idx >= lastUtfBlock) {
481 return NULL;
482 }
483 return r_utf_blocks[idx].name;
484 }
485
486 #define r_utf_blocks_count (sizeof (r_utf_blocks) / sizeof (r_utf_blocks[0]))
487
488 /* Convert an UTF-8 buf into a unicode RRune */
r_utf8_decode(const ut8 * ptr,int ptrlen,RRune * ch)489 R_API int r_utf8_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
490 if (ptrlen < 1) {
491 return 0;
492 }
493 if (ptr[0] < 0x80) {
494 if (ch) {
495 *ch = (ut32)ptr[0];
496 }
497 return 1;
498 } else if (ptrlen>1 && (ptr[0]&0xe0) == 0xc0 && (ptr[1]&0xc0) == 0x80) {
499 if (ch) {
500 *ch = (ptr[0] & 0x1f) << 6 | (ptr[1] & 0x3f);
501 }
502 return 2;
503 } else if (ptrlen>2 && (ptr[0]&0xf0) == 0xe0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80) {
504 if (ch) {
505 *ch = (ptr[0] & 0xf) << 12 | (ptr[1] & 0x3f) << 6 | (ptr[2] & 0x3f);
506 }
507 return 3;
508 } else if (ptrlen>3 && (ptr[0]&0xf8) == 0xf0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80 && (ptr[3]&0xc0) == 0x80) {
509 if (ch) {
510 *ch = (ptr[0] & 7) << 18 | (ptr[1] & 0x3f) << 12 | (ptr[2] & 0x3f) << 6 | (ptr[3] & 0x3f);
511 }
512 return 4;
513 }
514 return 0;
515 }
516
517 /* Convert a unicode RRune into an UTF-8 buf */
r_utf8_encode(ut8 * ptr,const RRune ch)518 R_API int r_utf8_encode(ut8 *ptr, const RRune ch) {
519 if (ch < 0x80) {
520 ptr[0] = (ut8)ch;
521 return 1;
522 }
523 else if (ch < 0x800) {
524 ptr[0] = 0xc0 | (ch >> 6);
525 ptr[1] = 0x80 | (ch & 0x3f);
526 return 2;
527 }
528 else if (ch < 0x10000) {
529 ptr[0] = 0xe0 | (ch >> 12);
530 ptr[1] = 0x80 | ((ch >> 6) & 0x3f);
531 ptr[2] = 0x80 | (ch & 0x3f);
532 return 3;
533 }
534 else if (ch < 0x200000) {
535 ptr[0] = 0xf0 | (ch >> 18);
536 ptr[1] = 0x80 | ((ch >> 12) & 0x3f);
537 ptr[2] = 0x80 | ((ch >> 6) & 0x3f);
538 ptr[3] = 0x80 | (ch & 0x3f );
539 return 4;
540 }
541 return 0;
542 }
543
544 /* Convert a unicode RRune string into an utf-8 one */
r_utf8_encode_str(const RRune * str,ut8 * dst,const int dst_length)545 R_API int r_utf8_encode_str(const RRune *str, ut8 *dst, const int dst_length) {
546 int i, pos = 0;
547
548 if (!str || !dst) {
549 return -1;
550 }
551
552 for (i = 0; i < sizeof (str) - 1 && str[i] && pos < dst_length - 1; i++) {
553 pos += r_utf8_encode (&dst[pos], str[i]);
554 }
555
556 dst[pos++] = '\0';
557 return pos;
558 }
559
560 /* Returns the size in bytes of the utf-8 encoded char */
r_utf8_size(const ut8 * ptr)561 R_API int r_utf8_size(const ut8 *ptr) {
562 const int utf8_size[] = {
563 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
564 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
565 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
566 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
567 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0-0xCF
568 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0-0xDF
569 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0-0xEF
570 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 0xF0-0xFF
571 };
572 return (ptr[0]&0x80) ? utf8_size[ptr[0]^0x80] : 1;
573 }
574
r_utf8_strlen(const ut8 * str)575 R_API int r_utf8_strlen(const ut8 *str) {
576 int i, len = 0;
577
578 for (i = 0; str[i]; i++) {
579 if ((str[i] & 0xc0) != 0x80) {
580 len++;
581 }
582 }
583
584 return len;
585 }
586
r_isprint(const RRune c)587 R_API int r_isprint(const RRune c) {
588 // RRunes are most commonly single byte... We can early out with this common case.
589 if (c < 0x34F) {
590 /*
591 manually copied from top, please update if this ever changes
592 { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
593 could do a linear search, but that's a lot slower than a few compare
594 */
595 return !( c <= 0x1F || ( c >= 0x7F && c <= 0x9F));
596 }
597
598 const int last = nonprintable_ranges_count;
599
600 int low = 0;
601 int hi = last - 1;
602
603 do {
604 int mid = (low + hi) >> 1;
605 if (c >= nonprintable_ranges[mid].from && c <= nonprintable_ranges[mid].to) {
606 return false;
607 }
608 if (mid < last && c > nonprintable_ranges[mid].to) {
609 low = mid + 1;
610 }
611 if (mid < last && c < nonprintable_ranges[mid].from) {
612 hi = mid - 1;
613 }
614 } while (low <= hi);
615
616 return true;
617 }
618
619 #if __WINDOWS__
r_utf16_to_utf8_l(const wchar_t * wc,int len)620 R_API char *r_utf16_to_utf8_l(const wchar_t *wc, int len) {
621 if (!wc || !len || len < -1) {
622 return NULL;
623 }
624 char *rutf8 = NULL;
625 int csize;
626
627 if ((csize = WideCharToMultiByte (CP_UTF8, 0, wc, len, NULL, 0, NULL, NULL))) {
628 ++csize;
629 if ((rutf8 = malloc (csize))) {
630 WideCharToMultiByte (CP_UTF8, 0, wc, len, rutf8, csize, NULL, NULL);
631 if (len != -1) {
632 rutf8[csize - 1] = '\0';
633 }
634 }
635 }
636 return rutf8;
637 }
638
r_utf8_to_utf16_l(const char * cstring,int len)639 R_API wchar_t *r_utf8_to_utf16_l(const char *cstring, int len) {
640 if (!cstring || !len || len < -1) {
641 return NULL;
642 }
643 wchar_t *rutf16 = NULL;
644 int wcsize;
645
646 if ((wcsize = MultiByteToWideChar (CP_UTF8, 0, cstring, len, NULL, 0))) {
647 ++wcsize;
648 if ((rutf16 = (wchar_t *) calloc (wcsize, sizeof (wchar_t)))) {
649 MultiByteToWideChar (CP_UTF8, 0, cstring, len, rutf16, wcsize);
650 if (len != -1) {
651 rutf16[wcsize - 1] = L'\0';
652 }
653 }
654 }
655 return rutf16;
656 }
657
r_utf8_to_acp_l(const char * str,int len)658 R_API char *r_utf8_to_acp_l(const char *str, int len) {
659 if (!str || !len || len < -1) {
660 return NULL;
661 }
662 char *acp = NULL;
663 int wcsize, csize;
664 if ((wcsize = MultiByteToWideChar (CP_UTF8, 0, str, len, NULL, 0))) {
665 wchar_t *rutf16;
666 ++wcsize;
667 if ((rutf16 = (wchar_t *)calloc (wcsize, sizeof (wchar_t)))) {
668 MultiByteToWideChar (CP_UTF8, 0, str, len, rutf16, wcsize);
669 if (len != -1) {
670 rutf16[wcsize - 1] = L'\0';
671 }
672 if ((csize = WideCharToMultiByte (CP_ACP, 0, rutf16, wcsize, NULL, 0, NULL, NULL))) {
673 ++csize;
674 if ((acp = malloc (csize))) {
675 WideCharToMultiByte (CP_ACP, 0, rutf16, wcsize, acp, csize, NULL, NULL);
676 if (len != -1) {
677 acp[csize - 1] = '\0';
678 }
679 }
680 }
681 free (rutf16);
682 }
683 }
684 return acp;
685 }
686
r_acp_to_utf8_l(const char * str,int len)687 R_API char *r_acp_to_utf8_l(const char *str, int len) {
688 if (!str || !len || len < -1) {
689 return NULL;
690 }
691 int wcsize;
692 if ((wcsize = MultiByteToWideChar (CP_ACP, 0, str, len, NULL, 0))) {
693 wchar_t *rutf16;
694 ++wcsize;
695 if ((rutf16 = (wchar_t *) calloc (wcsize, sizeof (wchar_t)))) {
696 MultiByteToWideChar (CP_ACP, 0, str, len, rutf16, wcsize);
697 if (len != -1) {
698 rutf16[wcsize - 1] = L'\0';
699 }
700 char *ret = r_utf16_to_utf8_l (rutf16, wcsize);
701 free (rutf16);
702 return ret;
703 }
704 }
705 return NULL;
706 }
707
708 #endif // __WINDOWS__
709
r_utf_block_idx(RRune ch)710 R_API int r_utf_block_idx(RRune ch) {
711 const int last = r_utf_blocks_count;
712 int low, hi, mid;
713
714 low = 0;
715 hi = last - 1;
716
717 do {
718 mid = (low + hi) >> 1;
719 if (ch >= r_utf_blocks[mid].from && ch <= r_utf_blocks[mid].to) {
720 return mid;
721 }
722 if (mid < last && ch > r_utf_blocks[mid].to) {
723 low = mid + 1;
724 }
725 if (mid < last && ch < r_utf_blocks[mid].from) {
726 hi = mid - 1;
727 }
728 } while (low <= hi);
729
730 return r_utf_blocks_count - 1; /* index for "No_Block" */
731 }
732
733 /* str must be UTF8-encoded */
r_utf_block_list(const ut8 * str,int len,int ** freq_list)734 R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list) {
735 if (!str) {
736 return NULL;
737 }
738 if (len < 0) {
739 len = strlen ((const char *)str);
740 }
741 static int block_freq[r_utf_blocks_count] = {0};
742 int *list = R_NEWS (int, len + 1);
743 if (!list) {
744 return NULL;
745 }
746 int *freq_list_ptr = NULL;
747 if (freq_list) {
748 *freq_list = R_NEWS (int, len + 1);
749 if (!*freq_list) {
750 free (list);
751 return NULL;
752 }
753 freq_list_ptr = *freq_list;
754 }
755 int *list_ptr = list;
756 const ut8 *str_ptr = str;
757 const ut8 *str_end = str + len;
758 RRune ch;
759 while (str_ptr < str_end) {
760 int block_idx;
761 int ch_bytes = r_utf8_decode (str_ptr, str_end - str_ptr, &ch);
762 if (!ch_bytes) {
763 block_idx = r_utf_blocks_count - 1;
764 ch_bytes = 1;
765 } else {
766 block_idx = r_utf_block_idx (ch);
767 }
768 if (!block_freq[block_idx]) {
769 *list_ptr = block_idx;
770 list_ptr++;
771 }
772 block_freq[block_idx]++;
773 str_ptr += ch_bytes;
774 }
775 *list_ptr = -1;
776 if (freq_list_ptr) {
777 for (list_ptr = list; *list_ptr != -1; list_ptr++) {
778 *freq_list_ptr = block_freq[*list_ptr];
779 freq_list_ptr++;
780 }
781 *freq_list_ptr = -1;
782 }
783 for (list_ptr = list; *list_ptr != -1; list_ptr++) {
784 block_freq[*list_ptr] = 0;
785 }
786 return list;
787 }
788
r_utf_bom_encoding(const ut8 * ptr,int ptrlen)789 R_API RStrEnc r_utf_bom_encoding(const ut8 *ptr, int ptrlen) {
790 if (ptrlen > 3) {
791 if (ptr[0] == 0xff && ptr[1] == 0xfe && !ptr[2] && !ptr[3]) {
792 return R_STRING_ENC_UTF32LE;
793 }
794 if (!ptr[0] && !ptr[1] && ptr[2] == 0xfe && ptr[3] == 0xff) {
795 return R_STRING_ENC_UTF32BE;
796 }
797 }
798 if (ptrlen > 2) {
799 if (ptr[0] == 0xef && ptr[1] == 0xbb && ptr[2] == 0xbf) {
800 return R_STRING_ENC_UTF8;
801 }
802 }
803 if (ptrlen > 1) {
804 if (ptr[0] == 0xff && ptr[1] == 0xfe) {
805 return R_STRING_ENC_UTF16LE;
806 }
807 if (ptr[0] == 0xfe && ptr[1] == 0xff) {
808 return R_STRING_ENC_UTF16BE;
809 }
810 }
811 return R_STRING_ENC_GUESS;
812 }
813