1 /* radare2 - LGPL - Copyright 2014-2018 - thelemon, kazarmy, pancake */
2 
3 #include <r_types.h>
4 #include <r_util.h>
5 
6 static const struct { ut32 from, to; } nonprintable_ranges[] = {
7 	{ 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
8 	{ 0x0378, 0x0379 }, { 0x037F, 0x0383 }, { 0x038B, 0x038B },
9 	{ 0x038D, 0x038D }, { 0x03A2, 0x03A2 }, { 0x0528, 0x0530 },
10 	{ 0x0557, 0x0558 }, { 0x0560, 0x0560 }, { 0x0588, 0x0588 },
11 	{ 0x058B, 0x058E }, { 0x0590, 0x0590 }, { 0x05C8, 0x05CF },
12 	{ 0x05EB, 0x05EF }, { 0x05F5, 0x0605 }, { 0x061C, 0x061D },
13 	{ 0x06DD, 0x06DD }, { 0x070E, 0x070F }, { 0x074B, 0x074C },
14 	{ 0x07B2, 0x07BF }, { 0x07FB, 0x07FF }, { 0x082E, 0x082F },
15 	{ 0x083F, 0x083F }, { 0x085C, 0x085D }, { 0x085F, 0x089F },
16 	{ 0x08A1, 0x08A1 }, { 0x08AD, 0x08E3 }, { 0x08FF, 0x08FF },
17 	{ 0x0978, 0x0978 }, { 0x0980, 0x0980 }, { 0x0984, 0x0984 },
18 	{ 0x098D, 0x098E }, { 0x0991, 0x0992 }, { 0x09A9, 0x09A9 },
19 	{ 0x09B1, 0x09B1 }, { 0x09B3, 0x09B5 }, { 0x09BA, 0x09BB },
20 	{ 0x09C5, 0x09C6 }, { 0x09C9, 0x09CA }, { 0x09CF, 0x09D6 },
21 	{ 0x09D8, 0x09DB }, { 0x09DE, 0x09DE }, { 0x09E4, 0x09E5 },
22 	{ 0x09FC, 0x0A00 }, { 0x0A04, 0x0A04 }, { 0x0A0B, 0x0A0E },
23 	{ 0x0A11, 0x0A12 }, { 0x0A29, 0x0A29 }, { 0x0A31, 0x0A31 },
24 	{ 0x0A34, 0x0A34 }, { 0x0A37, 0x0A37 }, { 0x0A3A, 0x0A3B },
25 	{ 0x0A3D, 0x0A3D }, { 0x0A43, 0x0A46 }, { 0x0A49, 0x0A4A },
26 	{ 0x0A4E, 0x0A50 }, { 0x0A52, 0x0A58 }, { 0x0A5D, 0x0A5D },
27 	{ 0x0A5F, 0x0A65 }, { 0x0A76, 0x0A80 }, { 0x0A84, 0x0A84 },
28 	{ 0x0A8E, 0x0A8E }, { 0x0A92, 0x0A92 }, { 0x0AA9, 0x0AA9 },
29 	{ 0x0AB1, 0x0AB1 }, { 0x0AB4, 0x0AB4 }, { 0x0ABA, 0x0ABB },
30 	{ 0x0AC6, 0x0AC6 }, { 0x0ACA, 0x0ACA }, { 0x0ACE, 0x0ACF },
31 	{ 0x0AD1, 0x0ADF }, { 0x0AE4, 0x0AE5 }, { 0x0AF2, 0x0B00 },
32 	{ 0x0B04, 0x0B04 }, { 0x0B0D, 0x0B0E }, { 0x0B11, 0x0B12 },
33 	{ 0x0B29, 0x0B29 }, { 0x0B31, 0x0B31 }, { 0x0B34, 0x0B34 },
34 	{ 0x0B3A, 0x0B3B }, { 0x0B45, 0x0B46 }, { 0x0B49, 0x0B4A },
35 	{ 0x0B4E, 0x0B55 }, { 0x0B58, 0x0B5B }, { 0x0B5E, 0x0B5E },
36 	{ 0x0B64, 0x0B65 }, { 0x0B78, 0x0B81 }, { 0x0B84, 0x0B84 },
37 	{ 0x0B8B, 0x0B8D }, { 0x0B91, 0x0B91 }, { 0x0B96, 0x0B98 },
38 	{ 0x0B9B, 0x0B9B }, { 0x0B9D, 0x0B9D }, { 0x0BA0, 0x0BA2 },
39 	{ 0x0BA5, 0x0BA7 }, { 0x0BAB, 0x0BAD }, { 0x0BBA, 0x0BBD },
40 	{ 0x0BC3, 0x0BC5 }, { 0x0BC9, 0x0BC9 }, { 0x0BCE, 0x0BCF },
41 	{ 0x0BD1, 0x0BD6 }, { 0x0BD8, 0x0BE5 }, { 0x0BFB, 0x0C00 },
42 	{ 0x0C04, 0x0C04 }, { 0x0C0D, 0x0C0D }, { 0x0C11, 0x0C11 },
43 	{ 0x0C29, 0x0C29 }, { 0x0C34, 0x0C34 }, { 0x0C3A, 0x0C3C },
44 	{ 0x0C45, 0x0C45 }, { 0x0C49, 0x0C49 }, { 0x0C4E, 0x0C54 },
45 	{ 0x0C57, 0x0C57 }, { 0x0C5A, 0x0C5F }, { 0x0C64, 0x0C65 },
46 	{ 0x0C70, 0x0C77 }, { 0x0C80, 0x0C81 }, { 0x0C84, 0x0C84 },
47 	{ 0x0C8D, 0x0C8D }, { 0x0C91, 0x0C91 }, { 0x0CA9, 0x0CA9 },
48 	{ 0x0CB4, 0x0CB4 }, { 0x0CBA, 0x0CBB }, { 0x0CC5, 0x0CC5 },
49 	{ 0x0CC9, 0x0CC9 }, { 0x0CCE, 0x0CD4 }, { 0x0CD7, 0x0CDD },
50 	{ 0x0CDF, 0x0CDF }, { 0x0CE4, 0x0CE5 }, { 0x0CF0, 0x0CF0 },
51 	{ 0x0CF3, 0x0D01 }, { 0x0D04, 0x0D04 }, { 0x0D0D, 0x0D0D },
52 	{ 0x0D11, 0x0D11 }, { 0x0D3B, 0x0D3C }, { 0x0D45, 0x0D45 },
53 	{ 0x0D49, 0x0D49 }, { 0x0D4F, 0x0D56 }, { 0x0D58, 0x0D5F },
54 	{ 0x0D64, 0x0D65 }, { 0x0D76, 0x0D78 }, { 0x0D80, 0x0D81 },
55 	{ 0x0D84, 0x0D84 }, { 0x0D97, 0x0D99 }, { 0x0DB2, 0x0DB2 },
56 	{ 0x0DBC, 0x0DBC }, { 0x0DBE, 0x0DBF }, { 0x0DC7, 0x0DC9 },
57 	{ 0x0DCB, 0x0DCE }, { 0x0DD5, 0x0DD5 }, { 0x0DD7, 0x0DD7 },
58 	{ 0x0DE0, 0x0DF1 }, { 0x0DF5, 0x0E00 }, { 0x0E3B, 0x0E3E },
59 	{ 0x0E5C, 0x0E80 }, { 0x0E83, 0x0E83 }, { 0x0E85, 0x0E86 },
60 	{ 0x0E89, 0x0E89 }, { 0x0E8B, 0x0E8C }, { 0x0E8E, 0x0E93 },
61 	{ 0x0E98, 0x0E98 }, { 0x0EA0, 0x0EA0 }, { 0x0EA4, 0x0EA4 },
62 	{ 0x0EA6, 0x0EA6 }, { 0x0EA8, 0x0EA9 }, { 0x0EAC, 0x0EAC },
63 	{ 0x0EBA, 0x0EBA }, { 0x0EBE, 0x0EBF }, { 0x0EC5, 0x0EC5 },
64 	{ 0x0EC7, 0x0EC7 }, { 0x0ECE, 0x0ECF }, { 0x0EDA, 0x0EDB },
65 	{ 0x0EE0, 0x0EFF }, { 0x0F48, 0x0F48 }, { 0x0F6D, 0x0F70 },
66 	{ 0x0F98, 0x0F98 }, { 0x0FBD, 0x0FBD }, { 0x0FCD, 0x0FCD },
67 	{ 0x0FDB, 0x0FFF }, { 0x10C6, 0x10C6 }, { 0x10C8, 0x10CC },
68 	{ 0x10CE, 0x10CF }, { 0x115F, 0x1160 }, { 0x1249, 0x1249 },
69 	{ 0x124E, 0x124F }, { 0x1257, 0x1257 }, { 0x1259, 0x1259 },
70 	{ 0x125E, 0x125F }, { 0x1289, 0x1289 }, { 0x128E, 0x128F },
71 	{ 0x12B1, 0x12B1 }, { 0x12B6, 0x12B7 }, { 0x12BF, 0x12BF },
72 	{ 0x12C1, 0x12C1 }, { 0x12C6, 0x12C7 }, { 0x12D7, 0x12D7 },
73 	{ 0x1311, 0x1311 }, { 0x1316, 0x1317 }, { 0x135B, 0x135C },
74 	{ 0x137D, 0x137F }, { 0x139A, 0x139F }, { 0x13F5, 0x13FF },
75 	{ 0x169D, 0x169F }, { 0x16F1, 0x16FF }, { 0x170D, 0x170D },
76 	{ 0x1715, 0x171F }, { 0x1737, 0x173F }, { 0x1754, 0x175F },
77 	{ 0x176D, 0x176D }, { 0x1771, 0x1771 }, { 0x1774, 0x177F },
78 	{ 0x17B4, 0x17B5 }, { 0x17DE, 0x17DF }, { 0x17EA, 0x17EF },
79 	{ 0x17FA, 0x17FF }, { 0x180B, 0x180D }, { 0x180F, 0x180F },
80 	{ 0x181A, 0x181F }, { 0x1878, 0x187F }, { 0x18AB, 0x18AF },
81 	{ 0x18F6, 0x18FF }, { 0x191D, 0x191F }, { 0x192C, 0x192F },
82 	{ 0x193C, 0x193F }, { 0x1941, 0x1943 }, { 0x196E, 0x196F },
83 	{ 0x1975, 0x197F }, { 0x19AC, 0x19AF }, { 0x19CA, 0x19CF },
84 	{ 0x19DB, 0x19DD }, { 0x1A1C, 0x1A1D }, { 0x1A5F, 0x1A5F },
85 	{ 0x1A7D, 0x1A7E }, { 0x1A8A, 0x1A8F }, { 0x1A9A, 0x1A9F },
86 	{ 0x1AAE, 0x1AFF }, { 0x1B4C, 0x1B4F }, { 0x1B7D, 0x1B7F },
87 	{ 0x1BF4, 0x1BFB }, { 0x1C38, 0x1C3A }, { 0x1C4A, 0x1C4C },
88 	{ 0x1C80, 0x1CBF }, { 0x1CC8, 0x1CCF }, { 0x1CF7, 0x1CFF },
89 	{ 0x1DE7, 0x1DFB }, { 0x1F16, 0x1F17 }, { 0x1F1E, 0x1F1F },
90 	{ 0x1F46, 0x1F47 }, { 0x1F4E, 0x1F4F }, { 0x1F58, 0x1F58 },
91 	{ 0x1F5A, 0x1F5A }, { 0x1F5C, 0x1F5C }, { 0x1F5E, 0x1F5E },
92 	{ 0x1F7E, 0x1F7F }, { 0x1FB5, 0x1FB5 }, { 0x1FC5, 0x1FC5 },
93 	{ 0x1FD4, 0x1FD5 }, { 0x1FDC, 0x1FDC }, { 0x1FF0, 0x1FF1 },
94 	{ 0x1FF5, 0x1FF5 }, { 0x1FFF, 0x1FFF }, { 0x200B, 0x200F },
95 	{ 0x202A, 0x202E }, { 0x2060, 0x206F }, { 0x2072, 0x2073 },
96 	{ 0x208F, 0x208F }, { 0x209D, 0x209F }, { 0x20BB, 0x20CF },
97 	{ 0x20F1, 0x20FF }, { 0x218A, 0x218F }, { 0x23F4, 0x23FF },
98 	{ 0x2427, 0x243F }, { 0x244B, 0x245F }, { 0x2700, 0x2700 },
99 	{ 0x2B4D, 0x2B4F }, { 0x2B5A, 0x2BFF }, { 0x2C2F, 0x2C2F },
100 	{ 0x2C5F, 0x2C5F }, { 0x2CF4, 0x2CF8 }, { 0x2D26, 0x2D26 },
101 	{ 0x2D28, 0x2D2C }, { 0x2D2E, 0x2D2F }, { 0x2D68, 0x2D6E },
102 	{ 0x2D71, 0x2D7E }, { 0x2D97, 0x2D9F }, { 0x2DA7, 0x2DA7 },
103 	{ 0x2DAF, 0x2DAF }, { 0x2DB7, 0x2DB7 }, { 0x2DBF, 0x2DBF },
104 	{ 0x2DC7, 0x2DC7 }, { 0x2DCF, 0x2DCF }, { 0x2DD7, 0x2DD7 },
105 	{ 0x2DDF, 0x2DDF }, { 0x2E3C, 0x2E7F }, { 0x2E9A, 0x2E9A },
106 	{ 0x2EF4, 0x2EFF }, { 0x2FD6, 0x2FEF }, { 0x2FFC, 0x2FFF },
107 	{ 0x3040, 0x3040 }, { 0x3097, 0x3098 }, { 0x3100, 0x3104 },
108 	{ 0x312E, 0x3130 }, { 0x3164, 0x3164 }, { 0x318F, 0x318F },
109 	{ 0x31BB, 0x31BF }, { 0x31E4, 0x31EF }, { 0x321F, 0x321F },
110 	{ 0x32FF, 0x32FF }, { 0x4DB6, 0x4DBF }, { 0x9FCD, 0x9FFF },
111 	{ 0xA48D, 0xA48F }, { 0xA4C7, 0xA4CF }, { 0xA62C, 0xA63F },
112 	{ 0xA698, 0xA69E }, { 0xA6F8, 0xA6FF }, { 0xA78F, 0xA78F },
113 	{ 0xA794, 0xA79F }, { 0xA7AB, 0xA7F7 }, { 0xA82C, 0xA82F },
114 	{ 0xA83A, 0xA83F }, { 0xA878, 0xA87F }, { 0xA8C5, 0xA8CD },
115 	{ 0xA8DA, 0xA8DF }, { 0xA8FC, 0xA8FF }, { 0xA954, 0xA95E },
116 	{ 0xA97D, 0xA97F }, { 0xA9CE, 0xA9CE }, { 0xA9DA, 0xA9DD },
117 	{ 0xA9E0, 0xA9FF }, { 0xAA37, 0xAA3F }, { 0xAA4E, 0xAA4F },
118 	{ 0xAA5A, 0xAA5B }, { 0xAA7C, 0xAA7F }, { 0xAAC3, 0xAADA },
119 	{ 0xAAF7, 0xAB00 }, { 0xAB07, 0xAB08 }, { 0xAB0F, 0xAB10 },
120 	{ 0xAB17, 0xAB1F }, { 0xAB27, 0xAB27 }, { 0xAB2F, 0xABBF },
121 	{ 0xABEE, 0xABEF }, { 0xABFA, 0xABFF }, { 0xD7A4, 0xD7AF },
122 	{ 0xD7C7, 0xD7CA }, { 0xD7FC, 0xDFFF }, { 0xFA6E, 0xFA6F },
123 	{ 0xFADA, 0xFAFF }, { 0xFB07, 0xFB12 }, { 0xFB18, 0xFB1C },
124 	{ 0xFB37, 0xFB37 }, { 0xFB3D, 0xFB3D }, { 0xFB3F, 0xFB3F },
125 	{ 0xFB42, 0xFB42 }, { 0xFB45, 0xFB45 }, { 0xFBC2, 0xFBD2 },
126 	{ 0xFD40, 0xFD4F }, { 0xFD90, 0xFD91 }, { 0xFDC8, 0xFDEF },
127 	{ 0xFDFE, 0xFE0F }, { 0xFE1A, 0xFE1F }, { 0xFE27, 0xFE2F },
128 	{ 0xFE53, 0xFE53 }, { 0xFE67, 0xFE67 }, { 0xFE6C, 0xFE6F },
129 	{ 0xFE75, 0xFE75 }, { 0xFEFD, 0xFEFF }, { 0xFF00, 0xFF00 },
130 	{ 0xFFA0, 0xFFA0 }, { 0xFFBF, 0xFFC1 }, { 0xFFC8, 0xFFC9 },
131 	{ 0xFFD0, 0xFFD1 }, { 0xFFD8, 0xFFD9 }, { 0xFFDD, 0xFFDF },
132 	{ 0xFFE7, 0xFFE7 }, { 0xFFEF, 0xFFFB }, { 0xFFFE, 0xFFFF },
133 	{ 0x1000C, 0x1000C }, { 0x10027, 0x10027 }, { 0x1003B, 0x1003B },
134 	{ 0x1003E, 0x1003E }, { 0x1004E, 0x1004F }, { 0x1005E, 0x1007F },
135 	{ 0x100FB, 0x100FF }, { 0x10103, 0x10106 }, { 0x10134, 0x10136 },
136 	{ 0x1018B, 0x1018F }, { 0x1019C, 0x101CF }, { 0x101FE, 0x1027F },
137 	{ 0x1029D, 0x1029F }, { 0x102D1, 0x102FF }, { 0x1031F, 0x1031F },
138 	{ 0x10324, 0x1032F }, { 0x1034B, 0x1037F }, { 0x1039E, 0x1039E },
139 	{ 0x103C4, 0x103C7 }, { 0x103D6, 0x103FF }, { 0x1049E, 0x1049F },
140 	{ 0x104AA, 0x107FF }, { 0x10806, 0x10807 }, { 0x10809, 0x10809 },
141 	{ 0x10836, 0x10836 }, { 0x10839, 0x1083B }, { 0x1083D, 0x1083E },
142 	{ 0x10856, 0x10856 }, { 0x10860, 0x108FF }, { 0x1091C, 0x1091E },
143 	{ 0x1093A, 0x1093E }, { 0x10940, 0x1097F }, { 0x109B8, 0x109BD },
144 	{ 0x109C0, 0x109FF }, { 0x10A04, 0x10A04 }, { 0x10A07, 0x10A0B },
145 	{ 0x10A14, 0x10A14 }, { 0x10A18, 0x10A18 }, { 0x10A34, 0x10A37 },
146 	{ 0x10A3B, 0x10A3E }, { 0x10A48, 0x10A4F }, { 0x10A59, 0x10A5F },
147 	{ 0x10A80, 0x10AFF }, { 0x10B36, 0x10B38 }, { 0x10B56, 0x10B57 },
148 	{ 0x10B73, 0x10B77 }, { 0x10B80, 0x10BFF }, { 0x10C49, 0x10E5F },
149 	{ 0x10E7F, 0x10FFF }, { 0x1104E, 0x11051 }, { 0x11070, 0x1107F },
150 	{ 0x110BD, 0x110BD }, { 0x110C2, 0x110CF }, { 0x110E9, 0x110EF },
151 	{ 0x110FA, 0x110FF }, { 0x11135, 0x11135 }, { 0x11144, 0x1117F },
152 	{ 0x111C9, 0x111CF }, { 0x111DA, 0x1167F }, { 0x116B8, 0x116BF },
153 	{ 0x116CA, 0x11FFF }, { 0x1236F, 0x123FF }, { 0x12463, 0x1246F },
154 	{ 0x12474, 0x12FFF }, { 0x1342F, 0x167FF }, { 0x16A39, 0x16EFF },
155 	{ 0x16F45, 0x16F4F }, { 0x16F7F, 0x16F8E }, { 0x16FA0, 0x1AFFF },
156 	{ 0x1B002, 0x1CFFF }, { 0x1D0F6, 0x1D0FF }, { 0x1D127, 0x1D128 },
157 	{ 0x1D173, 0x1D17A }, { 0x1D1DE, 0x1D1FF }, { 0x1D246, 0x1D2FF },
158 	{ 0x1D357, 0x1D35F }, { 0x1D372, 0x1D3FF }, { 0x1D455, 0x1D455 },
159 	{ 0x1D49D, 0x1D49D }, { 0x1D4A0, 0x1D4A1 }, { 0x1D4A3, 0x1D4A4 },
160 	{ 0x1D4A7, 0x1D4A8 }, { 0x1D4AD, 0x1D4AD }, { 0x1D4BA, 0x1D4BA },
161 	{ 0x1D4BC, 0x1D4BC }, { 0x1D4C4, 0x1D4C4 }, { 0x1D506, 0x1D506 },
162 	{ 0x1D50B, 0x1D50C }, { 0x1D515, 0x1D515 }, { 0x1D51D, 0x1D51D },
163 	{ 0x1D53A, 0x1D53A }, { 0x1D53F, 0x1D53F }, { 0x1D545, 0x1D545 },
164 	{ 0x1D547, 0x1D549 }, { 0x1D551, 0x1D551 }, { 0x1D6A6, 0x1D6A7 },
165 	{ 0x1D7CC, 0x1D7CD }, { 0x1D800, 0x1EDFF }, { 0x1EE04, 0x1EE04 },
166 	{ 0x1EE20, 0x1EE20 }, { 0x1EE23, 0x1EE23 }, { 0x1EE25, 0x1EE26 },
167 	{ 0x1EE28, 0x1EE28 }, { 0x1EE33, 0x1EE33 }, { 0x1EE38, 0x1EE38 },
168 	{ 0x1EE3A, 0x1EE3A }, { 0x1EE3C, 0x1EE41 }, { 0x1EE43, 0x1EE46 },
169 	{ 0x1EE48, 0x1EE48 }, { 0x1EE4A, 0x1EE4A }, { 0x1EE4C, 0x1EE4C },
170 	{ 0x1EE50, 0x1EE50 }, { 0x1EE53, 0x1EE53 }, { 0x1EE55, 0x1EE56 },
171 	{ 0x1EE58, 0x1EE58 }, { 0x1EE5A, 0x1EE5A }, { 0x1EE5C, 0x1EE5C },
172 	{ 0x1EE5E, 0x1EE5E }, { 0x1EE60, 0x1EE60 }, { 0x1EE63, 0x1EE63 },
173 	{ 0x1EE65, 0x1EE66 }, { 0x1EE6B, 0x1EE6B }, { 0x1EE73, 0x1EE73 },
174 	{ 0x1EE78, 0x1EE78 }, { 0x1EE7D, 0x1EE7D }, { 0x1EE7F, 0x1EE7F },
175 	{ 0x1EE8A, 0x1EE8A }, { 0x1EE9C, 0x1EEA0 }, { 0x1EEA4, 0x1EEA4 },
176 	{ 0x1EEAA, 0x1EEAA }, { 0x1EEBC, 0x1EEEF }, { 0x1EEF2, 0x1EFFF },
177 	{ 0x1F02C, 0x1F02F }, { 0x1F094, 0x1F09F }, { 0x1F0AF, 0x1F0B0 },
178 	{ 0x1F0BF, 0x1F0C0 }, { 0x1F0D0, 0x1F0D0 }, { 0x1F0E0, 0x1F0FF },
179 	{ 0x1F10B, 0x1F10F }, { 0x1F12F, 0x1F12F }, { 0x1F16C, 0x1F16F },
180 	{ 0x1F19B, 0x1F1E5 }, { 0x1F203, 0x1F20F }, { 0x1F23B, 0x1F23F },
181 	{ 0x1F249, 0x1F24F }, { 0x1F252, 0x1F2FF }, { 0x1F321, 0x1F32F },
182 	{ 0x1F336, 0x1F336 }, { 0x1F37D, 0x1F37F }, { 0x1F394, 0x1F39F },
183 	{ 0x1F3C5, 0x1F3C5 }, { 0x1F3CB, 0x1F3DF }, { 0x1F3F1, 0x1F3FF },
184 	{ 0x1F43F, 0x1F43F }, { 0x1F441, 0x1F441 }, { 0x1F4F8, 0x1F4F8 },
185 	{ 0x1F4FD, 0x1F4FF }, { 0x1F53E, 0x1F53F }, { 0x1F544, 0x1F54F },
186 	{ 0x1F568, 0x1F5FA }, { 0x1F641, 0x1F644 }, { 0x1F650, 0x1F67F },
187 	{ 0x1F6C6, 0x1F6FF }, { 0x1F774, 0x1FFFF }, { 0x2A6D7, 0x2A6FF },
188 	{ 0x2B735, 0x2B73F }, { 0x2B81E, 0x2F7FF }, { 0x2FA1E, 0xF0000 },
189 	{ 0xFFFFE, 0xFFFFF }, { 0x10FFFE, 0x10FFFF }, { 0x110000, 0xFFFFFFFF }
190 };
191 static const int nonprintable_ranges_count = sizeof (nonprintable_ranges) / sizeof (nonprintable_ranges[0]);
192 
193 static const int lastUtfBlock = 281;
194 
195 const RUtfBlock r_utf_blocks[] = {
196 	{ 0x0000, 0x007F, "Basic Latin" },
197 	{ 0x0080, 0x00FF, "Latin-1 Supplement" },
198 	{ 0x0100, 0x017F, "Latin Extended-A" },
199 	{ 0x0180, 0x024F, "Latin Extended-B" },
200 	{ 0x0250, 0x02AF, "IPA Extensions" },
201 	{ 0x02B0, 0x02FF, "Spacing Modifier Letters" },
202 	{ 0x0300, 0x036F, "Combining Diacritical Marks" },
203 	{ 0x0370, 0x03FF, "Greek and Coptic" },
204 	{ 0x0400, 0x04FF, "Cyrillic" },
205 	{ 0x0500, 0x052F, "Cyrillic Supplement" },
206 	{ 0x0530, 0x058F, "Armenian" },
207 	{ 0x0590, 0x05FF, "Hebrew" },
208 	{ 0x0600, 0x06FF, "Arabic" },
209 	{ 0x0700, 0x074F, "Syriac" },
210 	{ 0x0750, 0x077F, "Arabic Supplement" },
211 	{ 0x0780, 0x07BF, "Thaana" },
212 	{ 0x07C0, 0x07FF, "NKo" },
213 	{ 0x0800, 0x083F, "Samaritan" },
214 	{ 0x0840, 0x085F, "Mandaic" },
215 	{ 0x0860, 0x086F, "Syriac Supplement" },
216 	{ 0x08A0, 0x08FF, "Arabic Extended-A" },
217 	{ 0x0900, 0x097F, "Devanagari" },
218 	{ 0x0980, 0x09FF, "Bengali" },
219 	{ 0x0A00, 0x0A7F, "Gurmukhi" },
220 	{ 0x0A80, 0x0AFF, "Gujarati" },
221 	{ 0x0B00, 0x0B7F, "Oriya" },
222 	{ 0x0B80, 0x0BFF, "Tamil" },
223 	{ 0x0C00, 0x0C7F, "Telugu" },
224 	{ 0x0C80, 0x0CFF, "Kannada" },
225 	{ 0x0D00, 0x0D7F, "Malayalam" },
226 	{ 0x0D80, 0x0DFF, "Sinhala" },
227 	{ 0x0E00, 0x0E7F, "Thai" },
228 	{ 0x0E80, 0x0EFF, "Lao" },
229 	{ 0x0F00, 0x0FFF, "Tibetan" },
230 	{ 0x1000, 0x109F, "Myanmar" },
231 	{ 0x10A0, 0x10FF, "Georgian" },
232 	{ 0x1100, 0x11FF, "Hangul Jamo" },
233 	{ 0x1200, 0x137F, "Ethiopic" },
234 	{ 0x1380, 0x139F, "Ethiopic Supplement" },
235 	{ 0x13A0, 0x13FF, "Cherokee" },
236 	{ 0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics" },
237 	{ 0x1680, 0x169F, "Ogham" },
238 	{ 0x16A0, 0x16FF, "Runic" },
239 	{ 0x1700, 0x171F, "Tagalog" },
240 	{ 0x1720, 0x173F, "Hanunoo" },
241 	{ 0x1740, 0x175F, "Buhid" },
242 	{ 0x1760, 0x177F, "Tagbanwa" },
243 	{ 0x1780, 0x17FF, "Khmer" },
244 	{ 0x1800, 0x18AF, "Mongolian" },
245 	{ 0x18B0, 0x18FF, "Unified Canadian Aboriginal Syllabics Extended" },
246 	{ 0x1900, 0x194F, "Limbu" },
247 	{ 0x1950, 0x197F, "Tai Le" },
248 	{ 0x1980, 0x19DF, "New Tai Lue" },
249 	{ 0x19E0, 0x19FF, "Khmer Symbols" },
250 	{ 0x1A00, 0x1A1F, "Buginese" },
251 	{ 0x1A20, 0x1AAF, "Tai Tham" },
252 	{ 0x1AB0, 0x1AFF, "Combining Diacritical Marks Extended" },
253 	{ 0x1B00, 0x1B7F, "Balinese" },
254 	{ 0x1B80, 0x1BBF, "Sundanese" },
255 	{ 0x1BC0, 0x1BFF, "Batak" },
256 	{ 0x1C00, 0x1C4F, "Lepcha" },
257 	{ 0x1C50, 0x1C7F, "Ol Chiki" },
258 	{ 0x1C80, 0x1C8F, "Cyrillic Extended-C" },
259 	{ 0x1CC0, 0x1CCF, "Sundanese Supplement" },
260 	{ 0x1CD0, 0x1CFF, "Vedic Extensions" },
261 	{ 0x1D00, 0x1D7F, "Phonetic Extensions" },
262 	{ 0x1D80, 0x1DBF, "Phonetic Extensions Supplement" },
263 	{ 0x1DC0, 0x1DFF, "Combining Diacritical Marks Supplement" },
264 	{ 0x1E00, 0x1EFF, "Latin Extended Additional" },
265 	{ 0x1F00, 0x1FFF, "Greek Extended" },
266 	{ 0x2000, 0x206F, "General Punctuation" },
267 	{ 0x2070, 0x209F, "Superscripts and Subscripts" },
268 	{ 0x20A0, 0x20CF, "Currency Symbols" },
269 	{ 0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols" },
270 	{ 0x2100, 0x214F, "Letterlike Symbols" },
271 	{ 0x2150, 0x218F, "Number Forms" },
272 	{ 0x2190, 0x21FF, "Arrows" },
273 	{ 0x2200, 0x22FF, "Mathematical Operators" },
274 	{ 0x2300, 0x23FF, "Miscellaneous Technical" },
275 	{ 0x2400, 0x243F, "Control Pictures" },
276 	{ 0x2440, 0x245F, "Optical Character Recognition" },
277 	{ 0x2460, 0x24FF, "Enclosed Alphanumerics" },
278 	{ 0x2500, 0x257F, "Box Drawing" },
279 	{ 0x2580, 0x259F, "Block Elements" },
280 	{ 0x25A0, 0x25FF, "Geometric Shapes" },
281 	{ 0x2600, 0x26FF, "Miscellaneous Symbols" },
282 	{ 0x2700, 0x27BF, "Dingbats" },
283 	{ 0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A" },
284 	{ 0x27F0, 0x27FF, "Supplemental Arrows-A" },
285 	{ 0x2800, 0x28FF, "Braille Patterns" },
286 	{ 0x2900, 0x297F, "Supplemental Arrows-B" },
287 	{ 0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B" },
288 	{ 0x2A00, 0x2AFF, "Supplemental Mathematical Operators" },
289 	{ 0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows" },
290 	{ 0x2C00, 0x2C5F, "Glagolitic" },
291 	{ 0x2C60, 0x2C7F, "Latin Extended-C" },
292 	{ 0x2C80, 0x2CFF, "Coptic" },
293 	{ 0x2D00, 0x2D2F, "Georgian Supplement" },
294 	{ 0x2D30, 0x2D7F, "Tifinagh" },
295 	{ 0x2D80, 0x2DDF, "Ethiopic Extended" },
296 	{ 0x2DE0, 0x2DFF, "Cyrillic Extended-A" },
297 	{ 0x2E00, 0x2E7F, "Supplemental Punctuation" },
298 	{ 0x2E80, 0x2EFF, "CJK Radicals Supplement" },
299 	{ 0x2F00, 0x2FDF, "Kangxi Radicals" },
300 	{ 0x2FF0, 0x2FFF, "Ideographic Description Characters" },
301 	{ 0x3000, 0x303F, "CJK Symbols and Punctuation" },
302 	{ 0x3040, 0x309F, "Hiragana" },
303 	{ 0x30A0, 0x30FF, "Katakana" },
304 	{ 0x3100, 0x312F, "Bopomofo" },
305 	{ 0x3130, 0x318F, "Hangul Compatibility Jamo" },
306 	{ 0x3190, 0x319F, "Kanbun" },
307 	{ 0x31A0, 0x31BF, "Bopomofo Extended" },
308 	{ 0x31C0, 0x31EF, "CJK Strokes" },
309 	{ 0x31F0, 0x31FF, "Katakana Phonetic Extensions" },
310 	{ 0x3200, 0x32FF, "Enclosed CJK Letters and Months" },
311 	{ 0x3300, 0x33FF, "CJK Compatibility" },
312 	{ 0x3400, 0x4DBF, "CJK Unified Ideographs Extension A" },
313 	{ 0x4DC0, 0x4DFF, "Yijing Hexagram Symbols" },
314 	{ 0x4E00, 0x9FFF, "CJK Unified Ideographs" },
315 	{ 0xA000, 0xA48F, "Yi Syllables" },
316 	{ 0xA490, 0xA4CF, "Yi Radicals" },
317 	{ 0xA4D0, 0xA4FF, "Lisu" },
318 	{ 0xA500, 0xA63F, "Vai" },
319 	{ 0xA640, 0xA69F, "Cyrillic Extended-B" },
320 	{ 0xA6A0, 0xA6FF, "Bamum" },
321 	{ 0xA700, 0xA71F, "Modifier Tone Letters" },
322 	{ 0xA720, 0xA7FF, "Latin Extended-D" },
323 	{ 0xA800, 0xA82F, "Syloti Nagri" },
324 	{ 0xA830, 0xA83F, "Common Indic Number Forms" },
325 	{ 0xA840, 0xA87F, "Phags-pa" },
326 	{ 0xA880, 0xA8DF, "Saurashtra" },
327 	{ 0xA8E0, 0xA8FF, "Devanagari Extended" },
328 	{ 0xA900, 0xA92F, "Kayah Li" },
329 	{ 0xA930, 0xA95F, "Rejang" },
330 	{ 0xA960, 0xA97F, "Hangul Jamo Extended-A" },
331 	{ 0xA980, 0xA9DF, "Javanese" },
332 	{ 0xA9E0, 0xA9FF, "Myanmar Extended-B" },
333 	{ 0xAA00, 0xAA5F, "Cham" },
334 	{ 0xAA60, 0xAA7F, "Myanmar Extended-A" },
335 	{ 0xAA80, 0xAADF, "Tai Viet" },
336 	{ 0xAAE0, 0xAAFF, "Meetei Mayek Extensions" },
337 	{ 0xAB00, 0xAB2F, "Ethiopic Extended-A" },
338 	{ 0xAB30, 0xAB6F, "Latin Extended-E" },
339 	{ 0xAB70, 0xABBF, "Cherokee Supplement" },
340 	{ 0xABC0, 0xABFF, "Meetei Mayek" },
341 	{ 0xAC00, 0xD7AF, "Hangul Syllables" },
342 	{ 0xD7B0, 0xD7FF, "Hangul Jamo Extended-B" },
343 	{ 0xD800, 0xDB7F, "High Surrogates" },
344 	{ 0xDB80, 0xDBFF, "High Private Use Surrogates" },
345 	{ 0xDC00, 0xDFFF, "Low Surrogates" },
346 	{ 0xE000, 0xF8FF, "Private Use Area" },
347 	{ 0xF900, 0xFAFF, "CJK Compatibility Ideographs" },
348 	{ 0xFB00, 0xFB4F, "Alphabetic Presentation Forms" },
349 	{ 0xFB50, 0xFDFF, "Arabic Presentation Forms-A" },
350 	{ 0xFE00, 0xFE0F, "Variation Selectors" },
351 	{ 0xFE10, 0xFE1F, "Vertical Forms" },
352 	{ 0xFE20, 0xFE2F, "Combining Half Marks" },
353 	{ 0xFE30, 0xFE4F, "CJK Compatibility Forms" },
354 	{ 0xFE50, 0xFE6F, "Small Form Variants" },
355 	{ 0xFE70, 0xFEFF, "Arabic Presentation Forms-B" },
356 	{ 0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms" },
357 	{ 0xFFF0, 0xFFFF, "Specials" },
358 	{ 0x10000, 0x1007F, "Linear B Syllabary" },
359 	{ 0x10080, 0x100FF, "Linear B Ideograms" },
360 	{ 0x10100, 0x1013F, "Aegean Numbers" },
361 	{ 0x10140, 0x1018F, "Ancient Greek Numbers" },
362 	{ 0x10190, 0x101CF, "Ancient Symbols" },
363 	{ 0x101D0, 0x101FF, "Phaistos Disc" },
364 	{ 0x10280, 0x1029F, "Lycian" },
365 	{ 0x102A0, 0x102DF, "Carian" },
366 	{ 0x102E0, 0x102FF, "Coptic Epact Numbers" },
367 	{ 0x10300, 0x1032F, "Old Italic" },
368 	{ 0x10330, 0x1034F, "Gothic" },
369 	{ 0x10350, 0x1037F, "Old Permic" },
370 	{ 0x10380, 0x1039F, "Ugaritic" },
371 	{ 0x103A0, 0x103DF, "Old Persian" },
372 	{ 0x10400, 0x1044F, "Deseret" },
373 	{ 0x10450, 0x1047F, "Shavian" },
374 	{ 0x10480, 0x104AF, "Osmanya" },
375 	{ 0x104B0, 0x104FF, "Osage" },
376 	{ 0x10500, 0x1052F, "Elbasan" },
377 	{ 0x10530, 0x1056F, "Caucasian Albanian" },
378 	{ 0x10600, 0x1077F, "Linear A" },
379 	{ 0x10800, 0x1083F, "Cypriot Syllabary" },
380 	{ 0x10840, 0x1085F, "Imperial Aramaic" },
381 	{ 0x10860, 0x1087F, "Palmyrene" },
382 	{ 0x10880, 0x108AF, "Nabataean" },
383 	{ 0x108E0, 0x108FF, "Hatran" },
384 	{ 0x10900, 0x1091F, "Phoenician" },
385 	{ 0x10920, 0x1093F, "Lydian" },
386 	{ 0x10980, 0x1099F, "Meroitic Hieroglyphs" },
387 	{ 0x109A0, 0x109FF, "Meroitic Cursive" },
388 	{ 0x10A00, 0x10A5F, "Kharoshthi" },
389 	{ 0x10A60, 0x10A7F, "Old South Arabian" },
390 	{ 0x10A80, 0x10A9F, "Old North Arabian" },
391 	{ 0x10AC0, 0x10AFF, "Manichaean" },
392 	{ 0x10B00, 0x10B3F, "Avestan" },
393 	{ 0x10B40, 0x10B5F, "Inscriptional Parthian" },
394 	{ 0x10B60, 0x10B7F, "Inscriptional Pahlavi" },
395 	{ 0x10B80, 0x10BAF, "Psalter Pahlavi" },
396 	{ 0x10C00, 0x10C4F, "Old Turkic" },
397 	{ 0x10C80, 0x10CFF, "Old Hungarian" },
398 	{ 0x10E60, 0x10E7F, "Rumi Numeral Symbols" },
399 	{ 0x11000, 0x1107F, "Brahmi" },
400 	{ 0x11080, 0x110CF, "Kaithi" },
401 	{ 0x110D0, 0x110FF, "Sora Sompeng" },
402 	{ 0x11100, 0x1114F, "Chakma" },
403 	{ 0x11150, 0x1117F, "Mahajani" },
404 	{ 0x11180, 0x111DF, "Sharada" },
405 	{ 0x111E0, 0x111FF, "Sinhala Archaic Numbers" },
406 	{ 0x11200, 0x1124F, "Khojki" },
407 	{ 0x11280, 0x112AF, "Multani" },
408 	{ 0x112B0, 0x112FF, "Khudawadi" },
409 	{ 0x11300, 0x1137F, "Grantha" },
410 	{ 0x11400, 0x1147F, "Newa" },
411 	{ 0x11480, 0x114DF, "Tirhuta" },
412 	{ 0x11580, 0x115FF, "Siddham" },
413 	{ 0x11600, 0x1165F, "Modi" },
414 	{ 0x11660, 0x1167F, "Mongolian Supplement" },
415 	{ 0x11680, 0x116CF, "Takri" },
416 	{ 0x11700, 0x1173F, "Ahom" },
417 	{ 0x118A0, 0x118FF, "Warang Citi" },
418 	{ 0x11A00, 0x11A4F, "Zanabazar Square" },
419 	{ 0x11A50, 0x11AAF, "Soyombo" },
420 	{ 0x11AC0, 0x11AFF, "Pau Cin Hau" },
421 	{ 0x11C00, 0x11C6F, "Bhaiksuki" },
422 	{ 0x11C70, 0x11CBF, "Marchen" },
423 	{ 0x11D00, 0x11D5F, "Masaram Gondi" },
424 	{ 0x12000, 0x123FF, "Cuneiform" },
425 	{ 0x12400, 0x1247F, "Cuneiform Numbers and Punctuation" },
426 	{ 0x12480, 0x1254F, "Early Dynastic Cuneiform" },
427 	{ 0x13000, 0x1342F, "Egyptian Hieroglyphs" },
428 	{ 0x14400, 0x1467F, "Anatolian Hieroglyphs" },
429 	{ 0x16800, 0x16A3F, "Bamum Supplement" },
430 	{ 0x16A40, 0x16A6F, "Mro" },
431 	{ 0x16AD0, 0x16AFF, "Bassa Vah" },
432 	{ 0x16B00, 0x16B8F, "Pahawh Hmong" },
433 	{ 0x16F00, 0x16F9F, "Miao" },
434 	{ 0x16FE0, 0x16FFF, "Ideographic Symbols and Punctuation" },
435 	{ 0x17000, 0x187FF, "Tangut" },
436 	{ 0x18800, 0x18AFF, "Tangut Components" },
437 	{ 0x1B000, 0x1B0FF, "Kana Supplement" },
438 	{ 0x1B100, 0x1B12F, "Kana Extended-A" },
439 	{ 0x1B170, 0x1B2FF, "Nushu" },
440 	{ 0x1BC00, 0x1BC9F, "Duployan" },
441 	{ 0x1BCA0, 0x1BCAF, "Shorthand Format Controls" },
442 	{ 0x1D000, 0x1D0FF, "Byzantine Musical Symbols" },
443 	{ 0x1D100, 0x1D1FF, "Musical Symbols" },
444 	{ 0x1D200, 0x1D24F, "Ancient Greek Musical Notation" },
445 	{ 0x1D300, 0x1D35F, "Tai Xuan Jing Symbols" },
446 	{ 0x1D360, 0x1D37F, "Counting Rod Numerals" },
447 	{ 0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols" },
448 	{ 0x1D800, 0x1DAAF, "Sutton SignWriting" },
449 	{ 0x1E000, 0x1E02F, "Glagolitic Supplement" },
450 	{ 0x1E800, 0x1E8DF, "Mende Kikakui" },
451 	{ 0x1E900, 0x1E95F, "Adlam" },
452 	{ 0x1EE00, 0x1EEFF, "Arabic Mathematical Alphabetic Symbols" },
453 	{ 0x1F000, 0x1F02F, "Mahjong Tiles" },
454 	{ 0x1F030, 0x1F09F, "Domino Tiles" },
455 	{ 0x1F0A0, 0x1F0FF, "Playing Cards" },
456 	{ 0x1F100, 0x1F1FF, "Enclosed Alphanumeric Supplement" },
457 	{ 0x1F200, 0x1F2FF, "Enclosed Ideographic Supplement" },
458 	{ 0x1F300, 0x1F5FF, "Miscellaneous Symbols and Pictographs" },
459 	{ 0x1F600, 0x1F64F, "Emoticons" },
460 	{ 0x1F650, 0x1F67F, "Ornamental Dingbats" },
461 	{ 0x1F680, 0x1F6FF, "Transport and Map Symbols" },
462 	{ 0x1F700, 0x1F77F, "Alchemical Symbols" },
463 	{ 0x1F780, 0x1F7FF, "Geometric Shapes Extended" },
464 	{ 0x1F800, 0x1F8FF, "Supplemental Arrows-C" },
465 	{ 0x1F900, 0x1F9FF, "Supplemental Symbols and Pictographs" },
466 	{ 0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B" },
467 	{ 0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C" },
468 	{ 0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D" },
469 	{ 0x2B820, 0x2CEAF, "CJK Unified Ideographs Extension E" },
470 	{ 0x2CEB0, 0x2EBEF, "CJK Unified Ideographs Extension F" },
471 	{ 0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement" },
472 	{ 0xE0000, 0xE007F, "Tags" },
473 	{ 0xE0100, 0xE01EF, "Variation Selectors Supplement" },
474 	{ 0xF0000, 0xFFFFF, "Supplementary Private Use Area-A" },
475 	{ 0x100000, 0x10FFFF, "Supplementary Private Use Area-B" },
476 	{ 0x110000, 0xFFFFFFFF, "No_Block" }
477 };
478 
r_utf_block_name(int idx)479 R_API const char *r_utf_block_name(int idx) {
480 	if (idx < 0 || idx >= lastUtfBlock) {
481 		return NULL;
482 	}
483 	return r_utf_blocks[idx].name;
484 }
485 
486 #define r_utf_blocks_count (sizeof (r_utf_blocks) / sizeof (r_utf_blocks[0]))
487 
488 /* Convert an UTF-8 buf into a unicode RRune */
r_utf8_decode(const ut8 * ptr,int ptrlen,RRune * ch)489 R_API int r_utf8_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
490 	if (ptrlen < 1) {
491 		return 0;
492 	}
493 	if (ptr[0] < 0x80) {
494 		if (ch) {
495 			*ch = (ut32)ptr[0];
496 		}
497 		return 1;
498 	} else if (ptrlen>1 && (ptr[0]&0xe0) == 0xc0 && (ptr[1]&0xc0) == 0x80) {
499 		if (ch) {
500 			*ch = (ptr[0] & 0x1f) << 6 | (ptr[1] & 0x3f);
501 		}
502 		return 2;
503 	} else if (ptrlen>2 && (ptr[0]&0xf0) == 0xe0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80) {
504 		if (ch) {
505 			*ch = (ptr[0] & 0xf) << 12 | (ptr[1] & 0x3f) << 6 | (ptr[2] & 0x3f);
506 		}
507 		return 3;
508 	} else if (ptrlen>3 && (ptr[0]&0xf8) == 0xf0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80 && (ptr[3]&0xc0) == 0x80) {
509 		if (ch) {
510 			*ch = (ptr[0] & 7) << 18 | (ptr[1] & 0x3f) << 12 | (ptr[2] & 0x3f) << 6 | (ptr[3] & 0x3f);
511 		}
512 		return 4;
513 	}
514 	return 0;
515 }
516 
517 /* Convert a unicode RRune into an UTF-8 buf */
r_utf8_encode(ut8 * ptr,const RRune ch)518 R_API int r_utf8_encode(ut8 *ptr, const RRune ch) {
519 	if (ch < 0x80) {
520 		ptr[0] = (ut8)ch;
521 		return 1;
522 	}
523 	else if (ch < 0x800) {
524 		ptr[0] = 0xc0 | (ch >> 6);
525 		ptr[1] = 0x80 | (ch & 0x3f);
526 		return 2;
527 	}
528 	else if (ch < 0x10000) {
529 		ptr[0] = 0xe0 | (ch >> 12);
530 		ptr[1] = 0x80 | ((ch >> 6) & 0x3f);
531 		ptr[2] = 0x80 | (ch & 0x3f);
532 		return 3;
533 	}
534 	else if (ch < 0x200000) {
535 		ptr[0] = 0xf0 | (ch >> 18);
536 		ptr[1] = 0x80 | ((ch >> 12) & 0x3f);
537 		ptr[2] = 0x80 | ((ch >> 6) & 0x3f);
538 		ptr[3] = 0x80 | (ch & 0x3f );
539 		return 4;
540 	}
541 	return 0;
542 }
543 
544 /* Convert a unicode RRune string into an utf-8 one */
r_utf8_encode_str(const RRune * str,ut8 * dst,const int dst_length)545 R_API int r_utf8_encode_str(const RRune *str, ut8 *dst, const int dst_length) {
546 	int i, pos = 0;
547 
548 	if (!str || !dst) {
549 		return -1;
550 	}
551 
552 	for (i = 0; i < sizeof (str) - 1 && str[i] && pos < dst_length - 1; i++) {
553 		pos += r_utf8_encode (&dst[pos], str[i]);
554 	}
555 
556 	dst[pos++] = '\0';
557 	return pos;
558 }
559 
560 /* Returns the size in bytes of the utf-8 encoded char */
r_utf8_size(const ut8 * ptr)561 R_API int r_utf8_size(const ut8 *ptr) {
562 	const int utf8_size[] = {
563 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
564 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
565 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
566 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
567 		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0-0xCF
568 		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0-0xDF
569 		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0-0xEF
570 		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 0xF0-0xFF
571 	};
572 	return (ptr[0]&0x80) ? utf8_size[ptr[0]^0x80] : 1;
573 }
574 
r_utf8_strlen(const ut8 * str)575 R_API int r_utf8_strlen(const ut8 *str) {
576 	int i, len = 0;
577 
578 	for (i = 0; str[i]; i++) {
579 		if ((str[i] & 0xc0) != 0x80) {
580 			len++;
581 		}
582 	}
583 
584 	return len;
585 }
586 
r_isprint(const RRune c)587 R_API int r_isprint(const RRune c) {
588 	// RRunes are most commonly single byte... We can early out with this common case.
589 	if (c < 0x34F) {
590 		/*
591 		manually copied from top, please update if this ever changes
592 		{ 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
593 		could do a linear search, but that's a lot slower than a few compare
594 		*/
595 		return !( c <= 0x1F || ( c >= 0x7F && c <= 0x9F));
596 	}
597 
598 	const int last = nonprintable_ranges_count;
599 
600 	int low = 0;
601 	int hi = last - 1;
602 
603 	do {
604 		int mid = (low + hi) >> 1;
605 		if (c >= nonprintable_ranges[mid].from && c <= nonprintable_ranges[mid].to) {
606 			return false;
607 		}
608 		if (mid < last && c > nonprintable_ranges[mid].to) {
609 			low = mid + 1;
610 		}
611 		if (mid < last && c < nonprintable_ranges[mid].from) {
612 			hi = mid - 1;
613 		}
614 	} while (low <= hi);
615 
616 	return true;
617 }
618 
619 #if __WINDOWS__
r_utf16_to_utf8_l(const wchar_t * wc,int len)620 R_API char *r_utf16_to_utf8_l(const wchar_t *wc, int len) {
621 	if (!wc || !len || len < -1) {
622 		return NULL;
623 	}
624 	char *rutf8 = NULL;
625 	int csize;
626 
627 	if ((csize = WideCharToMultiByte (CP_UTF8, 0, wc, len, NULL, 0, NULL, NULL))) {
628 		++csize;
629 		if ((rutf8 = malloc (csize))) {
630 			WideCharToMultiByte (CP_UTF8, 0, wc, len, rutf8, csize, NULL, NULL);
631 			if (len != -1) {
632 				rutf8[csize - 1] = '\0';
633 			}
634 		}
635 	}
636 	return rutf8;
637 }
638 
r_utf8_to_utf16_l(const char * cstring,int len)639 R_API wchar_t *r_utf8_to_utf16_l(const char *cstring, int len) {
640 	if (!cstring || !len || len < -1) {
641 		return NULL;
642 	}
643 	wchar_t *rutf16 = NULL;
644 	int wcsize;
645 
646 	if ((wcsize = MultiByteToWideChar (CP_UTF8, 0, cstring, len, NULL, 0))) {
647 		++wcsize;
648 		if ((rutf16 = (wchar_t *) calloc (wcsize, sizeof (wchar_t)))) {
649 			MultiByteToWideChar (CP_UTF8, 0, cstring, len, rutf16, wcsize);
650 			if (len != -1) {
651 				rutf16[wcsize - 1] = L'\0';
652 			}
653 		}
654 	}
655 	return rutf16;
656 }
657 
r_utf8_to_acp_l(const char * str,int len)658 R_API char *r_utf8_to_acp_l(const char *str, int len) {
659 	if (!str || !len || len < -1) {
660 		return NULL;
661 	}
662 	char *acp = NULL;
663 	int wcsize, csize;
664 	if ((wcsize = MultiByteToWideChar (CP_UTF8, 0, str, len, NULL, 0))) {
665 		wchar_t *rutf16;
666 		++wcsize;
667 		if ((rutf16 = (wchar_t *)calloc (wcsize, sizeof (wchar_t)))) {
668 			MultiByteToWideChar (CP_UTF8, 0, str, len, rutf16, wcsize);
669 			if (len != -1) {
670 				rutf16[wcsize - 1] = L'\0';
671 			}
672 			if ((csize = WideCharToMultiByte (CP_ACP, 0, rutf16, wcsize, NULL, 0, NULL, NULL))) {
673 				++csize;
674 				if ((acp = malloc (csize))) {
675 					WideCharToMultiByte (CP_ACP, 0, rutf16, wcsize, acp, csize, NULL, NULL);
676 					if (len != -1) {
677 						acp[csize - 1] = '\0';
678 					}
679 				}
680 			}
681 			free (rutf16);
682 		}
683 	}
684 	return acp;
685 }
686 
r_acp_to_utf8_l(const char * str,int len)687 R_API char *r_acp_to_utf8_l(const char *str, int len) {
688 	if (!str || !len || len < -1) {
689 		return NULL;
690 	}
691 	int wcsize;
692 	if ((wcsize = MultiByteToWideChar (CP_ACP, 0, str, len, NULL, 0))) {
693 		wchar_t *rutf16;
694 		++wcsize;
695 		if ((rutf16 = (wchar_t *) calloc (wcsize, sizeof (wchar_t)))) {
696 			MultiByteToWideChar (CP_ACP, 0, str, len, rutf16, wcsize);
697 			if (len != -1) {
698 				rutf16[wcsize - 1] = L'\0';
699 			}
700 			char *ret = r_utf16_to_utf8_l (rutf16, wcsize);
701 			free (rutf16);
702 			return ret;
703 		}
704 	}
705 	return NULL;
706 }
707 
708 #endif // __WINDOWS__
709 
r_utf_block_idx(RRune ch)710 R_API int r_utf_block_idx(RRune ch) {
711 	const int last = r_utf_blocks_count;
712 	int low, hi, mid;
713 
714 	low = 0;
715 	hi = last - 1;
716 
717 	do {
718 		mid = (low + hi) >> 1;
719 		if (ch >= r_utf_blocks[mid].from && ch <= r_utf_blocks[mid].to) {
720 			return mid;
721 		}
722 		if (mid < last && ch > r_utf_blocks[mid].to) {
723 			low = mid + 1;
724 		}
725 		if (mid < last && ch < r_utf_blocks[mid].from) {
726 			hi = mid - 1;
727 		}
728 	} while (low <= hi);
729 
730 	return r_utf_blocks_count - 1; /* index for "No_Block" */
731 }
732 
733 /* str must be UTF8-encoded */
r_utf_block_list(const ut8 * str,int len,int ** freq_list)734 R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list) {
735 	if (!str) {
736 		return NULL;
737 	}
738 	if (len < 0) {
739 		len = strlen ((const char *)str);
740 	}
741 	static int block_freq[r_utf_blocks_count] = {0};
742 	int *list = R_NEWS (int, len + 1);
743 	if (!list) {
744 		return NULL;
745 	}
746 	int *freq_list_ptr = NULL;
747 	if (freq_list) {
748 		*freq_list = R_NEWS (int, len + 1);
749 		if (!*freq_list) {
750 			free (list);
751 			return NULL;
752 		}
753 		freq_list_ptr = *freq_list;
754 	}
755 	int *list_ptr = list;
756 	const ut8 *str_ptr = str;
757 	const ut8 *str_end = str + len;
758 	RRune ch;
759 	while (str_ptr < str_end) {
760 		int block_idx;
761 		int ch_bytes = r_utf8_decode (str_ptr, str_end - str_ptr, &ch);
762 		if (!ch_bytes) {
763 			block_idx = r_utf_blocks_count - 1;
764 			ch_bytes = 1;
765 		} else {
766 			block_idx = r_utf_block_idx (ch);
767 		}
768 		if (!block_freq[block_idx]) {
769 			*list_ptr = block_idx;
770 			list_ptr++;
771 		}
772 		block_freq[block_idx]++;
773 		str_ptr += ch_bytes;
774 	}
775 	*list_ptr = -1;
776 	if (freq_list_ptr) {
777 		for (list_ptr = list; *list_ptr != -1; list_ptr++) {
778 			*freq_list_ptr = block_freq[*list_ptr];
779 			freq_list_ptr++;
780 		}
781 		*freq_list_ptr = -1;
782 	}
783 	for (list_ptr = list; *list_ptr != -1; list_ptr++) {
784 		block_freq[*list_ptr] = 0;
785 	}
786 	return list;
787 }
788 
r_utf_bom_encoding(const ut8 * ptr,int ptrlen)789 R_API RStrEnc r_utf_bom_encoding(const ut8 *ptr, int ptrlen) {
790 	if (ptrlen > 3) {
791 		if (ptr[0] == 0xff && ptr[1] == 0xfe && !ptr[2] && !ptr[3]) {
792 			return R_STRING_ENC_UTF32LE;
793 		}
794 		if (!ptr[0] && !ptr[1] && ptr[2] == 0xfe && ptr[3] == 0xff) {
795 			return R_STRING_ENC_UTF32BE;
796 		}
797 	}
798 	if (ptrlen > 2) {
799 		if (ptr[0] == 0xef && ptr[1] == 0xbb && ptr[2] == 0xbf) {
800 			return R_STRING_ENC_UTF8;
801 		}
802 	}
803 	if (ptrlen > 1) {
804 		if (ptr[0] == 0xff && ptr[1] == 0xfe) {
805 			return R_STRING_ENC_UTF16LE;
806 		}
807 		if (ptr[0] == 0xfe && ptr[1] == 0xff) {
808 			return R_STRING_ENC_UTF16BE;
809 		}
810 	}
811 	return R_STRING_ENC_GUESS;
812 }
813