1 // Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
2 //
3 // Permission to use, copy, modify, and distribute this software for any
4 // purpose with or without fee is hereby granted, provided that the above
5 // copyright notice and this permission notice appear in all copies.
6 //
7 // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14
15 /// @file charset_6937.cpp
16 /// @brief A charset converter for ISO-6937-2
17 /// @ingroup libaegisub
18
19 #include "charset_6937.h"
20
21 #include <cerrno>
22 #include <iconv.h>
23
24 #include <boost/range/algorithm/lower_bound.hpp>
25
26 namespace {
27
28 // ISO-6937-2 values for the first 383 codepoints
29 const int iso6937_codepoints[] = {
30 0x00, 0x01, 0x02, 0x03, 0x04,
31 0x05, 0x06, 0x07, 0x08, 0x09,
32 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
33 0x0F, 0x10, 0x11, 0x12, 0x13,
34 0x14, 0x15, 0x16, 0x17, 0x18,
35 0x19, 0x1A, 0x1B, 0x1C, 0x1D,
36 0x1E, 0x1F, 0x20, 0x21, 0x22,
37 0x23, 0x24, 0x25, 0x26, 0x27,
38 0x28, 0x29, 0x2A, 0x2B, 0x2C,
39 0x2D, 0x2E, 0x2F, 0x30, 0x31,
40 0x32, 0x33, 0x34, 0x35, 0x36,
41 0x37, 0x38, 0x39, 0x3A, 0x3B,
42 0x3C, 0x3D, 0x3E, 0x3F, 0x40,
43 0x41, 0x42, 0x43, 0x44, 0x45,
44 0x46, 0x47, 0x48, 0x49, 0x4A,
45 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
46 0x50, 0x51, 0x52, 0x53, 0x54,
47 0x55, 0x56, 0x57, 0x58, 0x59,
48 0x5A, 0x5B, 0x5C, 0x5D, 0x5E,
49 0x5F, 0x60, 0x61, 0x62, 0x63,
50 0x64, 0x65, 0x66, 0x67, 0x68,
51 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
52 0x6E, 0x6F, 0x70, 0x71, 0x72,
53 0x73, 0x74, 0x75, 0x76, 0x77,
54 0x78, 0x79, 0x7A, 0x7B, 0x7C,
55 0x7D, 0x7E, 0x7F, 0x80, 0x81,
56 0x82, 0x83, 0x84, 0x85, 0x86,
57 0x87, 0x88, 0x89, 0x8A, 0x8B,
58 0x8C, 0x8D, 0x8E, 0x8F, 0x90,
59 0x91, 0x92, 0x93, 0x94, 0x95,
60 0x96, 0x97, 0x98, 0x99, 0x9A,
61 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
62 0xA0, 0xA1, 0xA2, 0xA3, 0xA8,
63 0xA5, 0x00, 0xA7, 0xC820, 0xD3,
64 0xE3, 0xAB, 0x00, 0x00, 0xD2,
65 0xC520, 0xB0, 0xB1, 0xB2, 0xB3,
66 0xC220, 0xB5, 0xB6, 0xB7, 0xCB20,
67 0xD1, 0xEB, 0xBB, 0xBC, 0xBD,
68 0xBE, 0xBF, 0xC141, 0xC241, 0xC341,
69 0xC441, 0xC841, 0xCA41, 0xE1, 0xCB43,
70 0xC145, 0xC245, 0xC345, 0xC845, 0xC149,
71 0xC249, 0xC349, 0xC849, 0xE2, 0xC44E,
72 0xC14F, 0xC24F, 0xC34F, 0xC44F, 0xC84F,
73 0xB4, 0xE9, 0xC155, 0xC255, 0xC355,
74 0xC855, 0xC259, 0xEC, 0xFB, 0xC161,
75 0xC261, 0xC361, 0xC461, 0xC861, 0xCA61,
76 0xF1, 0xCB63, 0xC165, 0xC265, 0xC365,
77 0xC865, 0xC169, 0xC269, 0xC369, 0xC869,
78 0xF3, 0xC46E, 0xC16F, 0xC26F, 0xC36F,
79 0xC46F, 0xC86F, 0xB8, 0xF9, 0xC175,
80 0xC275, 0xC375, 0xC875, 0xC279, 0xFC,
81 0xC879, 0xC541, 0xC561, 0xC641, 0xC661,
82 0xCE41, 0xCE61, 0xC243, 0xC263, 0xC343,
83 0xC363, 0xC743, 0xC763, 0xCF43, 0xCF63,
84 0xCF44, 0xCF64, 0x00, 0xF2, 0xC545,
85 0xC565, 0x00, 0x00, 0xC745, 0xC765,
86 0xCE45, 0xCE65, 0xCF45, 0xCF65, 0xC347,
87 0xC367, 0xC647, 0xC667, 0xC747, 0xC767,
88 0xCB47, 0xCB67, 0xC348, 0xC368, 0xE4,
89 0xF4, 0xC449, 0xC469, 0xC549, 0xC569,
90 0x00, 0x00, 0xCE49, 0xCE69, 0xC749,
91 0xF5, 0xE6, 0xF6, 0xC34A, 0xC36A,
92 0xCB4B, 0xCB6B, 0xF0, 0xC24C, 0xC26C,
93 0xCB4C, 0xCB6C, 0xCF4C, 0xCF6C, 0xE7,
94 0xF7, 0xE8, 0xF8, 0xC24E, 0xC26E,
95 0xCB4E, 0xCB6E, 0xCF4E, 0xCF6E, 0xEF,
96 0xEE, 0xFE, 0xC54F, 0xC56F, 0x00,
97 0x00, 0xCD4F, 0xCD6F, 0xEA, 0xFA,
98 0xC252, 0xC272, 0xCB52, 0xCB72, 0xCF52,
99 0xCF72, 0xC253, 0xC273, 0xC353, 0xC373,
100 0xCB53, 0xCB73, 0xCF53, 0xCF73, 0xCB54,
101 0xCB74, 0xCF54, 0xCF74, 0xED, 0xFD,
102 0xC455, 0xC475, 0xC555, 0xC575, 0xC655,
103 0xC675, 0xCA55, 0xCA75, 0xCD55, 0xCD75,
104 0xCE55, 0xCE75, 0xC357, 0xC377, 0xC359,
105 0xC379, 0xC859, 0xC25A, 0xC27A, 0xC75A,
106 0xC77A, 0xCF5A, 0xCF7A
107 };
108
109 struct extended_range {
110 const int codepoint;
111 const int value;
112 };
113
114 #ifdef _MSC_VER
115 // Needed for msvc's debug assertions
operator <(extended_range const & lft,extended_range const & rgt)116 bool operator<(extended_range const& lft, extended_range const& rgt) {
117 return lft.codepoint < rgt.codepoint;
118 }
119
operator <(int lft,extended_range const & rgt)120 bool operator<(int lft, extended_range const& rgt) {
121 return lft < rgt.codepoint;
122 }
123 #endif
124
operator <(extended_range const & lft,int rgt)125 bool operator<(extended_range const& lft, int rgt) {
126 return lft.codepoint < rgt;
127 }
128
129 // ISO-6937-2 values for codepoints that don't come in a nice contiguous block
130 const extended_range iso6937_extended_codepoints[] = {
131 { 0x02C7, 0xCF20 },
132 { 0x02D8, 0xC620 },
133 { 0x02D9, 0xC720 },
134 { 0x02DA, 0xCA20 },
135 { 0x02DB, 0xCE20 },
136 { 0x02DD, 0xCD20 },
137 { 0x2014, 0xD0 },
138 { 0x2018, 0xA9 },
139 { 0x2019, 0xB9 },
140 { 0x201C, 0xAA },
141 { 0x201D, 0xBA },
142 { 0x2022, 0xD4 },
143 { 0x20AC, 0xA4 }, // ETSI EN 300 468 extension: euro sign at A4
144 { 0x2126, 0xE0 },
145 { 0x215B, 0xDC },
146 { 0x215C, 0xDD },
147 { 0x215D, 0xDE },
148 { 0x2190, 0xAC },
149 { 0x2191, 0xAD },
150 { 0x2192, 0xAE },
151 { 0x2193, 0xAF },
152 { 0x266A, 0xD5 }
153 };
154
155 #define countof(array) (sizeof(array) / sizeof((array)[0]))
156
157 /// Get the ISO-6937-2 value for the given unicode codepoint or 0 if it cannot be mapped
get_iso6937(int codepoint)158 int get_iso6937(int codepoint) {
159 if (static_cast<size_t>(codepoint) < countof(iso6937_codepoints))
160 return iso6937_codepoints[codepoint];
161
162 auto ext = boost::lower_bound(iso6937_extended_codepoints, codepoint);
163 if (ext == std::end(iso6937_extended_codepoints) || ext->codepoint != codepoint)
164 return 0;
165 return ext->value;
166 }
167
168 } // namespace {
169
170 namespace agi { namespace charset {
171
172 #ifdef _LIBICONV_VERSION
173 #define INTERNAL_CHARSET "UCS-4-INTERNAL"
174 #else
175 #define INTERNAL_CHARSET "WCHAR_T"
176 #endif
177
Converter6937(bool subst,const char * src)178 Converter6937::Converter6937(bool subst, const char *src)
179 : to_ucs4(new IconvWrapper(src, INTERNAL_CHARSET))
180 , subst(subst)
181 {
182 }
183
Convert(const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)184 size_t Converter6937::Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) {
185 // No state to reset
186 if (!inbuf || !inbytesleft)
187 return 0;
188
189 size_t bytes_written = 0;
190
191 while (*inbytesleft > 0) {
192 int in_val = 0;
193
194 // Copy inbuf/inbytesleft so that we don't update them if the
195 // conversion fails (due to not enough space or a bad sequence)
196 const char *inbuftmp = *inbuf;
197 size_t inbyteslefttmp = *inbytesleft;
198
199 char *val_buf = reinterpret_cast<char *>(&in_val);
200 size_t val_buf_size = sizeof(in_val);
201
202 // Get the next unicode character from the input
203 size_t ret = to_ucs4->Convert(&inbuftmp, &inbyteslefttmp, &val_buf, &val_buf_size);
204 if (ret == (size_t)-1 && errno != E2BIG)
205 return ret;
206
207 // And convert that to ISO-6937-2
208 int val = get_iso6937(in_val);
209 if (!val && in_val) {
210 if (subst) {
211 val = '?';
212 }
213 else {
214 errno = EILSEQ;
215 return (size_t)-1;
216 }
217 }
218
219 if (*outbytesleft < 1 || (val > 255 && *outbytesleft < 2)) {
220 errno = E2BIG;
221 return (size_t)-1;
222 }
223
224 #define WRITE_BYTE(b) \
225 do { \
226 *(*outbuf)++ = (b); \
227 --*outbytesleft; \
228 ++bytes_written; \
229 } while(0)
230
231 if (val <= 255)
232 WRITE_BYTE(val);
233 else {
234 WRITE_BYTE((val >> 8) & 0xFF);
235 WRITE_BYTE(val & 0xFF);
236 }
237
238 // Update the input pointers now that the conversion has succeeded
239 *inbuf = inbuftmp;
240 *inbytesleft = inbyteslefttmp;
241 }
242
243 return bytes_written;
244 }
245
246 } } // namespace agi::charset
247