1 // Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
2 //
3 // Permission to use, copy, modify, and distribute this software for any
4 // purpose with or without fee is hereby granted, provided that the above
5 // copyright notice and this permission notice appear in all copies.
6 //
7 // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14 
15 /// @file charset_6937.cpp
16 /// @brief A charset converter for ISO-6937-2
17 /// @ingroup libaegisub
18 
19 #include "charset_6937.h"
20 
21 #include <cerrno>
22 #include <iconv.h>
23 
24 #include <boost/range/algorithm/lower_bound.hpp>
25 
26 namespace {
27 
28 // ISO-6937-2 values for the first 383 codepoints
29 const int iso6937_codepoints[] = {
30 	0x00,   0x01,   0x02,   0x03,   0x04,
31 	0x05,   0x06,   0x07,   0x08,   0x09,
32 	0x0A,   0x0B,   0x0C,   0x0D,   0x0E,
33 	0x0F,   0x10,   0x11,   0x12,   0x13,
34 	0x14,   0x15,   0x16,   0x17,   0x18,
35 	0x19,   0x1A,   0x1B,   0x1C,   0x1D,
36 	0x1E,   0x1F,   0x20,   0x21,   0x22,
37 	0x23,   0x24,   0x25,   0x26,   0x27,
38 	0x28,   0x29,   0x2A,   0x2B,   0x2C,
39 	0x2D,   0x2E,   0x2F,   0x30,   0x31,
40 	0x32,   0x33,   0x34,   0x35,   0x36,
41 	0x37,   0x38,   0x39,   0x3A,   0x3B,
42 	0x3C,   0x3D,   0x3E,   0x3F,   0x40,
43 	0x41,   0x42,   0x43,   0x44,   0x45,
44 	0x46,   0x47,   0x48,   0x49,   0x4A,
45 	0x4B,   0x4C,   0x4D,   0x4E,   0x4F,
46 	0x50,   0x51,   0x52,   0x53,   0x54,
47 	0x55,   0x56,   0x57,   0x58,   0x59,
48 	0x5A,   0x5B,   0x5C,   0x5D,   0x5E,
49 	0x5F,   0x60,   0x61,   0x62,   0x63,
50 	0x64,   0x65,   0x66,   0x67,   0x68,
51 	0x69,   0x6A,   0x6B,   0x6C,   0x6D,
52 	0x6E,   0x6F,   0x70,   0x71,   0x72,
53 	0x73,   0x74,   0x75,   0x76,   0x77,
54 	0x78,   0x79,   0x7A,   0x7B,   0x7C,
55 	0x7D,   0x7E,   0x7F,   0x80,   0x81,
56 	0x82,   0x83,   0x84,   0x85,   0x86,
57 	0x87,   0x88,   0x89,   0x8A,   0x8B,
58 	0x8C,   0x8D,   0x8E,   0x8F,   0x90,
59 	0x91,   0x92,   0x93,   0x94,   0x95,
60 	0x96,   0x97,   0x98,   0x99,   0x9A,
61 	0x9B,   0x9C,   0x9D,   0x9E,   0x9F,
62 	0xA0,   0xA1,   0xA2,   0xA3,   0xA8,
63 	0xA5,   0x00,   0xA7,   0xC820, 0xD3,
64 	0xE3,   0xAB,   0x00,   0x00,   0xD2,
65 	0xC520, 0xB0,   0xB1,   0xB2,   0xB3,
66 	0xC220, 0xB5,   0xB6,   0xB7,   0xCB20,
67 	0xD1,   0xEB,   0xBB,   0xBC,   0xBD,
68 	0xBE,   0xBF,   0xC141, 0xC241, 0xC341,
69 	0xC441, 0xC841, 0xCA41, 0xE1,   0xCB43,
70 	0xC145, 0xC245, 0xC345, 0xC845, 0xC149,
71 	0xC249, 0xC349, 0xC849, 0xE2,   0xC44E,
72 	0xC14F, 0xC24F, 0xC34F, 0xC44F, 0xC84F,
73 	0xB4,   0xE9,   0xC155, 0xC255, 0xC355,
74 	0xC855, 0xC259, 0xEC,   0xFB,   0xC161,
75 	0xC261, 0xC361, 0xC461, 0xC861, 0xCA61,
76 	0xF1,   0xCB63, 0xC165, 0xC265, 0xC365,
77 	0xC865, 0xC169, 0xC269, 0xC369, 0xC869,
78 	0xF3,   0xC46E, 0xC16F, 0xC26F, 0xC36F,
79 	0xC46F, 0xC86F, 0xB8,   0xF9,   0xC175,
80 	0xC275, 0xC375, 0xC875, 0xC279, 0xFC,
81 	0xC879, 0xC541, 0xC561, 0xC641, 0xC661,
82 	0xCE41, 0xCE61, 0xC243, 0xC263, 0xC343,
83 	0xC363, 0xC743, 0xC763, 0xCF43, 0xCF63,
84 	0xCF44, 0xCF64, 0x00,   0xF2,   0xC545,
85 	0xC565, 0x00,   0x00,   0xC745, 0xC765,
86 	0xCE45, 0xCE65, 0xCF45, 0xCF65, 0xC347,
87 	0xC367, 0xC647, 0xC667, 0xC747, 0xC767,
88 	0xCB47, 0xCB67, 0xC348, 0xC368, 0xE4,
89 	0xF4,   0xC449, 0xC469, 0xC549, 0xC569,
90 	0x00,   0x00,   0xCE49, 0xCE69, 0xC749,
91 	0xF5,   0xE6,   0xF6,   0xC34A, 0xC36A,
92 	0xCB4B, 0xCB6B, 0xF0,   0xC24C, 0xC26C,
93 	0xCB4C, 0xCB6C, 0xCF4C, 0xCF6C, 0xE7,
94 	0xF7,   0xE8,   0xF8,   0xC24E, 0xC26E,
95 	0xCB4E, 0xCB6E, 0xCF4E, 0xCF6E, 0xEF,
96 	0xEE,   0xFE,   0xC54F, 0xC56F, 0x00,
97 	0x00,   0xCD4F, 0xCD6F, 0xEA,   0xFA,
98 	0xC252, 0xC272, 0xCB52, 0xCB72, 0xCF52,
99 	0xCF72, 0xC253, 0xC273, 0xC353, 0xC373,
100 	0xCB53, 0xCB73, 0xCF53, 0xCF73, 0xCB54,
101 	0xCB74, 0xCF54, 0xCF74, 0xED,   0xFD,
102 	0xC455, 0xC475, 0xC555, 0xC575, 0xC655,
103 	0xC675, 0xCA55, 0xCA75, 0xCD55, 0xCD75,
104 	0xCE55, 0xCE75, 0xC357, 0xC377, 0xC359,
105 	0xC379, 0xC859, 0xC25A, 0xC27A, 0xC75A,
106 	0xC77A, 0xCF5A, 0xCF7A
107 };
108 
109 struct extended_range {
110 	const int codepoint;
111 	const int value;
112 };
113 
114 #ifdef _MSC_VER
115 // Needed for msvc's debug assertions
operator <(extended_range const & lft,extended_range const & rgt)116 bool operator<(extended_range const& lft, extended_range const& rgt) {
117 	return lft.codepoint < rgt.codepoint;
118 }
119 
operator <(int lft,extended_range const & rgt)120 bool operator<(int lft, extended_range const& rgt) {
121 	return lft < rgt.codepoint;
122 }
123 #endif
124 
operator <(extended_range const & lft,int rgt)125 bool operator<(extended_range const& lft, int rgt) {
126 	return lft.codepoint < rgt;
127 }
128 
129 // ISO-6937-2 values for codepoints that don't come in a nice contiguous block
130 const extended_range iso6937_extended_codepoints[] = {
131 	{ 0x02C7, 0xCF20 },
132 	{ 0x02D8, 0xC620 },
133 	{ 0x02D9, 0xC720 },
134 	{ 0x02DA, 0xCA20 },
135 	{ 0x02DB, 0xCE20 },
136 	{ 0x02DD, 0xCD20 },
137 	{ 0x2014, 0xD0 },
138 	{ 0x2018, 0xA9 },
139 	{ 0x2019, 0xB9 },
140 	{ 0x201C, 0xAA },
141 	{ 0x201D, 0xBA },
142 	{ 0x2022, 0xD4 },
143 	{ 0x20AC, 0xA4 }, // ETSI EN 300 468 extension: euro sign at A4
144 	{ 0x2126, 0xE0 },
145 	{ 0x215B, 0xDC },
146 	{ 0x215C, 0xDD },
147 	{ 0x215D, 0xDE },
148 	{ 0x2190, 0xAC },
149 	{ 0x2191, 0xAD },
150 	{ 0x2192, 0xAE },
151 	{ 0x2193, 0xAF },
152 	{ 0x266A, 0xD5 }
153 };
154 
155 #define countof(array) (sizeof(array) / sizeof((array)[0]))
156 
157 /// Get the ISO-6937-2 value for the given unicode codepoint or 0 if it cannot be mapped
get_iso6937(int codepoint)158 int get_iso6937(int codepoint) {
159 	if (static_cast<size_t>(codepoint) < countof(iso6937_codepoints))
160 		return iso6937_codepoints[codepoint];
161 
162 	auto ext = boost::lower_bound(iso6937_extended_codepoints, codepoint);
163 	if (ext == std::end(iso6937_extended_codepoints) || ext->codepoint != codepoint)
164 		return 0;
165 	return ext->value;
166 }
167 
168 } // namespace {
169 
170 namespace agi { namespace charset {
171 
172 #ifdef _LIBICONV_VERSION
173 #define INTERNAL_CHARSET "UCS-4-INTERNAL"
174 #else
175 #define INTERNAL_CHARSET "WCHAR_T"
176 #endif
177 
Converter6937(bool subst,const char * src)178 Converter6937::Converter6937(bool subst, const char *src)
179 : to_ucs4(new IconvWrapper(src, INTERNAL_CHARSET))
180 , subst(subst)
181 {
182 }
183 
Convert(const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)184 size_t Converter6937::Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) {
185 	// No state to reset
186 	if (!inbuf || !inbytesleft)
187 		return 0;
188 
189 	size_t bytes_written = 0;
190 
191 	while (*inbytesleft > 0) {
192 		int in_val = 0;
193 
194 		// Copy inbuf/inbytesleft so that we don't update them if the
195 		// conversion fails (due to not enough space or a bad sequence)
196 		const char *inbuftmp = *inbuf;
197 		size_t inbyteslefttmp = *inbytesleft;
198 
199 		char *val_buf = reinterpret_cast<char *>(&in_val);
200 		size_t val_buf_size = sizeof(in_val);
201 
202 		// Get the next unicode character from the input
203 		size_t ret = to_ucs4->Convert(&inbuftmp, &inbyteslefttmp, &val_buf, &val_buf_size);
204 		if (ret == (size_t)-1 && errno != E2BIG)
205 			return ret;
206 
207 		// And convert that to ISO-6937-2
208 		int val = get_iso6937(in_val);
209 		if (!val && in_val) {
210 			if (subst) {
211 				val = '?';
212 			}
213 			else {
214 				errno = EILSEQ;
215 				return (size_t)-1;
216 			}
217 		}
218 
219 		if (*outbytesleft < 1 || (val > 255 && *outbytesleft < 2)) {
220 			errno = E2BIG;
221 			return (size_t)-1;
222 		}
223 
224 #define WRITE_BYTE(b) \
225 		do { \
226 			*(*outbuf)++ = (b); \
227 			--*outbytesleft; \
228 			++bytes_written; \
229 		} while(0)
230 
231 		if (val <= 255)
232 			WRITE_BYTE(val);
233 		else {
234 			WRITE_BYTE((val >> 8) & 0xFF);
235 			WRITE_BYTE(val & 0xFF);
236 		}
237 
238 		// Update the input pointers now that the conversion has succeeded
239 		*inbuf = inbuftmp;
240 		*inbytesleft = inbyteslefttmp;
241 	}
242 
243 	return bytes_written;
244 }
245 
246 } } // namespace agi::charset
247