1 /**********************************************************************
2 windows_1257.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regenc.h"
31 #include "iso_8859.h"
32
33 /*
34 * Name: windows-1257
35 * MIBenum: 2257
36 * Link: http://www.iana.org/assignments/character-sets
37 * Link: http://www.microsoft.com/globaldev/reference/sbcs/1257.mspx
38 * Link: http://en.wikipedia.org/wiki/Windows-1257
39 */
40
41 #define ENC_CP1252_TO_LOWER_CASE(c) EncCP1252_ToLowerCaseTable[c]
42 #define ENC_IS_CP1252_CTYPE(code,ctype) \
43 ((EncCP1252_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
44
45 static const UChar EncCP1252_ToLowerCaseTable[256] = {
46 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
47 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
48 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
49 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
50 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
51 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
52 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
53 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
54 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
55 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
56 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
57 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
58 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
59 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
60 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
61 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
62 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
63 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
64 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
65 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
66 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
67 '\270', '\251', '\272', '\253', '\254', '\255', '\256', '\277',
68 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
69 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
70 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
71 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
72 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
73 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
74 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
75 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
76 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
77 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
78 };
79
80 static const unsigned short EncCP1252_CtypeTable[256] = {
81 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
82 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
83 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
84 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
85 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
86 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
87 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
88 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
89 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
90 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
91 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
92 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
93 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
94 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
95 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
96 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
97 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
98 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
99 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
100 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
101 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0,
102 0x34a2, 0x00a0, 0x34a2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x34a2,
103 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x01a0, 0x30e2, 0x00a0, 0x01a0,
104 0x30e2, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x30e2,
105 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
106 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
107 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
108 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
109 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
110 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
111 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
112 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x01a0
113 };
114
115 static int
mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower,OnigEncoding enc ARG_UNUSED)116 mbc_case_fold(OnigCaseFoldType flag,
117 const UChar** pp, const UChar* end ARG_UNUSED,
118 UChar* lower, OnigEncoding enc ARG_UNUSED)
119 {
120 const UChar* p = *pp;
121
122 if (*p == SHARP_s && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
123 *lower++ = 's';
124 *lower = 's';
125 (*pp)++;
126 return 2;
127 }
128
129 *lower = ENC_CP1252_TO_LOWER_CASE(*p);
130 (*pp)++;
131 return 1;
132 }
133
134 #if 0
135 static int
136 is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
137 {
138 int v;
139 const UChar* p = *pp;
140
141 if (*p == SHARP_s && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
142 (*pp)++;
143 return TRUE;
144 }
145
146 (*pp)++;
147 v = (EncCP1252_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
148 if ((v | BIT_CTYPE_LOWER) != 0) {
149 /* 0xdf, 0xb5 are lower case letter, but can't convert. */
150 if (*p == 0xb5)
151 return FALSE;
152 else
153 return TRUE;
154 }
155
156 return (v != 0 ? TRUE : FALSE);
157 }
158 #endif
159
160 static int
is_code_ctype(OnigCodePoint code,unsigned int ctype,OnigEncoding enc ARG_UNUSED)161 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
162 {
163 if (code < 256)
164 return ENC_IS_CP1252_CTYPE(code, ctype);
165 else
166 return FALSE;
167 }
168
169 static const OnigPairCaseFoldCodes CaseFoldMap[] = {
170 { 0xa8, 0xb8 },
171 { 0xaa, 0xba },
172 { 0xaf, 0xbf },
173
174 { 0xc0, 0xe0 },
175 { 0xc1, 0xe1 },
176 { 0xc2, 0xe2 },
177 { 0xc3, 0xe3 },
178 { 0xc4, 0xe4 },
179 { 0xc5, 0xe5 },
180 { 0xc6, 0xe6 },
181 { 0xc7, 0xe7 },
182 { 0xc8, 0xe8 },
183 { 0xc9, 0xe9 },
184 { 0xca, 0xea },
185 { 0xcb, 0xeb },
186 { 0xcc, 0xec },
187 { 0xcd, 0xed },
188 { 0xce, 0xee },
189 { 0xcf, 0xef },
190
191 { 0xd0, 0xf0 },
192 { 0xd1, 0xf1 },
193 { 0xd2, 0xf2 },
194 { 0xd3, 0xf3 },
195 { 0xd4, 0xf4 },
196 { 0xd5, 0xf5 },
197 { 0xd6, 0xf6 },
198 { 0xd8, 0xf8 },
199 { 0xd9, 0xf9 },
200 { 0xda, 0xfa },
201 { 0xdb, 0xfb },
202 { 0xdc, 0xfc },
203 { 0xdd, 0xfd },
204 { 0xde, 0xfe }
205 };
206
207 static int
apply_all_case_fold(OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg,OnigEncoding enc ARG_UNUSED)208 apply_all_case_fold(OnigCaseFoldType flag,
209 OnigApplyAllCaseFoldFunc f, void* arg,
210 OnigEncoding enc ARG_UNUSED)
211 {
212 return onigenc_apply_all_case_fold_with_map(
213 numberof(CaseFoldMap), CaseFoldMap, 1,
214 flag, f, arg);
215 }
216
217 static int
get_case_fold_codes_by_str(OnigCaseFoldType flag,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[],OnigEncoding enc ARG_UNUSED)218 get_case_fold_codes_by_str(OnigCaseFoldType flag,
219 const OnigUChar* p, const OnigUChar* end,
220 OnigCaseFoldCodeItem items[],
221 OnigEncoding enc ARG_UNUSED)
222 {
223 return onigenc_get_case_fold_codes_by_str_with_map(
224 numberof(CaseFoldMap), CaseFoldMap, 1,
225 flag, p, end, items);
226 }
227
228 #define DOTLESS_i (0xB9)
229 #define I_WITH_DOT_ABOVE (0xA9)
230 static int
case_map(OnigCaseFoldType * flagP,const OnigUChar ** pp,const OnigUChar * end,OnigUChar * to,OnigUChar * to_end,const struct OnigEncodingTypeST * enc)231 case_map(OnigCaseFoldType* flagP, const OnigUChar** pp,
232 const OnigUChar* end, OnigUChar* to, OnigUChar* to_end,
233 const struct OnigEncodingTypeST* enc)
234 {
235 OnigCodePoint code;
236 OnigUChar *to_start = to;
237 OnigCaseFoldType flags = *flagP;
238
239 while (*pp < end && to < to_end) {
240 code = *(*pp)++;
241 if (code == SHARP_s) {
242 if (flags & ONIGENC_CASE_UPCASE) {
243 flags |= ONIGENC_CASE_MODIFIED;
244 *to++ = 'S';
245 code = (flags & ONIGENC_CASE_TITLECASE) ? 's' : 'S';
246 }
247 else if (flags & ONIGENC_CASE_FOLD) {
248 flags |= ONIGENC_CASE_MODIFIED;
249 *to++ = 's';
250 code = 's';
251 }
252 }
253 else if (code == 0xB5)
254 ;
255 else if ((EncCP1252_CtypeTable[code] & BIT_CTYPE_UPPER)
256 && (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) {
257 flags |= ONIGENC_CASE_MODIFIED;
258 if (code == 'I')
259 code = flags & ONIGENC_CASE_FOLD_TURKISH_AZERI ? DOTLESS_i : 'i';
260 else
261 code = ENC_CP1252_TO_LOWER_CASE(code);
262 }
263 else if ((EncCP1252_CtypeTable[code]&BIT_CTYPE_LOWER)
264 && (flags & ONIGENC_CASE_UPCASE)) {
265 flags |= ONIGENC_CASE_MODIFIED;
266 if (code == 'i')
267 code = flags & ONIGENC_CASE_FOLD_TURKISH_AZERI ? I_WITH_DOT_ABOVE : 'I';
268 else if (code == DOTLESS_i)
269 code = 'I';
270 else if (code >= 0xB0 && code <= 0xBF)
271 code -= 0x10;
272 else
273 code -= 0x20;
274 }
275 *to++ = code;
276 if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */
277 flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE);
278 }
279 *flagP = flags;
280 return (int )(to - to_start);
281 }
282
283 OnigEncodingDefine(windows_1257, Windows_1257) = {
284 onigenc_single_byte_mbc_enc_len,
285 "Windows-1257", /* name */
286 1, /* max enc length */
287 1, /* min enc length */
288 onigenc_is_mbc_newline_0x0a,
289 onigenc_single_byte_mbc_to_code,
290 onigenc_single_byte_code_to_mbclen,
291 onigenc_single_byte_code_to_mbc,
292 mbc_case_fold,
293 apply_all_case_fold,
294 get_case_fold_codes_by_str,
295 onigenc_minimum_property_name_to_ctype,
296 is_code_ctype,
297 onigenc_not_support_get_ctype_code_range,
298 onigenc_single_byte_left_adjust_char_head,
299 onigenc_always_true_is_allowed_reverse_match,
300 case_map,
301 0,
302 ONIGENC_FLAG_NONE,
303 };
304 ENC_ALIAS("CP1257", "Windows-1257")
305