1 /*
2 * $Id: charsets.c,v 1.80 2018/10/21 19:18:36 tom Exp $
3 *
4 * see
5 http://msdn.microsoft.com/library/default.asp?url=/library/en-us/intl/unicode_42jv.asp
6 http://en.wikipedia.org/wiki/Byte_Order_Mark
7 http://en.wikipedia.org/wiki/UTF-16
8 */
9
10 #include <estruct.h>
11 #include <chgdfunc.h>
12 #include <edef.h>
13 #include <nefsms.h>
14
15 #if OPT_ICONV_FUNCS
16 #include <iconv.h>
17 #include <locale.h>
18 #endif
19 /* *INDENT-OFF* */
20 static const UCHAR mark_NONE[] = { 0x00 };
21 static const UCHAR mark_UTF8[] = { 0xef, 0xbb, 0xbf };
22 static const UCHAR mark_UTF16LE[] = { 0xff, 0xfe };
23 static const UCHAR mark_UTF16BE[] = { 0xfe, 0xff };
24 static const UCHAR mark_UTF32LE[] = { 0xff, 0xfe, 0x00, 0x00 };
25 static const UCHAR mark_UTF32BE[] = { 0x00, 0x00, 0xfe, 0xff };
26
27 #define IsNonNull(code) (code == 0xff)
28
29 typedef struct {
30 BOM_CODES code;
31 const UCHAR *mark;
32 size_t size;
33 } BOM_TABLE;
34
35 #define DATA(name) { bom_##name, mark_##name, sizeof(mark_##name) }
36 static const BOM_TABLE bom_table[] = {
37 { bom_NONE, mark_NONE, 0 },
38 DATA(UTF8),
39 DATA(UTF32LE), /* must be before UTF-16 entries */
40 DATA(UTF32BE),
41 DATA(UTF16LE),
42 DATA(UTF16BE),
43 };
44 #undef DATA
45 /* *INDENT-ON* */
46
47 /******************************************************************************/
48
49 static int
allow_decoder(BUFFER * bp,size_t need)50 allow_decoder(BUFFER *bp, size_t need)
51 {
52 if (need > bp->decode_utf_len) {
53 bp->decode_utf_len = (need + 1) * 2;
54 safe_typereallocn(UINT, bp->decode_utf_buf, bp->decode_utf_len);
55 }
56 return (bp->decode_utf_buf != 0);
57 }
58
59 static int
allow_encoder(BUFFER * bp,size_t need)60 allow_encoder(BUFFER *bp, size_t need)
61 {
62 if (need > bp->encode_utf_len) {
63 bp->encode_utf_len = (need + 1) * 2;
64 safe_typereallocn(char, bp->encode_utf_buf, bp->encode_utf_len);
65 }
66 return (bp->encode_utf_buf != 0);
67 }
68
69 int
vl_conv_to_utf8(UCHAR * target,UINT source,B_COUNT limit)70 vl_conv_to_utf8(UCHAR * target, UINT source, B_COUNT limit)
71 {
72 #define CH(n) (UCHAR)((source) >> ((n) * 8))
73 int rc = 0;
74
75 if (source <= 0x0000007f)
76 rc = 1;
77 else if (source <= 0x000007ff)
78 rc = 2;
79 else if (source <= 0x0000ffff)
80 rc = 3;
81 else if (source <= 0x001fffff)
82 rc = 4;
83 else if (source <= 0x03ffffff)
84 rc = 5;
85 else /* (source <= 0x7fffffff) */
86 rc = 6;
87
88 if ((B_COUNT) rc > limit) { /* whatever it is, we cannot decode it */
89 TRACE2(("limit failed in vl_conv_to_utf8 %d/%ld %#06x\n",
90 rc, limit, source));
91 rc = 0;
92 }
93
94 if (target != 0) {
95 switch (rc) {
96 case 1:
97 target[0] = (UCHAR) CH(0);
98 break;
99
100 case 2:
101 target[1] = (UCHAR) (0x80 | (CH(0) & 0x3f));
102 target[0] = (UCHAR) (0xc0 | (CH(0) >> 6) | ((CH(1) & 0x07) << 2));
103 break;
104
105 case 3:
106 target[2] = (UCHAR) (0x80 | (CH(0) & 0x3f));
107 target[1] = (UCHAR) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
108 target[0] = (UCHAR) (0xe0 | ((int) (CH(1) & 0xf0) >> 4));
109 break;
110
111 case 4:
112 target[3] = (UCHAR) (0x80 | (CH(0) & 0x3f));
113 target[2] = (UCHAR) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
114 target[1] = (UCHAR) (0x80 |
115 ((int) (CH(1) & 0xf0) >> 4) |
116 ((int) (CH(2) & 0x03) << 4));
117 target[0] = (UCHAR) (0xf0 | ((int) (CH(2) & 0x1f) >> 2));
118 break;
119
120 case 5:
121 target[4] = (UCHAR) (0x80 | (CH(0) & 0x3f));
122 target[3] = (UCHAR) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
123 target[2] = (UCHAR) (0x80 |
124 ((int) (CH(1) & 0xf0) >> 4) |
125 ((int) (CH(2) & 0x03) << 4));
126 target[1] = (UCHAR) (0x80 | (CH(2) >> 2));
127 target[0] = (UCHAR) (0xf8 | (CH(3) & 0x03));
128 break;
129
130 case 6:
131 target[5] = (UCHAR) (0x80 | (CH(0) & 0x3f));
132 target[4] = (UCHAR) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
133 target[3] = (UCHAR) (0x80 | (CH(1) >> 4) | ((CH(2) & 0x03) << 4));
134 target[2] = (UCHAR) (0x80 | (CH(2) >> 2));
135 target[1] = (UCHAR) (0x80 | (CH(3) & 0x3f));
136 target[0] = (UCHAR) (0xfc | ((int) (CH(3) & 0x40) >> 6));
137 break;
138 }
139 TRACE2(("decode %#08x %02X.%02X.%02X.%02X %d:%.*s\n", source,
140 CH(3), CH(2), CH(1), CH(0), rc, rc, target));
141 }
142
143 return rc; /* number of bytes needed in target */
144 #undef CH
145 }
146
147 int
vl_check_utf8(const char * source,B_COUNT limit)148 vl_check_utf8(const char *source, B_COUNT limit)
149 {
150 int rc = 0;
151 int j;
152
153 /*
154 * Find the number of bytes we will need from the source.
155 */
156 if ((*source & 0x80) == 0) {
157 rc = 1;
158 } else if ((*source & 0xe0) == 0xc0) {
159 rc = 2;
160 } else if ((*source & 0xf0) == 0xe0) {
161 rc = 3;
162 } else if ((*source & 0xf8) == 0xf0) {
163 rc = 4;
164 } else if ((*source & 0xfc) == 0xf8) {
165 rc = 5;
166 } else if ((*source & 0xfe) == 0xfc) {
167 rc = 6;
168 }
169
170 /*
171 * sanity-check.
172 */
173 if (rc > 1) {
174 int have = rc;
175
176 if ((int) limit < have)
177 have = (int) limit;
178
179 for (j = 1; j < have; j++) {
180 if ((source[j] & 0xc0) != 0x80)
181 break;
182 }
183 if (j != have) {
184 TRACE2(("check failed %d/%d in vl_check_utf8\n", j, rc));
185 rc = 0;
186 }
187 }
188 return rc;
189 }
190
191 int
vl_conv_to_utf32(UINT * target,const char * source,B_COUNT limit)192 vl_conv_to_utf32(UINT * target, const char *source, B_COUNT limit)
193 {
194 #define CH(n) (UCHAR)((*target) >> ((n) * 8))
195 int rc = vl_check_utf8(source, limit);
196
197 if ((B_COUNT) rc > limit) { /* whatever it is, we cannot decode it */
198 TRACE2(("limit failed %d/%ld in vl_conv_to_utf32\n", rc, limit));
199 rc = 0;
200 }
201
202 if (target != 0) {
203 UINT mask = 0;
204 int j;
205 int shift = 0;
206 *target = 0;
207
208 switch (rc) {
209 case 1:
210 mask = (UINT) * source;
211 break;
212 case 2:
213 mask = (UINT) (*source & 0x1f);
214 break;
215 case 3:
216 mask = (UINT) (*source & 0x0f);
217 break;
218 case 4:
219 mask = (UINT) (*source & 0x07);
220 break;
221 case 5:
222 mask = (UINT) (*source & 0x03);
223 break;
224 case 6:
225 mask = (UINT) (*source & 0x01);
226 break;
227 default:
228 mask = 0;
229 break;
230 }
231
232 for (j = 1; j < rc; j++) {
233 *target |= (UINT) (source[rc - j] & 0x3f) << shift;
234 shift += 6;
235 }
236 *target |= mask << shift;
237
238 TRACE2(("encode %2d:%.*s -> %#08x %02X.%02X.%02X.%02X\n",
239 rc, rc, source,
240 *target,
241 CH(3), CH(2), CH(1), CH(0)));
242 }
243 return rc;
244 #undef CH
245 }
246
247 static const BOM_TABLE *
find_mark_info(BOM_CODES code)248 find_mark_info(BOM_CODES code)
249 {
250 const BOM_TABLE *result = 0;
251 unsigned n;
252
253 for (n = 0; n < TABLESIZE(bom_table); ++n) {
254 const BOM_TABLE *mp = bom_table + n;
255 if (mp->code == code) {
256 result = mp;
257 break;
258 }
259 }
260 return result;
261 }
262
263 static BOM_CODES
get_bom(BUFFER * bp)264 get_bom(BUFFER *bp)
265 {
266 BOM_CODES rc = (BOM_CODES) b_val(bp, VAL_BYTEORDER_MARK);
267 if (rc == bom_NONE)
268 rc = bp->implied_BOM;
269 return rc;
270 }
271
272 /*
273 * If we read a file without a byteorder-mark, it is using one of the
274 * le-assumed or be-assumed values for the corresponding mode. See what the
275 * file-encoding is, and choose a specific byteorder-mark, needed when we
276 * write the data to a file.
277 */
278 static BOM_CODES
inferred_bom2(BUFFER * bp,BOM_CODES code)279 inferred_bom2(BUFFER *bp, BOM_CODES code)
280 {
281 BOM_CODES result = code;
282
283 switch (result) {
284 case bom_LE_ASSUMED:
285 switch (b_val(bp, VAL_FILE_ENCODING)) {
286 case enc_UTF16:
287 result = bom_UTF16LE;
288 break;
289 case enc_UTF32:
290 result = bom_UTF32LE;
291 break;
292 default:
293 result = bom_NONE;
294 break;
295 }
296 break;
297 case bom_BE_ASSUMED:
298 switch (b_val(bp, VAL_FILE_ENCODING)) {
299 case enc_UTF16:
300 result = bom_UTF16BE;
301 break;
302 case enc_UTF32:
303 result = bom_UTF32BE;
304 break;
305 default:
306 result = bom_NONE;
307 break;
308 }
309 break;
310 case bom_NONE:
311 switch (b_val(bp, VAL_FILE_ENCODING)) {
312 case enc_UTF16:
313 result = bom_UTF16LE;
314 break;
315 case enc_UTF32:
316 result = bom_UTF32LE;
317 break;
318 }
319 break;
320 default:
321 break;
322 }
323 #if OPT_TRACE > 1
324 if (result != code)
325 TRACE2(("inferred_bom(%s) %s\n",
326 byteorder2s(code),
327 byteorder2s(result)));
328 #endif
329 return result;
330 }
331
332 static BOM_CODES
inferred_bom(BUFFER * bp,const BOM_TABLE * mp)333 inferred_bom(BUFFER *bp, const BOM_TABLE * mp)
334 {
335 return inferred_bom2(bp, mp->code);
336 }
337
338 /*
339 * If the buffer has no explicit byteorder-mark, but the encoding is UTF-16
340 * or UTF-32, we still need to know the assumed or implicit byteorder-mark.
341 *
342 * When reading the buffer, and no byteorder-mark is found for UTF-16/-32,
343 * we store an implied BOM in the buffer attributes, and set an assumed
344 * byteorder-mark that the user can see/modify. If the assumed BOM is reset,
345 * e.g., to auto or none, we can still use the implied BOM, e.g., for writing
346 * the file.
347 */
348 static const BOM_TABLE *
find_mark_info2(BUFFER * bp)349 find_mark_info2(BUFFER *bp)
350 {
351 const BOM_TABLE *mp = find_mark_info(get_bom(bp));
352
353 if ((mp != 0) &&
354 (mp->size == 0) &&
355 (b_val(bp, VAL_FILE_ENCODING) > enc_UTF8)) {
356 mp = find_mark_info(inferred_bom(bp, mp));
357 } else if (mp == 0 &&
358 (b_val(bp, VAL_FILE_ENCODING) > enc_UTF8)) {
359 mp = find_mark_info(inferred_bom2(bp, (BOM_CODES) b_val(bp, VAL_BYTEORDER_MARK)));
360 }
361 return mp;
362 }
363
364 static int
line_has_mark(const BOM_TABLE * mp,UCHAR * buffer,B_COUNT length)365 line_has_mark(const BOM_TABLE * mp, UCHAR * buffer, B_COUNT length)
366 {
367 int result = FALSE;
368
369 if (length >= mp->size
370 && mp->size != 0
371 && memcmp(buffer, mp->mark, mp->size) == 0) {
372 result = TRUE;
373 }
374 return result;
375 }
376
377 static int
dump_as_utfXX(BUFFER * bp,const char * buf,int nbuf,const char * ending)378 dump_as_utfXX(BUFFER *bp, const char *buf, int nbuf, const char *ending)
379 {
380 #define BYTE_OF(k,n) (char) (bp->decode_utf_buf[k] >> ((n) * 8))
381 int rc = 0;
382 const BOM_TABLE *mp = find_mark_info2(bp);
383
384 if (mp != 0 && mp->size > 1) {
385 size_t j = 0;
386 size_t k = 0;
387 size_t need = (size_t) nbuf + strlen(ending);
388 size_t lend = strlen(ending);
389
390 if (!allow_encoder(bp, need * mp->size))
391 goto finish;
392 if (!allow_decoder(bp, need))
393 goto finish;
394
395 while (j < (unsigned) nbuf) {
396 int skip = vl_conv_to_utf32(bp->decode_utf_buf + k++,
397 buf + j,
398 (B_COUNT) ((UINT) nbuf - j));
399 if (skip == 0)
400 goto finish;
401 j += (UINT) skip;
402 }
403 while (*ending != 0) {
404 int skip = vl_conv_to_utf32(bp->decode_utf_buf + k++,
405 ending++,
406 (B_COUNT) (lend--));
407 if (skip == 0)
408 goto finish;
409 }
410 need = k;
411
412 for (j = k = 0; k < need; j += mp->size, ++k) {
413 switch (mp->code) {
414 case bom_NONE:
415 /* FALLTHRU */
416 case bom_UTF8:
417 /* FALLTHRU */
418 case bom_LE_ASSUMED:
419 /* FALLTHRU */
420 case bom_BE_ASSUMED:
421 /* ignored */
422 break;
423 case bom_UTF16LE:
424 bp->encode_utf_buf[j + 0] = BYTE_OF(k, 0);
425 bp->encode_utf_buf[j + 1] = BYTE_OF(k, 1);
426 break;
427 case bom_UTF16BE:
428 bp->encode_utf_buf[j + 1] = BYTE_OF(k, 0);
429 bp->encode_utf_buf[j + 0] = BYTE_OF(k, 1);
430 break;
431 case bom_UTF32LE:
432 bp->encode_utf_buf[j + 0] = BYTE_OF(k, 0);
433 bp->encode_utf_buf[j + 1] = BYTE_OF(k, 1);
434 bp->encode_utf_buf[j + 2] = BYTE_OF(k, 2);
435 bp->encode_utf_buf[j + 3] = BYTE_OF(k, 3);
436 break;
437 case bom_UTF32BE:
438 bp->encode_utf_buf[j + 3] = BYTE_OF(k, 0);
439 bp->encode_utf_buf[j + 2] = BYTE_OF(k, 1);
440 bp->encode_utf_buf[j + 1] = BYTE_OF(k, 2);
441 bp->encode_utf_buf[j + 0] = BYTE_OF(k, 3);
442 break;
443 }
444 }
445 rc = (int) j;
446 }
447 finish:
448 return rc;
449 #undef BYTE_OF
450 }
451
452 static void
set_byteorder_mark(BUFFER * bp,int value)453 set_byteorder_mark(BUFFER *bp, int value)
454 {
455 if (value != ENUM_UNKNOWN
456 && value != global_b_val(VAL_BYTEORDER_MARK)) {
457 set_local_b_val(bp, VAL_BYTEORDER_MARK, value);
458
459 TRACE(("set_byteorder_mark for '%s' to %s\n",
460 bp->b_bname,
461 byteorder2s(b_val(bp, VAL_BYTEORDER_MARK))));
462 }
463 }
464
465 static void
set_encoding(BUFFER * bp,int value)466 set_encoding(BUFFER *bp, int value)
467 {
468 if (value != ENUM_UNKNOWN
469 && value != global_b_val(VAL_FILE_ENCODING)) {
470 set_local_b_val(bp, VAL_FILE_ENCODING, value);
471
472 TRACE(("set_encoding for '%s' to %s\n",
473 bp->b_bname,
474 encoding2s(b_val(bp, VAL_FILE_ENCODING))));
475 }
476 }
477
478 static int
load_as_utf8(BUFFER * bp,LINE * lp)479 load_as_utf8(BUFFER *bp, LINE *lp)
480 {
481 #define CH(n) ((UCHAR)(lgetc(lp, n)))
482 int rc = FALSE;
483 const BOM_TABLE *mp = find_mark_info2(bp);
484
485 if (mp != 0 && mp->size > 1) {
486 int pass;
487 size_t j, k;
488 size_t need = (size_t) llength(lp);
489 size_t used;
490
491 TRACE2(("load_as_utf8:%d:%s\n", need, lp_visible(lp)));
492 if (allow_decoder(bp, need)) {
493 rc = TRUE;
494 if (need) {
495 for (j = k = 0; j < need; ++k) {
496 UCHAR ch = CH(j);
497 if (ch == '\r' || ch == '\n') {
498 bp->decode_utf_buf[k] = ch;
499 ++j; /* see remove_crlf_nulls() */
500 continue;
501 }
502 switch (inferred_bom(bp, mp)) {
503 case bom_NONE:
504 /* FALLTHRU */
505 case bom_UTF8:
506 /* FALLTHRU */
507 case bom_LE_ASSUMED:
508 /* FALLTHRU */
509 case bom_BE_ASSUMED:
510 /* ignored */
511 break;
512 case bom_UTF16LE:
513 bp->decode_utf_buf[k] = (CH(j)
514 + (UINT) (CH(j + 1) << 8));
515 break;
516 case bom_UTF16BE:
517 bp->decode_utf_buf[k] = (CH(j + 1)
518 + (UINT) (CH(j) << 8));
519 break;
520 case bom_UTF32LE:
521 bp->decode_utf_buf[k] = (CH(j + 0)
522 + (UINT) (CH(j + 1) << 8)
523 + (UINT) (CH(j + 2) << 16)
524 + (UINT) (CH(j + 3) << 24));
525 break;
526 case bom_UTF32BE:
527 bp->decode_utf_buf[k] = (CH(j + 3)
528 + (UINT) (CH(j + 2) << 8)
529 + (UINT) (CH(j + 1) << 16)
530 + (UINT) (CH(j + 0) << 24));
531 break;
532 }
533 j += (UINT) mp->size;
534 }
535 used = k;
536
537 for (pass = 1; pass <= 2; ++pass) {
538 UCHAR *buffer = (pass == 1) ? 0 : (UCHAR *) lvalue(lp);
539 for (j = k = 0; j < used; ++j) {
540 int nn = vl_conv_to_utf8(buffer,
541 bp->decode_utf_buf[j],
542 (B_COUNT) (used + 1 - j));
543 if (buffer != 0)
544 buffer += nn;
545 k += (UINT) nn;
546 }
547 if (pass == 1) {
548 TRACE2(("need %d, have %d\n", k, lp->l_size));
549 if ((int) k > llength(lp)) {
550 char *ntext;
551
552 /*
553 * We are doing this conversion on the initial load
554 * of the buffer, do not want to allow undo. Just
555 * go ahead and reallocate the line's text buffer.
556 */
557 if ((ntext = castalloc(char, (k + 1))) == NULL) {
558 rc = FALSE;
559 break;
560 }
561 ltextfree(lp, bp);
562 lvalue(lp) = ntext;
563 lp->l_size = k;
564 llength(lp) = (int) k;
565 } else {
566 llength(lp) = (int) k;
567 }
568 }
569 }
570 }
571 } else {
572 bp->decode_utf_len = 0;
573 }
574 }
575 return rc;
576 #undef CH
577 }
578
579 /*
580 * Remove the extra nulls (if any - according to the encoding) after
581 * \r and \n bytes. This is done to make the existing logic for checking
582 * recordseparator work without change.
583 */
584 static void
remove_crlf_nulls(BUFFER * bp,UCHAR * buffer,B_COUNT * length)585 remove_crlf_nulls(BUFFER *bp, UCHAR * buffer, B_COUNT * length)
586 {
587 const BOM_TABLE *mp = find_mark_info2(bp);
588 UCHAR mark_cr[4];
589 UCHAR mark_lf[4];
590 size_t marklen = 0;
591
592 if (mp != 0) {
593 memset(mark_cr, 0, sizeof(mark_cr));
594 memset(mark_lf, 0, sizeof(mark_lf));
595
596 switch (mp->code) {
597 case bom_NONE:
598 /* FALLTHRU */
599 case bom_UTF8:
600 /* FALLTHRU */
601 case bom_LE_ASSUMED:
602 /* FALLTHRU */
603 case bom_BE_ASSUMED:
604 /* ignored */
605 break;
606 case bom_UTF16LE:
607 marklen = 2;
608 mark_cr[0] = '\r';
609 mark_lf[0] = '\n';
610 break;
611 case bom_UTF16BE:
612 marklen = 2;
613 mark_cr[1] = '\r';
614 mark_lf[1] = '\n';
615 break;
616 case bom_UTF32LE:
617 marklen = 4;
618 mark_cr[0] = '\r';
619 mark_lf[0] = '\n';
620 break;
621 case bom_UTF32BE:
622 marklen = 4;
623 mark_cr[3] = '\r';
624 mark_lf[3] = '\n';
625 break;
626 }
627 if (marklen != 0) {
628 B_COUNT dst = 0;
629 B_COUNT src = 0;
630 char skip = 0;
631 while (src < *length) {
632 if (!memcmp(mark_cr, buffer + src, marklen))
633 skip = '\r';
634 else if (!memcmp(mark_lf, buffer + src, marklen))
635 skip = '\n';
636 if (skip) {
637 buffer[dst++] = (UCHAR) skip;
638 skip = 0;
639 } else {
640 memcpy(buffer + dst, buffer + src, marklen);
641 dst += (B_COUNT) marklen;
642 }
643 src += (B_COUNT) marklen;
644 }
645 *length = dst;
646 }
647 }
648 }
649
650 /*
651 * Returns a percentage for the number of cells in the buffer which are
652 * explained by interpreting them according to the given byte-order mark
653 * pattern with the assumption that most of the content is ASCII or ISO-8859-1
654 * (8-bits).
655 */
656 static int
riddled_buffer(const BOM_TABLE * mp,UCHAR * buffer,B_COUNT length)657 riddled_buffer(const BOM_TABLE * mp, UCHAR * buffer, B_COUNT length)
658 {
659 int result = 0;
660 B_COUNT total = 0;
661 size_t offset = 0;
662 size_t j, k;
663
664 if (mp->size && !(mp->size % 2)) {
665 TRACE(("checking if %s / %u-byte\n",
666 byteorder2s(mp->code),
667 (UINT) mp->size));
668
669 /* Check the line-length. If it is not a multiple of the pattern
670 * size, just give up.
671 */
672 if ((length + offset) % mp->size) {
673 TRACE(("length %ld vs pattern %u - give up\n",
674 length,
675 (UINT) mp->size));
676 } else {
677 /*
678 * Now walk through the line and measure the pattern against it.
679 */
680 for (j = offset; j < (unsigned) length; j += mp->size) {
681 int found = 1;
682 for (k = 0; k < mp->size; ++k) {
683 UCHAR have = buffer[j + k];
684 UCHAR want = (UCHAR) IsNonNull(mp->mark[k]);
685 if (!have ^ !want) {
686 found = 0;
687 break;
688 }
689 }
690 if (found) {
691 total += (B_COUNT) mp->size;
692 }
693 }
694 }
695 result = (int) (length
696 ? (((100.0 * (double) total) / (double) length))
697 : 0);
698
699 TRACE(("...%ld/%ld ->%d%%\n", total, length, result));
700 }
701 return result;
702 }
703
704 static void
set_encoding_from_bom(BUFFER * bp,BOM_CODES bom_value)705 set_encoding_from_bom(BUFFER *bp, BOM_CODES bom_value)
706 {
707 const BOM_TABLE *mp;
708 int result;
709
710 if (bom_value > bom_NONE
711 && (mp = find_mark_info(bom_value)) != 0) {
712
713 switch (mp->code) {
714 case bom_UTF8:
715 result = enc_UTF8;
716 break;
717 case bom_UTF16LE:
718 case bom_UTF16BE:
719 result = enc_UTF16;
720 break;
721 case bom_UTF32LE:
722 case bom_UTF32BE:
723 result = enc_UTF32;
724 break;
725 default:
726 result = ENUM_UNKNOWN;
727 break;
728 }
729 TRACE(("set_encoding_from_bom(%s) ->%s\n",
730 byteorder2s(mp->code),
731 encoding2s(result)));
732 set_encoding(bp, result);
733 }
734 }
735
736 static void
set_bom_from_encoding(BUFFER * bp,int enc_value)737 set_bom_from_encoding(BUFFER *bp, int enc_value)
738 {
739 int result = b_val(bp, VAL_BYTEORDER_MARK);
740
741 if (result > bom_NONE) {
742 switch (enc_value) {
743 case enc_UTF8:
744 case enc_UTF16:
745 case enc_UTF32:
746 break;
747 default:
748 if (result != ENUM_UNKNOWN
749 && result != global_b_val(VAL_BYTEORDER_MARK)) {
750 set_local_b_val(bp, VAL_BYTEORDER_MARK, bom_NONE);
751 }
752 break;
753 }
754 }
755 }
756
757 /******************************************************************************/
758
759 int
aligned_charset(BUFFER * bp,UCHAR * buffer,B_COUNT * length)760 aligned_charset(BUFFER *bp, UCHAR * buffer, B_COUNT * length)
761 {
762 int rc = FALSE;
763 const BOM_TABLE *mp = find_mark_info(get_bom(bp));
764
765 (void) buffer;
766 if (mp != 0 && mp->size > 1) {
767 rc = !(*length % mp->size);
768 }
769 return rc;
770 }
771
772 int
cleanup_charset(BUFFER * bp,UCHAR * buffer,B_COUNT * length)773 cleanup_charset(BUFFER *bp, UCHAR * buffer, B_COUNT * length)
774 {
775 remove_crlf_nulls(bp, buffer, length);
776 return TRUE;
777 }
778
779 /*
780 * Call this once after reading the buffer (or the first line).
781 * But do it before deducing the majormode (to avoid conflict with "preamble").
782 *
783 * It checks if the byteorder-mark is "auto", and if so, looks at the
784 * line to determine what value to use. It sets the local buffer mode
785 * for the result.
786 *
787 * Having a value other than "none", it then modifies the first line,
788 * stripping the BOM bytes.
789 */
790 int
decode_bom(BUFFER * bp,UCHAR * buffer,B_COUNT * length)791 decode_bom(BUFFER *bp, UCHAR * buffer, B_COUNT * length)
792 {
793 const BOM_TABLE *mp;
794 int code = FALSE;
795 int result;
796 unsigned n;
797
798 TRACE((T_CALLED "decode_bom(%s) length %ld\n", bp->b_bname, *length));
799
800 if (b_val(bp, VAL_BYTEORDER_MARK) == ENUM_UNKNOWN) {
801 result = bom_NONE;
802 for (n = 0; n < TABLESIZE(bom_table); ++n) {
803 mp = bom_table + n;
804 if (line_has_mark(mp, buffer, *length)) {
805 result = mp->code;
806 TRACE(("...matched %d\n", result));
807 break;
808 }
809 }
810 set_byteorder_mark(bp, result);
811 }
812
813 if (b_val(bp, VAL_BYTEORDER_MARK) > bom_NONE
814 && (mp = find_mark_info((BOM_CODES) b_val(bp,
815 VAL_BYTEORDER_MARK))) != 0
816 && line_has_mark(mp, buffer, *length)) {
817 for (n = 0; n < *length - mp->size; ++n) {
818 buffer[n] = buffer[n + mp->size];
819 }
820 while (n < *length) {
821 buffer[n++] = 0;
822 }
823 *length -= (B_COUNT) mp->size;
824
825 set_encoding_from_bom(bp, (BOM_CODES) b_val(bp, VAL_BYTEORDER_MARK));
826 code = TRUE;
827 }
828 returnCode(code);
829 }
830
831 /*
832 * Rewrite the line from UTF-16 or UTF-32 into UTF-8.
833 * That may increase the number of bytes used to store the data.
834 */
835 int
decode_charset(BUFFER * bp,LINE * lp)836 decode_charset(BUFFER *bp, LINE *lp)
837 {
838 int rc = FALSE;
839
840 if (b_val(bp, VAL_FILE_ENCODING) == enc_UTF16
841 || b_val(bp, VAL_FILE_ENCODING) == enc_UTF32) {
842 rc = load_as_utf8(bp, lp);
843 }
844 return rc;
845 }
846
847 /*
848 * Check if we have an explicit encoding. If not, inspect the buffer contents
849 * to decide what encoding to use.
850 *
851 * By observation, some UTF-16 files written by other editors have no BOM. It
852 * is possible that UTF-32 files may be missing a BOM as well. We can
853 * determine this by seeing if the file is riddled with nulls (in the right
854 * pattern of course). If we find a match for one of these, recode the buffer
855 * into UTF-8.
856 *
857 * If the encoding is unknown or 8-bit, we can inspect the buffer to see if it
858 * makes more sense as UTF-8.
859 */
860 int
deduce_charset(BUFFER * bp,UCHAR * buffer,B_COUNT * length,int always)861 deduce_charset(BUFFER *bp, UCHAR * buffer, B_COUNT * length, int always)
862 {
863 int rc = FALSE;
864
865 TRACE((T_CALLED "deduce_charset(%s) bom:%s, encoding:%s\n",
866 bp->b_bname,
867 byteorder2s(b_val(bp, VAL_BYTEORDER_MARK)),
868 encoding2s(b_val(bp, VAL_FILE_ENCODING))));
869
870 bp->implied_BOM = bom_NONE;
871 if (b_is_enc_AUTO(bp)) {
872 unsigned n;
873 int match = 0;
874 int found = -1;
875
876 for (n = 0; n < TABLESIZE(bom_table); ++n) {
877 int check = riddled_buffer(&bom_table[n], buffer, *length);
878 if (check > match) {
879 match = check;
880 found = (int) n;
881 }
882 }
883 if (found > 0 && match >= b_val(bp, VAL_PERCENT_UTF8)) {
884 bp->implied_BOM = bom_table[found].code;
885 set_encoding_from_bom(bp, bp->implied_BOM);
886 TRACE(("...found_charset %s\n",
887 byteorder2s(bp->implied_BOM)));
888
889 switch (bp->implied_BOM) {
890 case bom_UTF16BE:
891 case bom_UTF32BE:
892 set_byteorder_mark(bp, bom_BE_ASSUMED);
893 break;
894 case bom_UTF16LE:
895 case bom_UTF32LE:
896 set_byteorder_mark(bp, bom_LE_ASSUMED);
897 break;
898 default:
899 break;
900 }
901 rc = TRUE;
902 } else if (always) {
903 TRACE(("...try looking for UTF-8\n"));
904 if (check_utf8(buffer, *length) == TRUE)
905 found_utf8(bp);
906 }
907 } else {
908 rc = TRUE;
909 }
910 remove_crlf_nulls(bp, buffer, length);
911 returnCode(rc);
912 }
913
914 /*
915 * Check if the given buffer should be treated as UTF-8.
916 * For UTF-8, we have to have _some_ UTF-8 encoding, and _all_
917 * of the buffer has to match the pattern.
918 */
919 int
check_utf8(UCHAR * buffer,B_COUNT length)920 check_utf8(UCHAR * buffer, B_COUNT length)
921 {
922 B_COUNT n;
923 int check = TRUE;
924 int skip = 0;
925 int found;
926 UINT target;
927
928 for (n = 0, found = 0; n < length - 1; n += (B_COUNT) skip) {
929 skip = vl_conv_to_utf32(&target,
930 (char *) (buffer + n),
931 length - n);
932 if (skip == 0) {
933 check = FALSE;
934 break;
935 } else if (skip > 1) {
936 found = 1;
937 }
938 }
939 return ((check && found)
940 ? TRUE
941 : (check
942 ? SORTOFTRUE
943 : FALSE));
944 }
945
946 /*
947 * If we found UTF-8 encoding, set the buffer to match.
948 */
949 void
found_utf8(BUFFER * bp)950 found_utf8(BUFFER *bp)
951 {
952 TRACE(("...found UTF-8\n"));
953 bp->implied_BOM = bom_UTF8;
954 set_encoding_from_bom(bp, bp->implied_BOM);
955 }
956
957 /*
958 * encode BOM while writing file, without modifying the buffer.
959 */
960 int
write_bom(BUFFER * bp)961 write_bom(BUFFER *bp)
962 {
963 const BOM_TABLE *mp;
964 int status = FIOSUC;
965
966 if ((mp = find_mark_info((BOM_CODES) b_val(bp, VAL_BYTEORDER_MARK))) != 0) {
967 status = ffputline((const char *) mp->mark, (int) mp->size, NULL);
968 }
969 return status;
970 }
971
972 /*
973 * encode the UTF-8 text into UTF-16 or UTF-32, according to the buffer's
974 * file-encoding mode.
975 */
976 int
encode_charset(BUFFER * bp,const char * buf,int nbuf,const char * ending)977 encode_charset(BUFFER *bp, const char *buf, int nbuf, const char *ending)
978 {
979 int rc = 0;
980
981 if (b_val(bp, VAL_FILE_ENCODING) == enc_UTF16
982 || b_val(bp, VAL_FILE_ENCODING) == enc_UTF32) {
983 rc = dump_as_utfXX(bp, buf, nbuf, ending ? ending : "");
984 }
985 return rc;
986 }
987
988 /*
989 * if byteorder mark changes, ensure that file-encoding is set compatibly.
990 */
991 int
chgd_byteorder(BUFFER * bp,VALARGS * args,int glob_vals,int testing)992 chgd_byteorder(BUFFER *bp,
993 VALARGS * args,
994 int glob_vals,
995 int testing)
996 {
997 if (!testing && !glob_vals) {
998 set_encoding_from_bom(bp, (BOM_CODES) args->local->vp->i);
999 }
1000 return TRUE;
1001 }
1002
1003 /*
1004 * If file-encoding changes to non-UTF-8, set byteorder-mark to none.
1005 * Only keep it set if changing from one UTF-encoding to another.
1006 */
1007 int
chgd_fileencode(BUFFER * bp,VALARGS * args,int glob_vals,int testing)1008 chgd_fileencode(BUFFER *bp,
1009 VALARGS * args,
1010 int glob_vals,
1011 int testing)
1012 {
1013 if (testing) {
1014 ;
1015 } else {
1016 int new_encoding = args->local->vp->i;
1017 if (glob_vals) {
1018 if (new_encoding == enc_POSIX) {
1019 rebuild_charclasses(0, 127);
1020 } else {
1021 rebuild_charclasses(global_g_val(GVAL_PRINT_LOW),
1022 global_g_val(GVAL_PRINT_HIGH));
1023 }
1024 } else {
1025 set_bom_from_encoding(bp, new_encoding);
1026 }
1027 set_bufflags(glob_vals, WFHARD | WFMODE);
1028 }
1029 return TRUE;
1030 }
1031
1032 const char *
byteorder2s(int code)1033 byteorder2s(int code)
1034 {
1035 return choice_to_name(&fsm_byteorder_mark_blist, code);
1036 }
1037
1038 const char *
encoding2s(int code)1039 encoding2s(int code)
1040 {
1041 return choice_to_name(&fsm_file_encoding_blist, code);
1042 }
1043