1 /*
2  * $Id: charsets.c,v 1.80 2018/10/21 19:18:36 tom Exp $
3  *
4  * see
5  http://msdn.microsoft.com/library/default.asp?url=/library/en-us/intl/unicode_42jv.asp
6  http://en.wikipedia.org/wiki/Byte_Order_Mark
7  http://en.wikipedia.org/wiki/UTF-16
8  */
9 
10 #include <estruct.h>
11 #include <chgdfunc.h>
12 #include <edef.h>
13 #include <nefsms.h>
14 
15 #if OPT_ICONV_FUNCS
16 #include <iconv.h>
17 #include <locale.h>
18 #endif
19 /* *INDENT-OFF* */
20 static const UCHAR mark_NONE[]    = { 0x00 };
21 static const UCHAR mark_UTF8[]    = { 0xef, 0xbb, 0xbf };
22 static const UCHAR mark_UTF16LE[] = { 0xff, 0xfe };
23 static const UCHAR mark_UTF16BE[] = { 0xfe, 0xff };
24 static const UCHAR mark_UTF32LE[] = { 0xff, 0xfe, 0x00, 0x00 };
25 static const UCHAR mark_UTF32BE[] = { 0x00, 0x00, 0xfe, 0xff };
26 
27 #define IsNonNull(code) (code == 0xff)
28 
29 typedef struct {
30     BOM_CODES code;
31     const UCHAR *mark;
32     size_t size;
33 } BOM_TABLE;
34 
35 #define DATA(name) { bom_##name, mark_##name, sizeof(mark_##name) }
36 static const BOM_TABLE bom_table[] = {
37     { bom_NONE, mark_NONE, 0 },
38     DATA(UTF8),
39     DATA(UTF32LE),	/* must be before UTF-16 entries */
40     DATA(UTF32BE),
41     DATA(UTF16LE),
42     DATA(UTF16BE),
43 };
44 #undef DATA
45 /* *INDENT-ON* */
46 
47 /******************************************************************************/
48 
49 static int
allow_decoder(BUFFER * bp,size_t need)50 allow_decoder(BUFFER *bp, size_t need)
51 {
52     if (need > bp->decode_utf_len) {
53 	bp->decode_utf_len = (need + 1) * 2;
54 	safe_typereallocn(UINT, bp->decode_utf_buf, bp->decode_utf_len);
55     }
56     return (bp->decode_utf_buf != 0);
57 }
58 
59 static int
allow_encoder(BUFFER * bp,size_t need)60 allow_encoder(BUFFER *bp, size_t need)
61 {
62     if (need > bp->encode_utf_len) {
63 	bp->encode_utf_len = (need + 1) * 2;
64 	safe_typereallocn(char, bp->encode_utf_buf, bp->encode_utf_len);
65     }
66     return (bp->encode_utf_buf != 0);
67 }
68 
69 int
vl_conv_to_utf8(UCHAR * target,UINT source,B_COUNT limit)70 vl_conv_to_utf8(UCHAR * target, UINT source, B_COUNT limit)
71 {
72 #define CH(n) (UCHAR)((source) >> ((n) * 8))
73     int rc = 0;
74 
75     if (source <= 0x0000007f)
76 	rc = 1;
77     else if (source <= 0x000007ff)
78 	rc = 2;
79     else if (source <= 0x0000ffff)
80 	rc = 3;
81     else if (source <= 0x001fffff)
82 	rc = 4;
83     else if (source <= 0x03ffffff)
84 	rc = 5;
85     else			/* (source <= 0x7fffffff) */
86 	rc = 6;
87 
88     if ((B_COUNT) rc > limit) {	/* whatever it is, we cannot decode it */
89 	TRACE2(("limit failed in vl_conv_to_utf8 %d/%ld %#06x\n",
90 		rc, limit, source));
91 	rc = 0;
92     }
93 
94     if (target != 0) {
95 	switch (rc) {
96 	case 1:
97 	    target[0] = (UCHAR) CH(0);
98 	    break;
99 
100 	case 2:
101 	    target[1] = (UCHAR) (0x80 | (CH(0) & 0x3f));
102 	    target[0] = (UCHAR) (0xc0 | (CH(0) >> 6) | ((CH(1) & 0x07) << 2));
103 	    break;
104 
105 	case 3:
106 	    target[2] = (UCHAR) (0x80 | (CH(0) & 0x3f));
107 	    target[1] = (UCHAR) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
108 	    target[0] = (UCHAR) (0xe0 | ((int) (CH(1) & 0xf0) >> 4));
109 	    break;
110 
111 	case 4:
112 	    target[3] = (UCHAR) (0x80 | (CH(0) & 0x3f));
113 	    target[2] = (UCHAR) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
114 	    target[1] = (UCHAR) (0x80 |
115 				 ((int) (CH(1) & 0xf0) >> 4) |
116 				 ((int) (CH(2) & 0x03) << 4));
117 	    target[0] = (UCHAR) (0xf0 | ((int) (CH(2) & 0x1f) >> 2));
118 	    break;
119 
120 	case 5:
121 	    target[4] = (UCHAR) (0x80 | (CH(0) & 0x3f));
122 	    target[3] = (UCHAR) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
123 	    target[2] = (UCHAR) (0x80 |
124 				 ((int) (CH(1) & 0xf0) >> 4) |
125 				 ((int) (CH(2) & 0x03) << 4));
126 	    target[1] = (UCHAR) (0x80 | (CH(2) >> 2));
127 	    target[0] = (UCHAR) (0xf8 | (CH(3) & 0x03));
128 	    break;
129 
130 	case 6:
131 	    target[5] = (UCHAR) (0x80 | (CH(0) & 0x3f));
132 	    target[4] = (UCHAR) (0x80 | (CH(0) >> 6) | ((CH(1) & 0x0f) << 2));
133 	    target[3] = (UCHAR) (0x80 | (CH(1) >> 4) | ((CH(2) & 0x03) << 4));
134 	    target[2] = (UCHAR) (0x80 | (CH(2) >> 2));
135 	    target[1] = (UCHAR) (0x80 | (CH(3) & 0x3f));
136 	    target[0] = (UCHAR) (0xfc | ((int) (CH(3) & 0x40) >> 6));
137 	    break;
138 	}
139 	TRACE2(("decode %#08x %02X.%02X.%02X.%02X %d:%.*s\n", source,
140 		CH(3), CH(2), CH(1), CH(0), rc, rc, target));
141     }
142 
143     return rc;			/* number of bytes needed in target */
144 #undef CH
145 }
146 
147 int
vl_check_utf8(const char * source,B_COUNT limit)148 vl_check_utf8(const char *source, B_COUNT limit)
149 {
150     int rc = 0;
151     int j;
152 
153     /*
154      * Find the number of bytes we will need from the source.
155      */
156     if ((*source & 0x80) == 0) {
157 	rc = 1;
158     } else if ((*source & 0xe0) == 0xc0) {
159 	rc = 2;
160     } else if ((*source & 0xf0) == 0xe0) {
161 	rc = 3;
162     } else if ((*source & 0xf8) == 0xf0) {
163 	rc = 4;
164     } else if ((*source & 0xfc) == 0xf8) {
165 	rc = 5;
166     } else if ((*source & 0xfe) == 0xfc) {
167 	rc = 6;
168     }
169 
170     /*
171      * sanity-check.
172      */
173     if (rc > 1) {
174 	int have = rc;
175 
176 	if ((int) limit < have)
177 	    have = (int) limit;
178 
179 	for (j = 1; j < have; j++) {
180 	    if ((source[j] & 0xc0) != 0x80)
181 		break;
182 	}
183 	if (j != have) {
184 	    TRACE2(("check failed %d/%d in vl_check_utf8\n", j, rc));
185 	    rc = 0;
186 	}
187     }
188     return rc;
189 }
190 
191 int
vl_conv_to_utf32(UINT * target,const char * source,B_COUNT limit)192 vl_conv_to_utf32(UINT * target, const char *source, B_COUNT limit)
193 {
194 #define CH(n) (UCHAR)((*target) >> ((n) * 8))
195     int rc = vl_check_utf8(source, limit);
196 
197     if ((B_COUNT) rc > limit) {	/* whatever it is, we cannot decode it */
198 	TRACE2(("limit failed %d/%ld in vl_conv_to_utf32\n", rc, limit));
199 	rc = 0;
200     }
201 
202     if (target != 0) {
203 	UINT mask = 0;
204 	int j;
205 	int shift = 0;
206 	*target = 0;
207 
208 	switch (rc) {
209 	case 1:
210 	    mask = (UINT) * source;
211 	    break;
212 	case 2:
213 	    mask = (UINT) (*source & 0x1f);
214 	    break;
215 	case 3:
216 	    mask = (UINT) (*source & 0x0f);
217 	    break;
218 	case 4:
219 	    mask = (UINT) (*source & 0x07);
220 	    break;
221 	case 5:
222 	    mask = (UINT) (*source & 0x03);
223 	    break;
224 	case 6:
225 	    mask = (UINT) (*source & 0x01);
226 	    break;
227 	default:
228 	    mask = 0;
229 	    break;
230 	}
231 
232 	for (j = 1; j < rc; j++) {
233 	    *target |= (UINT) (source[rc - j] & 0x3f) << shift;
234 	    shift += 6;
235 	}
236 	*target |= mask << shift;
237 
238 	TRACE2(("encode %2d:%.*s -> %#08x %02X.%02X.%02X.%02X\n",
239 		rc, rc, source,
240 		*target,
241 		CH(3), CH(2), CH(1), CH(0)));
242     }
243     return rc;
244 #undef CH
245 }
246 
247 static const BOM_TABLE *
find_mark_info(BOM_CODES code)248 find_mark_info(BOM_CODES code)
249 {
250     const BOM_TABLE *result = 0;
251     unsigned n;
252 
253     for (n = 0; n < TABLESIZE(bom_table); ++n) {
254 	const BOM_TABLE *mp = bom_table + n;
255 	if (mp->code == code) {
256 	    result = mp;
257 	    break;
258 	}
259     }
260     return result;
261 }
262 
263 static BOM_CODES
get_bom(BUFFER * bp)264 get_bom(BUFFER *bp)
265 {
266     BOM_CODES rc = (BOM_CODES) b_val(bp, VAL_BYTEORDER_MARK);
267     if (rc == bom_NONE)
268 	rc = bp->implied_BOM;
269     return rc;
270 }
271 
272 /*
273  * If we read a file without a byteorder-mark, it is using one of the
274  * le-assumed or be-assumed values for the corresponding mode.  See what the
275  * file-encoding is, and choose a specific byteorder-mark, needed when we
276  * write the data to a file.
277  */
278 static BOM_CODES
inferred_bom2(BUFFER * bp,BOM_CODES code)279 inferred_bom2(BUFFER *bp, BOM_CODES code)
280 {
281     BOM_CODES result = code;
282 
283     switch (result) {
284     case bom_LE_ASSUMED:
285 	switch (b_val(bp, VAL_FILE_ENCODING)) {
286 	case enc_UTF16:
287 	    result = bom_UTF16LE;
288 	    break;
289 	case enc_UTF32:
290 	    result = bom_UTF32LE;
291 	    break;
292 	default:
293 	    result = bom_NONE;
294 	    break;
295 	}
296 	break;
297     case bom_BE_ASSUMED:
298 	switch (b_val(bp, VAL_FILE_ENCODING)) {
299 	case enc_UTF16:
300 	    result = bom_UTF16BE;
301 	    break;
302 	case enc_UTF32:
303 	    result = bom_UTF32BE;
304 	    break;
305 	default:
306 	    result = bom_NONE;
307 	    break;
308 	}
309 	break;
310     case bom_NONE:
311 	switch (b_val(bp, VAL_FILE_ENCODING)) {
312 	case enc_UTF16:
313 	    result = bom_UTF16LE;
314 	    break;
315 	case enc_UTF32:
316 	    result = bom_UTF32LE;
317 	    break;
318 	}
319 	break;
320     default:
321 	break;
322     }
323 #if OPT_TRACE > 1
324     if (result != code)
325 	TRACE2(("inferred_bom(%s) %s\n",
326 		byteorder2s(code),
327 		byteorder2s(result)));
328 #endif
329     return result;
330 }
331 
332 static BOM_CODES
inferred_bom(BUFFER * bp,const BOM_TABLE * mp)333 inferred_bom(BUFFER *bp, const BOM_TABLE * mp)
334 {
335     return inferred_bom2(bp, mp->code);
336 }
337 
338 /*
339  * If the buffer has no explicit byteorder-mark, but the encoding is UTF-16
340  * or UTF-32, we still need to know the assumed or implicit byteorder-mark.
341  *
342  * When reading the buffer, and no byteorder-mark is found for UTF-16/-32,
343  * we store an implied BOM in the buffer attributes, and set an assumed
344  * byteorder-mark that the user can see/modify.  If the assumed BOM is reset,
345  * e.g., to auto or none, we can still use the implied BOM, e.g., for writing
346  * the file.
347  */
348 static const BOM_TABLE *
find_mark_info2(BUFFER * bp)349 find_mark_info2(BUFFER *bp)
350 {
351     const BOM_TABLE *mp = find_mark_info(get_bom(bp));
352 
353     if ((mp != 0) &&
354 	(mp->size == 0) &&
355 	(b_val(bp, VAL_FILE_ENCODING) > enc_UTF8)) {
356 	mp = find_mark_info(inferred_bom(bp, mp));
357     } else if (mp == 0 &&
358 	       (b_val(bp, VAL_FILE_ENCODING) > enc_UTF8)) {
359 	mp = find_mark_info(inferred_bom2(bp, (BOM_CODES) b_val(bp, VAL_BYTEORDER_MARK)));
360     }
361     return mp;
362 }
363 
364 static int
line_has_mark(const BOM_TABLE * mp,UCHAR * buffer,B_COUNT length)365 line_has_mark(const BOM_TABLE * mp, UCHAR * buffer, B_COUNT length)
366 {
367     int result = FALSE;
368 
369     if (length >= mp->size
370 	&& mp->size != 0
371 	&& memcmp(buffer, mp->mark, mp->size) == 0) {
372 	result = TRUE;
373     }
374     return result;
375 }
376 
377 static int
dump_as_utfXX(BUFFER * bp,const char * buf,int nbuf,const char * ending)378 dump_as_utfXX(BUFFER *bp, const char *buf, int nbuf, const char *ending)
379 {
380 #define BYTE_OF(k,n) (char) (bp->decode_utf_buf[k] >> ((n) * 8))
381     int rc = 0;
382     const BOM_TABLE *mp = find_mark_info2(bp);
383 
384     if (mp != 0 && mp->size > 1) {
385 	size_t j = 0;
386 	size_t k = 0;
387 	size_t need = (size_t) nbuf + strlen(ending);
388 	size_t lend = strlen(ending);
389 
390 	if (!allow_encoder(bp, need * mp->size))
391 	    goto finish;
392 	if (!allow_decoder(bp, need))
393 	    goto finish;
394 
395 	while (j < (unsigned) nbuf) {
396 	    int skip = vl_conv_to_utf32(bp->decode_utf_buf + k++,
397 					buf + j,
398 					(B_COUNT) ((UINT) nbuf - j));
399 	    if (skip == 0)
400 		goto finish;
401 	    j += (UINT) skip;
402 	}
403 	while (*ending != 0) {
404 	    int skip = vl_conv_to_utf32(bp->decode_utf_buf + k++,
405 					ending++,
406 					(B_COUNT) (lend--));
407 	    if (skip == 0)
408 		goto finish;
409 	}
410 	need = k;
411 
412 	for (j = k = 0; k < need; j += mp->size, ++k) {
413 	    switch (mp->code) {
414 	    case bom_NONE:
415 		/* FALLTHRU */
416 	    case bom_UTF8:
417 		/* FALLTHRU */
418 	    case bom_LE_ASSUMED:
419 		/* FALLTHRU */
420 	    case bom_BE_ASSUMED:
421 		/* ignored */
422 		break;
423 	    case bom_UTF16LE:
424 		bp->encode_utf_buf[j + 0] = BYTE_OF(k, 0);
425 		bp->encode_utf_buf[j + 1] = BYTE_OF(k, 1);
426 		break;
427 	    case bom_UTF16BE:
428 		bp->encode_utf_buf[j + 1] = BYTE_OF(k, 0);
429 		bp->encode_utf_buf[j + 0] = BYTE_OF(k, 1);
430 		break;
431 	    case bom_UTF32LE:
432 		bp->encode_utf_buf[j + 0] = BYTE_OF(k, 0);
433 		bp->encode_utf_buf[j + 1] = BYTE_OF(k, 1);
434 		bp->encode_utf_buf[j + 2] = BYTE_OF(k, 2);
435 		bp->encode_utf_buf[j + 3] = BYTE_OF(k, 3);
436 		break;
437 	    case bom_UTF32BE:
438 		bp->encode_utf_buf[j + 3] = BYTE_OF(k, 0);
439 		bp->encode_utf_buf[j + 2] = BYTE_OF(k, 1);
440 		bp->encode_utf_buf[j + 1] = BYTE_OF(k, 2);
441 		bp->encode_utf_buf[j + 0] = BYTE_OF(k, 3);
442 		break;
443 	    }
444 	}
445 	rc = (int) j;
446     }
447   finish:
448     return rc;
449 #undef BYTE_OF
450 }
451 
452 static void
set_byteorder_mark(BUFFER * bp,int value)453 set_byteorder_mark(BUFFER *bp, int value)
454 {
455     if (value != ENUM_UNKNOWN
456 	&& value != global_b_val(VAL_BYTEORDER_MARK)) {
457 	set_local_b_val(bp, VAL_BYTEORDER_MARK, value);
458 
459 	TRACE(("set_byteorder_mark for '%s' to %s\n",
460 	       bp->b_bname,
461 	       byteorder2s(b_val(bp, VAL_BYTEORDER_MARK))));
462     }
463 }
464 
465 static void
set_encoding(BUFFER * bp,int value)466 set_encoding(BUFFER *bp, int value)
467 {
468     if (value != ENUM_UNKNOWN
469 	&& value != global_b_val(VAL_FILE_ENCODING)) {
470 	set_local_b_val(bp, VAL_FILE_ENCODING, value);
471 
472 	TRACE(("set_encoding for '%s' to %s\n",
473 	       bp->b_bname,
474 	       encoding2s(b_val(bp, VAL_FILE_ENCODING))));
475     }
476 }
477 
478 static int
load_as_utf8(BUFFER * bp,LINE * lp)479 load_as_utf8(BUFFER *bp, LINE *lp)
480 {
481 #define CH(n) ((UCHAR)(lgetc(lp, n)))
482     int rc = FALSE;
483     const BOM_TABLE *mp = find_mark_info2(bp);
484 
485     if (mp != 0 && mp->size > 1) {
486 	int pass;
487 	size_t j, k;
488 	size_t need = (size_t) llength(lp);
489 	size_t used;
490 
491 	TRACE2(("load_as_utf8:%d:%s\n", need, lp_visible(lp)));
492 	if (allow_decoder(bp, need)) {
493 	    rc = TRUE;
494 	    if (need) {
495 		for (j = k = 0; j < need; ++k) {
496 		    UCHAR ch = CH(j);
497 		    if (ch == '\r' || ch == '\n') {
498 			bp->decode_utf_buf[k] = ch;
499 			++j;	/* see remove_crlf_nulls() */
500 			continue;
501 		    }
502 		    switch (inferred_bom(bp, mp)) {
503 		    case bom_NONE:
504 			/* FALLTHRU */
505 		    case bom_UTF8:
506 			/* FALLTHRU */
507 		    case bom_LE_ASSUMED:
508 			/* FALLTHRU */
509 		    case bom_BE_ASSUMED:
510 			/* ignored */
511 			break;
512 		    case bom_UTF16LE:
513 			bp->decode_utf_buf[k] = (CH(j)
514 						 + (UINT) (CH(j + 1) << 8));
515 			break;
516 		    case bom_UTF16BE:
517 			bp->decode_utf_buf[k] = (CH(j + 1)
518 						 + (UINT) (CH(j) << 8));
519 			break;
520 		    case bom_UTF32LE:
521 			bp->decode_utf_buf[k] = (CH(j + 0)
522 						 + (UINT) (CH(j + 1) << 8)
523 						 + (UINT) (CH(j + 2) << 16)
524 						 + (UINT) (CH(j + 3) << 24));
525 			break;
526 		    case bom_UTF32BE:
527 			bp->decode_utf_buf[k] = (CH(j + 3)
528 						 + (UINT) (CH(j + 2) << 8)
529 						 + (UINT) (CH(j + 1) << 16)
530 						 + (UINT) (CH(j + 0) << 24));
531 			break;
532 		    }
533 		    j += (UINT) mp->size;
534 		}
535 		used = k;
536 
537 		for (pass = 1; pass <= 2; ++pass) {
538 		    UCHAR *buffer = (pass == 1) ? 0 : (UCHAR *) lvalue(lp);
539 		    for (j = k = 0; j < used; ++j) {
540 			int nn = vl_conv_to_utf8(buffer,
541 						 bp->decode_utf_buf[j],
542 						 (B_COUNT) (used + 1 - j));
543 			if (buffer != 0)
544 			    buffer += nn;
545 			k += (UINT) nn;
546 		    }
547 		    if (pass == 1) {
548 			TRACE2(("need %d, have %d\n", k, lp->l_size));
549 			if ((int) k > llength(lp)) {
550 			    char *ntext;
551 
552 			    /*
553 			     * We are doing this conversion on the initial load
554 			     * of the buffer, do not want to allow undo.  Just
555 			     * go ahead and reallocate the line's text buffer.
556 			     */
557 			    if ((ntext = castalloc(char, (k + 1))) == NULL) {
558 				rc = FALSE;
559 				break;
560 			    }
561 			    ltextfree(lp, bp);
562 			    lvalue(lp) = ntext;
563 			    lp->l_size = k;
564 			    llength(lp) = (int) k;
565 			} else {
566 			    llength(lp) = (int) k;
567 			}
568 		    }
569 		}
570 	    }
571 	} else {
572 	    bp->decode_utf_len = 0;
573 	}
574     }
575     return rc;
576 #undef CH
577 }
578 
579 /*
580  * Remove the extra nulls (if any - according to the encoding) after
581  * \r and \n bytes.  This is done to make the existing logic for checking
582  * recordseparator work without change.
583  */
584 static void
remove_crlf_nulls(BUFFER * bp,UCHAR * buffer,B_COUNT * length)585 remove_crlf_nulls(BUFFER *bp, UCHAR * buffer, B_COUNT * length)
586 {
587     const BOM_TABLE *mp = find_mark_info2(bp);
588     UCHAR mark_cr[4];
589     UCHAR mark_lf[4];
590     size_t marklen = 0;
591 
592     if (mp != 0) {
593 	memset(mark_cr, 0, sizeof(mark_cr));
594 	memset(mark_lf, 0, sizeof(mark_lf));
595 
596 	switch (mp->code) {
597 	case bom_NONE:
598 	    /* FALLTHRU */
599 	case bom_UTF8:
600 	    /* FALLTHRU */
601 	case bom_LE_ASSUMED:
602 	    /* FALLTHRU */
603 	case bom_BE_ASSUMED:
604 	    /* ignored */
605 	    break;
606 	case bom_UTF16LE:
607 	    marklen = 2;
608 	    mark_cr[0] = '\r';
609 	    mark_lf[0] = '\n';
610 	    break;
611 	case bom_UTF16BE:
612 	    marklen = 2;
613 	    mark_cr[1] = '\r';
614 	    mark_lf[1] = '\n';
615 	    break;
616 	case bom_UTF32LE:
617 	    marklen = 4;
618 	    mark_cr[0] = '\r';
619 	    mark_lf[0] = '\n';
620 	    break;
621 	case bom_UTF32BE:
622 	    marklen = 4;
623 	    mark_cr[3] = '\r';
624 	    mark_lf[3] = '\n';
625 	    break;
626 	}
627 	if (marklen != 0) {
628 	    B_COUNT dst = 0;
629 	    B_COUNT src = 0;
630 	    char skip = 0;
631 	    while (src < *length) {
632 		if (!memcmp(mark_cr, buffer + src, marklen))
633 		    skip = '\r';
634 		else if (!memcmp(mark_lf, buffer + src, marklen))
635 		    skip = '\n';
636 		if (skip) {
637 		    buffer[dst++] = (UCHAR) skip;
638 		    skip = 0;
639 		} else {
640 		    memcpy(buffer + dst, buffer + src, marklen);
641 		    dst += (B_COUNT) marklen;
642 		}
643 		src += (B_COUNT) marklen;
644 	    }
645 	    *length = dst;
646 	}
647     }
648 }
649 
650 /*
651  * Returns a percentage for the number of cells in the buffer which are
652  * explained by interpreting them according to the given byte-order mark
653  * pattern with the assumption that most of the content is ASCII or ISO-8859-1
654  * (8-bits).
655  */
656 static int
riddled_buffer(const BOM_TABLE * mp,UCHAR * buffer,B_COUNT length)657 riddled_buffer(const BOM_TABLE * mp, UCHAR * buffer, B_COUNT length)
658 {
659     int result = 0;
660     B_COUNT total = 0;
661     size_t offset = 0;
662     size_t j, k;
663 
664     if (mp->size && !(mp->size % 2)) {
665 	TRACE(("checking if %s / %u-byte\n",
666 	       byteorder2s(mp->code),
667 	       (UINT) mp->size));
668 
669 	/* Check the line-length.  If it is not a multiple of the pattern
670 	 * size, just give up.
671 	 */
672 	if ((length + offset) % mp->size) {
673 	    TRACE(("length %ld vs pattern %u - give up\n",
674 		   length,
675 		   (UINT) mp->size));
676 	} else {
677 	    /*
678 	     * Now walk through the line and measure the pattern against it.
679 	     */
680 	    for (j = offset; j < (unsigned) length; j += mp->size) {
681 		int found = 1;
682 		for (k = 0; k < mp->size; ++k) {
683 		    UCHAR have = buffer[j + k];
684 		    UCHAR want = (UCHAR) IsNonNull(mp->mark[k]);
685 		    if (!have ^ !want) {
686 			found = 0;
687 			break;
688 		    }
689 		}
690 		if (found) {
691 		    total += (B_COUNT) mp->size;
692 		}
693 	    }
694 	}
695 	result = (int) (length
696 			? (((100.0 * (double) total) / (double) length))
697 			: 0);
698 
699 	TRACE(("...%ld/%ld ->%d%%\n", total, length, result));
700     }
701     return result;
702 }
703 
704 static void
set_encoding_from_bom(BUFFER * bp,BOM_CODES bom_value)705 set_encoding_from_bom(BUFFER *bp, BOM_CODES bom_value)
706 {
707     const BOM_TABLE *mp;
708     int result;
709 
710     if (bom_value > bom_NONE
711 	&& (mp = find_mark_info(bom_value)) != 0) {
712 
713 	switch (mp->code) {
714 	case bom_UTF8:
715 	    result = enc_UTF8;
716 	    break;
717 	case bom_UTF16LE:
718 	case bom_UTF16BE:
719 	    result = enc_UTF16;
720 	    break;
721 	case bom_UTF32LE:
722 	case bom_UTF32BE:
723 	    result = enc_UTF32;
724 	    break;
725 	default:
726 	    result = ENUM_UNKNOWN;
727 	    break;
728 	}
729 	TRACE(("set_encoding_from_bom(%s) ->%s\n",
730 	       byteorder2s(mp->code),
731 	       encoding2s(result)));
732 	set_encoding(bp, result);
733     }
734 }
735 
736 static void
set_bom_from_encoding(BUFFER * bp,int enc_value)737 set_bom_from_encoding(BUFFER *bp, int enc_value)
738 {
739     int result = b_val(bp, VAL_BYTEORDER_MARK);
740 
741     if (result > bom_NONE) {
742 	switch (enc_value) {
743 	case enc_UTF8:
744 	case enc_UTF16:
745 	case enc_UTF32:
746 	    break;
747 	default:
748 	    if (result != ENUM_UNKNOWN
749 		&& result != global_b_val(VAL_BYTEORDER_MARK)) {
750 		set_local_b_val(bp, VAL_BYTEORDER_MARK, bom_NONE);
751 	    }
752 	    break;
753 	}
754     }
755 }
756 
757 /******************************************************************************/
758 
759 int
aligned_charset(BUFFER * bp,UCHAR * buffer,B_COUNT * length)760 aligned_charset(BUFFER *bp, UCHAR * buffer, B_COUNT * length)
761 {
762     int rc = FALSE;
763     const BOM_TABLE *mp = find_mark_info(get_bom(bp));
764 
765     (void) buffer;
766     if (mp != 0 && mp->size > 1) {
767 	rc = !(*length % mp->size);
768     }
769     return rc;
770 }
771 
772 int
cleanup_charset(BUFFER * bp,UCHAR * buffer,B_COUNT * length)773 cleanup_charset(BUFFER *bp, UCHAR * buffer, B_COUNT * length)
774 {
775     remove_crlf_nulls(bp, buffer, length);
776     return TRUE;
777 }
778 
779 /*
780  * Call this once after reading the buffer (or the first line).
781  * But do it before deducing the majormode (to avoid conflict with "preamble").
782  *
783  * It checks if the byteorder-mark is "auto", and if so, looks at the
784  * line to determine what value to use.  It sets the local buffer mode
785  * for the result.
786  *
787  * Having a value other than "none", it then modifies the first line,
788  * stripping the BOM bytes.
789  */
790 int
decode_bom(BUFFER * bp,UCHAR * buffer,B_COUNT * length)791 decode_bom(BUFFER *bp, UCHAR * buffer, B_COUNT * length)
792 {
793     const BOM_TABLE *mp;
794     int code = FALSE;
795     int result;
796     unsigned n;
797 
798     TRACE((T_CALLED "decode_bom(%s) length %ld\n", bp->b_bname, *length));
799 
800     if (b_val(bp, VAL_BYTEORDER_MARK) == ENUM_UNKNOWN) {
801 	result = bom_NONE;
802 	for (n = 0; n < TABLESIZE(bom_table); ++n) {
803 	    mp = bom_table + n;
804 	    if (line_has_mark(mp, buffer, *length)) {
805 		result = mp->code;
806 		TRACE(("...matched %d\n", result));
807 		break;
808 	    }
809 	}
810 	set_byteorder_mark(bp, result);
811     }
812 
813     if (b_val(bp, VAL_BYTEORDER_MARK) > bom_NONE
814 	&& (mp = find_mark_info((BOM_CODES) b_val(bp,
815 						  VAL_BYTEORDER_MARK))) != 0
816 	&& line_has_mark(mp, buffer, *length)) {
817 	for (n = 0; n < *length - mp->size; ++n) {
818 	    buffer[n] = buffer[n + mp->size];
819 	}
820 	while (n < *length) {
821 	    buffer[n++] = 0;
822 	}
823 	*length -= (B_COUNT) mp->size;
824 
825 	set_encoding_from_bom(bp, (BOM_CODES) b_val(bp, VAL_BYTEORDER_MARK));
826 	code = TRUE;
827     }
828     returnCode(code);
829 }
830 
831 /*
832  * Rewrite the line from UTF-16 or UTF-32 into UTF-8.
833  * That may increase the number of bytes used to store the data.
834  */
835 int
decode_charset(BUFFER * bp,LINE * lp)836 decode_charset(BUFFER *bp, LINE *lp)
837 {
838     int rc = FALSE;
839 
840     if (b_val(bp, VAL_FILE_ENCODING) == enc_UTF16
841 	|| b_val(bp, VAL_FILE_ENCODING) == enc_UTF32) {
842 	rc = load_as_utf8(bp, lp);
843     }
844     return rc;
845 }
846 
847 /*
848  * Check if we have an explicit encoding.  If not, inspect the buffer contents
849  * to decide what encoding to use.
850  *
851  * By observation, some UTF-16 files written by other editors have no BOM.  It
852  * is possible that UTF-32 files may be missing a BOM as well.  We can
853  * determine this by seeing if the file is riddled with nulls (in the right
854  * pattern of course).  If we find a match for one of these, recode the buffer
855  * into UTF-8.
856  *
857  * If the encoding is unknown or 8-bit, we can inspect the buffer to see if it
858  * makes more sense as UTF-8.
859  */
860 int
deduce_charset(BUFFER * bp,UCHAR * buffer,B_COUNT * length,int always)861 deduce_charset(BUFFER *bp, UCHAR * buffer, B_COUNT * length, int always)
862 {
863     int rc = FALSE;
864 
865     TRACE((T_CALLED "deduce_charset(%s) bom:%s, encoding:%s\n",
866 	   bp->b_bname,
867 	   byteorder2s(b_val(bp, VAL_BYTEORDER_MARK)),
868 	   encoding2s(b_val(bp, VAL_FILE_ENCODING))));
869 
870     bp->implied_BOM = bom_NONE;
871     if (b_is_enc_AUTO(bp)) {
872 	unsigned n;
873 	int match = 0;
874 	int found = -1;
875 
876 	for (n = 0; n < TABLESIZE(bom_table); ++n) {
877 	    int check = riddled_buffer(&bom_table[n], buffer, *length);
878 	    if (check > match) {
879 		match = check;
880 		found = (int) n;
881 	    }
882 	}
883 	if (found > 0 && match >= b_val(bp, VAL_PERCENT_UTF8)) {
884 	    bp->implied_BOM = bom_table[found].code;
885 	    set_encoding_from_bom(bp, bp->implied_BOM);
886 	    TRACE(("...found_charset %s\n",
887 		   byteorder2s(bp->implied_BOM)));
888 
889 	    switch (bp->implied_BOM) {
890 	    case bom_UTF16BE:
891 	    case bom_UTF32BE:
892 		set_byteorder_mark(bp, bom_BE_ASSUMED);
893 		break;
894 	    case bom_UTF16LE:
895 	    case bom_UTF32LE:
896 		set_byteorder_mark(bp, bom_LE_ASSUMED);
897 		break;
898 	    default:
899 		break;
900 	    }
901 	    rc = TRUE;
902 	} else if (always) {
903 	    TRACE(("...try looking for UTF-8\n"));
904 	    if (check_utf8(buffer, *length) == TRUE)
905 		found_utf8(bp);
906 	}
907     } else {
908 	rc = TRUE;
909     }
910     remove_crlf_nulls(bp, buffer, length);
911     returnCode(rc);
912 }
913 
914 /*
915  * Check if the given buffer should be treated as UTF-8.
916  * For UTF-8, we have to have _some_ UTF-8 encoding, and _all_
917  * of the buffer has to match the pattern.
918  */
919 int
check_utf8(UCHAR * buffer,B_COUNT length)920 check_utf8(UCHAR * buffer, B_COUNT length)
921 {
922     B_COUNT n;
923     int check = TRUE;
924     int skip = 0;
925     int found;
926     UINT target;
927 
928     for (n = 0, found = 0; n < length - 1; n += (B_COUNT) skip) {
929 	skip = vl_conv_to_utf32(&target,
930 				(char *) (buffer + n),
931 				length - n);
932 	if (skip == 0) {
933 	    check = FALSE;
934 	    break;
935 	} else if (skip > 1) {
936 	    found = 1;
937 	}
938     }
939     return ((check && found)
940 	    ? TRUE
941 	    : (check
942 	       ? SORTOFTRUE
943 	       : FALSE));
944 }
945 
946 /*
947  * If we found UTF-8 encoding, set the buffer to match.
948  */
949 void
found_utf8(BUFFER * bp)950 found_utf8(BUFFER *bp)
951 {
952     TRACE(("...found UTF-8\n"));
953     bp->implied_BOM = bom_UTF8;
954     set_encoding_from_bom(bp, bp->implied_BOM);
955 }
956 
957 /*
958  * encode BOM while writing file, without modifying the buffer.
959  */
960 int
write_bom(BUFFER * bp)961 write_bom(BUFFER *bp)
962 {
963     const BOM_TABLE *mp;
964     int status = FIOSUC;
965 
966     if ((mp = find_mark_info((BOM_CODES) b_val(bp, VAL_BYTEORDER_MARK))) != 0) {
967 	status = ffputline((const char *) mp->mark, (int) mp->size, NULL);
968     }
969     return status;
970 }
971 
972 /*
973  * encode the UTF-8 text into UTF-16 or UTF-32, according to the buffer's
974  * file-encoding mode.
975  */
976 int
encode_charset(BUFFER * bp,const char * buf,int nbuf,const char * ending)977 encode_charset(BUFFER *bp, const char *buf, int nbuf, const char *ending)
978 {
979     int rc = 0;
980 
981     if (b_val(bp, VAL_FILE_ENCODING) == enc_UTF16
982 	|| b_val(bp, VAL_FILE_ENCODING) == enc_UTF32) {
983 	rc = dump_as_utfXX(bp, buf, nbuf, ending ? ending : "");
984     }
985     return rc;
986 }
987 
988 /*
989  * if byteorder mark changes, ensure that file-encoding is set compatibly.
990  */
991 int
chgd_byteorder(BUFFER * bp,VALARGS * args,int glob_vals,int testing)992 chgd_byteorder(BUFFER *bp,
993 	       VALARGS * args,
994 	       int glob_vals,
995 	       int testing)
996 {
997     if (!testing && !glob_vals) {
998 	set_encoding_from_bom(bp, (BOM_CODES) args->local->vp->i);
999     }
1000     return TRUE;
1001 }
1002 
1003 /*
1004  * If file-encoding changes to non-UTF-8, set byteorder-mark to none.
1005  * Only keep it set if changing from one UTF-encoding to another.
1006  */
1007 int
chgd_fileencode(BUFFER * bp,VALARGS * args,int glob_vals,int testing)1008 chgd_fileencode(BUFFER *bp,
1009 		VALARGS * args,
1010 		int glob_vals,
1011 		int testing)
1012 {
1013     if (testing) {
1014 	;
1015     } else {
1016 	int new_encoding = args->local->vp->i;
1017 	if (glob_vals) {
1018 	    if (new_encoding == enc_POSIX) {
1019 		rebuild_charclasses(0, 127);
1020 	    } else {
1021 		rebuild_charclasses(global_g_val(GVAL_PRINT_LOW),
1022 				    global_g_val(GVAL_PRINT_HIGH));
1023 	    }
1024 	} else {
1025 	    set_bom_from_encoding(bp, new_encoding);
1026 	}
1027 	set_bufflags(glob_vals, WFHARD | WFMODE);
1028     }
1029     return TRUE;
1030 }
1031 
1032 const char *
byteorder2s(int code)1033 byteorder2s(int code)
1034 {
1035     return choice_to_name(&fsm_byteorder_mark_blist, code);
1036 }
1037 
1038 const char *
encoding2s(int code)1039 encoding2s(int code)
1040 {
1041     return choice_to_name(&fsm_file_encoding_blist, code);
1042 }
1043