1 /*
2 * translat.c: Stuff for handling different encodings
3 * and a digraph entry facility. Support an international IRC!
4 *
5 * <subliminal message> you start using utf-8 and
6 * discard all legacy encodings</subliminal message>
7 *
8 * Joel Yliluoma.
9 *
10 * $Id: translat.c,v 1.8 2007-03-31 10:56:17 f Exp $
11 *
12 */
13
14 #include "irc.h"
15
16 #ifdef HAVE_ICONV_H
17 #include <iconv.h>
18 #endif /* HAVE_ICONV_H */
19
20 #include "vars.h"
21 #include "translat.h"
22 #include "ircaux.h"
23 #include "window.h"
24 #include "screen.h"
25 #include "output.h"
26
27 static char my_getarg(char **);
28
29 /* Globals */
30 char digraph_changed = 0;
31
32 #ifdef HAVE_ICONV_OPEN
33 char *irc_encoding = NULL;
34 char *display_encoding = NULL;
35 char *input_encoding = NULL;
36 #endif /* HAVE_ICONV_OPEN */
37
38
39 /*
40 * dig_table_lo[] and dig_table_hi[] contain the character pair that
41 * will result in the digraph in dig_table_di[]. To avoid searching
42 * both tables, I take the lower character of the pair, and only
43 * search dig_table_lo[]. Thus, dig_table_lo[] must always contain
44 * the lower character of the pair.
45 *
46 * The digraph tables are based on those in the excellent editor Elvis,
47 * with some additions for those, like me, who are used to VT320 or
48 * VT420 terminals.
49 */
50
51 #define DiLo(x) x,
52 #define DiHi(x)
53 #define DiDi(x)
54
55 /*
56 * Digraph tables. Note that, when adding a new digraph, the character
57 * of the pair with the lowest value, *must* be in the DiLo column.
58 * The higher of the pair goes in DiHi, and the digraph itself in DiDi.
59 */
60
61 char dig_table_lo[DIG_TABLE_SIZE] =
62 {
63 #include "digraph.inc"
64 0
65 };
66
67
68 #undef DiLo
69 #undef DiHi
70 #undef DiDi
71 #define DiLo(x)
72 #define DiHi(x) x,
73 #define DiDi(x)
74
75 char dig_table_hi[DIG_TABLE_SIZE] =
76 {
77 #include "digraph.inc"
78 0
79 };
80
81
82 #undef DiLo
83 #undef DiHi
84 #undef DiDi
85 #define DiLo(x)
86 #define DiHi(x)
87 #define DiDi(x) x,
88
89 char dig_table_di[DIG_TABLE_SIZE] =
90 {
91 #include "digraph.inc"
92 0
93 };
94
95
96 /*
97 * enter_digraph: The BIND function ENTER_DIGRAPH.
98 */
99 void
enter_digraph(key,str)100 enter_digraph(key, str)
101 u_int key;
102 char *str;
103 {
104
105 current_screen->digraph_hit = 1; /* Just stuff away first character. */
106 }
107
108 /*
109 * get_digraph: Called by edit_char() when a digraph entry is activated.
110 * Looks up a digraph given u_char c1 and the global u_char
111 * current_screen->digraph_hit.
112 */
113 char
get_digraph(ic1)114 get_digraph(ic1)
115 u_int ic1;
116 {
117 int i = 0;
118 char c,
119 c2 = current_screen->digraph_first,
120 c1 = (u_char)ic1;
121
122 current_screen->digraph_hit = 0;
123 if (c1 > c2) /* Make sure we have the lowest one in c1. */
124 c = c1, c1 = c2, c2 = c;
125 while (dig_table_lo[i])
126 { /* Find digraph and return it. */
127 if ((dig_table_lo[i] == c1) && (dig_table_hi[i] == c2))
128 return dig_table_di[i];
129 i++;
130 }
131 return 0; /* Failed lookup. */
132 }
133
134
135 /*
136 * digraph: The /DIGRAPH command with facilities.
137 * This routine is *NOT* finished yet.
138 */
139
140 void
digraph(command,args,subargs)141 digraph(command, args, subargs)
142 char *command,
143 *args,
144 *subargs;
145 {
146 char *arg;
147 char c1,
148 c2 = '\0',
149 c3 = '\0';
150 int i;
151 size_t len;
152
153 if ((arg = next_arg(args, &args)) && (*arg == '-'))
154 {
155 char *cmd = (char *) 0;
156
157 arg++;
158 if ((len = strlen(arg)) == 0)
159 {
160 say("Unknown or missing flag.");
161 return;
162 }
163 malloc_strcpy(&cmd, arg);
164 lower(cmd);
165 if (strncmp(cmd, "add", len) == 0)
166 {
167 /*
168 * Add a digraph to the table.
169 * I *know*. This *is* a kludge.
170 */
171 if ((i = strlen(dig_table_lo)) ==
172 DIG_TABLE_SIZE - 1)
173 say("Sorry, digraph table full.");
174 else
175 {
176 while ((c1 = my_getarg(&args)) &&
177 (c2 = my_getarg(&args)) &&
178 (c3 = my_getarg(&args)))
179 {
180 /* Pass c1 to get_digraph() */
181 current_screen->digraph_first = c1;
182 if (get_digraph(c2) == 0)
183 {
184 dig_table_di[i] = c3;
185 /* Make sure c1 <= c2 */
186 if (c1 > c2)
187 c3 = c1, c1 = c2, c2 = c3;
188 dig_table_lo[i] = c1;
189 dig_table_hi[i] = c2;
190 i++;
191 dig_table_lo[i] =
192 dig_table_hi[i] =
193 dig_table_di[i] =
194 (u_char) 0;
195 digraph_changed = 1;
196 say("Digraph added to table.");
197 }
198 else
199 {
200 say("Digraph already defined in table.");
201 break;
202 }
203 }
204 if (!c2 || !c3)
205 say("Unknown or missing argument.");
206 }
207
208 }
209 else if (strncmp(cmd, "remove", len) == 0)
210 {
211
212 /* Remove a digraph from the table. */
213 if ((i = strlen(dig_table_lo)) == 0)
214 say("Digraph table is already empty.");
215 else
216 {
217 if ((c1 = my_getarg(&args)) &&
218 (c2 = my_getarg(&args)))
219 {
220 i = 0;
221 if (c1 > c2)
222 c3 = c1, c1 = c2, c2 = c3;
223 while (dig_table_lo[i])
224 {
225 if ((dig_table_lo[i] == c1) &&
226 (dig_table_hi[i] == c2))
227 /*
228 * FIXME: strcpy() is not guaranteed for
229 * overlapping copying, but this one
230 * is high -> low. Ought to be fixed.
231 */
232 /* re-indent this block - phone, jan 1993. */
233 {
234 strcpy(dig_table_lo + i, dig_table_lo + i + 1);
235 strcpy(dig_table_hi + i, dig_table_hi + i + 1);
236 strcpy(dig_table_di + i, dig_table_di + i + 1);
237 digraph_changed = 1;
238 put_it("Digraph removed from table.");
239 return;
240 }
241 /* much better */
242 i++;
243 }
244 say("Digraph not found.");
245 }
246 }
247 }
248 else if (strncmp(cmd, "clear", len) == 0)
249 {
250
251 /* Clear digraph table. */
252 dig_table_lo[0] = dig_table_hi[0] = dig_table_di[0] =
253 (u_char) 0;
254 digraph_changed = 1;
255 say("Digraph table cleared.");
256
257 }
258 else
259 say("Unknown flag.");
260 }
261 else
262 {
263
264 /* Display digraph table. */
265 u_char buffer1[8];
266 u_char buffer2[192];
267
268 say("Digraph table:");
269 buffer2[0] = (u_char) 0;
270 i = 0;
271 while(dig_table_lo[i])
272 {
273 snprintf(CP(buffer1), sizeof buffer1, "%c%c %c ", dig_table_lo[i],
274 dig_table_hi[i], dig_table_di[i]);
275 strcat(buffer2, buffer1);
276 if ((++i % 10) == 0)
277 {
278 put_it(CP(buffer2));
279 buffer2[0] = (u_char) 0;
280 }
281 }
282 if (buffer2[0])
283 put_it(CP(buffer2));
284 snprintf(CP(buffer2), sizeof buffer2, "%d digraphs listed.", i);
285 say(CP(buffer2));
286 }
287 }
288
289 static char
my_getarg(args)290 my_getarg(args)
291 char **args;
292 {
293 char *arg;
294
295 arg = (char *)next_arg(*args, args);
296 if (!args || !*args || !arg)
297 return '\0';
298 /* Don't trust isdigit() with 8 bits. */
299 if ((*arg <= '9') && (*arg >= '0'))
300 {
301 u_char i = *arg & 0x0f;
302 while ( *(++arg) )
303 i = (i * 10) + (*arg & 0x0f);
304 return i;
305 }
306 else if ( (*arg == '!') && (*(arg + 1)) )
307 return *(arg + 1) | 0x80;
308 return *arg;
309 }
310
311 void
save_digraphs(fp)312 save_digraphs(fp)
313 FILE *fp;
314 {
315 if (digraph_changed)
316 {
317
318 int i = 0;
319 char *command = "\nDIGRAPH -ADD ";
320
321 fprintf(fp, "DIGRAPH -CLEAR");
322 fprintf(fp, "%s", command);
323 while(1)
324 {
325 fprintf(fp, "%d %d %d ", dig_table_lo[i],
326 dig_table_hi[i], dig_table_di[i]);
327 if (!dig_table_lo[++i])
328 break;
329 if (!(i % 5))
330 fprintf(fp, "%s", command);
331 }
332 fputc('\n', fp);
333
334 }
335 }
336
337 char
displayable_unival(unsigned unival,iconv_t conv_out)338 displayable_unival(unsigned unival, iconv_t conv_out)
339 {
340 /* First rule out control characters */
341 if((unival >= 0x00 && unival < 0x20) ||
342 (unival >= 0x80 && unival < 0xA0) ||
343 (unival == 0x7F) ||
344 (unival >= 0xFFF0 && unival <= 0xFFFF))
345 return 0;
346
347 /* Range 0x80..0x9F is used in some character sets (such as cp850),
348 * but they are assigned different positions in unicode.
349 * The univals we handle here are unicode positions.
350 * In unicode, 0x80..0x9F are not used because some
351 * american programs might still blindly assume
352 * 7-bitness and take those as control characters.
353 * 0x7F is delete/backspace.
354 * 0xFFF0..0xFFFF is the unicode control range.
355 * It contains a signature token, an illegal
356 * character token and so on.
357 */
358
359 #ifdef HAVE_ICONV_OPEN
360 if (conv_out)
361 {
362 u_char utfbuf[8],*utfptr;
363 u_char outbuf[256],*outptr;
364 size_t utfspace, outspace;
365 size_t retval;
366
367 /* Now sequence the character to buffer
368 * and let iconv say whether it can displayed.
369 */
370 utf8_sequence(unival, utfbuf);
371
372 utfptr = utfbuf;
373 outptr = outbuf;
374 utfspace = strlen(utfbuf);
375 outspace = sizeof outbuf;
376
377 /* reset the converter */
378 iconv(conv_out, NULL, 0, NULL, 0);
379
380 /* *outptr = '\0'; */
381 retval = iconv(conv_out,
382 (iconv_const char**)&utfptr, &utfspace,
383 (char **)&outptr, &outspace);
384
385 /*
386 *outptr = '\0';
387 fprintf(stderr, "CHK: '%s' -> '%s', retval=%d, errno=%d\n",
388 utfbuf, outbuf,
389 retval, errno);
390 */
391 return retval != (size_t)-1;
392 }
393 #endif /* HAVE_ICONV_OPEN */
394 return 1;
395 }
396
397 unsigned
calc_unival_width(unsigned unival)398 calc_unival_width(unsigned unival)
399 {
400 /* FIXME: Should we use some kind of database here?
401 * FIXME: Combining marks support is completely untested
402 */
403
404 /* chinese, japanese, korean */
405 if(unival >= 0x3000 && unival < 0xFF00)
406 return 2;
407 /* combining diacritical marks */
408 if(unival >= 0x0300 && unival < 0x0400)
409 return 0;
410 /* combining diacritical marks for symbols */
411 if(unival >= 0x20D0 && unival < 0x2100)
412 return 0;
413 /* combining half-marks */
414 if(unival >= 0xFE20 && unival < 0xFE30)
415 return 0;
416 /* everything else */
417 return 1;
418 }
419
420 unsigned
calc_unival_length(const u_char * str)421 calc_unival_length(const u_char* str)
422 {
423 /* Returns the number of bytes taken by
424 * the given utf-8 code
425 */
426 static const char sizes[16] =
427 { 1,1,1,1,1,1,1,1,
428 0,0,0,0,2,2,3,4 };
429 return sizes[*str >> 4];
430 /* 1-byte (0..7F):
431 * 0 1 2 3 4 5 6 7
432 * 2-byte (80..7FF):
433 * C D
434 * 3-byte (800..FFFF):
435 * E
436 * 4-byte (10000..1FFFFF):
437 * F
438 * invalid:
439 * 8 9 A B (they can not begin a sequence)
440 *
441 * If utf8 is some day extended to use 5-byte
442 * codings, you need to double the sizes[] size
443 * and shift str by 3 instead of 4.
444 * You'd also need to modify
445 * utf8_sequence() and calc_unival().
446 *
447 * Today, it seems unlikely that these encodings
448 * will be needed in practical applications such as
449 * an irc client. Many programs (such as Microsoft IE)
450 * don't even support 4-byte encodings.
451 * 2-3 -byte encodings are in daily use everywhere.
452 */
453 }
454
455 unsigned
calc_unival(const u_char * utfbuf)456 calc_unival(const u_char *utfbuf)
457 {
458 /* This function does the reverse of utf8_sequence(). */
459 switch (calc_unival_length(utfbuf))
460 {
461 case 1:
462 default:
463 return ((utfbuf[0] & 127));
464 case 2:
465 return ((utfbuf[0] & 31) << 6)
466 | ((utfbuf[1] & 63));
467 case 3:
468 return ((utfbuf[0] & 15) << 12)
469 | ((utfbuf[1] & 63) << 6)
470 | ((utfbuf[2] & 63));
471 case 4:
472 return ((utfbuf[0] & 7) << 16)
473 | ((utfbuf[1] & 63) << 12)
474 | ((utfbuf[2] & 63) << 6)
475 | ((utfbuf[3] & 63));
476 }
477 }
478
479 void
utf8_sequence(unsigned unival,u_char * utfbuf)480 utf8_sequence(unsigned unival, u_char* utfbuf)
481 {
482 /* This function does the reverse of calc_unival(). */
483 /* The output buffer should have 5 bytes of space. */
484 u_char *utfptr = utfbuf;
485 if (unival < 0x80) /* <=7 bits */
486 *utfptr++ = (char)unival;
487 else
488 {
489 if (unival < 0x800) /* <=11 bits */
490 *utfptr++ = (char)(0xC0 + (unival>>6));
491 else
492 {
493 if (unival < 0x10000) /* <=16 bits */
494 *utfptr++ = (char)(0xE0 + (unival>>12));
495 else /* <=21 bits */
496 {
497 *utfptr++ = (char)(0xF0 + (unival>>18));
498 *utfptr++ = (char)(0x80 + ((unival>>12)&63));
499 }
500 *utfptr++ = (char)(0x80 + ((unival>>6)&63));
501 }
502 *utfptr++ = (char)(0x80 + (unival&63));
503 }
504 /* Last put a zero-terminator. */
505 *utfptr = '\0';
506 /*
507 fprintf(stderr, "utf8-seq %X: %02X %02X (%s)\n",
508 unival, utfbuf[0], utfbuf[1], utfbuf);
509 */
510 }
511
512 void
mbdata_init(struct mb_data * d,const char * enc)513 mbdata_init(struct mb_data *d, const char *enc)
514 {
515 bzero(d, sizeof(*d));
516
517 #ifdef HAVE_ICONV_OPEN
518 d->enc = enc;
519 if (!d->conv_in && !d->conv_out && d->enc && display_encoding)
520 {
521 /* New encoding, reinitialize converters */
522
523 if (!d->conv_in)
524 d->conv_in = iconv_open("UTF-8", d->enc);
525 if (!d->conv_out)
526 d->conv_out = iconv_open(display_encoding, "UTF-8");
527
528 if (d->conv_in == (iconv_t)(-1))
529 {
530 iconv_close(d->conv_in);
531 d->conv_in = NULL;
532 }
533 if (d->conv_out == (iconv_t)(-1))
534 {
535 iconv_close(d->conv_out);
536 d->conv_out = NULL;
537 }
538 }
539 #endif /* HAVE_ICONV_OPEN */
540 }
541
542 void
mbdata_done(struct mb_data * d)543 mbdata_done(struct mb_data* d)
544 {
545 #ifdef HAVE_ICONV_OPEN
546 if (d->conv_in)
547 iconv_close(d->conv_in);
548 if (d->conv_out)
549 iconv_close(d->conv_out);
550 #endif /* HAVE_ICONV_OPEN */
551 bzero(d, sizeof(*d));
552 }
553
554 void
decode_mb(ptr,dest,data)555 decode_mb(ptr, dest, data)
556 u_char *ptr; /* Source, encoded in whatever */
557 u_char *dest; /* Target, encoded in utf-8 - NULL is allowed */
558 mb_data *data; /* Populated with data*/
559 {
560 #ifdef HAVE_ICONV_OPEN
561 /* If iconv has now been initialized, use it. */
562 if (data->conv_in && data->conv_out)
563 {
564 /* Task:
565 * Eat input byte by byte
566 * Until either
567 * - the input is exhausted
568 * - conv_in creates a character
569 * When conv_in creates a character,
570 * - feed the character to conv_out
571 * - if conv_out says dame desu yo
572 * - we have an invalid character
573 * - otherwise, analyze the unicode value
574 * - For values 0000..001F: add 40, invert (invalid)
575 * - For values 0080..009F: dec 40, invert (invalid)
576 * - For values 3000..FEFF: (CJK) width=2
577 */
578 u_char utfbuf[8], *utfptr = utfbuf;
579 size_t utfspace = sizeof(utfbuf);
580 unsigned unival;
581 int error = 0;
582 size_t retval = 0;
583
584 data->input_bytes = 0;
585 data->output_bytes = 0;
586 data->num_columns = 0;
587
588 *utfptr = '\0';
589
590 while (*ptr != '\0')
591 {
592 unsigned gave;
593 size_t is = 1;
594 u_char *cptr, *cutfptr;
595
596 retry:
597 gave = is;
598 cptr = (char *)ptr;
599 cutfptr = (char *)utfptr;
600 retval = iconv(data->conv_in,
601 (iconv_const char**)&ptr, &is,
602 (char **)&utfptr, &utfspace);
603
604 data->input_bytes += gave-is;
605
606 if (retval == (size_t)-1)
607 {
608 switch (errno)
609 {
610 case EINVAL:
611 /* We didn't give enough bytes. Must give more */
612 is = gave;
613 if (ptr[is] != '\0')
614 {
615 ++is;
616 goto retry;
617 }
618 /* It's an undecodable input. */
619 error = 1;
620 data->input_bytes = 1;
621 ++ptr;
622 goto endloop;
623 case EILSEQ:
624 /* Ignore invalid byte, continue loop. */
625 error = 1;
626 if (*ptr != '\0')
627 {
628 ++ptr;
629 ++data->input_bytes;
630 }
631 continue;
632 }
633 }
634
635 if (utfptr > utfbuf)
636 {
637 /* An UTF-8 character was created! */
638 data->output_bytes += utfptr-utfbuf;
639 *utfptr = '\0';
640 endloop:
641 break;
642 }
643 break;
644 }
645
646 if (data->output_bytes == 0 && !error)
647 {
648 /* Nothing was produced, no errors. */
649 return;
650 }
651
652 unival = 0;
653
654 if (data->output_bytes > 0)
655 {
656 /* Calculate the unicode value of the utf8 character */
657 unival = calc_unival(utfbuf);
658 }
659
660 if (!displayable_unival(unival, data->conv_out))
661 {
662 /* The character could not be expressed in display encoding
663 * or would be a control character
664 */
665 data->num_columns = data->input_bytes;
666 data->output_bytes = data->input_bytes;
667 if (data->output_bytes > 0)
668 data->output_bytes += 2;
669 if (dest)
670 {
671 unsigned n = data->input_bytes;
672 if (n > 0)
673 {
674 ptr -= n;
675 *dest++ = REV_TOG;
676 /* we assume ascii always works */
677 while (n-- > 0)
678 *dest++ = (*ptr++ & 127) | 64;
679 *dest++ = REV_TOG;
680 }
681 }
682 return;
683 }
684
685 data->num_columns = calc_unival_width(unival);
686
687 if (dest)
688 {
689 memcpy(dest, utfbuf, data->output_bytes);
690 }
691 return;
692 }
693 #endif /* HAVE_ICONV_OPEN */
694 /* No usable iconv (maybe csets were invalid), assume ISO-8859-1 in */
695 data->input_bytes = 1;
696 data->num_columns = 1;
697
698 if (!displayable_unival(*ptr, NULL))
699 {
700 data->output_bytes = 3;
701 if (dest)
702 {
703 *dest++ = REV_TOG;
704 *dest++ = (*ptr & 127) | 64;
705 *dest++ = REV_TOG;
706 }
707 }
708 else
709 {
710 unsigned unival = *ptr;
711
712 if (unival < 0x80)
713 data->output_bytes = 1;
714 else if (unival < 0x800)
715 data->output_bytes = 2;
716 else if (unival < 0x10000)
717 data->output_bytes = 3;
718 else
719 data->output_bytes = 4;
720 if (dest)
721 utf8_sequence(unival, dest);
722 }
723 }
724
725 void
set_irc_encoding(char * enc)726 set_irc_encoding(char *enc)
727 {
728 #ifdef HAVE_ICONV_OPEN
729 iconv_t test = iconv_open("UTF-8", enc);
730
731 if (test != NULL && test != (iconv_t)(-1))
732 {
733 iconv_close(test);
734 malloc_strcpy(&irc_encoding, enc);
735 }
736 else
737 say("IRC_ENCODING value %s is not supported by this system", enc);
738 #else
739 say("IRC_ENCODING has no effect - this version was compiled without iconv support");
740 #endif /* HAVE_ICONV_OPEN */
741 }
742
743 void
set_display_encoding(char * enc)744 set_display_encoding(char *enc)
745 {
746 #ifdef HAVE_ICONV_OPEN
747 iconv_t test = iconv_open(enc, "UTF-8");
748
749 if (test != NULL && test != (iconv_t)(-1))
750 {
751 iconv_close(test);
752 malloc_strcpy(&display_encoding, enc);
753 }
754 else
755 say("DISPLAY_ENCODING value %s is not supported by this system", enc);
756 #else
757 say("DISPLAY_ENCODING has no effect - this version was compiled without iconv support");
758 #endif /* HAVE_ICONV_OPEN */
759 }
760
761 void
set_input_encoding(char * enc)762 set_input_encoding(char *enc)
763 {
764 #ifdef HAVE_ICONV_OPEN
765 iconv_t test = iconv_open("UTF-8", enc);
766
767 if (test != NULL && test != (iconv_t)(-1))
768 {
769 iconv_close(test);
770 malloc_strcpy(&input_encoding, enc);
771 }
772 else
773 say("INPUT_ENCODING value %s is not supported by this system", enc);
774 #else
775 say("INPUT_ENCODING has no effect - this version was compiled without iconv support");
776 #endif /* HAVE_ICONV_OPEN */
777 }
778