1 /* format.c
2 * Format text, establish line breaks, manage whitespace.
3 * This file is part of the edbrowse project, released under GPL.
4 */
5
6 #include "eb.h"
7
8 /*********************************************************************
9 Prepare html for text processing.
10 Change nulls to spaces.
11 Make sure it doesn't already contain my magic code,
12 The one I use to indicate a tag.
13 If it does, well, change them to something else.
14 I can only hope this doesn't screw up some embedded javascript.
15 *********************************************************************/
16
prepareForBrowse(char * h,int h_len)17 void prepareForBrowse(char *h, int h_len)
18 {
19 int i, j;
20
21 for (i = j = 0; i < h_len; ++i) {
22 if (h[i] == 0)
23 h[i] = ' ';
24 if (h[i] == '\b') {
25 if (i && !strchr("\n\b<>'\"&", h[i - 1]))
26 --j;
27 continue;
28 }
29 if (h[i] == InternalCodeChar)
30 h[i] = InternalCodeCharAlternate;
31 h[j++] = h[i];
32 }
33 h[j] = 0; /* now it's a string */
34
35 /* undos the file */
36 for (i = j = 0; h[i]; ++i) {
37 if (h[i] == '\r' && h[i + 1] == '\n')
38 continue;
39 h[j++] = h[i];
40 }
41 h[j] = 0;
42 } /* prepareForBrowse */
43
44 /* An input field cannot contain newline, null, or the InternalCodeChar */
45 // Revised June 2018, maybe newline is ok. We need it for textarea.
prepareForField(char * h)46 void prepareForField(char *h)
47 {
48 while (*h) {
49 if (*h == 0)
50 *h = ' ';
51 if (*h == InternalCodeChar)
52 *h = InternalCodeCharAlternate;
53 ++h;
54 }
55 } /* prepareForField */
56
57 /*********************************************************************
58 The primary goal of this routine is to turn
59 Hey,{ click here } for more information
60 into
61 Hey, {click here} for more information
62 But of course we won't do that if the section is preformatted.
63 Nor can we muck with the whitespace that might be present in an input field <>.
64 Also swap 32* whitespace, pushing invisible anchors forward.
65 If a change is made, the procedure is run again,
66 kinda like bubble sort.
67 It has the potential to be terribly inefficient,
68 but that doesn't seem to happen in practice.
69 Use cnt to count the iterations, just for debugging.
70 | is considered a whitespace character. Why is that?
71 Html tables are mostly used for visual layout, but sometimes not.
72 I use | to separate the cells of a table, but if there's nothing in them,
73 or at least no text, then I get rid of the pipes.
74 But every cell is going to have an invisible anchor from <td>, so that js can,
75 perhaps, set innerHTML inside this cell.
76 So there's something there, but nothing there.
77 I push these tags past pipes, so I can clear it all away.
78 One web page in ten thousand will actually set html inside a cell,
79 after the fact, and when that happens the text won't be in the right place,
80 it won't have the pipes around it that it should.
81 I'm willing to accept that for now.
82 *********************************************************************/
83
cellDelimiters(char * buf)84 static void cellDelimiters(char *buf)
85 {
86 char *lastcell = 0;
87 int cellcount = 0;
88 char *s;
89
90 for (s = buf; *s; ++s) {
91 if (*s == TableCellChar) {
92 *s = '|';
93 lastcell = s;
94 ++cellcount;
95 continue;
96 }
97 if (!strchr("\f\r\n", *s))
98 continue;
99 /* newline here, if just one cell delimiter then blank it out */
100 if (cellcount == 1)
101 *lastcell = ' ';
102 cellcount = 0;
103 }
104 } /* cellDelimiters */
105
anchorSwap(char * buf)106 static void anchorSwap(char *buf)
107 {
108 char c, d, *s, *ss, *w, *a;
109 bool pretag; // <pre>
110 bool premode; // inside <pre> </pre>
111 bool inputmode; // inside an input field
112 bool slash; // closing tag
113 bool change; // made a swap somewhere
114 bool strong; // strong whitespace, newline or paragraph
115 int n, cnt;
116 char tag[20];
117
118 static const char from[] =
119 "\x1b\x95\x99\x9c\x9d\x91\x92\x93\x94\xa0\xad\x96\x97\x85";
120 static const char becomes[] = "_*'`'`'`' ----";
121 /* I use to convert a6 and c2 to hyphen space, not sure why */
122
123 /* Transliterate a few characters. One of them is 0xa0 to space,
124 * so we need to do this now, before the anchors swap with whitespace.
125 * Watch out for utf8 - don't translate the a0 in c3a0. That is a grave.
126 * But a0 by itself is breakspace; turn it into space.
127 * And c2a0 is a0 is breakspace.
128 * Don't do any of these transliterations in an input field. */
129
130 inputmode = false;
131 for (s = w = buf; (c = *s); ++s) {
132 d = s[1];
133 if (c == InternalCodeChar && isdigitByte(d)) {
134 strtol(s + 1, &ss, 10);
135 if (*ss == '<')
136 inputmode = true;
137 if (*ss == '>')
138 inputmode = false;
139 ++ss;
140 n = ss - s;
141 memmove(w, s, n);
142 w += n;
143 s = ss - 1;
144 continue;
145 }
146
147 if (inputmode)
148 goto put1;
149
150 /* utf8 test */
151 if ((c & 0xc0) == 0xc0 && (d & 0xc0) == 0x80) {
152 unsigned int uni = 0;
153 if ((c & 0x3c) == 0) {
154 /* fits in 8 bits */
155 uni = ((uchar) c << 6) | (d & 0x3f);
156 ss = strchr(from, (char)uni);
157 if (ss) {
158 c = becomes[ss - from];
159 ++s;
160 goto put1;
161 }
162 }
163 /* copy the utf8 sequence as is */
164 *w++ = c;
165 ++s;
166 c <<= 1;
167 while ((c & 0x80) && ((d = *s) & 0xc0) == 0x80) {
168 *w++ = d;
169 ++s;
170 }
171 --s;
172 continue;
173 }
174
175 /* Now assuming iso8859-1, which is practically deprecated */
176 ss = strchr(from, c);
177 if (ss)
178 c = becomes[ss - from];
179
180 #if 0
181 // Should we modify empty anchors in any way?
182 if (c != InternalCodeChar)
183 goto put1;
184 if (!isdigitByte(s[1]))
185 goto put1;
186 for (a = s + 2; isdigitByte(*a); ++a) ;
187 if (*a != '{')
188 goto put1;
189 for (++a; *a == ' '; ++a) ;
190 if (a[0] != InternalCodeChar || a[1] != '0' || a[2] != '}')
191 goto put1;
192 // do something with empty {} here.
193 // Following code just skips it, but we likely shouldn't do that.
194 s = a + 2;
195 continue;
196 #endif
197
198 put1:
199 *w++ = c;
200 }
201 *w = 0;
202
203 /* anchor whitespace swap preserves the length of the string */
204 cnt = 0;
205 change = true;
206 while (change) {
207 change = false;
208 ++cnt;
209 premode = false;
210 /* w represents the state of whitespace */
211 w = NULL;
212 /* a points to the prior anchor, which is swappable with following whitespace */
213 a = NULL;
214
215 for (s = buf; (c = *s); ++s) {
216 if (isspaceByte(c) || c == '|') {
217 if (c == '\t' && !premode)
218 *s = ' ';
219 if (!w)
220 w = s;
221 continue;
222 }
223
224 /* end of white space, should we swap it with prior tag? */
225 if (w && a) {
226 memmove(a, w, s - w);
227 memmove(a + (s - w), tag, n);
228 change = true;
229 w = NULL;
230 }
231
232 /* prior anchor has no significance */
233 a = NULL;
234
235 if (c != InternalCodeChar)
236 goto normalChar;
237 /* some conditions that should never happen */
238 if (!isdigitByte(s[1]))
239 goto normalChar;
240 n = strtol(s + 1, &ss, 10);
241 preFormatCheck(n, &pretag, &slash);
242 d = *ss;
243 if (!strchr("{}<>*", d))
244 goto normalChar;
245 n = ss + 1 - s;
246 memcpy(tag, s, n);
247 tag[n] = 0;
248
249 if (pretag) {
250 w = 0;
251 premode = !slash;
252 s = ss;
253 continue;
254 }
255
256 /* We have a tag, should we swap it with prior whitespace? */
257 if (w && !premode && d == '}') {
258 memmove(w + n, w, s - w);
259 memcpy(w, tag, n);
260 change = true;
261 w += n;
262 s = ss;
263 continue;
264 }
265
266 if ((d == '*' || d == '{') && !premode)
267 a = s;
268 s = ss;
269
270 normalChar:
271 w = 0; /* no more whitespace */
272 /* end of loop over the chars in the buffer */
273 }
274 /* end of loop making changes */
275 }
276 debugPrint(4, "anchorSwap %d", cnt);
277
278 /* Framing characters like [] around an anchor are unnecessary here,
279 * because we already frame it in braces.
280 * Get rid of these characters, even in premode. */
281 for (s = w = buf; (c = *s); ++s) {
282 char open, close, linkchar;
283 if (!strchr("{[(<", c))
284 goto putc;
285 if (s[1] != InternalCodeChar)
286 goto putc;
287 if (!isdigitByte(s[2]))
288 goto putc;
289 for (a = s + 3; isdigitByte(*a); ++a) ;
290 linkchar = 0;
291 if (*a == '{')
292 linkchar = '}';
293 if (*a == '<')
294 linkchar = '>';
295 if (!linkchar)
296 goto putc;
297 open = c;
298 close = 0;
299 if (open == '{')
300 close = '}';
301 if (open == '[')
302 close = ']';
303 if (open == '(')
304 close = ')';
305 if (open == '<')
306 close = '>';
307 n = 1;
308 while (n < 120) {
309 d = a[n++];
310 if (!d)
311 break;
312 if (d != InternalCodeChar)
313 continue;
314 while (isdigitByte(a[n]))
315 ++n;
316 d = a[n++];
317 if (!d)
318 break; /* should never happen */
319 if (strchr("{}<>", d))
320 break;
321 }
322 if (n >= 120)
323 goto putc;
324 if (d != linkchar)
325 goto putc;
326 a += n;
327 if (*a != close)
328 goto putc;
329 ++s;
330 memmove(w, s, a - s);
331 w += a - s;
332 s = a;
333 continue;
334 putc:
335 *w++ = c;
336 } /* loop over buffer */
337 *w = 0;
338 debugPrint(4, "anchors unframed");
339
340 /* Now compress the implied linebreaks into one. */
341 premode = false;
342 ss = 0;
343 for (s = buf; (c = *s); ++s) {
344 if (c == InternalCodeChar && isdigitByte(s[1])) {
345 n = strtol(s + 1, &s, 10);
346 if (*s == '*') {
347 preFormatCheck(n, &pretag, &slash);
348 if (pretag)
349 premode = !slash;
350 }
351 }
352 if (!isspaceByte(c))
353 continue;
354 strong = false;
355 a = 0;
356 for (w = s; isspaceByte(*w); ++w) {
357 if (*w == '\n' || *w == '\f')
358 strong = true;
359 if (*w == '\r' && !a)
360 a = w;
361 }
362 ss = s, s = w - 1;
363 if (!a)
364 continue;
365 if (premode)
366 continue;
367 if (strong) {
368 for (w = ss; w <= s; ++w)
369 if (*w == '\r')
370 *w = ' ';
371 continue;
372 }
373 for (w = ss; w <= s; ++w)
374 if (*w == '\r' && w != a)
375 *w = ' ';
376 } /* loop over buffer */
377 debugPrint(4, "whitespace combined");
378
379 /* Due to the anchor swap, the buffer could end in whitespace
380 * followed by several anchors. Trim these off. */
381 s = buf + strlen(buf);
382 while (s > buf + 1 && s[-1] == '*' && isdigitByte(s[-2])) {
383 for (w = s - 3; w >= buf && isdigitByte(*w); --w) ;
384 if (w < buf || *w != InternalCodeChar)
385 break;
386 s = w;
387 }
388 *s = 0;
389
390 /*********************************************************************
391 Some hyperlinks are multiline, due to some html inside, and our interpretation
392 of said html. This is just annoying, so pull it back down to one line.
393 Same goes for <button>, but other input fields must remain as they are.
394 Even submit, as shown by jsrt, if you submit the form it says b1=Send%20Message
395 hence it would send a newline if there was one.
396 *********************************************************************/
397
398 for (s = buf; (c = *s); ++s) {
399 if (c != InternalCodeChar)
400 continue;
401 n = strtol(s + 1, &s, 10);
402 if (*s == '<') {
403 if (!stringEqual(tagList[n]->info->name, "button"))
404 continue;
405 } else if (*s != '{')
406 continue;
407 for (a = s + 1; (c = *a); ++a) {
408 if (c == InternalCodeChar && a[1] == '0')
409 break;
410 if (c == '\n' || c == '\f')
411 *a = ' ';
412 }
413 s = a;
414 }
415 } /* anchorSwap */
416
417 /*********************************************************************
418 Format text, and break lines at sentence/phrase boundaries.
419 The prefix bl means breakline.
420 *********************************************************************/
421
422 static char *bl_start, *bl_cursor, *bl_end;
423 static bool bl_overflow;
424 /* This is a virtual column number, extra spaces for tab,
425 * one space for emoji, and skipping over invisible anchors. */
426 static int colno;
427 int formatLineLength = 80; // for html formatting or the bl command
428 bool formatOverflow;
429 static const int cutLineAfter = 36; /* cut sentence after this column */
430 static const int paraLine = 120; /* paragraph in a line */
431 static int longcut, pre_cr;
432 static int lspace; /* last space value, 3 = paragraph */
433 /* Location of period comma rightparen or any word.
434 * Question mark is equivalent to period etc.
435 * Other things being equal, we break at period, rather than comma, etc.
436 * First the column numbers, then the index into the string. */
437 static int lperiod, lcomma, lright, lany;
438 static int idxperiod, idxcomma, idxright, idxany;
439
debugChunk(const char * chunk,int len)440 static void debugChunk(const char *chunk, int len)
441 {
442 int i;
443 FILE *f;
444 if (debugLevel < 7)
445 return;
446 f = debugFile ? debugFile : stdout;
447 fprintf(f, "chunk<");
448 for (i = 0; i < len; ++i) {
449 char c = chunk[i];
450 if (c == '\t') {
451 fprintf(f, "\\t");
452 continue;
453 }
454 if (c == '\n') {
455 fprintf(f, "\\n");
456 continue;
457 }
458 if (c == '\f') {
459 fprintf(f, "\\f");
460 continue;
461 }
462 if (c == '\r') {
463 fprintf(f, "\\r");
464 continue;
465 }
466 if (c == '\0') {
467 fprintf(f, "\\0");
468 continue;
469 }
470 fprintf(f, "%c", c);
471 }
472 fprintf(f, ">%d.%d\n", colno, lspace);
473 } /* debugChunk */
474
appendOneChar(char c)475 static void appendOneChar(char c)
476 {
477 if (bl_cursor == bl_end)
478 bl_overflow = true;
479 else
480 *bl_cursor++ = c;
481 } /* appendOneChar */
482
spaceNotInInput(void)483 static bool spaceNotInInput(void)
484 {
485 char *t = bl_cursor;
486 char c;
487 for (--t; t >= bl_start; --t) {
488 c = *t;
489 if (c == '\n' || c == '\r')
490 return true;
491 if (c == '>' && t >= bl_start + 2 &&
492 t[-1] == '0' && t[-2] == InternalCodeChar)
493 return true;
494 if (c != '<')
495 continue;
496 while (t > bl_start && isdigitByte(t[-1]))
497 --t;
498 if (*t == '<')
499 continue;
500 if (t > bl_start && t[-1] == InternalCodeChar)
501 return false;
502 }
503 return true;
504 } /* spaceNotInInput */
505
appendSpaceChunk(const char * chunk,int len,bool premode)506 static void appendSpaceChunk(const char *chunk, int len, bool premode)
507 {
508 int nlc = pre_cr; /* newline count */
509 int spc = 0; /* space count */
510 int i, j;
511 char c, d, e;
512
513 if (!len)
514 return;
515 for (i = 0; i < len; ++i) {
516 c = chunk[i];
517 if (c == '\n' || c == '\r') {
518 ++nlc, spc = 0;
519 continue;
520 }
521 if (c == '\f') {
522 nlc += 2, spc = 0;
523 continue;
524 }
525 ++spc;
526 }
527
528 if (!premode && spaceNotInInput()) {
529 int l = bl_cursor - bl_start;
530 c = d = ' ';
531 if (l)
532 d = bl_cursor[-1];
533 if (l > 1)
534 c = bl_cursor[-2];
535 e = d;
536 if (strchr(")\"|}", d))
537 e = c;
538 if (strchr(".?!:", e)) {
539 bool ok = true;
540 /* Check for Mr. Mrs. and others. */
541 if (e == '.' && bl_cursor - bl_start > 10) {
542 static const char *const prefix[] =
543 { "mr.", "mrs.", "sis.", "ms.", 0 };
544 char trailing[12];
545 for (i = 0; i < 6; ++i) {
546 c = bl_cursor[i - 6];
547 if (isupperByte(c))
548 c = tolower(c);
549 trailing[i] = c;
550 }
551 trailing[i] = 0;
552 for (i = 0; prefix[i]; ++i)
553 if (strstr(trailing, prefix[i]))
554 ok = false;
555 /* Check for John C. Calhoon */
556 if (isupperByte(bl_cursor[-2])
557 && isspaceByte(bl_cursor[-3]))
558 ok = false;
559 }
560 if (ok)
561 lperiod = colno, idxperiod = l;
562 }
563 e = d;
564 if (strchr(")\"|", d))
565 e = c;
566 if (strchr("-,;", e))
567 lcomma = colno, idxcomma = l;
568 if (strchr(")\"|", d))
569 lright = colno, idxright = l;
570 lany = colno, idxany = l;
571 if (formatOverflow) {
572 /* tack a short fragment onto the previous line. */
573 if (longcut && colno <= 15 && (nlc || lperiod == colno)) {
574 bl_start[longcut] = ' ';
575 if (!nlc)
576 len = spc = 0, nlc = 1;
577 } /* pasting small fragment onto previous line */
578 }
579 } /* allowing line breaks */
580 if (lspace == 3)
581 nlc = 0;
582 if (nlc) {
583 if (lspace == 2)
584 nlc = 1;
585 appendOneChar('\n');
586 if (nlc > 1)
587 appendOneChar('\n');
588 colno = 1;
589 longcut = lperiod = lcomma = lright = lany = 0;
590 if (lspace >= 2 || nlc > 1)
591 lspace = 3;
592 if (lspace < 2)
593 lspace = 2;
594 if (!premode)
595 return;
596 }
597 if (!spc)
598 return;
599 if (!premode) {
600 /* if the first char of the text to be reformatted is space,
601 * then we will wind up here, with lspace = 3. */
602 if (lspace == 3)
603 return;
604 appendOneChar(' ');
605 ++colno;
606 lspace = 1;
607 return;
608 }
609 j = -1;
610 for (i = 0; i < len; ++i) {
611 c = chunk[i];
612 if (c == '\n' || c == '\r' || c == '\f')
613 j = i;
614 }
615 i = j + 1;
616 if (i)
617 colno = 1;
618 for (; i < len; ++i) {
619 c = chunk[i];
620 if (c == 0)
621 c = ' ';
622 appendOneChar(c);
623 if (c == ' ')
624 ++colno;
625 if (c == '\t')
626 colno += 4;
627 }
628 lspace = 1;
629 } /* appendSpaceChunk */
630
appendPrintableChunk(const char * chunk,int len,bool premode)631 static void appendPrintableChunk(const char *chunk, int len, bool premode)
632 {
633 int i, j;
634 bool visible = true;
635
636 for (i = 0; i < len; ++i) {
637 char c = chunk[i];
638 appendOneChar(c);
639 if (c == InternalCodeChar) {
640 visible = false;
641 continue;
642 }
643 if (visible) {
644 // each foreign char or emoji counts as one.
645 // Ignore all but the first byte of a utf8.
646 if ((char)c >= 0 // ascii
647 || (c & 0x40) == 0x40)
648 ++colno;
649 continue;
650 }
651 if (isdigitByte(c))
652 continue;
653 /* end of the tag */
654 visible = true;
655 if (c != '*')
656 ++colno;
657 }
658
659 lspace = 0;
660 if (premode)
661 return;
662 if (colno <= formatLineLength)
663 return;
664 /* Oops, line is getting long. Let's see where we can cut it. */
665 i = j = 0;
666 if (lperiod > cutLineAfter)
667 i = lperiod, j = idxperiod;
668 else if (lcomma > cutLineAfter)
669 i = lcomma, j = idxcomma;
670 else if (lright > cutLineAfter)
671 i = lright, j = idxright;
672 else if (lany > cutLineAfter)
673 i = lany, j = idxany;
674 if (!j)
675 return; /* nothing we can do about it */
676 longcut = 0;
677 if (i != lperiod)
678 longcut = j;
679 bl_start[j] = '\n';
680 colno -= i;
681 lperiod -= i;
682 lcomma -= i;
683 lright -= i;
684 lany -= i;
685 } /* appendPrintableChunk */
686
687 /* Break up a line using the above routines.
688 * The new lines are put in a fixed array.
689 * Return false (fail) if we ran out of room.
690 * This function is called from buffers.c, implementing the bl command,
691 * and is only in this file because it shares the above routines and variables
692 * with the html reformatting, which really has to be here. */
693
694 char *breakLineResult;
695 #define REFORMAT_EXTRA 400
696
697 /* Count the formfeeds in a string. Each of these expands to \n\n,
698 * making the string longer. */
formfeedCount(const char * buf,int len)699 static int formfeedCount(const char *buf, int len)
700 {
701 int i, ff = 0;
702 for (i = 0; i < len; ++i)
703 if (buf[i] == '\f')
704 ++ff;
705 return ff;
706 } /* formfeedCount */
707
breakLine(const char * line,int len,int * newlen)708 bool breakLine(const char *line, int len, int *newlen)
709 {
710 char c, state, newstate;
711 int i, last, extra;
712
713 pre_cr = 0;
714 if (len && line[len - 1] == '\r')
715 --len;
716 if (lspace == 4) {
717 /* special continuation code from the previous invokation */
718 lspace = 2;
719 if (line[0])
720 ++pre_cr;
721 }
722 if (len > paraLine)
723 ++pre_cr;
724 if (lspace < 2)
725 lspace = 2; /* should never happen */
726 if (!len + pre_cr)
727 lspace = 3;
728
729 nzFree(breakLineResult);
730 extra = REFORMAT_EXTRA + formfeedCount(line, len);
731 breakLineResult = allocMem(len + extra);
732 bl_start = bl_cursor = breakLineResult;
733 bl_end = breakLineResult + len + extra - 8;
734 bl_overflow = false;
735
736 colno = 1;
737 longcut = lperiod = lcomma = lright = lany = 0;
738 last = 0;
739 state = 0;
740 if (pre_cr)
741 state = 1;
742
743 for (i = 0; i < len; ++i) {
744 c = line[i];
745 newstate = 2;
746 if (!c || strchr(" \t\n\r\f", c))
747 newstate = 1;
748 if (state == newstate)
749 continue;
750 if (!state) {
751 state = newstate;
752 continue;
753 }
754
755 /* state change here */
756 debugChunk(line + last, i - last);
757 if (state == 1)
758 appendSpaceChunk(line + last, i - last, false);
759 else
760 appendPrintableChunk(line + last, i - last, false);
761 last = i;
762 state = newstate;
763 pre_cr = 0;
764 }
765
766 if (state) { /* last token */
767 debugChunk(line + last, len - last);
768 if (state == 1)
769 appendSpaceChunk(line + last, len - last, false);
770 else
771 appendPrintableChunk(line + last, len - last, false);
772 }
773
774 if (lspace < 2) { /* line didn't have a \r at the end */
775 appendSpaceChunk("\n", 1, false);
776 }
777 if (bl_cursor - bl_start > paraLine)
778 lspace = 4;
779 debugPrint(7, "chunk<EOL>%d.%d", colno, lspace);
780 *newlen = bl_cursor - bl_start;
781 return !bl_overflow;
782 } /* breakLine */
783
breakLineSetup(void)784 void breakLineSetup(void)
785 {
786 lspace = 3;
787 }
788
htmlReformat(char * buf)789 char *htmlReformat(char *buf)
790 {
791 const char *h, *nh, *s;
792 char c;
793 bool premode = false;
794 bool pretag, slash;
795 char *new;
796 int l, tagno, extra;
797 char *fmark; /* mark the start of a frame */
798
799 cellDelimiters(buf);
800
801 anchorSwap(buf);
802
803 longcut = lperiod = lcomma = lright = lany = 0;
804 colno = 1;
805 pre_cr = 0;
806 lspace = 3;
807
808 l = strlen(buf);
809 /* Only a pathological web page gets longer after reformatting.
810 * Those with paragraphs and nothing else to compress or remove.
811 * Thus I allocate for the formfeeds, which correspond to paragraphs,
812 * and are replaced with \n\n.
813 * Plus some extra bytes for slop.
814 * If you still overflow, even beyond the EXTRA,
815 * it won't seg fault, you'll just lose some text. */
816 extra = REFORMAT_EXTRA + formfeedCount(buf, l);
817 new = allocMem(l + extra);
818 bl_start = bl_cursor = new;
819 bl_end = new + l + extra - 20;
820 bl_overflow = false;
821
822 for (h = buf; (c = *h); h = nh) {
823 if (isspaceByte(c)) {
824 for (s = h + 1; isspaceByte(*s); ++s) ;
825 nh = s;
826 appendSpaceChunk(h, nh - h, premode);
827 if (lspace == 3) {
828 longcut = lperiod = lcomma = lright = lany = 0;
829 colno = 1;
830 }
831 continue;
832 }
833
834 if (c != InternalCodeChar) {
835 for (s = h + 1; *s; ++s)
836 if (isspaceByte(*s) || *s == InternalCodeChar)
837 break;
838 nh = s;
839 appendPrintableChunk(h, nh - h, premode);
840 continue;
841 }
842
843 /* It's a tag */
844 tagno = strtol(h + 1, (char **)&nh, 10);
845 c = *nh++;
846 if (!c || !strchr("{}<>*", c)) {
847 // this should never happen!
848 i_printf(MSG_BadTagCode, tagno, c);
849 appendOneChar('@');
850 nh = h + 1;
851 continue;
852 }
853 appendPrintableChunk(h, nh - h, premode);
854 preFormatCheck(tagno, &pretag, &slash);
855 if (pretag) {
856 premode = !slash;
857 if (!premode) {
858 /* This forces a new paragraph, so it last char was nl, erase it. */
859 char *w = bl_cursor - 1;
860 while (*w != InternalCodeChar)
861 --w;
862 if (w > bl_start && w[-1] == '\n') {
863 memmove(w - 1, w, bl_cursor - w);
864 --bl_cursor;
865 }
866 }
867 }
868 } /* loop over text */
869
870 /* close off the last line */
871 if (lspace < 2)
872 appendSpaceChunk("\n", 1, true);
873 *bl_cursor = 0;
874 l = bl_cursor - bl_start;
875 /* Get rid of last space. */
876 if (l >= 2 && new[l - 1] == '\n' && new[l - 2] == ' ')
877 new[l - 2] = '\n', new[--l] = 0;
878 /* Don't need empty lines at the end. */
879 while (l > 1 && new[l - 1] == '\n' && new[l - 2] == '\n')
880 --l;
881 new[l] = 0;
882 /* Don't allow an empty buffer */
883 if (!l)
884 new[0] = '\n', new[1] = 0, l = 1;
885
886 if (bl_overflow) {
887 /* we should print a more helpful error message here */
888 strcpy(new + l, "\n???");
889 l += 4;
890 }
891
892 /* It's a little thing really, but the blank line at the top of each frame annoys me */
893 fmark = new;
894 while ((fmark = strstr(fmark + 1, "*`--\n\n"))) {
895 if (isdigit(fmark[-1]))
896 strmove(fmark + 5, fmark + 6);
897 }
898
899 return new;
900 } /* htmlReformat */
901
902 /*********************************************************************
903 Crunch a to-list or a copy-to-list down to its email addresses.
904 Delimit them with newlines.
905 "Smith, John" <jsmith@whatever.com>
906 becomes
907 jsmith@whatever.com
908 *********************************************************************/
909
extractEmailAddresses(char * line)910 void extractEmailAddresses(char *line)
911 {
912 char *s, *t;
913 char *mark; /* start of current entry */
914 char quote = 0, c;
915
916 for (s = t = mark = line; (c = *s); ++s) {
917 if (c == ',' && !quote) {
918 mark = t + 1;
919 c = ' ';
920 goto append;
921 }
922
923 if (c == '"') {
924 if (!quote)
925 quote = c;
926 else if (quote == c)
927 quote = 0;
928 /* don't think you can quote in an email address */
929 continue;
930 }
931
932 if (c == '<') {
933 if (!quote) {
934 quote = c;
935 t = mark;
936 }
937 continue;
938 }
939
940 if (c == '>') {
941 if (quote == '<')
942 quote = 0;
943 continue;
944 }
945
946 if (quote == '"')
947 continue;
948
949 if (c < ' ')
950 c = ' ';
951 if (c == ' ' && quote == '<')
952 c = '_';
953
954 append:
955 *t++ = c;
956 }
957
958 *t = 0;
959 spaceCrunch(line, true, false);
960 for (s = line; (c = *s); ++s)
961 if (c == ' ')
962 *s = ',';
963 if (*line)
964 strcat(line, ",");
965 } /* extractEmailAddresses */
966
cutDuplicateEmail(char * line,const char * dup,int duplen)967 static void cutDuplicateEmail(char *line, const char *dup, int duplen)
968 {
969 char *s;
970 while (*line) {
971 s = strchr(line, ',');
972 if (!s)
973 return; /* should never happen */
974 if (duplen == s - line && memEqualCI(line, dup, duplen)) {
975 ++s;
976 strmove(line, s);
977 continue;
978 }
979 line = s + 1;
980 }
981 } /* cutDuplicateEmail */
982
cutDuplicateEmails(char * tolist,char * cclist,const char * reply)983 void cutDuplicateEmails(char *tolist, char *cclist, const char *reply)
984 {
985 int len;
986 char *s, *t;
987
988 len = strlen(reply);
989 if (len) {
990 cutDuplicateEmail(tolist, reply, len);
991 cutDuplicateEmail(cclist, reply, len);
992 }
993
994 s = tolist;
995 while (*s) {
996 t = strchr(s, ',');
997 if (!t)
998 break; /* should never happen */
999 len = t - s;
1000 ++t;
1001 cutDuplicateEmail(t, s, len);
1002 cutDuplicateEmail(cclist, s, len);
1003 s = t;
1004 }
1005
1006 s = cclist;
1007 while (*s) {
1008 t = strchr(s, ',');
1009 if (!t)
1010 break; /* should never happen */
1011 len = t - s;
1012 ++t;
1013 cutDuplicateEmail(t, s, len);
1014 s = t;
1015 }
1016
1017 /* If your email address is on the to or cc list, drop it.
1018 * But retain it if it is the reply, in case you sent mail to yourself. */
1019 if (reply[0]) {
1020 struct MACCOUNT *m = accounts;
1021 int i;
1022 for (i = 0; i < maxAccount; ++i, ++m) {
1023 const char *r = m->reply;
1024 if (!r)
1025 continue;
1026 len = strlen(r);
1027 cutDuplicateEmail(tolist, r, len);
1028 cutDuplicateEmail(cclist, r, len);
1029 }
1030 }
1031 } /* cutDuplicateEmails */
1032
isEmailAddress(const char * s)1033 bool isEmailAddress(const char *s)
1034 {
1035 bool atfound = false, dotfound = false;
1036 if (!s || !*s)
1037 return false;
1038 for (; *s; ++s) {
1039 char c = *s;
1040 if (c < 0) // nonascii
1041 return false;
1042 if (atfound) {
1043 if (!isalnum(c) && c != '.' && c != '-')
1044 return false;
1045 if (c == '.') {
1046 if (s[1] == '.' || s[1] == 0 || s[-1] == '.'
1047 || s[-1] == '@')
1048 return false;
1049 dotfound = true;
1050 }
1051 continue;
1052 }
1053 // I think anything is ok before the @, except space.
1054 if (c <= ' ')
1055 return false;
1056 if (c == '@')
1057 atfound = true;
1058 }
1059 return atfound & dotfound;
1060 }
1061
1062 /* return 1 for utf16, 2 for utf32, ored with 4 for big endian */
byteOrderMark(const uchar * buf,int buflen)1063 int byteOrderMark(const uchar * buf, int buflen)
1064 {
1065 if (buflen < 2)
1066 return 0;
1067 if (buf[0] == 0xfe && buf[1] == 0xff)
1068 return 5;
1069 if (buf[0] == 0xff && buf[1] == 0xfe) {
1070 if (buflen >= 4 && buf[2] == 0 && buf[3] == 0)
1071 return 2;
1072 return 1;
1073 }
1074 if (buflen >= 4 && !memcmp(buf, "\x0\x0\xfe\xff", 4))
1075 return 6;
1076 return 0;
1077 } /* byteOrderMark */
1078
1079 /*********************************************************************
1080 We got some data from a file or from the internet.
1081 Count the binary characters and decide if this is, on the whole,
1082 binary or text. I allow some nonascii chars,
1083 like you might see in Spanish or German, and still call it text,
1084 but if there's too many such chars, I call it binary.
1085 It's not an exact science.
1086 utf8 sequences are considered text characters.
1087 If there is a leading byte order mark as per the previous routine, it's text.
1088 *********************************************************************/
1089
looksBinary(const uchar * buf,int buflen)1090 bool looksBinary(const uchar * buf, int buflen)
1091 {
1092 int i, j, bincount = 0, charcount = 0, nullcount = 0;
1093 uchar c;
1094 uchar seed;
1095
1096 if (byteOrderMark(buf, buflen))
1097 return false;
1098
1099 for (i = 0; i < buflen; ++i, ++charcount) {
1100 c = buf[i];
1101 // 0 is ascii, but not really text, and very common in binary files.
1102 if (c == 0) {
1103 if (++nullcount >= 10)
1104 return true;
1105 }
1106 if (c < 0x80)
1107 continue;
1108 // could represent a utf8 character
1109 seed = c;
1110 if ((seed & 0xfe) == 0xfe || (seed & 0xc0) == 0x80) {
1111 binchar:
1112 ++bincount;
1113 continue;
1114 }
1115 seed <<= 1;
1116 j = 1;
1117 while (seed & 0x80 && i + j < buflen
1118 && (buf[i + j] & 0xc0) == 0x80)
1119 seed <<= 1, ++j;
1120 if (seed & 0x80)
1121 goto binchar;
1122 // this is valid utf8 char, don't treat it as binary.
1123 i += j - 1;
1124 }
1125
1126 return (bincount * 8 - 16 >= charcount);
1127 } /* looksBinary */
1128
looks_8859_utf8(const uchar * buf,int buflen,bool * iso_p,bool * utf8_p)1129 void looks_8859_utf8(const uchar * buf, int buflen, bool * iso_p, bool * utf8_p)
1130 {
1131 int utfcount = 0, isocount = 0;
1132 int i, j, bothcount;
1133
1134 for (i = 0; i < buflen; ++i) {
1135 uchar c = buf[i];
1136 if (c < 0x80)
1137 continue;
1138 /* This is the start of the nonascii sequence. */
1139 /* No second bit, it has to be iso. */
1140 if (!(c & 0x40)) {
1141 isogo:
1142 ++isocount;
1143 continue;
1144 }
1145 /* Next byte has to start with 10 to be utf8, else it's iso */
1146 if ((buf[i + 1] & 0xc0) != 0x80)
1147 goto isogo;
1148 c <<= 2;
1149 for (j = i + 2; c < 0; ++j, c <<= 1)
1150 if ((buf[j] & 0xc0) != 0x80)
1151 goto isogo;
1152 ++utfcount;
1153 i = j - 1;
1154 }
1155
1156 *iso_p = *utf8_p = false;
1157
1158 bothcount = isocount + utfcount;
1159 if (!bothcount)
1160 return; /* ascii */
1161 bothcount *= 6;
1162 if (utfcount * 7 >= bothcount)
1163 *utf8_p = true;
1164 if (isocount * 7 >= bothcount)
1165 *iso_p = true;
1166 } /* looks_8859_utf8 */
1167
1168 /*********************************************************************
1169 Convert a string from iso 8859 to utf8, or vice versa.
1170 In each case a new string is allocated.
1171 Don't forget to free it when you're done.
1172 *********************************************************************/
1173
1174 /* only 8859-1 and 8859-2 so far */
1175 static const int iso_unicodes[2][128] = {
1176 {
1177 /*********************************************************************
1178 The first 32 nonascii chars in iso8859-1 are control characters,
1179 and almost never used.
1180 Much more common are the cp1252 characters, introduced by Microsoft.
1181 I'm gonna go with those, and hope I'm right more often than wrong.
1182 *********************************************************************/
1183 #define CP1252 1
1184 #if CP1252
1185 0x20AC, 0x81, 0x201A, 0x192, 0x201E, 0x2026, 0x2020, 0x2021,
1186 0x2C6, 0x2030, 0x160, 0x2039, 0x152, 0x8d, 0x17D, 0x8f,
1187 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1188 0x2DC, 0x2122, 0x161, 0x203A, 0x153, 0x9d, 0x17E, 0x178,
1189 #else
1190 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b,
1191 0x8c, 0x8d, 0x8e, 0x8f,
1192 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a,
1193 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
1194 #endif
1195 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa,
1196 0xab, 0xac, 0xad, 0xae, 0xaf,
1197 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
1198 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
1199 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca,
1200 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
1201 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
1202 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
1203 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
1204 0xeb, 0xec, 0xed, 0xee, 0xef,
1205 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa,
1206 0xfb, 0xfc, 0xfd, 0xfe, 0xff},
1207 {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b,
1208 0x8c, 0x8d, 0x8e, 0x8f,
1209 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a,
1210 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
1211 0xa0, 0x104, 0x2d8, 0x141, 0xa4, 0x13d, 0x15a, 0xa7, 0xa8, 0x160,
1212 0x15e, 0x164, 0x179, 0xad, 0x17d, 0x17b,
1213 0xb0, 0x105, 0x2db, 0x142, 0xb4, 0x13e, 0x15b, 0x2c7, 0xb8, 0x161,
1214 0x15f, 0x165, 0x17a, 0x2dd, 0x17e, 0x17c,
1215 0x154, 0xc1, 0xc2, 0x102, 0xc4, 0x139, 0x106, 0xc7, 0x10c, 0xc9,
1216 0x118, 0xcb, 0x11a, 0xcd, 0xce, 0x10e,
1217 0x110, 0x143, 0x147, 0xd3, 0xd4, 0x150, 0xd6, 0xd7, 0x158, 0x16e,
1218 0xda, 0x170, 0xdc, 0xdd, 0x162, 0xdf,
1219 0x155, 0xe1, 0xe2, 0x103, 0xe4, 0x13a, 0x107, 0xe7, 0x10d, 0xe9,
1220 0x119, 0xeb, 0x11b, 0xed, 0xee, 0x10f,
1221 0x111, 0x144, 0x148, 0xf3, 0xf4, 0x151, 0xf6, 0xf7, 0x159, 0x16f,
1222 0xfa, 0x171, 0xfc, 0xfd, 0x163, 0x2d9},
1223 };
1224
iso2utf(const uchar * inbuf,int inbuflen,uchar ** outbuf_p,int * outbuflen_p)1225 void iso2utf(const uchar * inbuf, int inbuflen, uchar ** outbuf_p,
1226 int *outbuflen_p)
1227 {
1228 int i, j;
1229 int nacount = 0;
1230 uchar c;
1231 uchar *outbuf;
1232 const int *isoarray = iso_unicodes[type8859 - 1];
1233 int ucode;
1234 char *s;
1235
1236 if (!inbuflen) {
1237 *outbuf_p = (uchar *) emptyString;
1238 *outbuflen_p = 0;
1239 return;
1240 }
1241
1242 /* count chars, so we can allocate */
1243 for (i = 0; i < inbuflen; ++i) {
1244 c = inbuf[i];
1245 if (c >= 0x80) {
1246 ucode = isoarray[c & 0x7f];
1247 s = uni2utf8(ucode);
1248 nacount += strlen(s) - 1;
1249 }
1250 }
1251
1252 outbuf = allocMem(inbuflen + nacount + 1);
1253
1254 for (i = j = 0; i < inbuflen; ++i) {
1255 c = inbuf[i];
1256 if (c < 0x80) {
1257 outbuf[j++] = c;
1258 continue;
1259 }
1260 ucode = isoarray[c & 0x7f];
1261 s = uni2utf8(ucode);
1262 strcpy((char *)outbuf + j, s);
1263 j += strlen(s);
1264 }
1265 outbuf[j] = 0;
1266
1267 *outbuf_p = outbuf;
1268 *outbuflen_p = j;
1269 } /* iso2utf */
1270
utf2iso(const uchar * inbuf,int inbuflen,uchar ** outbuf_p,int * outbuflen_p)1271 void utf2iso(const uchar * inbuf, int inbuflen, uchar ** outbuf_p,
1272 int *outbuflen_p)
1273 {
1274 int i, j, k;
1275 uchar c;
1276 uchar *outbuf;
1277 const int *isoarray = iso_unicodes[type8859 - 1];
1278 int ucode;
1279
1280 if (!inbuflen) {
1281 *outbuf_p = (uchar *) emptyString;
1282 *outbuflen_p = 0;
1283 return;
1284 }
1285
1286 outbuf = allocMem(inbuflen + 1);
1287 for (i = j = 0; i < inbuflen; ++i) {
1288 c = inbuf[i];
1289
1290 /* regular chars and nonascii chars that aren't utf8 pass through. */
1291 /* There shouldn't be any of the latter */
1292 if ((c & 0xc0) != 0xc0) {
1293 outbuf[j++] = c;
1294 continue;
1295 }
1296
1297 /* Convertable into 11 bit */
1298 if ((c & 0xe0) == 0xc0 && (inbuf[i + 1] & 0xc0) == 0x80) {
1299 ucode = c & 0x1f;
1300 ucode <<= 6;
1301 ucode |= (inbuf[i + 1] & 0x3f);
1302 for (k = 0; k < 128; ++k)
1303 if (isoarray[k] == ucode)
1304 break;
1305 if (k < 128) {
1306 outbuf[j++] = k | 0x80;
1307 ++i;
1308 continue;
1309 }
1310 }
1311
1312 /* Convertable into 16 bit */
1313 if ((c & 0xf0) == 0xe0 &&
1314 (inbuf[i + 1] & 0xc0) == 0x80 &&
1315 (inbuf[i + 2] & 0xc0) == 0x80) {
1316 ucode = c & 0xf;
1317 ucode <<= 6;
1318 ucode |= (inbuf[i + 1] & 0x3f);
1319 ucode <<= 6;
1320 ucode |= (inbuf[i + 2] & 0x3f);
1321 for (k = 0; k < 128; ++k)
1322 if (isoarray[k] == ucode)
1323 break;
1324 if (k < 128) {
1325 outbuf[j++] = k | 0x80;
1326 i += 2;
1327 continue;
1328 }
1329 }
1330
1331 /* unicodes not found in our iso class are converted into stars */
1332 c <<= 1;
1333 ++i;
1334 for (++i; c < 0; ++i, c <<= 1) {
1335 if ((outbuf[i] & 0xc0) != 0x80)
1336 break;
1337 }
1338 outbuf[j++] = '*';
1339 --i;
1340 }
1341 outbuf[j] = 0;
1342
1343 *outbuf_p = outbuf;
1344 *outbuflen_p = j;
1345 } /* utf2iso */
1346
1347 /*********************************************************************
1348 Convert the current line in buffer, which is either iso8859-1 or utf8,
1349 into utf16 or utf32, big or little endian.
1350 The returned string is allocated, though not really a string,
1351 since it will contain nulls, plenty of them in the case of utf32.
1352 *********************************************************************/
1353
utfHigh(const char * inbuf,int inbuflen,char ** outbuf_p,int * outbuflen_p,bool inutf8,bool out32,bool outbig)1354 void utfHigh(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p,
1355 bool inutf8, bool out32, bool outbig)
1356 {
1357 uchar *outbuf;
1358 unsigned int unicode;
1359 uchar c;
1360 int i, j;
1361
1362 if (!inbuflen) {
1363 *outbuf_p = emptyString;
1364 *outbuflen_p = 0;
1365 return;
1366 }
1367
1368 outbuf = allocMem(inbuflen * 4); // worst case
1369
1370 i = j = 0;
1371 while (i < inbuflen) {
1372 c = (uchar) inbuf[i];
1373 if (!inutf8 || ((c & 0xc0) != 0xc0 && (c & 0xfe) != 0xfe)) {
1374 unicode = c; // that was easy
1375 ++i;
1376 } else {
1377 uchar mask = 0x20;
1378 int k = 1;
1379 ++i;
1380 while (c & mask)
1381 ++k, mask >>= 1;
1382 c &= (mask - 1);
1383 unicode = ((unsigned int)c) << (6 * k);
1384 while (i < inbuflen && k) {
1385 c = (uchar) inbuf[i];
1386 if ((c & 0xc0) != 0x80)
1387 break;
1388 ++i, --k;
1389 c &= 0x3f;
1390 unicode |= (((unsigned int)c) << (6 * k));
1391 }
1392 }
1393
1394 if (out32) {
1395 if (outbig) {
1396 outbuf[j++] = ((unicode >> 24) & 0xff);
1397 outbuf[j++] = ((unicode >> 16) & 0xff);
1398 outbuf[j++] = ((unicode >> 8) & 0xff);
1399 outbuf[j++] = (unicode & 0xff);
1400 } else {
1401 outbuf[j++] = (unicode & 0xff);
1402 outbuf[j++] = ((unicode >> 8) & 0xff);
1403 outbuf[j++] = ((unicode >> 16) & 0xff);
1404 outbuf[j++] = ((unicode >> 24) & 0xff);
1405 }
1406 continue;
1407 }
1408 // utf16, a bit trickier but not too bad.
1409 if (unicode <= 0xd7ff
1410 || (unicode >= 0xe000 && unicode <= 0xffff)) {
1411 if (outbig) {
1412 outbuf[j++] = ((unicode >> 8) & 0xff);
1413 outbuf[j++] = (unicode & 0xff);
1414 } else {
1415 outbuf[j++] = (unicode & 0xff);
1416 outbuf[j++] = ((unicode >> 8) & 0xff);
1417 }
1418 continue;
1419 }
1420
1421 if (unicode >= 0x10000 && unicode <= 0x10ffff) {
1422 // surrogate pairs
1423 unsigned int pair1, pair2;
1424 unicode -= 0x10000;
1425 pair1 = 0xd800 + ((unicode >> 10) & 0x3ff);
1426 pair2 = 0xdc00 + (unicode & 0x3ff);
1427 if (outbig) {
1428 outbuf[j++] = ((pair1 >> 8) & 0xff);
1429 outbuf[j++] = (pair1 & 0xff);
1430 outbuf[j++] = ((pair2 >> 8) & 0xff);
1431 outbuf[j++] = (pair2 & 0xff);
1432 } else {
1433 outbuf[j++] = (pair1 & 0xff);
1434 outbuf[j++] = ((pair1 >> 8) & 0xff);
1435 outbuf[j++] = (pair2 & 0xff);
1436 outbuf[j++] = ((pair2 >> 8) & 0xff);
1437 }
1438 continue;
1439 }
1440
1441 }
1442
1443 *outbuf_p = (char *)outbuf;
1444 *outbuflen_p = j;
1445 } /* utfHigh */
1446
1447 /* convert a 32 bit unicode character into utf8 */
uni2utf8(unsigned int unichar)1448 char *uni2utf8(unsigned int unichar)
1449 {
1450 static uchar outbuf[12];
1451 int n = 0;
1452
1453 if (unichar <= 0x7f) {
1454 outbuf[n++] = unichar;
1455 } else if (unichar <= 0x7ff) {
1456 outbuf[n++] = 0xc0 | ((unichar >> 6) & 0x1f);
1457 outbuf[n++] = 0x80 | (unichar & 0x3f);
1458 } else if (unichar <= 0xffff) {
1459 outbuf[n++] = 0xe0 | ((unichar >> 12) & 0xf);
1460 outbuf[n++] = 0x80 | ((unichar >> 6) & 0x3f);
1461 outbuf[n++] = 0x80 | (unichar & 0x3f);
1462 } else if (unichar <= 0x1fffff) {
1463 outbuf[n++] = 0xf0 | ((unichar >> 18) & 7);
1464 outbuf[n++] = 0x80 | ((unichar >> 12) & 0x3f);
1465 outbuf[n++] = 0x80 | ((unichar >> 6) & 0x3f);
1466 outbuf[n++] = 0x80 | (unichar & 0x3f);
1467 } else if (unichar <= 0x3ffffff) {
1468 outbuf[n++] = 0xf8 | ((unichar >> 24) & 3);
1469 outbuf[n++] = 0x80 | ((unichar >> 18) & 0x3f);
1470 outbuf[n++] = 0x80 | ((unichar >> 12) & 0x3f);
1471 outbuf[n++] = 0x80 | ((unichar >> 6) & 0x3f);
1472 outbuf[n++] = 0x80 | (unichar & 0x3f);
1473 } else if (unichar <= 0x7fffffff) {
1474 outbuf[n++] = 0xfc | ((unichar >> 30) & 1);
1475 outbuf[n++] = 0x80 | ((unichar >> 24) & 0x3f);
1476 outbuf[n++] = 0x80 | ((unichar >> 18) & 0x3f);
1477 outbuf[n++] = 0x80 | ((unichar >> 12) & 0x3f);
1478 outbuf[n++] = 0x80 | ((unichar >> 6) & 0x3f);
1479 outbuf[n++] = 0x80 | (unichar & 0x3f);
1480 }
1481
1482 outbuf[n] = 0;
1483 return (char *)outbuf;
1484 } /* uni2utf8 */
1485
utfLow(const char * inbuf,int inbuflen,char ** outbuf_p,int * outbuflen_p,int bom)1486 void utfLow(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p,
1487 int bom)
1488 {
1489 char *obuf;
1490 int obuf_l;
1491 unsigned int unicode;
1492 int isbig;
1493 int k, l;
1494 const int *isoarray = iso_unicodes[type8859 - 1];
1495
1496 if (!inbuflen) {
1497 *outbuf_p = emptyString;
1498 *outbuflen_p = 0;
1499 return;
1500 }
1501
1502 obuf = initString(&obuf_l);
1503 isbig = (bom & 4);
1504 bom &= 3;
1505 l = bom * 2; // skip past byte order mark
1506
1507 while (l < inbuflen) {
1508 if (bom == 2) {
1509 if (l + 4 > inbuflen) {
1510 unicode = '?';
1511 l = inbuflen;
1512 } else if (isbig) {
1513 unicode = (uchar) inbuf[l];
1514 unicode <<= 8;
1515 unicode |= (uchar) inbuf[l + 1];
1516 unicode <<= 8;
1517 unicode |= (uchar) inbuf[l + 2];
1518 unicode <<= 8;
1519 unicode |= (uchar) inbuf[l + 3];
1520 l += 4;
1521 } else {
1522 unicode = (uchar) inbuf[l + 3];
1523 unicode <<= 8;
1524 unicode |= (uchar) inbuf[l + 2];
1525 unicode <<= 8;
1526 unicode |= (uchar) inbuf[l + 1];
1527 unicode <<= 8;
1528 unicode |= (uchar) inbuf[l];
1529 l += 4;
1530 }
1531 } else {
1532 if (l + 2 > inbuflen) {
1533 unicode = '?';
1534 l = inbuflen;
1535 } else if (isbig) {
1536 unicode = (uchar) inbuf[l];
1537 unicode <<= 8;
1538 unicode |= (uchar) inbuf[l + 1];
1539 l += 2;
1540 } else {
1541 unicode = (uchar) inbuf[l + 1];
1542 unicode <<= 8;
1543 unicode |= (uchar) inbuf[l];
1544 l += 2;
1545 }
1546 if (unicode >= 0xd800 && unicode <= 0xdbff
1547 && l + 2 <= inbuflen) {
1548 unsigned int pair1, pair2;
1549 pair1 = unicode - 0xd800;
1550 if (isbig) {
1551 pair2 = (uchar) inbuf[l];
1552 pair2 <<= 8;
1553 pair2 |= (uchar) inbuf[l + 1];
1554 } else {
1555 pair2 = (uchar) inbuf[l + 1];
1556 pair2 <<= 8;
1557 pair2 |= (uchar) inbuf[l];
1558 }
1559 if (pair2 >= 0xdc00 && pair2 <= 0xdfff) {
1560 pair2 -= 0xdc00;
1561 l += 2;
1562 unicode = pair1;
1563 unicode <<= 10;
1564 unicode |= pair2;
1565 }
1566 }
1567 }
1568
1569 // ok we got the unicode.
1570 // It now becomes utf8 or iso8859-x
1571 if (cons_utf8) {
1572 stringAndString(&obuf, &obuf_l, uni2utf8(unicode));
1573 continue;
1574 }
1575 // iso8859-x here, practically deprecated
1576 if (unicode <= 127) { // ascii
1577 stringAndChar(&obuf, &obuf_l, (char)unicode);
1578 continue;
1579 }
1580
1581 for (k = 0; k < 128; ++k)
1582 if (isoarray[k] == unicode)
1583 break;
1584 if (k < 128)
1585 unicode = k | 0x80;
1586 else
1587 unicode = '?';
1588 stringAndChar(&obuf, &obuf_l, (char)unicode);
1589 }
1590
1591 // The input string is a file or url and has 2 extra bytes after it.
1592 // After reformatting it should still have two extra bytes after it.
1593 stringAndString(&obuf, &obuf_l, " ");
1594
1595 *outbuf_p = obuf;
1596 *outbuflen_p = obuf_l - 2;
1597 } /* utfLow */
1598
1599 // Convert from whatever it is to utf8, for javascript and css.
1600 // Result parameter is the new string, or null if no conversion.
1601 // But, if the original string is utf8, I remove the bom.
1602 // Also turn \0 into spaces.
force_utf8(char * buf,int buflen)1603 char *force_utf8(char *buf, int buflen)
1604 {
1605 char *tbuf, *s;
1606 int bom = byteOrderMark((const uchar *)buf, buflen);
1607 if (bom) {
1608 debugPrint(3, "text type is %s%s",
1609 ((bom & 4) ? "big " : ""),
1610 ((bom & 2) ? "utf32" : "utf16"));
1611 if (debugLevel >= 3)
1612 i_puts(MSG_ConvUtf8);
1613 utfLow(buf, buflen, &tbuf, &buflen, bom);
1614 // get rid of \0
1615 for (s = tbuf; s < tbuf + buflen; ++s)
1616 if (!*s)
1617 *s = ' ';
1618 *s = 0;
1619 return tbuf;
1620 }
1621 // Strip off the leading bom, if any, and no we're not going to put it back.
1622 if (buflen >= 3 && !memcmp(buf, "\xef\xbb\xbf", 3)) {
1623 buflen -= 3;
1624 memmove(buf, buf + 3, buflen);
1625 buf[buflen] = 0;
1626 }
1627 for (s = buf; s < buf + buflen; ++s)
1628 if (!*s)
1629 *s = ' ';
1630 return NULL;
1631 }
1632
1633 static const char base64_chars[] =
1634 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1635
1636 /*
1637 * Encode some data in base64.
1638 * inbuf points to the data
1639 * inlen is the length of the data
1640 * lines is a boolean, indicating whether to add newlines to the output.
1641 * If true, newlines will be added after each group of 72 output bytes.
1642 * Returns: A freshly-allocated NUL-terminated string, containing the
1643 * base64 representation of the data. */
base64Encode(const char * inbuf,int inlen,bool lines)1644 char *base64Encode(const char *inbuf, int inlen, bool lines)
1645 {
1646 char *out, *outstr;
1647 uchar *in = (uchar *) inbuf;
1648 int colno;
1649 int outlen = ((inlen / 3) + 1) * 4;
1650 ++outlen; /* zero on the end */
1651 if (lines)
1652 outlen += (inlen / 54) + 1;
1653 outstr = out = allocMem(outlen);
1654 colno = 0;
1655 while (inlen >= 3) {
1656 *out++ = base64_chars[(int)(*in >> 2)];
1657 *out++ = base64_chars[(int)((*in << 4 | *(in + 1) >> 4) & 63)];
1658 *out++ =
1659 base64_chars[(int)((*(in + 1) << 2 | *(in + 2) >> 6) & 63)];
1660 *out++ = base64_chars[(int)(*(in + 2) & 63)];
1661 inlen -= 3;
1662 in += 3;
1663 if (!lines)
1664 continue;
1665 colno += 4;
1666 if (colno < 72)
1667 continue;
1668 *out++ = '\n';
1669 colno = 0;
1670 }
1671 if (inlen == 1) {
1672 *out++ = base64_chars[(int)(*in >> 2)];
1673 *out++ = base64_chars[(int)(*in << 4 & 63)];
1674 *out++ = '=';
1675 *out++ = '=';
1676 colno += 4;
1677 }
1678 if (inlen == 2) {
1679 *out++ = base64_chars[(int)(*in >> 2)];
1680 *out++ = base64_chars[(int)((*in << 4 | *(in + 1) >> 4) & 63)];
1681 *out++ = base64_chars[(int)((*(in + 1) << 2) & 63)];
1682 *out++ = '=';
1683 colno += 4;
1684 }
1685 /* finish the last line */
1686 if (lines && colno)
1687 *out++ = '\n';
1688 *out = 0;
1689 return outstr;
1690 } /* base64Encode */
1691
base64Bits(char c)1692 uchar base64Bits(char c)
1693 {
1694 if (isupperByte(c))
1695 return c - 'A';
1696 if (islowerByte(c))
1697 return c - ('a' - 26);
1698 if (isdigitByte(c))
1699 return c - ('0' - 52);
1700 if (c == '+')
1701 return 62;
1702 if (c == '/')
1703 return 63;
1704 return 64; /* error */
1705 } /* base64Bits */
1706
1707 /*********************************************************************
1708 Decode some data in base64.
1709 This function operates on the data in-line. It does not allocate a fresh
1710 string to hold the decoded data. Since the data will be smaller than
1711 the base64 encoded representation, this cannot overflow.
1712 If you need to preserve the input, copy it first.
1713 start points to the start of the input
1714 *end initially points to the byte just after the end of the input
1715 Returns: GOOD_BASE64_DECODE on success, BAD_BASE64_DECODE or
1716 EXTRA_CHARS_BASE64_DECODE on error.
1717 When the function returns success, *end points to the end of the decoded
1718 data. On failure, end points to the byte just past the end of
1719 what was successfully decoded.
1720 *********************************************************************/
1721
base64Decode(char * start,char ** end)1722 int base64Decode(char *start, char **end)
1723 {
1724 char *b64_end = *end;
1725 uchar val, leftover, mod;
1726 bool equals;
1727 int ret = GOOD_BASE64_DECODE;
1728 char c, *q, *r;
1729 mod = 0;
1730 equals = false;
1731 for (q = r = start; q < b64_end; ++q) {
1732 c = *q;
1733 if (isspaceByte(c))
1734 continue;
1735 if (equals) {
1736 if (c == '=')
1737 continue;
1738 ret = EXTRA_CHARS_BASE64_DECODE;
1739 break;
1740 }
1741 if (c == '=') {
1742 equals = true;
1743 continue;
1744 }
1745 val = base64Bits(c);
1746 if (val & 64) {
1747 ret = BAD_BASE64_DECODE;
1748 break;
1749 }
1750 if (mod == 0) {
1751 leftover = val << 2;
1752 } else if (mod == 1) {
1753 *r++ = (leftover | (val >> 4));
1754 leftover = val << 4;
1755 } else if (mod == 2) {
1756 *r++ = (leftover | (val >> 2));
1757 leftover = val << 6;
1758 } else {
1759 *r++ = (leftover | val);
1760 }
1761 ++mod;
1762 mod &= 3;
1763 }
1764 *end = r;
1765 return ret;
1766 } /* base64Decode */
1767
1768 void
iuReformat(const char * inbuf,int inbuflen,char ** outbuf_p,int * outbuflen_p)1769 iuReformat(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p)
1770 {
1771 bool is8859, isutf8;
1772
1773 *outbuf_p = 0;
1774 *outbuflen_p = 0;
1775 if (!iuConvert)
1776 return;
1777
1778 looks_8859_utf8((uchar *) inbuf, inbuflen, &is8859, &isutf8);
1779 if (cons_utf8 && is8859) {
1780 debugPrint(3, "converting to utf8");
1781 iso2utf((uchar *) inbuf, inbuflen, (uchar **) outbuf_p,
1782 outbuflen_p);
1783 }
1784 if (!cons_utf8 && isutf8) {
1785 debugPrint(3, "converting to iso8859");
1786 utf2iso((uchar *) inbuf, inbuflen, (uchar **) outbuf_p,
1787 outbuflen_p);
1788 }
1789 } /* iuReformat */
1790
parseDataURI(const char * uri,char ** mediatype,char ** data,int * data_l)1791 bool parseDataURI(const char *uri, char **mediatype, char **data, int *data_l)
1792 {
1793 bool base64 = false;
1794 const char *mediatype_start;
1795 const char *data_sep;
1796 const char *cp;
1797 size_t encoded_len;
1798
1799 *data = *mediatype = emptyString;
1800 *data_l = 0;
1801
1802 if (!isDataURI(uri))
1803 return false;
1804
1805 mediatype_start = uri + 5;
1806 data_sep = strchr(mediatype_start, ',');
1807
1808 if (!data_sep)
1809 return false;
1810
1811 for (cp = data_sep - 1; (cp >= mediatype_start && *cp != ';'); cp--) ;
1812
1813 if (cp >= mediatype_start && memEqualCI(cp, ";base64,", 8)) {
1814 base64 = true;
1815 *mediatype = pullString1(mediatype_start, cp);
1816 } else {
1817 *mediatype = pullString1(mediatype_start, data_sep);
1818 }
1819
1820 encoded_len = strlen(data_sep + 1);
1821 *data = pullString(data_sep + 1, encoded_len);
1822 unpercentString(*data);
1823
1824 if (!base64) {
1825 *data_l = strlen(*data);
1826 } else {
1827 char *data_end = *data + strlen(*data);
1828 int unpack_ret = base64Decode(*data, &data_end);
1829 if (unpack_ret != GOOD_BASE64_DECODE) {
1830 nzFree(*mediatype);
1831 *mediatype = emptyString;
1832 nzFree(*data);
1833 *data = emptyString;
1834 return false;
1835 }
1836 *data_end = '\0';
1837 *data_l = data_end - *data;
1838 }
1839
1840 return true;
1841 } /* parseDataURI */
1842
fromHex(char d,char e)1843 uchar fromHex(char d, char e)
1844 {
1845 d |= 0x20, e |= 0x20;
1846 if (d >= 'a')
1847 d -= ('a' - '9' - 1);
1848 if (e >= 'a')
1849 e -= ('a' - '9' - 1);
1850 d -= '0', e -= '0';
1851 return ((((uchar) d) << 4) | (uchar) e);
1852 } /* fromHex */
1853
1854 // find the color closest to the rgb value.
1855 // Input string is allocated; return is either the einput string
1856 // or another allocated string.
closeColor(const char * s)1857 char *closeColor(const char *s)
1858 {
1859 // indent formats an array of structures really weird; not like I would.
1860 const struct reserved {
1861 const char *name;
1862 uchar r, g, b;
1863 } colorlist[] = {
1864 {
1865 "aliceblue", 0xf0, 0xf8, 0xff}, {
1866 "antiquewhite", 0xfa, 0xeb, 0xd7}, {
1867 "aqua", 0x00, 0xff, 0xff}, {
1868 "aquamarine", 0x7f, 0xff, 0xd4}, {
1869 "azure", 0xf0, 0xff, 0xff}, {
1870 "beige", 0xf5, 0xf5, 0xdc}, {
1871 "bisque", 0xff, 0xe4, 0xc4}, {
1872 "black", 0x00, 0x00, 0x00}, {
1873 "blanchedalmond", 0xff, 0xeb, 0xcd}, {
1874 "blue", 0x00, 0x00, 0xff}, {
1875 "blueviolet", 0x8a, 0x2b, 0xe2}, {
1876 "brown", 0xa5, 0x2a, 0x2a}, {
1877 "burlywood", 0xde, 0xb8, 0x87}, {
1878 "cadetblue", 0x5f, 0x9e, 0xa0}, {
1879 "chartreuse", 0x7f, 0xff, 0x00}, {
1880 "chocolate", 0xd2, 0x69, 0x1e}, {
1881 "coral", 0xff, 0x7f, 0x50}, {
1882 "cornflowerblue", 0x64, 0x95, 0xed}, {
1883 "cornsilk", 0xff, 0xf8, 0xdc}, {
1884 "crimson", 0xdc, 0x14, 0x3c}, {
1885 "cyan", 0x00, 0xff, 0xff}, {
1886 "darkblue", 0x00, 0x00, 0x8b}, {
1887 "darkcyan", 0x00, 0x8b, 0x8b}, {
1888 "darkgoldenrod", 0xb8, 0x86, 0x0b}, {
1889 "darkgray", 0xa9, 0xa9, 0xa9}, {
1890 "darkgreen", 0x00, 0x64, 0x00}, {
1891 "darkkhaki", 0xbd, 0xb7, 0x6b}, {
1892 "darkmagenta", 0x8b, 0x00, 0x8b}, {
1893 "darkolivegreen", 0x55, 0x6b, 0x2f}, {
1894 "darkorange", 0xff, 0x8c, 0x00}, {
1895 "darkorchid", 0x99, 0x32, 0xcc}, {
1896 "darkred", 0x8b, 0x00, 0x00}, {
1897 "darksalmon", 0xe9, 0x96, 0x7a}, {
1898 "darkseagreen", 0x8f, 0xbc, 0x8f}, {
1899 "darkslateblue", 0x48, 0x3d, 0x8b}, {
1900 "darkslategray", 0x2f, 0x4f, 0x4f}, {
1901 "darkturquoise", 0x00, 0xce, 0xd1}, {
1902 "darkviolet", 0x94, 0x00, 0xd3}, {
1903 "deeppink", 0xff, 0x14, 0x93}, {
1904 "deepskyblue", 0x00, 0xbf, 0xff}, {
1905 "dimgray", 0x69, 0x69, 0x69}, {
1906 "dodgerblue", 0x1e, 0x90, 0xff}, {
1907 "feldspar", 0xd1, 0x92, 0x75}, {
1908 "firebrick", 0xb2, 0x22, 0x22}, {
1909 "floralwhite", 0xff, 0xfa, 0xf0}, {
1910 "forestgreen", 0x22, 0x8b, 0x22}, {
1911 "fuchsia", 0xff, 0x00, 0xff}, {
1912 "gainsboro", 0xdc, 0xdc, 0xdc}, {
1913 "ghostwhite", 0xf8, 0xf8, 0xff}, {
1914 "gold", 0xff, 0xd7, 0x00}, {
1915 "goldenrod", 0xda, 0xa5, 0x20}, {
1916 "gray", 0x80, 0x80, 0x80}, {
1917 "green", 0x00, 0x80, 0x00}, {
1918 "greenyellow", 0xad, 0xff, 0x2f}, {
1919 "honeydew", 0xf0, 0xff, 0xf0}, {
1920 "hotpink", 0xff, 0x69, 0xb4}, {
1921 "indianred", 0xcd, 0x5c, 0x5c}, {
1922 "indigo", 0x4b, 0x00, 0x82}, {
1923 "ivory", 0xff, 0xff, 0xf0}, {
1924 "khaki", 0xf0, 0xe6, 0x8c}, {
1925 "lavender", 0xe6, 0xe6, 0xfa}, {
1926 "lavenderblush", 0xff, 0xf0, 0xf5}, {
1927 "lawngreen", 0x7c, 0xfc, 0x00}, {
1928 "lemonchiffon", 0xff, 0xfa, 0xcd}, {
1929 "lightblue", 0xad, 0xd8, 0xe6}, {
1930 "lightcoral", 0xf0, 0x80, 0x80}, {
1931 "lightcyan", 0xe0, 0xff, 0xff}, {
1932 "lightgoldenrodyellow", 0xfa, 0xfa, 0xd2}, {
1933 "lightgrey", 0xd3, 0xd3, 0xd3}, {
1934 "lightgreen", 0x90, 0xee, 0x90}, {
1935 "lightpink", 0xff, 0xb6, 0xc1}, {
1936 "lightsalmon", 0xff, 0xa0, 0x7a}, {
1937 "lightseagreen", 0x20, 0xb2, 0xaa}, {
1938 "lightskyblue", 0x87, 0xce, 0xfa}, {
1939 "lightslateblue", 0x84, 0x70, 0xff}, {
1940 "lightslategray", 0x77, 0x88, 0x99}, {
1941 "lightsteelblue", 0xb0, 0xc4, 0xde}, {
1942 "lightyellow", 0xff, 0xff, 0xe0}, {
1943 "lime", 0x00, 0xff, 0x00}, {
1944 "limegreen", 0x32, 0xcd, 0x32}, {
1945 "linen", 0xfa, 0xf0, 0xe6}, {
1946 "magenta", 0xff, 0x00, 0xff}, {
1947 "maroon", 0x80, 0x00, 0x00}, {
1948 "mediumaquamarine", 0x66, 0xcd, 0xaa}, {
1949 "mediumblue", 0x00, 0x00, 0xcd}, {
1950 "mediumorchid", 0xba, 0x55, 0xd3}, {
1951 "mediumpurple", 0x93, 0x70, 0xd8}, {
1952 "mediumseagreen", 0x3c, 0xb3, 0x71}, {
1953 "mediumslateblue", 0x7b, 0x68, 0xee}, {
1954 "mediumspringgreen", 0x00, 0xfa, 0x9a}, {
1955 "mediumturquoise", 0x48, 0xd1, 0xcc}, {
1956 "mediumvioletred", 0xc7, 0x15, 0x85}, {
1957 "midnightblue", 0x19, 0x19, 0x70}, {
1958 "mintcream", 0xf5, 0xff, 0xfa}, {
1959 "mistyrose", 0xff, 0xe4, 0xe1}, {
1960 "moccasin", 0xff, 0xe4, 0xb5}, {
1961 "navajowhite", 0xff, 0xde, 0xad}, {
1962 "navy", 0x00, 0x00, 0x80}, {
1963 "oldlace", 0xfd, 0xf5, 0xe6}, {
1964 "olive", 0x80, 0x80, 0x00}, {
1965 "olivedrab", 0x6b, 0x8e, 0x23}, {
1966 "orange", 0xff, 0xa5, 0x00}, {
1967 "orangered", 0xff, 0x45, 0x00}, {
1968 "orchid", 0xda, 0x70, 0xd6}, {
1969 "palegoldenrod", 0xee, 0xe8, 0xaa}, {
1970 "palegreen", 0x98, 0xfb, 0x98}, {
1971 "paleturquoise", 0xaf, 0xee, 0xee}, {
1972 "palevioletred", 0xd8, 0x70, 0x93}, {
1973 "papayawhip", 0xff, 0xef, 0xd5}, {
1974 "peachpuff", 0xff, 0xda, 0xb9}, {
1975 "peru", 0xcd, 0x85, 0x3f}, {
1976 "pink", 0xff, 0xc0, 0xcb}, {
1977 "plum", 0xdd, 0xa0, 0xdd}, {
1978 "powderblue", 0xb0, 0xe0, 0xe6}, {
1979 "purple", 0x80, 0x00, 0x80}, {
1980 "red", 0xff, 0x00, 0x00}, {
1981 "rosybrown", 0xbc, 0x8f, 0x8f}, {
1982 "royalblue", 0x41, 0x69, 0xe1}, {
1983 "saddlebrown", 0x8b, 0x45, 0x13}, {
1984 "salmon", 0xfa, 0x80, 0x72}, {
1985 "sandybrown", 0xf4, 0xa4, 0x60}, {
1986 "seagreen", 0x2e, 0x8b, 0x57}, {
1987 "seashell", 0xff, 0xf5, 0xee}, {
1988 "sienna", 0xa0, 0x52, 0x2d}, {
1989 "silver", 0xc0, 0xc0, 0xc0}, {
1990 "skyblue", 0x87, 0xce, 0xeb}, {
1991 "slateblue", 0x6a, 0x5a, 0xcd}, {
1992 "slategray", 0x70, 0x80, 0x90}, {
1993 "snow", 0xff, 0xfa, 0xfa}, {
1994 "springgreen", 0x00, 0xff, 0x7f}, {
1995 "steelblue", 0x46, 0x82, 0xb4}, {
1996 "tan", 0xd2, 0xb4, 0x8c}, {
1997 "teal", 0x00, 0x80, 0x80}, {
1998 "thistle", 0xd8, 0xbf, 0xd8}, {
1999 "tomato", 0xff, 0x63, 0x47}, {
2000 "turquoise", 0x40, 0xe0, 0xd0}, {
2001 "violet", 0xee, 0x82, 0xee}, {
2002 "violetred", 0xd0, 0x20, 0x90}, {
2003 "wheat", 0xf5, 0xde, 0xb3}, {
2004 "white", 0xff, 0xff, 0xff}, {
2005 "whitesmoke", 0xf5, 0xf5, 0xf5}, {
2006 "yellow", 0xff, 0xff, 0x00}, {
2007 "yellowgreen", 0x9a, 0xcd, 0x32}, {
2008 0}
2009 };
2010 const struct reserved *c, *best_c;
2011 int best_val;
2012 int r1, g1, b1;
2013 const char *t;
2014
2015 if (!strncmp(s, "rgb(", 4)) {
2016 t = s + 4;
2017 if (!isdigit(*t))
2018 goto fail;
2019 r1 = strtol(t, (char **)&t, 10);
2020 if (*t == ',')
2021 ++t;
2022 while (*t == ' ')
2023 ++t;
2024 if (!isdigit(*t))
2025 goto fail;
2026 g1 = strtol(t, (char **)&t, 10);
2027 if (*t == ',')
2028 ++t;
2029 while (*t == ' ')
2030 ++t;
2031 if (!isdigit(*t))
2032 goto fail;
2033 b1 = strtol(t, (char **)&t, 10);
2034 if (*t == ',')
2035 ++t;
2036 while (*t == ' ')
2037 ++t;
2038 if (*t != ')')
2039 goto fail;
2040 } else if (*s == '#' && isxdigit(s[1])) {
2041 if (!isxdigit(s[2]) || !isxdigit(s[3]))
2042 goto fail;
2043 if (isxdigit(s[4]) && isxdigit(s[5]) && isxdigit(s[6])) {
2044 r1 = fromHex(s[1], s[2]);
2045 g1 = fromHex(s[3], s[4]);
2046 b1 = fromHex(s[5], s[6]);
2047 } else {
2048 // #xyz is short for #xxyyzz
2049 r1 = fromHex(s[1], s[1]);
2050 g1 = fromHex(s[2], s[2]);
2051 b1 = fromHex(s[3], s[3]);
2052 }
2053 } else {
2054 // not an rgb format we recognize; should be just a word.
2055 for (t = s; *t; ++t)
2056 if (!isalpha(*t))
2057 goto fail;
2058 return (char *)s;
2059 }
2060
2061 if (r1 < 0 || g1 < 0 || b1 < 0)
2062 goto fail;
2063 if (r1 > 255 || g1 > 255 || b1 > 255)
2064 goto fail;
2065
2066 // closest by rms; just check them all; kind of inefficient.
2067 best_val = 255 * 255 * 3 + 1;
2068 for (c = colorlist; c->name; ++c) {
2069 int rms = (r1 - (int)c->r) * (r1 - (int)c->r) +
2070 (g1 - (int)c->g) * (g1 - (int)c->g) +
2071 (b1 - (int)c->b) * (b1 - (int)c->b);
2072 if (rms < best_val)
2073 best_val = rms, best_c = c;
2074 }
2075 return cloneString(best_c->name);
2076
2077 fail:
2078 return 0;
2079 }
2080