1 /* format.c
2  * Format text, establish line breaks, manage whitespace.
3  * This file is part of the edbrowse project, released under GPL.
4  */
5 
6 #include "eb.h"
7 
8 /*********************************************************************
9 Prepare html for text processing.
10 Change nulls to spaces.
11 Make sure it doesn't already contain my magic code,
12 The one I use to indicate a tag.
13 If it does, well, change them to something else.
14 I can only hope this doesn't screw up some embedded javascript.
15 *********************************************************************/
16 
prepareForBrowse(char * h,int h_len)17 void prepareForBrowse(char *h, int h_len)
18 {
19 	int i, j;
20 
21 	for (i = j = 0; i < h_len; ++i) {
22 		if (h[i] == 0)
23 			h[i] = ' ';
24 		if (h[i] == '\b') {
25 			if (i && !strchr("\n\b<>'\"&", h[i - 1]))
26 				--j;
27 			continue;
28 		}
29 		if (h[i] == InternalCodeChar)
30 			h[i] = InternalCodeCharAlternate;
31 		h[j++] = h[i];
32 	}
33 	h[j] = 0;		/* now it's a string */
34 
35 /* undos the file */
36 	for (i = j = 0; h[i]; ++i) {
37 		if (h[i] == '\r' && h[i + 1] == '\n')
38 			continue;
39 		h[j++] = h[i];
40 	}
41 	h[j] = 0;
42 }				/* prepareForBrowse */
43 
44 /* An input field cannot contain newline, null, or the InternalCodeChar */
45 // Revised June 2018, maybe newline is ok. We need it for textarea.
prepareForField(char * h)46 void prepareForField(char *h)
47 {
48 	while (*h) {
49 		if (*h == 0)
50 			*h = ' ';
51 		if (*h == InternalCodeChar)
52 			*h = InternalCodeCharAlternate;
53 		++h;
54 	}
55 }				/* prepareForField */
56 
57 /*********************************************************************
58 The primary goal of this routine is to turn
59 Hey,{ click here } for more information
60 into
61 Hey, {click here}  for more information
62 But of course we won't do that if the section is preformatted.
63 Nor can we muck with the whitespace that might be present in an input field <>.
64 Also swap 32* whitespace, pushing invisible anchors forward.
65 If a change is made, the procedure is run again,
66 kinda like bubble sort.
67 It has the potential to be terribly inefficient,
68 but that doesn't seem to happen in practice.
69 Use cnt to count the iterations, just for debugging.
70 | is considered a whitespace character. Why is that?
71 Html tables are mostly used for visual layout, but sometimes not.
72 I use | to separate the cells of a table, but if there's nothing in them,
73 or at least no text, then I get rid of the pipes.
74 But every cell is going to have an invisible anchor from <td>, so that js can,
75 perhaps, set innerHTML inside this cell.
76 So there's something there, but nothing there.
77 I push these tags past pipes, so I can clear it all away.
78 One web page in ten thousand will actually set html inside a cell,
79 after the fact, and when that happens the text won't be in the right place,
80 it won't have the pipes around it that it should.
81 I'm willing to accept that for now.
82 *********************************************************************/
83 
cellDelimiters(char * buf)84 static void cellDelimiters(char *buf)
85 {
86 	char *lastcell = 0;
87 	int cellcount = 0;
88 	char *s;
89 
90 	for (s = buf; *s; ++s) {
91 		if (*s == TableCellChar) {
92 			*s = '|';
93 			lastcell = s;
94 			++cellcount;
95 			continue;
96 		}
97 		if (!strchr("\f\r\n", *s))
98 			continue;
99 /* newline here, if just one cell delimiter then blank it out */
100 		if (cellcount == 1)
101 			*lastcell = ' ';
102 		cellcount = 0;
103 	}
104 }				/* cellDelimiters */
105 
anchorSwap(char * buf)106 static void anchorSwap(char *buf)
107 {
108 	char c, d, *s, *ss, *w, *a;
109 	bool pretag;		// <pre>
110 	bool premode;		// inside <pre> </pre>
111 	bool inputmode;		// inside an input field
112 	bool slash;		// closing tag
113 	bool change;		// made a swap somewhere
114 	bool strong;		// strong whitespace, newline or paragraph
115 	int n, cnt;
116 	char tag[20];
117 
118 	static const char from[] =
119 	    "\x1b\x95\x99\x9c\x9d\x91\x92\x93\x94\xa0\xad\x96\x97\x85";
120 	static const char becomes[] = "_*'`'`'`' ----";
121 /* I use to convert a6 and c2 to hyphen space, not sure why */
122 
123 /* Transliterate a few characters.  One of them is 0xa0 to space,
124  * so we need to do this now, before the anchors swap with whitespace.
125  * Watch out for utf8 - don't translate the a0 in c3a0.  That is a grave.
126  * But a0 by itself is breakspace; turn it into space.
127  * And c2a0 is a0 is breakspace.
128  * Don't do any of these transliterations in an input field. */
129 
130 	inputmode = false;
131 	for (s = w = buf; (c = *s); ++s) {
132 		d = s[1];
133 		if (c == InternalCodeChar && isdigitByte(d)) {
134 			strtol(s + 1, &ss, 10);
135 			if (*ss == '<')
136 				inputmode = true;
137 			if (*ss == '>')
138 				inputmode = false;
139 			++ss;
140 			n = ss - s;
141 			memmove(w, s, n);
142 			w += n;
143 			s = ss - 1;
144 			continue;
145 		}
146 
147 		if (inputmode)
148 			goto put1;
149 
150 /* utf8 test */
151 		if ((c & 0xc0) == 0xc0 && (d & 0xc0) == 0x80) {
152 			unsigned int uni = 0;
153 			if ((c & 0x3c) == 0) {
154 /* fits in 8 bits */
155 				uni = ((uchar) c << 6) | (d & 0x3f);
156 				ss = strchr(from, (char)uni);
157 				if (ss) {
158 					c = becomes[ss - from];
159 					++s;
160 					goto put1;
161 				}
162 			}
163 /* copy the utf8 sequence as is */
164 			*w++ = c;
165 			++s;
166 			c <<= 1;
167 			while ((c & 0x80) && ((d = *s) & 0xc0) == 0x80) {
168 				*w++ = d;
169 				++s;
170 			}
171 			--s;
172 			continue;
173 		}
174 
175 /* Now assuming iso8859-1, which is practically deprecated */
176 		ss = strchr(from, c);
177 		if (ss)
178 			c = becomes[ss - from];
179 
180 #if 0
181 // Should we modify empty anchors in any way?
182 		if (c != InternalCodeChar)
183 			goto put1;
184 		if (!isdigitByte(s[1]))
185 			goto put1;
186 		for (a = s + 2; isdigitByte(*a); ++a) ;
187 		if (*a != '{')
188 			goto put1;
189 		for (++a; *a == ' '; ++a) ;
190 		if (a[0] != InternalCodeChar || a[1] != '0' || a[2] != '}')
191 			goto put1;
192 // do something with empty {} here.
193 // Following code just skips it, but we likely shouldn't do that.
194 		s = a + 2;
195 		continue;
196 #endif
197 
198 put1:
199 		*w++ = c;
200 	}
201 	*w = 0;
202 
203 /* anchor whitespace swap preserves the length of the string */
204 	cnt = 0;
205 	change = true;
206 	while (change) {
207 		change = false;
208 		++cnt;
209 		premode = false;
210 /* w represents the state of whitespace */
211 		w = NULL;
212 /* a points to the prior anchor, which is swappable with following whitespace */
213 		a = NULL;
214 
215 		for (s = buf; (c = *s); ++s) {
216 			if (isspaceByte(c) || c == '|') {
217 				if (c == '\t' && !premode)
218 					*s = ' ';
219 				if (!w)
220 					w = s;
221 				continue;
222 			}
223 
224 /* end of white space, should we swap it with prior tag? */
225 			if (w && a) {
226 				memmove(a, w, s - w);
227 				memmove(a + (s - w), tag, n);
228 				change = true;
229 				w = NULL;
230 			}
231 
232 /* prior anchor has no significance */
233 			a = NULL;
234 
235 			if (c != InternalCodeChar)
236 				goto normalChar;
237 /* some conditions that should never happen */
238 			if (!isdigitByte(s[1]))
239 				goto normalChar;
240 			n = strtol(s + 1, &ss, 10);
241 			preFormatCheck(n, &pretag, &slash);
242 			d = *ss;
243 			if (!strchr("{}<>*", d))
244 				goto normalChar;
245 			n = ss + 1 - s;
246 			memcpy(tag, s, n);
247 			tag[n] = 0;
248 
249 			if (pretag) {
250 				w = 0;
251 				premode = !slash;
252 				s = ss;
253 				continue;
254 			}
255 
256 /* We have a tag, should we swap it with prior whitespace? */
257 			if (w && !premode && d == '}') {
258 				memmove(w + n, w, s - w);
259 				memcpy(w, tag, n);
260 				change = true;
261 				w += n;
262 				s = ss;
263 				continue;
264 			}
265 
266 			if ((d == '*' || d == '{') && !premode)
267 				a = s;
268 			s = ss;
269 
270 normalChar:
271 			w = 0;	/* no more whitespace */
272 /* end of loop over the chars in the buffer */
273 		}
274 /* end of loop making changes */
275 	}
276 	debugPrint(4, "anchorSwap %d", cnt);
277 
278 /* Framing characters like [] around an anchor are unnecessary here,
279  * because we already frame it in braces.
280  * Get rid of these characters, even in premode. */
281 	for (s = w = buf; (c = *s); ++s) {
282 		char open, close, linkchar;
283 		if (!strchr("{[(<", c))
284 			goto putc;
285 		if (s[1] != InternalCodeChar)
286 			goto putc;
287 		if (!isdigitByte(s[2]))
288 			goto putc;
289 		for (a = s + 3; isdigitByte(*a); ++a) ;
290 		linkchar = 0;
291 		if (*a == '{')
292 			linkchar = '}';
293 		if (*a == '<')
294 			linkchar = '>';
295 		if (!linkchar)
296 			goto putc;
297 		open = c;
298 		close = 0;
299 		if (open == '{')
300 			close = '}';
301 		if (open == '[')
302 			close = ']';
303 		if (open == '(')
304 			close = ')';
305 		if (open == '<')
306 			close = '>';
307 		n = 1;
308 		while (n < 120) {
309 			d = a[n++];
310 			if (!d)
311 				break;
312 			if (d != InternalCodeChar)
313 				continue;
314 			while (isdigitByte(a[n]))
315 				++n;
316 			d = a[n++];
317 			if (!d)
318 				break;	/* should never happen */
319 			if (strchr("{}<>", d))
320 				break;
321 		}
322 		if (n >= 120)
323 			goto putc;
324 		if (d != linkchar)
325 			goto putc;
326 		a += n;
327 		if (*a != close)
328 			goto putc;
329 		++s;
330 		memmove(w, s, a - s);
331 		w += a - s;
332 		s = a;
333 		continue;
334 putc:
335 		*w++ = c;
336 	}			/* loop over buffer */
337 	*w = 0;
338 	debugPrint(4, "anchors unframed");
339 
340 /* Now compress the implied linebreaks into one. */
341 	premode = false;
342 	ss = 0;
343 	for (s = buf; (c = *s); ++s) {
344 		if (c == InternalCodeChar && isdigitByte(s[1])) {
345 			n = strtol(s + 1, &s, 10);
346 			if (*s == '*') {
347 				preFormatCheck(n, &pretag, &slash);
348 				if (pretag)
349 					premode = !slash;
350 			}
351 		}
352 		if (!isspaceByte(c))
353 			continue;
354 		strong = false;
355 		a = 0;
356 		for (w = s; isspaceByte(*w); ++w) {
357 			if (*w == '\n' || *w == '\f')
358 				strong = true;
359 			if (*w == '\r' && !a)
360 				a = w;
361 		}
362 		ss = s, s = w - 1;
363 		if (!a)
364 			continue;
365 		if (premode)
366 			continue;
367 		if (strong) {
368 			for (w = ss; w <= s; ++w)
369 				if (*w == '\r')
370 					*w = ' ';
371 			continue;
372 		}
373 		for (w = ss; w <= s; ++w)
374 			if (*w == '\r' && w != a)
375 				*w = ' ';
376 	}			/* loop over buffer */
377 	debugPrint(4, "whitespace combined");
378 
379 /* Due to the anchor swap, the buffer could end in whitespace
380  * followed by several anchors. Trim these off. */
381 	s = buf + strlen(buf);
382 	while (s > buf + 1 && s[-1] == '*' && isdigitByte(s[-2])) {
383 		for (w = s - 3; w >= buf && isdigitByte(*w); --w) ;
384 		if (w < buf || *w != InternalCodeChar)
385 			break;
386 		s = w;
387 	}
388 	*s = 0;
389 
390 /*********************************************************************
391 Some hyperlinks are multiline, due to some html inside, and our interpretation
392 of said html. This is just annoying, so pull it back down to one line.
393 Same goes for <button>, but other input fields must remain as they are.
394 Even submit, as shown by jsrt, if you submit the form it says b1=Send%20Message
395 hence it would send a newline if there was one.
396 *********************************************************************/
397 
398 	for (s = buf; (c = *s); ++s) {
399 		if (c != InternalCodeChar)
400 			continue;
401 		n = strtol(s + 1, &s, 10);
402 		if (*s == '<') {
403 			if (!stringEqual(tagList[n]->info->name, "button"))
404 				continue;
405 		} else if (*s != '{')
406 			continue;
407 		for (a = s + 1; (c = *a); ++a) {
408 			if (c == InternalCodeChar && a[1] == '0')
409 				break;
410 			if (c == '\n' || c == '\f')
411 				*a = ' ';
412 		}
413 		s = a;
414 	}
415 }				/* anchorSwap */
416 
417 /*********************************************************************
418 Format text, and break lines at sentence/phrase boundaries.
419 The prefix bl means breakline.
420 *********************************************************************/
421 
422 static char *bl_start, *bl_cursor, *bl_end;
423 static bool bl_overflow;
424 /* This is a virtual column number, extra spaces for tab,
425  * one space for emoji, and skipping over invisible anchors. */
426 static int colno;
427 int formatLineLength = 80;	// for html formatting or the bl command
428 bool formatOverflow;
429 static const int cutLineAfter = 36;	/* cut sentence after this column */
430 static const int paraLine = 120;	/* paragraph in a line */
431 static int longcut, pre_cr;
432 static int lspace;		/* last space value, 3 = paragraph */
433 /* Location of period comma rightparen or any word.
434  * Question mark is equivalent to period etc.
435  * Other things being equal, we break at period, rather than comma, etc.
436  * First the column numbers, then the index into the string. */
437 static int lperiod, lcomma, lright, lany;
438 static int idxperiod, idxcomma, idxright, idxany;
439 
debugChunk(const char * chunk,int len)440 static void debugChunk(const char *chunk, int len)
441 {
442 	int i;
443 	FILE *f;
444 	if (debugLevel < 7)
445 		return;
446 	f = debugFile ? debugFile : stdout;
447 	fprintf(f, "chunk<");
448 	for (i = 0; i < len; ++i) {
449 		char c = chunk[i];
450 		if (c == '\t') {
451 			fprintf(f, "\\t");
452 			continue;
453 		}
454 		if (c == '\n') {
455 			fprintf(f, "\\n");
456 			continue;
457 		}
458 		if (c == '\f') {
459 			fprintf(f, "\\f");
460 			continue;
461 		}
462 		if (c == '\r') {
463 			fprintf(f, "\\r");
464 			continue;
465 		}
466 		if (c == '\0') {
467 			fprintf(f, "\\0");
468 			continue;
469 		}
470 		fprintf(f, "%c", c);
471 	}
472 	fprintf(f, ">%d.%d\n", colno, lspace);
473 }				/* debugChunk */
474 
appendOneChar(char c)475 static void appendOneChar(char c)
476 {
477 	if (bl_cursor == bl_end)
478 		bl_overflow = true;
479 	else
480 		*bl_cursor++ = c;
481 }				/* appendOneChar */
482 
spaceNotInInput(void)483 static bool spaceNotInInput(void)
484 {
485 	char *t = bl_cursor;
486 	char c;
487 	for (--t; t >= bl_start; --t) {
488 		c = *t;
489 		if (c == '\n' || c == '\r')
490 			return true;
491 		if (c == '>' && t >= bl_start + 2 &&
492 		    t[-1] == '0' && t[-2] == InternalCodeChar)
493 			return true;
494 		if (c != '<')
495 			continue;
496 		while (t > bl_start && isdigitByte(t[-1]))
497 			--t;
498 		if (*t == '<')
499 			continue;
500 		if (t > bl_start && t[-1] == InternalCodeChar)
501 			return false;
502 	}
503 	return true;
504 }				/* spaceNotInInput */
505 
appendSpaceChunk(const char * chunk,int len,bool premode)506 static void appendSpaceChunk(const char *chunk, int len, bool premode)
507 {
508 	int nlc = pre_cr;	/* newline count */
509 	int spc = 0;		/* space count */
510 	int i, j;
511 	char c, d, e;
512 
513 	if (!len)
514 		return;
515 	for (i = 0; i < len; ++i) {
516 		c = chunk[i];
517 		if (c == '\n' || c == '\r') {
518 			++nlc, spc = 0;
519 			continue;
520 		}
521 		if (c == '\f') {
522 			nlc += 2, spc = 0;
523 			continue;
524 		}
525 		++spc;
526 	}
527 
528 	if (!premode && spaceNotInInput()) {
529 		int l = bl_cursor - bl_start;
530 		c = d = ' ';
531 		if (l)
532 			d = bl_cursor[-1];
533 		if (l > 1)
534 			c = bl_cursor[-2];
535 		e = d;
536 		if (strchr(")\"|}", d))
537 			e = c;
538 		if (strchr(".?!:", e)) {
539 			bool ok = true;
540 /* Check for Mr. Mrs. and others. */
541 			if (e == '.' && bl_cursor - bl_start > 10) {
542 				static const char *const prefix[] =
543 				    { "mr.", "mrs.", "sis.", "ms.", 0 };
544 				char trailing[12];
545 				for (i = 0; i < 6; ++i) {
546 					c = bl_cursor[i - 6];
547 					if (isupperByte(c))
548 						c = tolower(c);
549 					trailing[i] = c;
550 				}
551 				trailing[i] = 0;
552 				for (i = 0; prefix[i]; ++i)
553 					if (strstr(trailing, prefix[i]))
554 						ok = false;
555 /* Check for John C. Calhoon */
556 				if (isupperByte(bl_cursor[-2])
557 				    && isspaceByte(bl_cursor[-3]))
558 					ok = false;
559 			}
560 			if (ok)
561 				lperiod = colno, idxperiod = l;
562 		}
563 		e = d;
564 		if (strchr(")\"|", d))
565 			e = c;
566 		if (strchr("-,;", e))
567 			lcomma = colno, idxcomma = l;
568 		if (strchr(")\"|", d))
569 			lright = colno, idxright = l;
570 		lany = colno, idxany = l;
571 		if (formatOverflow) {
572 /* tack a short fragment onto the previous line. */
573 			if (longcut && colno <= 15 && (nlc || lperiod == colno)) {
574 				bl_start[longcut] = ' ';
575 				if (!nlc)
576 					len = spc = 0, nlc = 1;
577 			}	/* pasting small fragment onto previous line */
578 		}
579 	}			/* allowing line breaks */
580 	if (lspace == 3)
581 		nlc = 0;
582 	if (nlc) {
583 		if (lspace == 2)
584 			nlc = 1;
585 		appendOneChar('\n');
586 		if (nlc > 1)
587 			appendOneChar('\n');
588 		colno = 1;
589 		longcut = lperiod = lcomma = lright = lany = 0;
590 		if (lspace >= 2 || nlc > 1)
591 			lspace = 3;
592 		if (lspace < 2)
593 			lspace = 2;
594 		if (!premode)
595 			return;
596 	}
597 	if (!spc)
598 		return;
599 	if (!premode) {
600 /* if the first char of the text to be reformatted is space,
601  * then we will wind up here, with lspace = 3. */
602 		if (lspace == 3)
603 			return;
604 		appendOneChar(' ');
605 		++colno;
606 		lspace = 1;
607 		return;
608 	}
609 	j = -1;
610 	for (i = 0; i < len; ++i) {
611 		c = chunk[i];
612 		if (c == '\n' || c == '\r' || c == '\f')
613 			j = i;
614 	}
615 	i = j + 1;
616 	if (i)
617 		colno = 1;
618 	for (; i < len; ++i) {
619 		c = chunk[i];
620 		if (c == 0)
621 			c = ' ';
622 		appendOneChar(c);
623 		if (c == ' ')
624 			++colno;
625 		if (c == '\t')
626 			colno += 4;
627 	}
628 	lspace = 1;
629 }				/* appendSpaceChunk */
630 
appendPrintableChunk(const char * chunk,int len,bool premode)631 static void appendPrintableChunk(const char *chunk, int len, bool premode)
632 {
633 	int i, j;
634 	bool visible = true;
635 
636 	for (i = 0; i < len; ++i) {
637 		char c = chunk[i];
638 		appendOneChar(c);
639 		if (c == InternalCodeChar) {
640 			visible = false;
641 			continue;
642 		}
643 		if (visible) {
644 // each foreign char or emoji counts as one.
645 // Ignore all but the first byte of a utf8.
646 			if ((char)c >= 0	// ascii
647 			    || (c & 0x40) == 0x40)
648 				++colno;
649 			continue;
650 		}
651 		if (isdigitByte(c))
652 			continue;
653 /* end of the tag */
654 		visible = true;
655 		if (c != '*')
656 			++colno;
657 	}
658 
659 	lspace = 0;
660 	if (premode)
661 		return;
662 	if (colno <= formatLineLength)
663 		return;
664 /* Oops, line is getting long.  Let's see where we can cut it. */
665 	i = j = 0;
666 	if (lperiod > cutLineAfter)
667 		i = lperiod, j = idxperiod;
668 	else if (lcomma > cutLineAfter)
669 		i = lcomma, j = idxcomma;
670 	else if (lright > cutLineAfter)
671 		i = lright, j = idxright;
672 	else if (lany > cutLineAfter)
673 		i = lany, j = idxany;
674 	if (!j)
675 		return;		/* nothing we can do about it */
676 	longcut = 0;
677 	if (i != lperiod)
678 		longcut = j;
679 	bl_start[j] = '\n';
680 	colno -= i;
681 	lperiod -= i;
682 	lcomma -= i;
683 	lright -= i;
684 	lany -= i;
685 }				/* appendPrintableChunk */
686 
687 /* Break up a line using the above routines.
688  * The new lines are put in a fixed array.
689  * Return false (fail) if we ran out of room.
690  * This function is called from buffers.c, implementing the bl command,
691  * and is only in this file because it shares the above routines and variables
692  * with the html reformatting, which really has to be here. */
693 
694 char *breakLineResult;
695 #define REFORMAT_EXTRA 400
696 
697 /* Count the formfeeds in a string. Each of these expands to \n\n,
698  * making the string longer. */
formfeedCount(const char * buf,int len)699 static int formfeedCount(const char *buf, int len)
700 {
701 	int i, ff = 0;
702 	for (i = 0; i < len; ++i)
703 		if (buf[i] == '\f')
704 			++ff;
705 	return ff;
706 }				/* formfeedCount */
707 
breakLine(const char * line,int len,int * newlen)708 bool breakLine(const char *line, int len, int *newlen)
709 {
710 	char c, state, newstate;
711 	int i, last, extra;
712 
713 	pre_cr = 0;
714 	if (len && line[len - 1] == '\r')
715 		--len;
716 	if (lspace == 4) {
717 /* special continuation code from the previous invokation */
718 		lspace = 2;
719 		if (line[0])
720 			++pre_cr;
721 	}
722 	if (len > paraLine)
723 		++pre_cr;
724 	if (lspace < 2)
725 		lspace = 2;	/* should never happen */
726 	if (!len + pre_cr)
727 		lspace = 3;
728 
729 	nzFree(breakLineResult);
730 	extra = REFORMAT_EXTRA + formfeedCount(line, len);
731 	breakLineResult = allocMem(len + extra);
732 	bl_start = bl_cursor = breakLineResult;
733 	bl_end = breakLineResult + len + extra - 8;
734 	bl_overflow = false;
735 
736 	colno = 1;
737 	longcut = lperiod = lcomma = lright = lany = 0;
738 	last = 0;
739 	state = 0;
740 	if (pre_cr)
741 		state = 1;
742 
743 	for (i = 0; i < len; ++i) {
744 		c = line[i];
745 		newstate = 2;
746 		if (!c || strchr(" \t\n\r\f", c))
747 			newstate = 1;
748 		if (state == newstate)
749 			continue;
750 		if (!state) {
751 			state = newstate;
752 			continue;
753 		}
754 
755 /* state change here */
756 		debugChunk(line + last, i - last);
757 		if (state == 1)
758 			appendSpaceChunk(line + last, i - last, false);
759 		else
760 			appendPrintableChunk(line + last, i - last, false);
761 		last = i;
762 		state = newstate;
763 		pre_cr = 0;
764 	}
765 
766 	if (state) {		/* last token */
767 		debugChunk(line + last, len - last);
768 		if (state == 1)
769 			appendSpaceChunk(line + last, len - last, false);
770 		else
771 			appendPrintableChunk(line + last, len - last, false);
772 	}
773 
774 	if (lspace < 2) {	/* line didn't have a \r at the end */
775 		appendSpaceChunk("\n", 1, false);
776 	}
777 	if (bl_cursor - bl_start > paraLine)
778 		lspace = 4;
779 	debugPrint(7, "chunk<EOL>%d.%d", colno, lspace);
780 	*newlen = bl_cursor - bl_start;
781 	return !bl_overflow;
782 }				/* breakLine */
783 
breakLineSetup(void)784 void breakLineSetup(void)
785 {
786 	lspace = 3;
787 }
788 
htmlReformat(char * buf)789 char *htmlReformat(char *buf)
790 {
791 	const char *h, *nh, *s;
792 	char c;
793 	bool premode = false;
794 	bool pretag, slash;
795 	char *new;
796 	int l, tagno, extra;
797 	char *fmark;		/* mark the start of a frame */
798 
799 	cellDelimiters(buf);
800 
801 	anchorSwap(buf);
802 
803 	longcut = lperiod = lcomma = lright = lany = 0;
804 	colno = 1;
805 	pre_cr = 0;
806 	lspace = 3;
807 
808 	l = strlen(buf);
809 /* Only a pathological web page gets longer after reformatting.
810  * Those with paragraphs and nothing else to compress or remove.
811  * Thus I allocate for the formfeeds, which correspond to paragraphs,
812  * and are replaced with \n\n.
813  * Plus some extra bytes for slop.
814  * If you still overflow, even beyond the EXTRA,
815  * it won't seg fault, you'll just lose some text. */
816 	extra = REFORMAT_EXTRA + formfeedCount(buf, l);
817 	new = allocMem(l + extra);
818 	bl_start = bl_cursor = new;
819 	bl_end = new + l + extra - 20;
820 	bl_overflow = false;
821 
822 	for (h = buf; (c = *h); h = nh) {
823 		if (isspaceByte(c)) {
824 			for (s = h + 1; isspaceByte(*s); ++s) ;
825 			nh = s;
826 			appendSpaceChunk(h, nh - h, premode);
827 			if (lspace == 3) {
828 				longcut = lperiod = lcomma = lright = lany = 0;
829 				colno = 1;
830 			}
831 			continue;
832 		}
833 
834 		if (c != InternalCodeChar) {
835 			for (s = h + 1; *s; ++s)
836 				if (isspaceByte(*s) || *s == InternalCodeChar)
837 					break;
838 			nh = s;
839 			appendPrintableChunk(h, nh - h, premode);
840 			continue;
841 		}
842 
843 		/* It's a tag */
844 		tagno = strtol(h + 1, (char **)&nh, 10);
845 		c = *nh++;
846 		if (!c || !strchr("{}<>*", c)) {
847 // this should never happen!
848 			i_printf(MSG_BadTagCode, tagno, c);
849 			appendOneChar('@');
850 			nh = h + 1;
851 			continue;
852 		}
853 		appendPrintableChunk(h, nh - h, premode);
854 		preFormatCheck(tagno, &pretag, &slash);
855 		if (pretag) {
856 			premode = !slash;
857 			if (!premode) {
858 /* This forces a new paragraph, so it last char was nl, erase it. */
859 				char *w = bl_cursor - 1;
860 				while (*w != InternalCodeChar)
861 					--w;
862 				if (w > bl_start && w[-1] == '\n') {
863 					memmove(w - 1, w, bl_cursor - w);
864 					--bl_cursor;
865 				}
866 			}
867 		}
868 	}			/* loop over text */
869 
870 /* close off the last line */
871 	if (lspace < 2)
872 		appendSpaceChunk("\n", 1, true);
873 	*bl_cursor = 0;
874 	l = bl_cursor - bl_start;
875 /* Get rid of last space. */
876 	if (l >= 2 && new[l - 1] == '\n' && new[l - 2] == ' ')
877 		new[l - 2] = '\n', new[--l] = 0;
878 /* Don't need empty lines at the end. */
879 	while (l > 1 && new[l - 1] == '\n' && new[l - 2] == '\n')
880 		--l;
881 	new[l] = 0;
882 /* Don't allow an empty buffer */
883 	if (!l)
884 		new[0] = '\n', new[1] = 0, l = 1;
885 
886 	if (bl_overflow) {
887 /* we should print a more helpful error message here */
888 		strcpy(new + l, "\n???");
889 		l += 4;
890 	}
891 
892 /* It's a little thing really, but the blank line at the top of each frame annoys me */
893 	fmark = new;
894 	while ((fmark = strstr(fmark + 1, "*`--\n\n"))) {
895 		if (isdigit(fmark[-1]))
896 			strmove(fmark + 5, fmark + 6);
897 	}
898 
899 	return new;
900 }				/* htmlReformat */
901 
902 /*********************************************************************
903 Crunch a to-list or a copy-to-list down to its email addresses.
904 Delimit them with newlines.
905 "Smith, John" <jsmith@whatever.com>
906 becomes
907 jsmith@whatever.com
908 *********************************************************************/
909 
extractEmailAddresses(char * line)910 void extractEmailAddresses(char *line)
911 {
912 	char *s, *t;
913 	char *mark;		/* start of current entry */
914 	char quote = 0, c;
915 
916 	for (s = t = mark = line; (c = *s); ++s) {
917 		if (c == ',' && !quote) {
918 			mark = t + 1;
919 			c = ' ';
920 			goto append;
921 		}
922 
923 		if (c == '"') {
924 			if (!quote)
925 				quote = c;
926 			else if (quote == c)
927 				quote = 0;
928 /* don't think you can quote in an email address */
929 			continue;
930 		}
931 
932 		if (c == '<') {
933 			if (!quote) {
934 				quote = c;
935 				t = mark;
936 			}
937 			continue;
938 		}
939 
940 		if (c == '>') {
941 			if (quote == '<')
942 				quote = 0;
943 			continue;
944 		}
945 
946 		if (quote == '"')
947 			continue;
948 
949 		if (c < ' ')
950 			c = ' ';
951 		if (c == ' ' && quote == '<')
952 			c = '_';
953 
954 append:
955 		*t++ = c;
956 	}
957 
958 	*t = 0;
959 	spaceCrunch(line, true, false);
960 	for (s = line; (c = *s); ++s)
961 		if (c == ' ')
962 			*s = ',';
963 	if (*line)
964 		strcat(line, ",");
965 }				/* extractEmailAddresses */
966 
cutDuplicateEmail(char * line,const char * dup,int duplen)967 static void cutDuplicateEmail(char *line, const char *dup, int duplen)
968 {
969 	char *s;
970 	while (*line) {
971 		s = strchr(line, ',');
972 		if (!s)
973 			return;	/* should never happen */
974 		if (duplen == s - line && memEqualCI(line, dup, duplen)) {
975 			++s;
976 			strmove(line, s);
977 			continue;
978 		}
979 		line = s + 1;
980 	}
981 }				/* cutDuplicateEmail */
982 
cutDuplicateEmails(char * tolist,char * cclist,const char * reply)983 void cutDuplicateEmails(char *tolist, char *cclist, const char *reply)
984 {
985 	int len;
986 	char *s, *t;
987 
988 	len = strlen(reply);
989 	if (len) {
990 		cutDuplicateEmail(tolist, reply, len);
991 		cutDuplicateEmail(cclist, reply, len);
992 	}
993 
994 	s = tolist;
995 	while (*s) {
996 		t = strchr(s, ',');
997 		if (!t)
998 			break;	/* should never happen */
999 		len = t - s;
1000 		++t;
1001 		cutDuplicateEmail(t, s, len);
1002 		cutDuplicateEmail(cclist, s, len);
1003 		s = t;
1004 	}
1005 
1006 	s = cclist;
1007 	while (*s) {
1008 		t = strchr(s, ',');
1009 		if (!t)
1010 			break;	/* should never happen */
1011 		len = t - s;
1012 		++t;
1013 		cutDuplicateEmail(t, s, len);
1014 		s = t;
1015 	}
1016 
1017 /* If your email address is on the to or cc list, drop it.
1018  * But retain it if it is the reply, in case you sent mail to yourself. */
1019 	if (reply[0]) {
1020 		struct MACCOUNT *m = accounts;
1021 		int i;
1022 		for (i = 0; i < maxAccount; ++i, ++m) {
1023 			const char *r = m->reply;
1024 			if (!r)
1025 				continue;
1026 			len = strlen(r);
1027 			cutDuplicateEmail(tolist, r, len);
1028 			cutDuplicateEmail(cclist, r, len);
1029 		}
1030 	}
1031 }				/* cutDuplicateEmails */
1032 
isEmailAddress(const char * s)1033 bool isEmailAddress(const char *s)
1034 {
1035 	bool atfound = false, dotfound = false;
1036 	if (!s || !*s)
1037 		return false;
1038 	for (; *s; ++s) {
1039 		char c = *s;
1040 		if (c < 0)	// nonascii
1041 			return false;
1042 		if (atfound) {
1043 			if (!isalnum(c) && c != '.' && c != '-')
1044 				return false;
1045 			if (c == '.') {
1046 				if (s[1] == '.' || s[1] == 0 || s[-1] == '.'
1047 				    || s[-1] == '@')
1048 					return false;
1049 				dotfound = true;
1050 			}
1051 			continue;
1052 		}
1053 // I think anything is ok before the @, except space.
1054 		if (c <= ' ')
1055 			return false;
1056 		if (c == '@')
1057 			atfound = true;
1058 	}
1059 	return atfound & dotfound;
1060 }
1061 
1062 /* return 1 for utf16, 2 for utf32, ored with 4 for big endian */
byteOrderMark(const uchar * buf,int buflen)1063 int byteOrderMark(const uchar * buf, int buflen)
1064 {
1065 	if (buflen < 2)
1066 		return 0;
1067 	if (buf[0] == 0xfe && buf[1] == 0xff)
1068 		return 5;
1069 	if (buf[0] == 0xff && buf[1] == 0xfe) {
1070 		if (buflen >= 4 && buf[2] == 0 && buf[3] == 0)
1071 			return 2;
1072 		return 1;
1073 	}
1074 	if (buflen >= 4 && !memcmp(buf, "\x0\x0\xfe\xff", 4))
1075 		return 6;
1076 	return 0;
1077 }				/* byteOrderMark */
1078 
1079 /*********************************************************************
1080 We got some data from a file or from the internet.
1081 Count the binary characters and decide if this is, on the whole,
1082 binary or text.  I allow some nonascii chars,
1083 like you might see in Spanish or German, and still call it text,
1084 but if there's too many such chars, I call it binary.
1085 It's not an exact science.
1086 utf8 sequences are considered text characters.
1087 If there is a leading byte order mark as per the previous routine, it's text.
1088 *********************************************************************/
1089 
looksBinary(const uchar * buf,int buflen)1090 bool looksBinary(const uchar * buf, int buflen)
1091 {
1092 	int i, j, bincount = 0, charcount = 0, nullcount = 0;
1093 	uchar c;
1094 	uchar seed;
1095 
1096 	if (byteOrderMark(buf, buflen))
1097 		return false;
1098 
1099 	for (i = 0; i < buflen; ++i, ++charcount) {
1100 		c = buf[i];
1101 // 0 is ascii, but not really text, and very common in binary files.
1102 		if (c == 0) {
1103 			if (++nullcount >= 10)
1104 				return true;
1105 		}
1106 		if (c < 0x80)
1107 			continue;
1108 // could represent a utf8 character
1109 		seed = c;
1110 		if ((seed & 0xfe) == 0xfe || (seed & 0xc0) == 0x80) {
1111 binchar:
1112 			++bincount;
1113 			continue;
1114 		}
1115 		seed <<= 1;
1116 		j = 1;
1117 		while (seed & 0x80 && i + j < buflen
1118 		       && (buf[i + j] & 0xc0) == 0x80)
1119 			seed <<= 1, ++j;
1120 		if (seed & 0x80)
1121 			goto binchar;
1122 // this is valid utf8 char, don't treat it as binary.
1123 		i += j - 1;
1124 	}
1125 
1126 	return (bincount * 8 - 16 >= charcount);
1127 }				/* looksBinary */
1128 
looks_8859_utf8(const uchar * buf,int buflen,bool * iso_p,bool * utf8_p)1129 void looks_8859_utf8(const uchar * buf, int buflen, bool * iso_p, bool * utf8_p)
1130 {
1131 	int utfcount = 0, isocount = 0;
1132 	int i, j, bothcount;
1133 
1134 	for (i = 0; i < buflen; ++i) {
1135 		uchar c = buf[i];
1136 		if (c < 0x80)
1137 			continue;
1138 /* This is the start of the nonascii sequence. */
1139 /* No second bit, it has to be iso. */
1140 		if (!(c & 0x40)) {
1141 isogo:
1142 			++isocount;
1143 			continue;
1144 		}
1145 /* Next byte has to start with 10 to be utf8, else it's iso */
1146 		if ((buf[i + 1] & 0xc0) != 0x80)
1147 			goto isogo;
1148 		c <<= 2;
1149 		for (j = i + 2; c < 0; ++j, c <<= 1)
1150 			if ((buf[j] & 0xc0) != 0x80)
1151 				goto isogo;
1152 		++utfcount;
1153 		i = j - 1;
1154 	}
1155 
1156 	*iso_p = *utf8_p = false;
1157 
1158 	bothcount = isocount + utfcount;
1159 	if (!bothcount)
1160 		return;		/* ascii */
1161 	bothcount *= 6;
1162 	if (utfcount * 7 >= bothcount)
1163 		*utf8_p = true;
1164 	if (isocount * 7 >= bothcount)
1165 		*iso_p = true;
1166 }				/* looks_8859_utf8 */
1167 
1168 /*********************************************************************
1169 Convert a string from iso 8859 to utf8, or vice versa.
1170 In each case a new string is allocated.
1171 Don't forget to free it when you're done.
1172 *********************************************************************/
1173 
1174 /* only 8859-1 and 8859-2 so far */
1175 static const int iso_unicodes[2][128] = {
1176 	{
1177 /*********************************************************************
1178 The first 32 nonascii chars in iso8859-1 are control characters,
1179 and almost never used.
1180 Much more common are the cp1252 characters, introduced by Microsoft.
1181 I'm gonna go with those, and hope I'm right more often than wrong.
1182 *********************************************************************/
1183 #define CP1252 1
1184 #if CP1252
1185 	 0x20AC, 0x81, 0x201A, 0x192, 0x201E, 0x2026, 0x2020, 0x2021,
1186 	 0x2C6, 0x2030, 0x160, 0x2039, 0x152, 0x8d, 0x17D, 0x8f,
1187 	 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1188 	 0x2DC, 0x2122, 0x161, 0x203A, 0x153, 0x9d, 0x17E, 0x178,
1189 #else
1190 	 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b,
1191 	 0x8c, 0x8d, 0x8e, 0x8f,
1192 	 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a,
1193 	 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
1194 #endif
1195 	 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa,
1196 	 0xab, 0xac, 0xad, 0xae, 0xaf,
1197 	 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
1198 	 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
1199 	 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca,
1200 	 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
1201 	 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
1202 	 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
1203 	 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
1204 	 0xeb, 0xec, 0xed, 0xee, 0xef,
1205 	 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa,
1206 	 0xfb, 0xfc, 0xfd, 0xfe, 0xff},
1207 	{0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b,
1208 	 0x8c, 0x8d, 0x8e, 0x8f,
1209 	 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a,
1210 	 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
1211 	 0xa0, 0x104, 0x2d8, 0x141, 0xa4, 0x13d, 0x15a, 0xa7, 0xa8, 0x160,
1212 	 0x15e, 0x164, 0x179, 0xad, 0x17d, 0x17b,
1213 	 0xb0, 0x105, 0x2db, 0x142, 0xb4, 0x13e, 0x15b, 0x2c7, 0xb8, 0x161,
1214 	 0x15f, 0x165, 0x17a, 0x2dd, 0x17e, 0x17c,
1215 	 0x154, 0xc1, 0xc2, 0x102, 0xc4, 0x139, 0x106, 0xc7, 0x10c, 0xc9,
1216 	 0x118, 0xcb, 0x11a, 0xcd, 0xce, 0x10e,
1217 	 0x110, 0x143, 0x147, 0xd3, 0xd4, 0x150, 0xd6, 0xd7, 0x158, 0x16e,
1218 	 0xda, 0x170, 0xdc, 0xdd, 0x162, 0xdf,
1219 	 0x155, 0xe1, 0xe2, 0x103, 0xe4, 0x13a, 0x107, 0xe7, 0x10d, 0xe9,
1220 	 0x119, 0xeb, 0x11b, 0xed, 0xee, 0x10f,
1221 	 0x111, 0x144, 0x148, 0xf3, 0xf4, 0x151, 0xf6, 0xf7, 0x159, 0x16f,
1222 	 0xfa, 0x171, 0xfc, 0xfd, 0x163, 0x2d9},
1223 };
1224 
iso2utf(const uchar * inbuf,int inbuflen,uchar ** outbuf_p,int * outbuflen_p)1225 void iso2utf(const uchar * inbuf, int inbuflen, uchar ** outbuf_p,
1226 	     int *outbuflen_p)
1227 {
1228 	int i, j;
1229 	int nacount = 0;
1230 	uchar c;
1231 	uchar *outbuf;
1232 	const int *isoarray = iso_unicodes[type8859 - 1];
1233 	int ucode;
1234 	char *s;
1235 
1236 	if (!inbuflen) {
1237 		*outbuf_p = (uchar *) emptyString;
1238 		*outbuflen_p = 0;
1239 		return;
1240 	}
1241 
1242 /* count chars, so we can allocate */
1243 	for (i = 0; i < inbuflen; ++i) {
1244 		c = inbuf[i];
1245 		if (c >= 0x80) {
1246 			ucode = isoarray[c & 0x7f];
1247 			s = uni2utf8(ucode);
1248 			nacount += strlen(s) - 1;
1249 		}
1250 	}
1251 
1252 	outbuf = allocMem(inbuflen + nacount + 1);
1253 
1254 	for (i = j = 0; i < inbuflen; ++i) {
1255 		c = inbuf[i];
1256 		if (c < 0x80) {
1257 			outbuf[j++] = c;
1258 			continue;
1259 		}
1260 		ucode = isoarray[c & 0x7f];
1261 		s = uni2utf8(ucode);
1262 		strcpy((char *)outbuf + j, s);
1263 		j += strlen(s);
1264 	}
1265 	outbuf[j] = 0;
1266 
1267 	*outbuf_p = outbuf;
1268 	*outbuflen_p = j;
1269 }				/* iso2utf */
1270 
utf2iso(const uchar * inbuf,int inbuflen,uchar ** outbuf_p,int * outbuflen_p)1271 void utf2iso(const uchar * inbuf, int inbuflen, uchar ** outbuf_p,
1272 	     int *outbuflen_p)
1273 {
1274 	int i, j, k;
1275 	uchar c;
1276 	uchar *outbuf;
1277 	const int *isoarray = iso_unicodes[type8859 - 1];
1278 	int ucode;
1279 
1280 	if (!inbuflen) {
1281 		*outbuf_p = (uchar *) emptyString;
1282 		*outbuflen_p = 0;
1283 		return;
1284 	}
1285 
1286 	outbuf = allocMem(inbuflen + 1);
1287 	for (i = j = 0; i < inbuflen; ++i) {
1288 		c = inbuf[i];
1289 
1290 /* regular chars and nonascii chars that aren't utf8 pass through. */
1291 /* There shouldn't be any of the latter */
1292 		if ((c & 0xc0) != 0xc0) {
1293 			outbuf[j++] = c;
1294 			continue;
1295 		}
1296 
1297 /* Convertable into 11 bit */
1298 		if ((c & 0xe0) == 0xc0 && (inbuf[i + 1] & 0xc0) == 0x80) {
1299 			ucode = c & 0x1f;
1300 			ucode <<= 6;
1301 			ucode |= (inbuf[i + 1] & 0x3f);
1302 			for (k = 0; k < 128; ++k)
1303 				if (isoarray[k] == ucode)
1304 					break;
1305 			if (k < 128) {
1306 				outbuf[j++] = k | 0x80;
1307 				++i;
1308 				continue;
1309 			}
1310 		}
1311 
1312 /* Convertable into 16 bit */
1313 		if ((c & 0xf0) == 0xe0 &&
1314 		    (inbuf[i + 1] & 0xc0) == 0x80 &&
1315 		    (inbuf[i + 2] & 0xc0) == 0x80) {
1316 			ucode = c & 0xf;
1317 			ucode <<= 6;
1318 			ucode |= (inbuf[i + 1] & 0x3f);
1319 			ucode <<= 6;
1320 			ucode |= (inbuf[i + 2] & 0x3f);
1321 			for (k = 0; k < 128; ++k)
1322 				if (isoarray[k] == ucode)
1323 					break;
1324 			if (k < 128) {
1325 				outbuf[j++] = k | 0x80;
1326 				i += 2;
1327 				continue;
1328 			}
1329 		}
1330 
1331 /* unicodes not found in our iso class are converted into stars */
1332 		c <<= 1;
1333 		++i;
1334 		for (++i; c < 0; ++i, c <<= 1) {
1335 			if ((outbuf[i] & 0xc0) != 0x80)
1336 				break;
1337 		}
1338 		outbuf[j++] = '*';
1339 		--i;
1340 	}
1341 	outbuf[j] = 0;
1342 
1343 	*outbuf_p = outbuf;
1344 	*outbuflen_p = j;
1345 }				/* utf2iso */
1346 
1347 /*********************************************************************
1348 Convert the current line in buffer, which is either iso8859-1 or utf8,
1349 into utf16 or utf32, big or little endian.
1350 The returned string is allocated, though not really a string,
1351 since it will contain nulls, plenty of them in the case of utf32.
1352 *********************************************************************/
1353 
utfHigh(const char * inbuf,int inbuflen,char ** outbuf_p,int * outbuflen_p,bool inutf8,bool out32,bool outbig)1354 void utfHigh(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p,
1355 	     bool inutf8, bool out32, bool outbig)
1356 {
1357 	uchar *outbuf;
1358 	unsigned int unicode;
1359 	uchar c;
1360 	int i, j;
1361 
1362 	if (!inbuflen) {
1363 		*outbuf_p = emptyString;
1364 		*outbuflen_p = 0;
1365 		return;
1366 	}
1367 
1368 	outbuf = allocMem(inbuflen * 4);	// worst case
1369 
1370 	i = j = 0;
1371 	while (i < inbuflen) {
1372 		c = (uchar) inbuf[i];
1373 		if (!inutf8 || ((c & 0xc0) != 0xc0 && (c & 0xfe) != 0xfe)) {
1374 			unicode = c;	// that was easy
1375 			++i;
1376 		} else {
1377 			uchar mask = 0x20;
1378 			int k = 1;
1379 			++i;
1380 			while (c & mask)
1381 				++k, mask >>= 1;
1382 			c &= (mask - 1);
1383 			unicode = ((unsigned int)c) << (6 * k);
1384 			while (i < inbuflen && k) {
1385 				c = (uchar) inbuf[i];
1386 				if ((c & 0xc0) != 0x80)
1387 					break;
1388 				++i, --k;
1389 				c &= 0x3f;
1390 				unicode |= (((unsigned int)c) << (6 * k));
1391 			}
1392 		}
1393 
1394 		if (out32) {
1395 			if (outbig) {
1396 				outbuf[j++] = ((unicode >> 24) & 0xff);
1397 				outbuf[j++] = ((unicode >> 16) & 0xff);
1398 				outbuf[j++] = ((unicode >> 8) & 0xff);
1399 				outbuf[j++] = (unicode & 0xff);
1400 			} else {
1401 				outbuf[j++] = (unicode & 0xff);
1402 				outbuf[j++] = ((unicode >> 8) & 0xff);
1403 				outbuf[j++] = ((unicode >> 16) & 0xff);
1404 				outbuf[j++] = ((unicode >> 24) & 0xff);
1405 			}
1406 			continue;
1407 		}
1408 // utf16, a bit trickier but not too bad.
1409 		if (unicode <= 0xd7ff
1410 		    || (unicode >= 0xe000 && unicode <= 0xffff)) {
1411 			if (outbig) {
1412 				outbuf[j++] = ((unicode >> 8) & 0xff);
1413 				outbuf[j++] = (unicode & 0xff);
1414 			} else {
1415 				outbuf[j++] = (unicode & 0xff);
1416 				outbuf[j++] = ((unicode >> 8) & 0xff);
1417 			}
1418 			continue;
1419 		}
1420 
1421 		if (unicode >= 0x10000 && unicode <= 0x10ffff) {
1422 // surrogate pairs
1423 			unsigned int pair1, pair2;
1424 			unicode -= 0x10000;
1425 			pair1 = 0xd800 + ((unicode >> 10) & 0x3ff);
1426 			pair2 = 0xdc00 + (unicode & 0x3ff);
1427 			if (outbig) {
1428 				outbuf[j++] = ((pair1 >> 8) & 0xff);
1429 				outbuf[j++] = (pair1 & 0xff);
1430 				outbuf[j++] = ((pair2 >> 8) & 0xff);
1431 				outbuf[j++] = (pair2 & 0xff);
1432 			} else {
1433 				outbuf[j++] = (pair1 & 0xff);
1434 				outbuf[j++] = ((pair1 >> 8) & 0xff);
1435 				outbuf[j++] = (pair2 & 0xff);
1436 				outbuf[j++] = ((pair2 >> 8) & 0xff);
1437 			}
1438 			continue;
1439 		}
1440 
1441 	}
1442 
1443 	*outbuf_p = (char *)outbuf;
1444 	*outbuflen_p = j;
1445 }				/* utfHigh */
1446 
1447 /* convert a 32 bit unicode character into utf8 */
uni2utf8(unsigned int unichar)1448 char *uni2utf8(unsigned int unichar)
1449 {
1450 	static uchar outbuf[12];
1451 	int n = 0;
1452 
1453 	if (unichar <= 0x7f) {
1454 		outbuf[n++] = unichar;
1455 	} else if (unichar <= 0x7ff) {
1456 		outbuf[n++] = 0xc0 | ((unichar >> 6) & 0x1f);
1457 		outbuf[n++] = 0x80 | (unichar & 0x3f);
1458 	} else if (unichar <= 0xffff) {
1459 		outbuf[n++] = 0xe0 | ((unichar >> 12) & 0xf);
1460 		outbuf[n++] = 0x80 | ((unichar >> 6) & 0x3f);
1461 		outbuf[n++] = 0x80 | (unichar & 0x3f);
1462 	} else if (unichar <= 0x1fffff) {
1463 		outbuf[n++] = 0xf0 | ((unichar >> 18) & 7);
1464 		outbuf[n++] = 0x80 | ((unichar >> 12) & 0x3f);
1465 		outbuf[n++] = 0x80 | ((unichar >> 6) & 0x3f);
1466 		outbuf[n++] = 0x80 | (unichar & 0x3f);
1467 	} else if (unichar <= 0x3ffffff) {
1468 		outbuf[n++] = 0xf8 | ((unichar >> 24) & 3);
1469 		outbuf[n++] = 0x80 | ((unichar >> 18) & 0x3f);
1470 		outbuf[n++] = 0x80 | ((unichar >> 12) & 0x3f);
1471 		outbuf[n++] = 0x80 | ((unichar >> 6) & 0x3f);
1472 		outbuf[n++] = 0x80 | (unichar & 0x3f);
1473 	} else if (unichar <= 0x7fffffff) {
1474 		outbuf[n++] = 0xfc | ((unichar >> 30) & 1);
1475 		outbuf[n++] = 0x80 | ((unichar >> 24) & 0x3f);
1476 		outbuf[n++] = 0x80 | ((unichar >> 18) & 0x3f);
1477 		outbuf[n++] = 0x80 | ((unichar >> 12) & 0x3f);
1478 		outbuf[n++] = 0x80 | ((unichar >> 6) & 0x3f);
1479 		outbuf[n++] = 0x80 | (unichar & 0x3f);
1480 	}
1481 
1482 	outbuf[n] = 0;
1483 	return (char *)outbuf;
1484 }				/* uni2utf8 */
1485 
utfLow(const char * inbuf,int inbuflen,char ** outbuf_p,int * outbuflen_p,int bom)1486 void utfLow(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p,
1487 	    int bom)
1488 {
1489 	char *obuf;
1490 	int obuf_l;
1491 	unsigned int unicode;
1492 	int isbig;
1493 	int k, l;
1494 	const int *isoarray = iso_unicodes[type8859 - 1];
1495 
1496 	if (!inbuflen) {
1497 		*outbuf_p = emptyString;
1498 		*outbuflen_p = 0;
1499 		return;
1500 	}
1501 
1502 	obuf = initString(&obuf_l);
1503 	isbig = (bom & 4);
1504 	bom &= 3;
1505 	l = bom * 2;		// skip past byte order mark
1506 
1507 	while (l < inbuflen) {
1508 		if (bom == 2) {
1509 			if (l + 4 > inbuflen) {
1510 				unicode = '?';
1511 				l = inbuflen;
1512 			} else if (isbig) {
1513 				unicode = (uchar) inbuf[l];
1514 				unicode <<= 8;
1515 				unicode |= (uchar) inbuf[l + 1];
1516 				unicode <<= 8;
1517 				unicode |= (uchar) inbuf[l + 2];
1518 				unicode <<= 8;
1519 				unicode |= (uchar) inbuf[l + 3];
1520 				l += 4;
1521 			} else {
1522 				unicode = (uchar) inbuf[l + 3];
1523 				unicode <<= 8;
1524 				unicode |= (uchar) inbuf[l + 2];
1525 				unicode <<= 8;
1526 				unicode |= (uchar) inbuf[l + 1];
1527 				unicode <<= 8;
1528 				unicode |= (uchar) inbuf[l];
1529 				l += 4;
1530 			}
1531 		} else {
1532 			if (l + 2 > inbuflen) {
1533 				unicode = '?';
1534 				l = inbuflen;
1535 			} else if (isbig) {
1536 				unicode = (uchar) inbuf[l];
1537 				unicode <<= 8;
1538 				unicode |= (uchar) inbuf[l + 1];
1539 				l += 2;
1540 			} else {
1541 				unicode = (uchar) inbuf[l + 1];
1542 				unicode <<= 8;
1543 				unicode |= (uchar) inbuf[l];
1544 				l += 2;
1545 			}
1546 			if (unicode >= 0xd800 && unicode <= 0xdbff
1547 			    && l + 2 <= inbuflen) {
1548 				unsigned int pair1, pair2;
1549 				pair1 = unicode - 0xd800;
1550 				if (isbig) {
1551 					pair2 = (uchar) inbuf[l];
1552 					pair2 <<= 8;
1553 					pair2 |= (uchar) inbuf[l + 1];
1554 				} else {
1555 					pair2 = (uchar) inbuf[l + 1];
1556 					pair2 <<= 8;
1557 					pair2 |= (uchar) inbuf[l];
1558 				}
1559 				if (pair2 >= 0xdc00 && pair2 <= 0xdfff) {
1560 					pair2 -= 0xdc00;
1561 					l += 2;
1562 					unicode = pair1;
1563 					unicode <<= 10;
1564 					unicode |= pair2;
1565 				}
1566 			}
1567 		}
1568 
1569 // ok we got the unicode.
1570 // It now becomes utf8 or iso8859-x
1571 		if (cons_utf8) {
1572 			stringAndString(&obuf, &obuf_l, uni2utf8(unicode));
1573 			continue;
1574 		}
1575 // iso8859-x here, practically deprecated
1576 		if (unicode <= 127) {	// ascii
1577 			stringAndChar(&obuf, &obuf_l, (char)unicode);
1578 			continue;
1579 		}
1580 
1581 		for (k = 0; k < 128; ++k)
1582 			if (isoarray[k] == unicode)
1583 				break;
1584 		if (k < 128)
1585 			unicode = k | 0x80;
1586 		else
1587 			unicode = '?';
1588 		stringAndChar(&obuf, &obuf_l, (char)unicode);
1589 	}
1590 
1591 // The input string is a file or url and has 2 extra bytes after it.
1592 // After reformatting it should still have two extra bytes after it.
1593 	stringAndString(&obuf, &obuf_l, "  ");
1594 
1595 	*outbuf_p = obuf;
1596 	*outbuflen_p = obuf_l - 2;
1597 }				/* utfLow */
1598 
1599 // Convert from whatever it is to utf8, for javascript and css.
1600 // Result parameter is the new string, or null if no conversion.
1601 // But, if the original string is utf8, I remove the bom.
1602 // Also turn \0 into spaces.
force_utf8(char * buf,int buflen)1603 char *force_utf8(char *buf, int buflen)
1604 {
1605 	char *tbuf, *s;
1606 	int bom = byteOrderMark((const uchar *)buf, buflen);
1607 	if (bom) {
1608 		debugPrint(3, "text type is %s%s",
1609 			   ((bom & 4) ? "big " : ""),
1610 			   ((bom & 2) ? "utf32" : "utf16"));
1611 		if (debugLevel >= 3)
1612 			i_puts(MSG_ConvUtf8);
1613 		utfLow(buf, buflen, &tbuf, &buflen, bom);
1614 // get rid of \0
1615 		for (s = tbuf; s < tbuf + buflen; ++s)
1616 			if (!*s)
1617 				*s = ' ';
1618 		*s = 0;
1619 		return tbuf;
1620 	}
1621 // Strip off the leading bom, if any, and no we're not going to put it back.
1622 	if (buflen >= 3 && !memcmp(buf, "\xef\xbb\xbf", 3)) {
1623 		buflen -= 3;
1624 		memmove(buf, buf + 3, buflen);
1625 		buf[buflen] = 0;
1626 	}
1627 	for (s = buf; s < buf + buflen; ++s)
1628 		if (!*s)
1629 			*s = ' ';
1630 	return NULL;
1631 }
1632 
1633 static const char base64_chars[] =
1634     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1635 
1636 /*
1637  * Encode some data in base64.
1638  * inbuf points to the data
1639  * inlen is the length of the data
1640  * lines is a boolean, indicating whether to add newlines to the output.
1641  * If true, newlines will be added after each group of 72 output bytes.
1642  * Returns: A freshly-allocated NUL-terminated string, containing the
1643  * base64 representation of the data. */
base64Encode(const char * inbuf,int inlen,bool lines)1644 char *base64Encode(const char *inbuf, int inlen, bool lines)
1645 {
1646 	char *out, *outstr;
1647 	uchar *in = (uchar *) inbuf;
1648 	int colno;
1649 	int outlen = ((inlen / 3) + 1) * 4;
1650 	++outlen;		/* zero on the end */
1651 	if (lines)
1652 		outlen += (inlen / 54) + 1;
1653 	outstr = out = allocMem(outlen);
1654 	colno = 0;
1655 	while (inlen >= 3) {
1656 		*out++ = base64_chars[(int)(*in >> 2)];
1657 		*out++ = base64_chars[(int)((*in << 4 | *(in + 1) >> 4) & 63)];
1658 		*out++ =
1659 		    base64_chars[(int)((*(in + 1) << 2 | *(in + 2) >> 6) & 63)];
1660 		*out++ = base64_chars[(int)(*(in + 2) & 63)];
1661 		inlen -= 3;
1662 		in += 3;
1663 		if (!lines)
1664 			continue;
1665 		colno += 4;
1666 		if (colno < 72)
1667 			continue;
1668 		*out++ = '\n';
1669 		colno = 0;
1670 	}
1671 	if (inlen == 1) {
1672 		*out++ = base64_chars[(int)(*in >> 2)];
1673 		*out++ = base64_chars[(int)(*in << 4 & 63)];
1674 		*out++ = '=';
1675 		*out++ = '=';
1676 		colno += 4;
1677 	}
1678 	if (inlen == 2) {
1679 		*out++ = base64_chars[(int)(*in >> 2)];
1680 		*out++ = base64_chars[(int)((*in << 4 | *(in + 1) >> 4) & 63)];
1681 		*out++ = base64_chars[(int)((*(in + 1) << 2) & 63)];
1682 		*out++ = '=';
1683 		colno += 4;
1684 	}
1685 /* finish the last line */
1686 	if (lines && colno)
1687 		*out++ = '\n';
1688 	*out = 0;
1689 	return outstr;
1690 }				/* base64Encode */
1691 
base64Bits(char c)1692 uchar base64Bits(char c)
1693 {
1694 	if (isupperByte(c))
1695 		return c - 'A';
1696 	if (islowerByte(c))
1697 		return c - ('a' - 26);
1698 	if (isdigitByte(c))
1699 		return c - ('0' - 52);
1700 	if (c == '+')
1701 		return 62;
1702 	if (c == '/')
1703 		return 63;
1704 	return 64;		/* error */
1705 }				/* base64Bits */
1706 
1707 /*********************************************************************
1708 Decode some data in base64.
1709 This function operates on the data in-line.  It does not allocate a fresh
1710 string to hold the decoded data.  Since the data will be smaller than
1711 the base64 encoded representation, this cannot overflow.
1712 If you need to preserve the input, copy it first.
1713 start points to the start of the input
1714 *end initially points to the byte just after the end of the input
1715 Returns: GOOD_BASE64_DECODE on success, BAD_BASE64_DECODE or
1716 EXTRA_CHARS_BASE64_DECODE on error.
1717 When the function returns success, *end points to the end of the decoded
1718 data.  On failure, end points to the byte just past the end of
1719 what was successfully decoded.
1720 *********************************************************************/
1721 
base64Decode(char * start,char ** end)1722 int base64Decode(char *start, char **end)
1723 {
1724 	char *b64_end = *end;
1725 	uchar val, leftover, mod;
1726 	bool equals;
1727 	int ret = GOOD_BASE64_DECODE;
1728 	char c, *q, *r;
1729 	mod = 0;
1730 	equals = false;
1731 	for (q = r = start; q < b64_end; ++q) {
1732 		c = *q;
1733 		if (isspaceByte(c))
1734 			continue;
1735 		if (equals) {
1736 			if (c == '=')
1737 				continue;
1738 			ret = EXTRA_CHARS_BASE64_DECODE;
1739 			break;
1740 		}
1741 		if (c == '=') {
1742 			equals = true;
1743 			continue;
1744 		}
1745 		val = base64Bits(c);
1746 		if (val & 64) {
1747 			ret = BAD_BASE64_DECODE;
1748 			break;
1749 		}
1750 		if (mod == 0) {
1751 			leftover = val << 2;
1752 		} else if (mod == 1) {
1753 			*r++ = (leftover | (val >> 4));
1754 			leftover = val << 4;
1755 		} else if (mod == 2) {
1756 			*r++ = (leftover | (val >> 2));
1757 			leftover = val << 6;
1758 		} else {
1759 			*r++ = (leftover | val);
1760 		}
1761 		++mod;
1762 		mod &= 3;
1763 	}
1764 	*end = r;
1765 	return ret;
1766 }				/* base64Decode */
1767 
1768 void
iuReformat(const char * inbuf,int inbuflen,char ** outbuf_p,int * outbuflen_p)1769 iuReformat(const char *inbuf, int inbuflen, char **outbuf_p, int *outbuflen_p)
1770 {
1771 	bool is8859, isutf8;
1772 
1773 	*outbuf_p = 0;
1774 	*outbuflen_p = 0;
1775 	if (!iuConvert)
1776 		return;
1777 
1778 	looks_8859_utf8((uchar *) inbuf, inbuflen, &is8859, &isutf8);
1779 	if (cons_utf8 && is8859) {
1780 		debugPrint(3, "converting to utf8");
1781 		iso2utf((uchar *) inbuf, inbuflen, (uchar **) outbuf_p,
1782 			outbuflen_p);
1783 	}
1784 	if (!cons_utf8 && isutf8) {
1785 		debugPrint(3, "converting to iso8859");
1786 		utf2iso((uchar *) inbuf, inbuflen, (uchar **) outbuf_p,
1787 			outbuflen_p);
1788 	}
1789 }				/* iuReformat */
1790 
parseDataURI(const char * uri,char ** mediatype,char ** data,int * data_l)1791 bool parseDataURI(const char *uri, char **mediatype, char **data, int *data_l)
1792 {
1793 	bool base64 = false;
1794 	const char *mediatype_start;
1795 	const char *data_sep;
1796 	const char *cp;
1797 	size_t encoded_len;
1798 
1799 	*data = *mediatype = emptyString;
1800 	*data_l = 0;
1801 
1802 	if (!isDataURI(uri))
1803 		return false;
1804 
1805 	mediatype_start = uri + 5;
1806 	data_sep = strchr(mediatype_start, ',');
1807 
1808 	if (!data_sep)
1809 		return false;
1810 
1811 	for (cp = data_sep - 1; (cp >= mediatype_start && *cp != ';'); cp--) ;
1812 
1813 	if (cp >= mediatype_start && memEqualCI(cp, ";base64,", 8)) {
1814 		base64 = true;
1815 		*mediatype = pullString1(mediatype_start, cp);
1816 	} else {
1817 		*mediatype = pullString1(mediatype_start, data_sep);
1818 	}
1819 
1820 	encoded_len = strlen(data_sep + 1);
1821 	*data = pullString(data_sep + 1, encoded_len);
1822 	unpercentString(*data);
1823 
1824 	if (!base64) {
1825 		*data_l = strlen(*data);
1826 	} else {
1827 		char *data_end = *data + strlen(*data);
1828 		int unpack_ret = base64Decode(*data, &data_end);
1829 		if (unpack_ret != GOOD_BASE64_DECODE) {
1830 			nzFree(*mediatype);
1831 			*mediatype = emptyString;
1832 			nzFree(*data);
1833 			*data = emptyString;
1834 			return false;
1835 		}
1836 		*data_end = '\0';
1837 		*data_l = data_end - *data;
1838 	}
1839 
1840 	return true;
1841 }				/* parseDataURI */
1842 
fromHex(char d,char e)1843 uchar fromHex(char d, char e)
1844 {
1845 	d |= 0x20, e |= 0x20;
1846 	if (d >= 'a')
1847 		d -= ('a' - '9' - 1);
1848 	if (e >= 'a')
1849 		e -= ('a' - '9' - 1);
1850 	d -= '0', e -= '0';
1851 	return ((((uchar) d) << 4) | (uchar) e);
1852 }				/* fromHex */
1853 
1854 // find the color closest to the rgb value.
1855 // Input string is allocated; return is either the einput string
1856 // or another allocated string.
closeColor(const char * s)1857 char *closeColor(const char *s)
1858 {
1859 // indent formats an array of structures really weird; not like I would.
1860 	const struct reserved {
1861 		const char *name;
1862 		uchar r, g, b;
1863 	} colorlist[] = {
1864 		{
1865 		"aliceblue", 0xf0, 0xf8, 0xff}, {
1866 		"antiquewhite", 0xfa, 0xeb, 0xd7}, {
1867 		"aqua", 0x00, 0xff, 0xff}, {
1868 		"aquamarine", 0x7f, 0xff, 0xd4}, {
1869 		"azure", 0xf0, 0xff, 0xff}, {
1870 		"beige", 0xf5, 0xf5, 0xdc}, {
1871 		"bisque", 0xff, 0xe4, 0xc4}, {
1872 		"black", 0x00, 0x00, 0x00}, {
1873 		"blanchedalmond", 0xff, 0xeb, 0xcd}, {
1874 		"blue", 0x00, 0x00, 0xff}, {
1875 		"blueviolet", 0x8a, 0x2b, 0xe2}, {
1876 		"brown", 0xa5, 0x2a, 0x2a}, {
1877 		"burlywood", 0xde, 0xb8, 0x87}, {
1878 		"cadetblue", 0x5f, 0x9e, 0xa0}, {
1879 		"chartreuse", 0x7f, 0xff, 0x00}, {
1880 		"chocolate", 0xd2, 0x69, 0x1e}, {
1881 		"coral", 0xff, 0x7f, 0x50}, {
1882 		"cornflowerblue", 0x64, 0x95, 0xed}, {
1883 		"cornsilk", 0xff, 0xf8, 0xdc}, {
1884 		"crimson", 0xdc, 0x14, 0x3c}, {
1885 		"cyan", 0x00, 0xff, 0xff}, {
1886 		"darkblue", 0x00, 0x00, 0x8b}, {
1887 		"darkcyan", 0x00, 0x8b, 0x8b}, {
1888 		"darkgoldenrod", 0xb8, 0x86, 0x0b}, {
1889 		"darkgray", 0xa9, 0xa9, 0xa9}, {
1890 		"darkgreen", 0x00, 0x64, 0x00}, {
1891 		"darkkhaki", 0xbd, 0xb7, 0x6b}, {
1892 		"darkmagenta", 0x8b, 0x00, 0x8b}, {
1893 		"darkolivegreen", 0x55, 0x6b, 0x2f}, {
1894 		"darkorange", 0xff, 0x8c, 0x00}, {
1895 		"darkorchid", 0x99, 0x32, 0xcc}, {
1896 		"darkred", 0x8b, 0x00, 0x00}, {
1897 		"darksalmon", 0xe9, 0x96, 0x7a}, {
1898 		"darkseagreen", 0x8f, 0xbc, 0x8f}, {
1899 		"darkslateblue", 0x48, 0x3d, 0x8b}, {
1900 		"darkslategray", 0x2f, 0x4f, 0x4f}, {
1901 		"darkturquoise", 0x00, 0xce, 0xd1}, {
1902 		"darkviolet", 0x94, 0x00, 0xd3}, {
1903 		"deeppink", 0xff, 0x14, 0x93}, {
1904 		"deepskyblue", 0x00, 0xbf, 0xff}, {
1905 		"dimgray", 0x69, 0x69, 0x69}, {
1906 		"dodgerblue", 0x1e, 0x90, 0xff}, {
1907 		"feldspar", 0xd1, 0x92, 0x75}, {
1908 		"firebrick", 0xb2, 0x22, 0x22}, {
1909 		"floralwhite", 0xff, 0xfa, 0xf0}, {
1910 		"forestgreen", 0x22, 0x8b, 0x22}, {
1911 		"fuchsia", 0xff, 0x00, 0xff}, {
1912 		"gainsboro", 0xdc, 0xdc, 0xdc}, {
1913 		"ghostwhite", 0xf8, 0xf8, 0xff}, {
1914 		"gold", 0xff, 0xd7, 0x00}, {
1915 		"goldenrod", 0xda, 0xa5, 0x20}, {
1916 		"gray", 0x80, 0x80, 0x80}, {
1917 		"green", 0x00, 0x80, 0x00}, {
1918 		"greenyellow", 0xad, 0xff, 0x2f}, {
1919 		"honeydew", 0xf0, 0xff, 0xf0}, {
1920 		"hotpink", 0xff, 0x69, 0xb4}, {
1921 		"indianred", 0xcd, 0x5c, 0x5c}, {
1922 		"indigo", 0x4b, 0x00, 0x82}, {
1923 		"ivory", 0xff, 0xff, 0xf0}, {
1924 		"khaki", 0xf0, 0xe6, 0x8c}, {
1925 		"lavender", 0xe6, 0xe6, 0xfa}, {
1926 		"lavenderblush", 0xff, 0xf0, 0xf5}, {
1927 		"lawngreen", 0x7c, 0xfc, 0x00}, {
1928 		"lemonchiffon", 0xff, 0xfa, 0xcd}, {
1929 		"lightblue", 0xad, 0xd8, 0xe6}, {
1930 		"lightcoral", 0xf0, 0x80, 0x80}, {
1931 		"lightcyan", 0xe0, 0xff, 0xff}, {
1932 		"lightgoldenrodyellow", 0xfa, 0xfa, 0xd2}, {
1933 		"lightgrey", 0xd3, 0xd3, 0xd3}, {
1934 		"lightgreen", 0x90, 0xee, 0x90}, {
1935 		"lightpink", 0xff, 0xb6, 0xc1}, {
1936 		"lightsalmon", 0xff, 0xa0, 0x7a}, {
1937 		"lightseagreen", 0x20, 0xb2, 0xaa}, {
1938 		"lightskyblue", 0x87, 0xce, 0xfa}, {
1939 		"lightslateblue", 0x84, 0x70, 0xff}, {
1940 		"lightslategray", 0x77, 0x88, 0x99}, {
1941 		"lightsteelblue", 0xb0, 0xc4, 0xde}, {
1942 		"lightyellow", 0xff, 0xff, 0xe0}, {
1943 		"lime", 0x00, 0xff, 0x00}, {
1944 		"limegreen", 0x32, 0xcd, 0x32}, {
1945 		"linen", 0xfa, 0xf0, 0xe6}, {
1946 		"magenta", 0xff, 0x00, 0xff}, {
1947 		"maroon", 0x80, 0x00, 0x00}, {
1948 		"mediumaquamarine", 0x66, 0xcd, 0xaa}, {
1949 		"mediumblue", 0x00, 0x00, 0xcd}, {
1950 		"mediumorchid", 0xba, 0x55, 0xd3}, {
1951 		"mediumpurple", 0x93, 0x70, 0xd8}, {
1952 		"mediumseagreen", 0x3c, 0xb3, 0x71}, {
1953 		"mediumslateblue", 0x7b, 0x68, 0xee}, {
1954 		"mediumspringgreen", 0x00, 0xfa, 0x9a}, {
1955 		"mediumturquoise", 0x48, 0xd1, 0xcc}, {
1956 		"mediumvioletred", 0xc7, 0x15, 0x85}, {
1957 		"midnightblue", 0x19, 0x19, 0x70}, {
1958 		"mintcream", 0xf5, 0xff, 0xfa}, {
1959 		"mistyrose", 0xff, 0xe4, 0xe1}, {
1960 		"moccasin", 0xff, 0xe4, 0xb5}, {
1961 		"navajowhite", 0xff, 0xde, 0xad}, {
1962 		"navy", 0x00, 0x00, 0x80}, {
1963 		"oldlace", 0xfd, 0xf5, 0xe6}, {
1964 		"olive", 0x80, 0x80, 0x00}, {
1965 		"olivedrab", 0x6b, 0x8e, 0x23}, {
1966 		"orange", 0xff, 0xa5, 0x00}, {
1967 		"orangered", 0xff, 0x45, 0x00}, {
1968 		"orchid", 0xda, 0x70, 0xd6}, {
1969 		"palegoldenrod", 0xee, 0xe8, 0xaa}, {
1970 		"palegreen", 0x98, 0xfb, 0x98}, {
1971 		"paleturquoise", 0xaf, 0xee, 0xee}, {
1972 		"palevioletred", 0xd8, 0x70, 0x93}, {
1973 		"papayawhip", 0xff, 0xef, 0xd5}, {
1974 		"peachpuff", 0xff, 0xda, 0xb9}, {
1975 		"peru", 0xcd, 0x85, 0x3f}, {
1976 		"pink", 0xff, 0xc0, 0xcb}, {
1977 		"plum", 0xdd, 0xa0, 0xdd}, {
1978 		"powderblue", 0xb0, 0xe0, 0xe6}, {
1979 		"purple", 0x80, 0x00, 0x80}, {
1980 		"red", 0xff, 0x00, 0x00}, {
1981 		"rosybrown", 0xbc, 0x8f, 0x8f}, {
1982 		"royalblue", 0x41, 0x69, 0xe1}, {
1983 		"saddlebrown", 0x8b, 0x45, 0x13}, {
1984 		"salmon", 0xfa, 0x80, 0x72}, {
1985 		"sandybrown", 0xf4, 0xa4, 0x60}, {
1986 		"seagreen", 0x2e, 0x8b, 0x57}, {
1987 		"seashell", 0xff, 0xf5, 0xee}, {
1988 		"sienna", 0xa0, 0x52, 0x2d}, {
1989 		"silver", 0xc0, 0xc0, 0xc0}, {
1990 		"skyblue", 0x87, 0xce, 0xeb}, {
1991 		"slateblue", 0x6a, 0x5a, 0xcd}, {
1992 		"slategray", 0x70, 0x80, 0x90}, {
1993 		"snow", 0xff, 0xfa, 0xfa}, {
1994 		"springgreen", 0x00, 0xff, 0x7f}, {
1995 		"steelblue", 0x46, 0x82, 0xb4}, {
1996 		"tan", 0xd2, 0xb4, 0x8c}, {
1997 		"teal", 0x00, 0x80, 0x80}, {
1998 		"thistle", 0xd8, 0xbf, 0xd8}, {
1999 		"tomato", 0xff, 0x63, 0x47}, {
2000 		"turquoise", 0x40, 0xe0, 0xd0}, {
2001 		"violet", 0xee, 0x82, 0xee}, {
2002 		"violetred", 0xd0, 0x20, 0x90}, {
2003 		"wheat", 0xf5, 0xde, 0xb3}, {
2004 		"white", 0xff, 0xff, 0xff}, {
2005 		"whitesmoke", 0xf5, 0xf5, 0xf5}, {
2006 		"yellow", 0xff, 0xff, 0x00}, {
2007 		"yellowgreen", 0x9a, 0xcd, 0x32}, {
2008 		0}
2009 	};
2010 	const struct reserved *c, *best_c;
2011 	int best_val;
2012 	int r1, g1, b1;
2013 	const char *t;
2014 
2015 	if (!strncmp(s, "rgb(", 4)) {
2016 		t = s + 4;
2017 		if (!isdigit(*t))
2018 			goto fail;
2019 		r1 = strtol(t, (char **)&t, 10);
2020 		if (*t == ',')
2021 			++t;
2022 		while (*t == ' ')
2023 			++t;
2024 		if (!isdigit(*t))
2025 			goto fail;
2026 		g1 = strtol(t, (char **)&t, 10);
2027 		if (*t == ',')
2028 			++t;
2029 		while (*t == ' ')
2030 			++t;
2031 		if (!isdigit(*t))
2032 			goto fail;
2033 		b1 = strtol(t, (char **)&t, 10);
2034 		if (*t == ',')
2035 			++t;
2036 		while (*t == ' ')
2037 			++t;
2038 		if (*t != ')')
2039 			goto fail;
2040 	} else if (*s == '#' && isxdigit(s[1])) {
2041 		if (!isxdigit(s[2]) || !isxdigit(s[3]))
2042 			goto fail;
2043 		if (isxdigit(s[4]) && isxdigit(s[5]) && isxdigit(s[6])) {
2044 			r1 = fromHex(s[1], s[2]);
2045 			g1 = fromHex(s[3], s[4]);
2046 			b1 = fromHex(s[5], s[6]);
2047 		} else {
2048 // #xyz is short for #xxyyzz
2049 			r1 = fromHex(s[1], s[1]);
2050 			g1 = fromHex(s[2], s[2]);
2051 			b1 = fromHex(s[3], s[3]);
2052 		}
2053 	} else {
2054 // not an rgb format we recognize; should be just a word.
2055 		for (t = s; *t; ++t)
2056 			if (!isalpha(*t))
2057 				goto fail;
2058 		return (char *)s;
2059 	}
2060 
2061 	if (r1 < 0 || g1 < 0 || b1 < 0)
2062 		goto fail;
2063 	if (r1 > 255 || g1 > 255 || b1 > 255)
2064 		goto fail;
2065 
2066 // closest by rms; just check them all; kind of inefficient.
2067 	best_val = 255 * 255 * 3 + 1;
2068 	for (c = colorlist; c->name; ++c) {
2069 		int rms = (r1 - (int)c->r) * (r1 - (int)c->r) +
2070 		    (g1 - (int)c->g) * (g1 - (int)c->g) +
2071 		    (b1 - (int)c->b) * (b1 - (int)c->b);
2072 		if (rms < best_val)
2073 			best_val = rms, best_c = c;
2074 	}
2075 	return cloneString(best_c->name);
2076 
2077 fail:
2078 	return 0;
2079 }
2080