1 /*
2  * $LynxId: LYCharUtils.c,v 1.131 2018/03/05 22:32:14 tom Exp $
3  *
4  *  Functions associated with LYCharSets.c and the Lynx version of HTML.c - FM
5  *  ==========================================================================
6  */
7 #include <HTUtils.h>
8 #include <SGML.h>
9 
10 #define Lynx_HTML_Handler
11 #include <HTChunk.h>
12 #include <HText.h>
13 #include <HTStyle.h>
14 #include <HTMIME.h>
15 #include <HTML.h>
16 
17 #include <HTCJK.h>
18 #include <HTAtom.h>
19 #include <HTMLGen.h>
20 #include <HTParse.h>
21 #include <UCMap.h>
22 #include <UCDefs.h>
23 #include <UCAux.h>
24 
25 #include <LYGlobalDefs.h>
26 #include <LYCharUtils.h>
27 #include <LYCharSets.h>
28 
29 #include <HTAlert.h>
30 #include <HTForms.h>
31 #include <HTNestedList.h>
32 #include <GridText.h>
33 #include <LYStrings.h>
34 #include <LYUtils.h>
35 #include <LYMap.h>
36 #include <LYBookmark.h>
37 #include <LYCurses.h>
38 #include <LYCookie.h>
39 
40 #include <LYexit.h>
41 #include <LYLeaks.h>
42 
43 /*
44  * Used for nested lists.  - FM
45  */
46 int OL_CONTINUE = -29999;	/* flag for whether CONTINUE is set */
47 int OL_VOID = -29998;		/* flag for whether a count is set */
48 
count_char(const char * value,int ch)49 static size_t count_char(const char *value, int ch)
50 {
51     const char *found;
52     size_t result = 0;
53 
54     while ((*value != '\0') && (found = StrChr(value, ch)) != NULL) {
55 	++result;
56 	value = (found + 1);
57     }
58     return result;
59 }
60 
61 /*
62  * This function converts any ampersands in a pre-allocated string to "&amp;".
63  * If brackets is TRUE, it also converts any angle-brackets to "&lt;" or "&gt;".
64  */
LYEntify(char ** in_out,int brackets)65 void LYEntify(char **in_out,
66 	      int brackets)
67 {
68     char *source = *in_out;
69     char *target;
70     char *result = NULL;
71     size_t count_AMPs = 0;
72     size_t count_LTs = 0;
73     size_t count_GTs = 0;
74 
75 #ifdef CJK_EX
76     enum _state {
77 	S_text,
78 	S_esc,
79 	S_dollar,
80 	S_paren,
81 	S_nonascii_text,
82 	S_dollar_paren
83     } state = S_text;
84     int in_sjis = 0;
85 #endif
86 
87     if (non_empty(source)) {
88 	count_AMPs = count_char(*in_out, '&');
89 	if (brackets) {
90 	    count_LTs = count_char(*in_out, '<');
91 	    count_GTs = count_char(*in_out, '>');
92 	}
93 
94 	if (count_AMPs != 0 || count_LTs != 0 || count_GTs != 0) {
95 
96 	    target = typecallocn(char,
97 				   (strlen(*in_out)
98 				    + (4 * count_AMPs)
99 				    + (3 * count_LTs)
100 				    + (3 * count_GTs) + 1));
101 
102 	    if ((result = target) == NULL)
103 		outofmem(__FILE__, "LYEntify");
104 
105 	    for (source = *in_out; *source; source++) {
106 #ifdef CJK_EX
107 		if (IS_CJK_TTY) {
108 		    switch (state) {
109 		    case S_text:
110 			if (*source == '\033') {
111 			    state = S_esc;
112 			    *target++ = *source;
113 			    continue;
114 			}
115 			break;
116 
117 		    case S_esc:
118 			if (*source == '$') {
119 			    state = S_dollar;
120 			} else if (*source == '(') {
121 			    state = S_paren;
122 			} else {
123 			    state = S_text;
124 			}
125 			*target++ = *source;
126 			continue;
127 
128 		    case S_dollar:
129 			if (*source == '@' || *source == 'B' || *source == 'A') {
130 			    state = S_nonascii_text;
131 			} else if (*source == '(') {
132 			    state = S_dollar_paren;
133 			} else {
134 			    state = S_text;
135 			}
136 			*target++ = *source;
137 			continue;
138 
139 		    case S_dollar_paren:
140 			if (*source == 'C') {
141 			    state = S_nonascii_text;
142 			} else {
143 			    state = S_text;
144 			}
145 			*target++ = *source;
146 			continue;
147 
148 		    case S_paren:
149 			if (*source == 'B' || *source == 'J' || *source == 'T') {
150 			    state = S_text;
151 			} else if (*source == 'I') {
152 			    state = S_nonascii_text;
153 			} else if (*source == '\033') {
154 			    state = S_esc;
155 			}
156 			*target++ = *source;
157 			continue;
158 
159 		    case S_nonascii_text:
160 			if (*source == '\033')
161 			    state = S_esc;
162 			*target++ = *source;
163 			continue;
164 
165 		    default:
166 			break;
167 		    }
168 		    if (*(source + 1) != '\0' &&
169 			(IS_EUC(UCH(*source), UCH(*(source + 1))) ||
170 			 IS_SJIS(UCH(*source), UCH(*(source + 1)), in_sjis) ||
171 			 IS_BIG5(UCH(*source), UCH(*(source + 1))))) {
172 			*target++ = *source++;
173 			*target++ = *source;
174 			continue;
175 		    }
176 		}
177 #endif
178 		switch (*source) {
179 		case '&':
180 		    *target++ = '&';
181 		    *target++ = 'a';
182 		    *target++ = 'm';
183 		    *target++ = 'p';
184 		    *target++ = ';';
185 		    break;
186 		case '<':
187 		    if (brackets) {
188 			*target++ = '&';
189 			*target++ = 'l';
190 			*target++ = 't';
191 			*target++ = ';';
192 			break;
193 		    }
194 		    /* FALLTHRU */
195 		case '>':
196 		    if (brackets) {
197 			*target++ = '&';
198 			*target++ = 'g';
199 			*target++ = 't';
200 			*target++ = ';';
201 			break;
202 		    }
203 		    /* FALLTHRU */
204 		default:
205 		    *target++ = *source;
206 		    break;
207 		}
208 	    }
209 	    *target = '\0';
210 	    FREE(*in_out);
211 	    *in_out = result;
212 	}
213     }
214 }
215 
216 /*
217  * Callers to LYEntifyTitle/LYEntifyValue do not look at the 'target' param.
218  * Optimize things a little by avoiding the memory allocation if not needed,
219  * as is usually the case.
220  */
MustEntify(const char * source)221 static BOOL MustEntify(const char *source)
222 {
223     BOOL result;
224 
225 #ifdef CJK_EX
226     if (IS_CJK_TTY && StrChr(source, '\033') != 0) {
227 	result = TRUE;
228     } else
229 #endif
230     {
231 	size_t length = strlen(source);
232 	size_t reject = strcspn(source, "<&>");
233 
234 	result = (BOOL) (length != reject);
235     }
236 
237     return result;
238 }
239 
240 /*
241  * Wrappers for LYEntify() which do not assume that the source was allocated,
242  * e.g., output from gettext().
243  */
LYEntifyTitle(char ** target,const char * source)244 const char *LYEntifyTitle(char **target, const char *source)
245 {
246     const char *result = 0;
247 
248     if (MustEntify(source)) {
249 	StrAllocCopy(*target, source);
250 	LYEntify(target, TRUE);
251 	result = *target;
252     } else {
253 	result = source;
254     }
255     return result;
256 }
257 
LYEntifyValue(char ** target,const char * source)258 const char *LYEntifyValue(char **target, const char *source)
259 {
260     const char *result = 0;
261 
262     if (MustEntify(source)) {
263 	StrAllocCopy(*target, source);
264 	LYEntify(target, FALSE);
265 	result = *target;
266     } else {
267 	result = source;
268     }
269     return result;
270 }
271 
272 /*
273  *  This function trims characters <= that of a space (32),
274  *  including HT_NON_BREAK_SPACE (1) and HT_EN_SPACE (2),
275  *  but not ESC, from the heads of strings. - FM
276  */
LYTrimHead(char * str)277 void LYTrimHead(char *str)
278 {
279     const char *s = str;
280 
281     if (isEmpty(s))
282 	return;
283 
284     while (*s && WHITE(*s) && UCH(*s) != UCH(CH_ESC))	/* S/390 -- gil -- 1669 */
285 	s++;
286     if (s > str) {
287 	char *ns = str;
288 
289 	while (*s) {
290 	    *ns++ = *s++;
291 	}
292 	*ns = '\0';
293     }
294 }
295 
296 /*
297  *  This function trims characters <= that of a space (32),
298  *  including HT_NON_BREAK_SPACE (1), HT_EN_SPACE (2), and
299  *  ESC from the tails of strings. - FM
300  */
LYTrimTail(char * str)301 void LYTrimTail(char *str)
302 {
303     int i;
304 
305     if (isEmpty(str))
306 	return;
307 
308     i = (int) strlen(str) - 1;
309     while (i >= 0) {
310 	if (WHITE(str[i]))
311 	    str[i] = '\0';
312 	else
313 	    break;
314 	i--;
315     }
316 }
317 
318 /*
319  * This function should receive a pointer to the start
320  * of a comment.  It returns a pointer to the end ('>')
321  * character of comment, or it's best guess if the comment
322  * is invalid. - FM
323  */
LYFindEndOfComment(char * str)324 char *LYFindEndOfComment(char *str)
325 {
326     char *cp, *cp1;
327     enum comment_state {
328 	start1,
329 	start2,
330 	end1,
331 	end2
332     } state;
333 
334     if (str == NULL)
335 	/*
336 	 * We got NULL, so return NULL.  - FM
337 	 */
338 	return NULL;
339 
340     if (StrNCmp(str, "<!--", 4))
341 	/*
342 	 * We don't have the start of a comment, so return the beginning of the
343 	 * string.  - FM
344 	 */
345 	return str;
346 
347     cp = (str + 4);
348     if (*cp == '>')
349 	/*
350 	 * It's an invalid comment, so
351 	 * return this end character. - FM
352 	 */
353 	return cp;
354 
355     if ((cp1 = StrChr(cp, '>')) == NULL)
356 	/*
357 	 * We don't have an end character, so return the beginning of the
358 	 * string.  - FM
359 	 */
360 	return str;
361 
362     if (*cp == '-')
363 	/*
364 	 * Ugh, it's a "decorative" series of dashes, so return the next end
365 	 * character.  - FM
366 	 */
367 	return cp1;
368 
369     /*
370      * OK, we're ready to start parsing.  - FM
371      */
372     state = start2;
373     while (*cp != '\0') {
374 	switch (state) {
375 	case start1:
376 	    if (*cp == '-')
377 		state = start2;
378 	    else
379 		/*
380 		 * Invalid comment, so return the first '>' from the start of
381 		 * the string.  - FM
382 		 */
383 		return cp1;
384 	    break;
385 
386 	case start2:
387 	    if (*cp == '-')
388 		state = end1;
389 	    break;
390 
391 	case end1:
392 	    if (*cp == '-')
393 		state = end2;
394 	    else
395 		/*
396 		 * Invalid comment, so return the first '>' from the start of
397 		 * the string.  - FM
398 		 */
399 		return cp1;
400 	    break;
401 
402 	case end2:
403 	    if (*cp == '>')
404 		/*
405 		 * Valid comment, so return the end character.  - FM
406 		 */
407 		return cp;
408 	    if (*cp == '-') {
409 		state = start1;
410 	    } else if (!(WHITE(*cp) && UCH(*cp) != UCH(CH_ESC))) {	/* S/390 -- gil -- 1686 */
411 		/*
412 		 * Invalid comment, so return the first '>' from the start of
413 		 * the string.  - FM
414 		 */
415 		return cp1;
416 	    }
417 	    break;
418 
419 	default:
420 	    break;
421 	}
422 	cp++;
423     }
424 
425     /*
426      * Invalid comment, so return the first '>' from the start of the string.
427      * - FM
428      */
429     return cp1;
430 }
431 
432 /*
433  *  If an HREF, itself or if resolved against a base,
434  *  represents a file URL, and the host is defaulted,
435  *  force in "//localhost".  We need this until
436  *  all the other Lynx code which performs security
437  *  checks based on the "localhost" string is changed
438  *  to assume "//localhost" when a host field is not
439  *  present in file URLs - FM
440  */
LYFillLocalFileURL(char ** href,const char * base)441 void LYFillLocalFileURL(char **href,
442 			const char *base)
443 {
444     char *temp = NULL;
445 
446     if (isEmpty(*href))
447 	return;
448 
449     if (!strcmp(*href, "//") || !StrNCmp(*href, "///", 3)) {
450 	if (base != NULL && isFILE_URL(base)) {
451 	    StrAllocCopy(temp, STR_FILE_URL);
452 	    StrAllocCat(temp, *href);
453 	    StrAllocCopy(*href, temp);
454 	}
455     }
456     if (isFILE_URL(*href)) {
457 	if (*(*href + 5) == '\0') {
458 	    StrAllocCat(*href, "//localhost");
459 	} else if (!strcmp(*href, "file://")) {
460 	    StrAllocCat(*href, "localhost");
461 	} else if (!StrNCmp(*href, "file:///", 8)) {
462 	    StrAllocCopy(temp, (*href + 7));
463 	    LYLocalFileToURL(href, temp);
464 	} else if (!StrNCmp(*href, "file:/", 6) && !LYIsHtmlSep(*(*href + 6))) {
465 	    StrAllocCopy(temp, (*href + 5));
466 	    LYLocalFileToURL(href, temp);
467 	}
468     }
469 #if defined(USE_DOS_DRIVES)
470     if (LYIsDosDrive(*href)) {
471 	/*
472 	 * If it's a local DOS path beginning with drive letter,
473 	 * add file://localhost/ prefix and go ahead.
474 	 */
475 	StrAllocCopy(temp, *href);
476 	LYLocalFileToURL(href, temp);
477     }
478 
479     /* use below: strlen("file://localhost/") = 17 */
480     if (!StrNCmp(*href, "file://localhost/", 17)
481 	&& (strlen(*href) == 19)
482 	&& LYIsDosDrive(*href + 17)) {
483 	/*
484 	 * Terminate DOS drive letter with a slash to surf root successfully.
485 	 * Here seems a proper place to do so.
486 	 */
487 	LYAddPathSep(href);
488     }
489 #endif /* USE_DOS_DRIVES */
490 
491     /*
492      * No path in a file://localhost URL means a
493      * directory listing for the current default. - FM
494      */
495     if (!strcmp(*href, "file://localhost")) {
496 	const char *temp2;
497 
498 #ifdef VMS
499 	temp2 = HTVMS_wwwName(LYGetEnv("PATH"));
500 #else
501 	char curdir[LY_MAXPATH];
502 
503 	temp2 = wwwName(Current_Dir(curdir));
504 #endif /* VMS */
505 	if (!LYIsHtmlSep(*temp2))
506 	    LYAddHtmlSep(href);
507 	/*
508 	 * Check for pathological cases - current dir has chars which MUST BE
509 	 * URL-escaped - kw
510 	 */
511 	if (StrChr(temp2, '%') != NULL || StrChr(temp2, '#') != NULL) {
512 	    FREE(temp);
513 	    temp = HTEscape(temp2, URL_PATH);
514 	    StrAllocCat(*href, temp);
515 	} else {
516 	    StrAllocCat(*href, temp2);
517 	}
518     }
519 #ifdef VMS
520     /*
521      * On VMS, a file://localhost/ URL means
522      * a listing for the login directory. - FM
523      */
524     if (!strcmp(*href, "file://localhost/"))
525 	StrAllocCat(*href, (HTVMS_wwwName(Home_Dir()) + 1));
526 #endif /* VMS */
527 
528     FREE(temp);
529     return;
530 }
531 
LYAddMETAcharsetToStream(HTStream * target,int disp_chndl)532 void LYAddMETAcharsetToStream(HTStream *target, int disp_chndl)
533 {
534     char *buf = 0;
535 
536     if (disp_chndl == -1)
537 	/*
538 	 * -1 means use current_char_set.
539 	 */
540 	disp_chndl = current_char_set;
541 
542     if (target != 0 && disp_chndl >= 0) {
543 	HTSprintf0(&buf, "<META %s content=\"" STR_HTML ";charset=%s\">\n",
544 		   "http-equiv=\"content-type\"",
545 		   LYCharSet_UC[disp_chndl].MIMEname);
546 	(*target->isa->put_string) (target, buf);
547 	FREE(buf);
548     }
549 }
550 
551 /*
552  *  This function writes a line with a META tag to an open file,
553  *  which will specify a charset parameter to use when the file is
554  *  read back in.  It is meant for temporary HTML files used by the
555  *  various special pages which may show titles of documents.  When those
556  *  files are created, the title strings normally have been translated and
557  *  expanded to the display character set, so we have to make sure they
558  *  don't get translated again.
559  *  If the user has changed the display character set during the lifetime
560  *  of the Lynx session (or, more exactly, during the time the title
561  *  strings to be written were generated), they may now have different
562  *  character encodings and there is currently no way to get it all right.
563  *  To change this, we would have to add a variable for each string which
564  *  keeps track of its character encoding.
565  *  But at least we can try to ensure that reading the file after future
566  *  display character set changes will give reasonable output.
567  *
568  *  The META tag is not written if the display character set (passed as
569  *  disp_chndl) already corresponds to the charset assumption that
570  *  would be made when the file is read. - KW
571  *
572  *  Currently this function is used for temporary files like "Lynx Info Page"
573  *  and for one permanent - bookmarks (so it may be a problem if you change
574  *  the display charset later: new bookmark entries may be mistranslated).
575  *								 - LP
576  */
LYAddMETAcharsetToFD(FILE * fd,int disp_chndl)577 void LYAddMETAcharsetToFD(FILE *fd, int disp_chndl)
578 {
579     if (disp_chndl == -1)
580 	/*
581 	 * -1 means use current_char_set.
582 	 */
583 	disp_chndl = current_char_set;
584 
585     if (fd == NULL || disp_chndl < 0)
586 	/*
587 	 * Should not happen.
588 	 */
589 	return;
590 
591     if (UCLYhndl_HTFile_for_unspec == disp_chndl)
592 	/*
593 	 * Not need to do, so we don't.
594 	 */
595 	return;
596 
597     if (LYCharSet_UC[disp_chndl].enc == UCT_ENC_7BIT)
598 	/*
599 	 * There shouldn't be any 8-bit characters in this case.
600 	 */
601 	return;
602 
603     /*
604      * In other cases we don't know because UCLYhndl_for_unspec may change
605      * during the lifetime of the file (by toggling raw mode or changing the
606      * display character set), so proceed.
607      */
608     fprintf(fd, "<META %s content=\"" STR_HTML ";charset=%s\">\n",
609 	    "http-equiv=\"content-type\"",
610 	    LYCharSet_UC[disp_chndl].MIMEname);
611 }
612 
613 /*
614  * This function returns OL TYPE="A" strings in
615  * the range of " A." (1) to "ZZZ." (18278). - FM
616  */
LYUppercaseA_OL_String(int seqnum)617 char *LYUppercaseA_OL_String(int seqnum)
618 {
619     static char OLstring[8];
620 
621     if (seqnum <= 1) {
622 	strcpy(OLstring, " A.");
623 	return OLstring;
624     }
625     if (seqnum < 27) {
626 	sprintf(OLstring, " %c.", (seqnum + 64));
627 	return OLstring;
628     }
629     if (seqnum < 703) {
630 	sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 64),
631 		(seqnum - ((seqnum - 1) / 26) * 26 + 64));
632 	return OLstring;
633     }
634     if (seqnum < 18279) {
635 	sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 64),
636 		(((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 64),
637 		(seqnum - ((seqnum - 1) / 26) * 26 + 64));
638 	return OLstring;
639     }
640     strcpy(OLstring, "ZZZ.");
641     return OLstring;
642 }
643 
644 /*
645  * This function returns OL TYPE="a" strings in
646  * the range of " a." (1) to "zzz." (18278). - FM
647  */
LYLowercaseA_OL_String(int seqnum)648 char *LYLowercaseA_OL_String(int seqnum)
649 {
650     static char OLstring[8];
651 
652     if (seqnum <= 1) {
653 	strcpy(OLstring, " a.");
654 	return OLstring;
655     }
656     if (seqnum < 27) {
657 	sprintf(OLstring, " %c.", (seqnum + 96));
658 	return OLstring;
659     }
660     if (seqnum < 703) {
661 	sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 96),
662 		(seqnum - ((seqnum - 1) / 26) * 26 + 96));
663 	return OLstring;
664     }
665     if (seqnum < 18279) {
666 	sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 96),
667 		(((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 96),
668 		(seqnum - ((seqnum - 1) / 26) * 26 + 96));
669 	return OLstring;
670     }
671     strcpy(OLstring, "zzz.");
672     return OLstring;
673 }
674 
675 /*
676  * This function returns OL TYPE="I" strings in the
677  * range of " I." (1) to "MMM." (3000).- FM
678  * Maximum length: 16 -TD
679  */
LYUppercaseI_OL_String(int seqnum)680 char *LYUppercaseI_OL_String(int seqnum)
681 {
682     static char OLstring[20];
683     int Arabic = seqnum;
684 
685     if (Arabic >= 3000) {
686 	strcpy(OLstring, "MMM.");
687 	return OLstring;
688     }
689 
690     switch (Arabic) {
691     case 1:
692 	strcpy(OLstring, " I.");
693 	return OLstring;
694     case 5:
695 	strcpy(OLstring, " V.");
696 	return OLstring;
697     case 10:
698 	strcpy(OLstring, " X.");
699 	return OLstring;
700     case 50:
701 	strcpy(OLstring, " L.");
702 	return OLstring;
703     case 100:
704 	strcpy(OLstring, " C.");
705 	return OLstring;
706     case 500:
707 	strcpy(OLstring, " D.");
708 	return OLstring;
709     case 1000:
710 	strcpy(OLstring, " M.");
711 	return OLstring;
712     default:
713 	OLstring[0] = '\0';
714 	break;
715     }
716 
717     while (Arabic >= 1000) {
718 	strcat(OLstring, "M");
719 	Arabic -= 1000;
720     }
721 
722     if (Arabic >= 900) {
723 	strcat(OLstring, "CM");
724 	Arabic -= 900;
725     }
726 
727     if (Arabic >= 500) {
728 	strcat(OLstring, "D");
729 	Arabic -= 500;
730     }
731 
732     if (Arabic >= 400) {
733 	strcat(OLstring, "CD");
734 	Arabic -= 400;
735     }
736 
737     while (Arabic >= 100) {
738 	strcat(OLstring, "C");
739 	Arabic -= 100;
740     }
741 
742     if (Arabic >= 90) {
743 	strcat(OLstring, "XC");
744 	Arabic -= 90;
745     }
746 
747     if (Arabic >= 50) {
748 	strcat(OLstring, "L");
749 	Arabic -= 50;
750     }
751 
752     if (Arabic >= 40) {
753 	strcat(OLstring, "XL");
754 	Arabic -= 40;
755     }
756 
757     while (Arabic > 10) {
758 	strcat(OLstring, "X");
759 	Arabic -= 10;
760     }
761 
762     switch (Arabic) {
763     case 1:
764 	strcat(OLstring, "I.");
765 	break;
766     case 2:
767 	strcat(OLstring, "II.");
768 	break;
769     case 3:
770 	strcat(OLstring, "III.");
771 	break;
772     case 4:
773 	strcat(OLstring, "IV.");
774 	break;
775     case 5:
776 	strcat(OLstring, "V.");
777 	break;
778     case 6:
779 	strcat(OLstring, "VI.");
780 	break;
781     case 7:
782 	strcat(OLstring, "VII.");
783 	break;
784     case 8:
785 	strcat(OLstring, "VIII.");
786 	break;
787     case 9:
788 	strcat(OLstring, "IX.");
789 	break;
790     case 10:
791 	strcat(OLstring, "X.");
792 	break;
793     default:
794 	strcat(OLstring, ".");
795 	break;
796     }
797 
798     return OLstring;
799 }
800 
801 /*
802  * This function returns OL TYPE="i" strings in
803  * range of " i." (1) to "mmm." (3000).- FM
804  * Maximum length: 16 -TD
805  */
LYLowercaseI_OL_String(int seqnum)806 char *LYLowercaseI_OL_String(int seqnum)
807 {
808     static char OLstring[20];
809     int Arabic = seqnum;
810 
811     if (Arabic >= 3000) {
812 	strcpy(OLstring, "mmm.");
813 	return OLstring;
814     }
815 
816     switch (Arabic) {
817     case 1:
818 	strcpy(OLstring, " i.");
819 	return OLstring;
820     case 5:
821 	strcpy(OLstring, " v.");
822 	return OLstring;
823     case 10:
824 	strcpy(OLstring, " x.");
825 	return OLstring;
826     case 50:
827 	strcpy(OLstring, " l.");
828 	return OLstring;
829     case 100:
830 	strcpy(OLstring, " c.");
831 	return OLstring;
832     case 500:
833 	strcpy(OLstring, " d.");
834 	return OLstring;
835     case 1000:
836 	strcpy(OLstring, " m.");
837 	return OLstring;
838     default:
839 	OLstring[0] = '\0';
840 	break;
841     }
842 
843     while (Arabic >= 1000) {
844 	strcat(OLstring, "m");
845 	Arabic -= 1000;
846     }
847 
848     if (Arabic >= 900) {
849 	strcat(OLstring, "cm");
850 	Arabic -= 900;
851     }
852 
853     if (Arabic >= 500) {
854 	strcat(OLstring, "d");
855 	Arabic -= 500;
856     }
857 
858     if (Arabic >= 400) {
859 	strcat(OLstring, "cd");
860 	Arabic -= 400;
861     }
862 
863     while (Arabic >= 100) {
864 	strcat(OLstring, "c");
865 	Arabic -= 100;
866     }
867 
868     if (Arabic >= 90) {
869 	strcat(OLstring, "xc");
870 	Arabic -= 90;
871     }
872 
873     if (Arabic >= 50) {
874 	strcat(OLstring, "l");
875 	Arabic -= 50;
876     }
877 
878     if (Arabic >= 40) {
879 	strcat(OLstring, "xl");
880 	Arabic -= 40;
881     }
882 
883     while (Arabic > 10) {
884 	strcat(OLstring, "x");
885 	Arabic -= 10;
886     }
887 
888     switch (Arabic) {
889     case 1:
890 	strcat(OLstring, "i.");
891 	break;
892     case 2:
893 	strcat(OLstring, "ii.");
894 	break;
895     case 3:
896 	strcat(OLstring, "iii.");
897 	break;
898     case 4:
899 	strcat(OLstring, "iv.");
900 	break;
901     case 5:
902 	strcat(OLstring, "v.");
903 	break;
904     case 6:
905 	strcat(OLstring, "vi.");
906 	break;
907     case 7:
908 	strcat(OLstring, "vii.");
909 	break;
910     case 8:
911 	strcat(OLstring, "viii.");
912 	break;
913     case 9:
914 	strcat(OLstring, "ix.");
915 	break;
916     case 10:
917 	strcat(OLstring, "x.");
918 	break;
919     default:
920 	strcat(OLstring, ".");
921 	break;
922     }
923 
924     return OLstring;
925 }
926 
927 /*
928  *  This function initializes the Ordered List counter. - FM
929  */
LYZero_OL_Counter(HTStructured * me)930 void LYZero_OL_Counter(HTStructured * me)
931 {
932     int i;
933 
934     if (!me)
935 	return;
936 
937     for (i = 0; i < 12; i++) {
938 	me->OL_Counter[i] = OL_VOID;
939 	me->OL_Type[i] = '1';
940     }
941 
942     me->Last_OL_Count = 0;
943     me->Last_OL_Type = '1';
944 
945     return;
946 }
947 
948 /*
949  *  This function is used by the HTML Structured object. - KW
950  */
LYGetChartransInfo(HTStructured * me)951 void LYGetChartransInfo(HTStructured * me)
952 {
953     me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
954 					UCT_STAGE_STRUCTURED);
955     if (me->UCLYhndl < 0) {
956 	int chndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_HTEXT);
957 
958 	if (chndl < 0) {
959 	    chndl = current_char_set;
960 	    HTAnchor_setUCInfoStage(me->node_anchor, chndl,
961 				    UCT_STAGE_HTEXT,
962 				    UCT_SETBY_STRUCTURED);
963 	}
964 	HTAnchor_setUCInfoStage(me->node_anchor, chndl,
965 				UCT_STAGE_STRUCTURED,
966 				UCT_SETBY_STRUCTURED);
967 	me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
968 					    UCT_STAGE_STRUCTURED);
969     }
970     me->UCI = HTAnchor_getUCInfoStage(me->node_anchor,
971 				      UCT_STAGE_STRUCTURED);
972 }
973 
974 	/* as in HTParse.c, saves some calls - kw */
975 static const char *hex = "0123456789ABCDEF";
976 
977 /*
978  *	  Any raw 8-bit or multibyte characters already have been
979  *	  handled in relation to the display character set
980  *	  in SGML_character(), including named and numeric entities.
981  *
982  *  This function used for translations HTML special fields inside tags
983  *  (ALT=, VALUE=, etc.) from charset `cs_from' to charset `cs_to'.
984  *  It also unescapes non-ASCII characters from URL (#fragments !)
985  *  if st_URL is active.
986  *
987  *  If `do_ent' is YES, it converts named entities
988  *  and numeric character references (NCRs) to their `cs_to' replacements.
989  *
990  *  Named entities converted to unicodes.  NCRs (unicodes) converted
991  *  by UCdomap.c chartrans functions.
992  *  ???NCRs with values in the ISO-8859-1 range 160-255 may be converted
993  *  to their HTML entity names (via old-style entities) and then translated
994  *  according to the LYCharSets.c array for `cs_out'???.
995  *
996  *  Some characters (see descriptions in `put_special_unicodes' from SGML.c)
997  *  translated in relation with the state of boolean variables
998  *  `use_lynx_specials', `plain_space' and `hidden'.  It is not clear yet:
999  *
1000  *  If plain_space is TRUE, nbsp (160) will be treated as an ASCII
1001  *  space (32).  If hidden is TRUE, entities will be translated
1002  *  (if `do_ent' is YES) but escape sequences will be passed unaltered.
1003  *  If `hidden' is FALSE, some characters are converted to Lynx special
1004  *  codes (see `put_special_unicodes') or ASCII space if `plain_space'
1005  *  applies).  @@ is `use_lynx_specials' needed, does it have any effect? @@
1006  *  If `use_lynx_specials' is YES, translate byte values 160 and 173
1007  *  meaning U+00A0 and U+00AD given as or converted from raw char input
1008  *  are converted to HT_NON_BREAK_SPACE and LY_SOFT_HYPHEN, respectively
1009  *  (unless input and output charset are both iso-8859-1, for compatibility
1010  *  with previous usage in HTML.c) even if `hidden' or `plain_space' is set.
1011  *
1012  *  If `Back' is YES, the reverse is done instead i.e., Lynx special codes
1013  *  in the input are translated back to character values.
1014  *
1015  *  If `Back' is YES, an attempt is made to use UCReverseTransChar() for
1016  *  back translation which may be more efficient. (?)
1017  *
1018  *  If `stype' is st_URL, non-ASCII characters are URL-encoded instead.
1019  *  The sequence of bytes being URL-encoded is the raw input character if
1020  *  we couldn't translate it from `cs_in' (CJK etc.); otherwise it is the
1021  *  UTF-8 representation if either `cs_to' requires this or if the
1022  *  character's Unicode value is > 255, otherwise it should be the iso-8859-1
1023  *  representation.
1024  *  No general URL-encoding occurs for displayable ASCII characters and
1025  *  spaces and some C0 controls valid in HTML (LF, TAB), it is expected
1026  *  that other functions will take care of that as appropriate.
1027  *
1028  *  Escape characters (0x1B, '\033') are
1029  *  - URL-encoded	if `stype'  is st_URL,	 otherwise
1030  *  - dropped		if `stype'  is st_other, otherwise (i.e., st_HTML)
1031  *  - passed		if `hidden' is TRUE or HTCJK is set, otherwise
1032  *  - dropped.
1033  *
1034  *  (If `stype' is st_URL or st_other most of the parameters really predefined:
1035  *  cs_from=cs_to, use_lynx_specials=plain_space=NO, and hidden=YES)
1036  *
1037  *
1038  *  Returns pointer to the char** passed in
1039  *		 if string translated or translation unnecessary,
1040  *	    NULL otherwise
1041  *		 (in which case something probably went wrong.)
1042  *
1043  *
1044  *  In general, this somehow ugly function (KW)
1045  *  cover three functions from v.2.7.2 (FM):
1046  *		    extern void LYExpandString (
1047  *		       HTStructured *	       me,
1048  *		       char **		       str);
1049  *		    extern void LYUnEscapeEntities (
1050  *		       HTStructured *	       me,
1051  *		       char **		       str);
1052  *		    extern void LYUnEscapeToLatinOne (
1053  *		       HTStructured *	       me,
1054  *		       char **		       str,
1055  *		       BOOLEAN		       isURL);
1056  */
1057 
LYUCFullyTranslateString(char ** str,int cs_from,int cs_to,int do_ent,int use_lynx_specials,int plain_space,int hidden,int Back,CharUtil_st stype)1058 char **LYUCFullyTranslateString(char **str,
1059 				int cs_from,
1060 				int cs_to,
1061 				int do_ent,
1062 				int use_lynx_specials,
1063 				int plain_space,
1064 				int hidden,
1065 				int Back,
1066 				CharUtil_st stype)
1067 {
1068     char *p;
1069     char *q, *qs;
1070     HTChunk *chunk = NULL;
1071     char *cp = 0;
1072     char cpe = 0;
1073     char *esc = NULL;
1074     char replace_buf[64];
1075     int uck;
1076     int lowest_8;
1077     UCode_t code = 0;
1078     BOOL output_utf8 = 0, repl_translated_C0 = 0;
1079     size_t len;
1080     const char *name = NULL;
1081     BOOLEAN no_bytetrans;
1082     UCTransParams T;
1083     BOOL from_is_utf8 = FALSE;
1084     char *puni = 0;
1085     enum _state {
1086 	S_text,
1087 	S_esc,
1088 	S_dollar,
1089 	S_paren,
1090 	S_nonascii_text,
1091 	S_dollar_paren,
1092 	S_trans_byte,
1093 	S_check_ent,
1094 	S_ncr,
1095 	S_check_uni,
1096 	S_named,
1097 	S_check_name,
1098 	S_recover,
1099 	S_got_oututf8,
1100 	S_got_outstring,
1101 	S_put_urlstring,
1102 	S_got_outchar,
1103 	S_put_urlchar,
1104 	S_next_char,
1105 	S_done
1106     } state = S_text;
1107     enum _parsing_what {
1108 	P_text,
1109 	P_utf8,
1110 	P_hex,
1111 	P_decimal,
1112 	P_named
1113     } what = P_text;
1114 
1115 #ifdef KANJI_CODE_OVERRIDE
1116     static unsigned char sjis_1st = '\0';
1117 
1118     unsigned char sjis_str[3];
1119 #endif
1120 
1121     /*
1122      * Make sure we have a non-empty string.  - FM
1123      */
1124     if (isEmpty(*str))
1125 	return str;
1126 
1127     /*
1128      * FIXME: something's wrong with the limit checks here (clearing the
1129      * buffer helps).
1130      */
1131     memset(replace_buf, 0, sizeof(replace_buf));
1132 
1133     /*
1134      * Don't do byte translation if original AND target character sets are both
1135      * iso-8859-1 (and we are not called to back-translate), or if we are in
1136      * CJK mode.
1137      */
1138     if (IS_CJK_TTY
1139 #ifdef EXP_JAPANESEUTF8_SUPPORT
1140 	&& (strcmp(LYCharSet_UC[cs_from].MIMEname, "utf-8") != 0)
1141 	&& (strcmp(LYCharSet_UC[cs_to].MIMEname, "utf-8") != 0)
1142 #endif
1143 	) {
1144 	no_bytetrans = TRUE;
1145     } else if (cs_to <= 0 && cs_from == cs_to && (!Back || cs_to < 0)) {
1146 	no_bytetrans = TRUE;
1147     } else {
1148 	/* No need to translate or examine the string any further */
1149 	no_bytetrans = (BOOL) (!use_lynx_specials && !Back &&
1150 			       UCNeedNotTranslate(cs_from, cs_to));
1151     }
1152     /*
1153      * Save malloc/calloc overhead in simple case - kw
1154      */
1155     if (do_ent && hidden && (stype != st_URL) && (StrChr(*str, '&') == NULL))
1156 	do_ent = FALSE;
1157 
1158     /* Can't do, caller should figure out what to do... */
1159     if (!UCCanTranslateFromTo(cs_from, cs_to)) {
1160 	if (cs_to < 0)
1161 	    return NULL;
1162 	if (!do_ent && no_bytetrans)
1163 	    return NULL;
1164 	no_bytetrans = TRUE;
1165     } else if (cs_to < 0) {
1166 	do_ent = FALSE;
1167     }
1168 
1169     if (!do_ent && no_bytetrans)
1170 	return str;
1171     p = *str;
1172 
1173     if (!no_bytetrans) {
1174 	UCTransParams_clear(&T);
1175 	UCSetTransParams(&T, cs_from, &LYCharSet_UC[cs_from],
1176 			 cs_to, &LYCharSet_UC[cs_to]);
1177 	from_is_utf8 = (BOOL) (LYCharSet_UC[cs_from].enc == UCT_ENC_UTF8);
1178 	output_utf8 = T.output_utf8;
1179 	repl_translated_C0 = T.repl_translated_C0;
1180 	puni = p;
1181     } else if (do_ent) {
1182 	output_utf8 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8 ||
1183 			      HText_hasUTF8OutputSet(HTMainText));
1184 	repl_translated_C0 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_8BIT_C0);
1185     }
1186 
1187     lowest_8 = LYlowest_eightbit[cs_to];
1188 
1189     /*
1190      * Create a buffer string seven times the length of the original, so we
1191      * have plenty of room for expansions.  - FM
1192      */
1193     len = strlen(p) + 16;
1194     q = p;
1195 
1196     qs = q;
1197 
1198 /*  Create the HTChunk only if we need it */
1199 #define CHUNK (chunk ? chunk : (chunk = HTChunkCreate2(128, len+1)))
1200 
1201 #define REPLACE_STRING(s) \
1202 		if (q != qs) HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
1203 		HTChunkPuts(CHUNK, s); \
1204 		qs = q = *str
1205 
1206 #define REPLACE_CHAR(c) if (q > p) { \
1207 		HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
1208 		qs = q = *str; \
1209 		*q++ = c; \
1210 	    } else \
1211 		*q++ = c
1212 
1213     /*
1214      * Loop through string, making conversions as needed.
1215      *
1216      * The while() checks for a non-'\0' char only for the normal text states
1217      * since other states may temporarily modify p or *p (which should be
1218      * restored before S_done!) - kw
1219      */
1220     while (*p || (state != S_text && state != S_nonascii_text)) {
1221 	switch (state) {
1222 	case S_text:
1223 	    code = UCH(*p);
1224 #ifdef KANJI_CODE_OVERRIDE
1225 	    if (HTCJK == JAPANESE && last_kcode == SJIS) {
1226 		if (sjis_1st == '\0' && (IS_SJIS_HI1(code) || IS_SJIS_HI2(code))) {
1227 		    sjis_1st = UCH(code);
1228 		} else if (sjis_1st && IS_SJIS_LO(code)) {
1229 		    sjis_1st = '\0';
1230 		} else {
1231 		    if (conv_jisx0201kana && 0xA1 <= code && code <= 0xDF) {
1232 			sjis_str[2] = '\0';
1233 			JISx0201TO0208_SJIS(UCH(code),
1234 					    sjis_str, sjis_str + 1);
1235 			REPLACE_STRING(sjis_str);
1236 			p++;
1237 			continue;
1238 		    }
1239 		}
1240 	    }
1241 #endif
1242 	    if (*p == '\033') {
1243 		if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
1244 		    state = S_esc;
1245 		    if (stype == st_URL) {
1246 			REPLACE_STRING("%1B");
1247 			p++;
1248 			continue;
1249 		    } else if (stype != st_HTML) {
1250 			p++;
1251 			continue;
1252 		    } else {
1253 			*q++ = *p++;
1254 			continue;
1255 		    }
1256 		} else if (!hidden) {
1257 		    /*
1258 		     * CJK handling not on, and not a hidden INPUT, so block
1259 		     * escape.  - FM
1260 		     */
1261 		    state = S_next_char;
1262 		} else {
1263 		    state = S_trans_byte;
1264 		}
1265 	    } else {
1266 		state = (do_ent ? S_check_ent : S_trans_byte);
1267 	    }
1268 	    break;
1269 
1270 	case S_esc:
1271 	    if (*p == '$') {
1272 		state = S_dollar;
1273 		*q++ = *p++;
1274 		continue;
1275 	    } else if (*p == '(') {
1276 		state = S_paren;
1277 		*q++ = *p++;
1278 		continue;
1279 	    } else {
1280 		state = S_text;
1281 	    }
1282 	    break;
1283 
1284 	case S_dollar:
1285 	    if (*p == '@' || *p == 'B' || *p == 'A') {
1286 		state = S_nonascii_text;
1287 		*q++ = *p++;
1288 		continue;
1289 	    } else if (*p == '(') {
1290 		state = S_dollar_paren;
1291 		*q++ = *p++;
1292 		continue;
1293 	    } else {
1294 		state = S_text;
1295 	    }
1296 	    break;
1297 
1298 	case S_dollar_paren:
1299 	    if (*p == 'C') {
1300 		state = S_nonascii_text;
1301 		*q++ = *p++;
1302 		continue;
1303 	    } else {
1304 		state = S_text;
1305 	    }
1306 	    break;
1307 
1308 	case S_paren:
1309 	    if (*p == 'B' || *p == 'J' || *p == 'T') {
1310 		state = S_text;
1311 		*q++ = *p++;
1312 		continue;
1313 	    } else if (*p == 'I') {
1314 		state = S_nonascii_text;
1315 		*q++ = *p++;
1316 		continue;
1317 	    } else {
1318 		state = S_text;
1319 	    }
1320 	    break;
1321 
1322 	case S_nonascii_text:
1323 	    if (*p == '\033') {
1324 		if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
1325 		    state = S_esc;
1326 		    if (stype == st_URL) {
1327 			REPLACE_STRING("%1B");
1328 			p++;
1329 			continue;
1330 		    } else if (stype != st_HTML) {
1331 			p++;
1332 			continue;
1333 		    }
1334 		}
1335 	    }
1336 	    *q++ = *p++;
1337 	    continue;
1338 
1339 	case S_trans_byte:
1340 	    /* character translation goes here */
1341 	    /*
1342 	     * Don't do anything if we have no string, or if original AND
1343 	     * target character sets are both iso-8859-1, or if we are in CJK
1344 	     * mode.
1345 	     */
1346 	    if (*p == '\0' || no_bytetrans) {
1347 		state = S_got_outchar;
1348 		break;
1349 	    }
1350 
1351 	    if (Back) {
1352 		int rev_c;
1353 
1354 		if ((*p) == HT_NON_BREAK_SPACE ||
1355 		    (*p) == HT_EN_SPACE) {
1356 		    if (plain_space) {
1357 			code = *p = ' ';
1358 			state = S_got_outchar;
1359 			break;
1360 		    } else {
1361 			code = 160;
1362 			if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1363 			    (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
1364 			    state = S_got_outchar;
1365 			    break;
1366 			} else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
1367 				     || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1368 			    state = S_check_uni;
1369 			    break;
1370 			} else {
1371 			    *(unsigned char *) p = UCH(160);
1372 			}
1373 		    }
1374 		} else if ((*p) == LY_SOFT_HYPHEN) {
1375 		    code = 173;
1376 		    if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1377 			(LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
1378 			state = S_got_outchar;
1379 			break;
1380 		    } else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
1381 				 || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1382 			state = S_check_uni;
1383 			break;
1384 		    } else {
1385 			*(unsigned char *) p = UCH(173);
1386 		    }
1387 #ifdef EXP_JAPANESEUTF8_SUPPORT
1388 		} else if (output_utf8) {
1389 		    if ((!strcmp(LYCharSet_UC[cs_from].MIMEname, "euc-jp") &&
1390 			 (IS_EUC((unsigned char) (*p),
1391 				 (unsigned char) (*(p + 1))))) ||
1392 			(!strcmp(LYCharSet_UC[cs_from].MIMEname, "shift_jis") &&
1393 			 (IS_SJIS_2BYTE((unsigned char) (*p),
1394 					(unsigned char) (*(p + 1)))))) {
1395 			code = UCTransJPToUni(p, 2, cs_from);
1396 			p++;
1397 			state = S_check_uni;
1398 			break;
1399 		    }
1400 #endif
1401 		} else if (code < 127 || T.transp) {
1402 		    state = S_got_outchar;
1403 		    break;
1404 		}
1405 		rev_c = UCReverseTransChar(*p, cs_to, cs_from);
1406 		if (rev_c > 127) {
1407 		    *p = (char) rev_c;
1408 		    code = rev_c;
1409 		    state = S_got_outchar;
1410 		    break;
1411 		}
1412 	    } else if (code < 127) {
1413 		state = S_got_outchar;
1414 		break;
1415 	    }
1416 
1417 	    if (from_is_utf8) {
1418 		if (((*p) & 0xc0) == 0xc0) {
1419 		    const char *pq = p;
1420 
1421 		    puni = p;
1422 		    code = UCGetUniFromUtf8String(&pq);
1423 		    if (code <= 0) {
1424 			code = UCH(*p);
1425 		    } else {
1426 			what = P_utf8;
1427 			puni += (pq - (const char *) p);
1428 		    }
1429 		}
1430 	    } else if (use_lynx_specials && !Back &&
1431 		       (code == 160 || code == 173) &&
1432 		       (LYCharSet_UC[cs_from].enc == UCT_ENC_8859 ||
1433 			(LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1434 		if (code == 160)
1435 		    code = *p = HT_NON_BREAK_SPACE;
1436 		else if (code == 173)
1437 		    code = *p = LY_SOFT_HYPHEN;
1438 		state = S_got_outchar;
1439 		break;
1440 	    } else if (T.trans_to_uni) {
1441 		code = UCTransToUni(*p, cs_from);
1442 		if (code <= 0) {
1443 		    /* What else can we do? */
1444 		    code = UCH(*p);
1445 		}
1446 	    } else if (!T.trans_from_uni) {
1447 		state = S_got_outchar;
1448 		break;
1449 	    }
1450 	    /*
1451 	     * Substitute Lynx special character for 160 (nbsp) if
1452 	     * use_lynx_specials is set.
1453 	     */
1454 	    if (use_lynx_specials && !Back &&
1455 		(code == 160 || code == 173)) {
1456 		code = ((code == 160 ? HT_NON_BREAK_SPACE : LY_SOFT_HYPHEN));
1457 		state = S_got_outchar;
1458 		break;
1459 	    }
1460 
1461 	    state = S_check_uni;
1462 	    break;
1463 
1464 	case S_check_ent:
1465 	    if (*p == '&') {
1466 		char *pp = p + 1;
1467 
1468 		len = strlen(pp);
1469 		/*
1470 		 * Check for a numeric entity.  - FM
1471 		 */
1472 		if (*pp == '#' && len > 2 &&
1473 		    (*(pp + 1) == 'x' || *(pp + 1) == 'X') &&
1474 		    UCH(*(pp + 2)) < 127 &&
1475 		    isxdigit(UCH(*(pp + 2)))) {
1476 		    what = P_hex;
1477 		    state = S_ncr;
1478 		} else if (*pp == '#' && len > 2 &&
1479 			   UCH(*(pp + 1)) < 127 &&
1480 			   isdigit(UCH(*(pp + 1)))) {
1481 		    what = P_decimal;
1482 		    state = S_ncr;
1483 		} else if (UCH(*pp) < 127 &&
1484 			   isalpha(UCH(*pp))) {
1485 		    what = P_named;
1486 		    state = S_named;
1487 		} else {
1488 		    state = S_trans_byte;
1489 		}
1490 	    } else {
1491 		state = S_trans_byte;
1492 	    }
1493 	    break;
1494 
1495 	case S_ncr:
1496 	    if (what == P_hex) {
1497 		p += 3;
1498 	    } else {		/* P_decimal */
1499 		p += 2;
1500 	    }
1501 	    cp = p;
1502 	    while (*p && UCH(*p) < 127 &&
1503 		   (what == P_hex ? isxdigit(UCH(*p)) :
1504 		    isdigit(UCH(*p)))) {
1505 		p++;
1506 	    }
1507 	    /*
1508 	     * Save the terminator and isolate the digit(s).  - FM
1509 	     */
1510 	    cpe = *p;
1511 	    if (*p)
1512 		*p++ = '\0';
1513 	    /*
1514 	     * Show the numeric entity if the value:
1515 	     * (1) Is greater than 255 and unhandled Unicode.
1516 	     * (2) Is less than 32, and not valid and we don't have HTCJK set.
1517 	     * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
1518 	     * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
1519 	     */
1520 	    if (UCScanCode(&code, cp, (BOOL) (what == P_hex))) {
1521 		code = LYcp1252ToUnicode(code);
1522 		state = S_check_uni;
1523 	    } else {
1524 		state = S_recover;
1525 		break;
1526 	    }
1527 	    break;
1528 
1529 	case S_check_uni:
1530 	    /*
1531 	     * Show the numeric entity if the value:
1532 	     * (2) Is less than 32, and not valid and we don't have HTCJK set.
1533 	     * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
1534 	     * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
1535 	     */
1536 	    if ((code < 32 &&
1537 		 code != 9 && code != 10 && code != 13 &&
1538 		 !IS_CJK_TTY) ||
1539 		(code == 127 &&
1540 		 !(HTPassHighCtrlRaw || IS_CJK_TTY)) ||
1541 		(code > 127 && code < 160 &&
1542 		 !HTPassHighCtrlNum)) {
1543 		state = S_recover;
1544 		break;
1545 	    }
1546 	    /*
1547 	     * Convert the value as an unsigned char, hex escaped if isURL is
1548 	     * set and it's 8-bit, and then recycle the terminator if it is not
1549 	     * a semicolon.  - FM
1550 	     */
1551 	    if (code > 159 && stype == st_URL) {
1552 		state = S_got_oututf8;
1553 		break;
1554 	    }
1555 	    /*
1556 	     * For 160 (nbsp), use that value if it's a hidden INPUT, otherwise
1557 	     * use an ASCII space (32) if plain_space is TRUE, otherwise use
1558 	     * the Lynx special character.  - FM
1559 	     */
1560 	    if (code == 160) {
1561 		if (plain_space) {
1562 		    code = ' ';
1563 		    state = S_got_outchar;
1564 		    break;
1565 		} else if (use_lynx_specials) {
1566 		    code = HT_NON_BREAK_SPACE;
1567 		    state = S_got_outchar;
1568 		    break;
1569 		} else if ((hidden && !Back)
1570 			   || (LYCharSet_UC[cs_to].codepoints & UCT_CP_SUPERSETOF_LAT1)
1571 			   || LYCharSet_UC[cs_to].enc == UCT_ENC_8859
1572 			   || (LYCharSet_UC[cs_to].like8859 &
1573 			       UCT_R_8859SPECL)) {
1574 		    state = S_got_outchar;
1575 		    break;
1576 		} else if (
1577 			      (LYCharSet_UC[cs_to].repertoire & UCT_REP_SUPERSETOF_LAT1)) {
1578 		    ;		/* nothing, may be translated later */
1579 		} else {
1580 		    code = ' ';
1581 		    state = S_got_outchar;
1582 		    break;
1583 		}
1584 	    }
1585 	    /*
1586 	     * For 173 (shy), use that value if it's a hidden INPUT, otherwise
1587 	     * ignore it if plain_space is TRUE, otherwise use the Lynx special
1588 	     * character.  - FM
1589 	     */
1590 	    if (code == 173) {
1591 		if (plain_space) {
1592 		    replace_buf[0] = '\0';
1593 		    state = S_got_outstring;
1594 		    break;
1595 		} else if (Back &&
1596 			   !(LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1597 			     (LYCharSet_UC[cs_to].like8859 &
1598 			      UCT_R_8859SPECL))) {
1599 		    ;		/* nothing, may be translated later */
1600 		} else if (hidden || Back) {
1601 		    state = S_got_outchar;
1602 		    break;
1603 		} else if (use_lynx_specials) {
1604 		    code = LY_SOFT_HYPHEN;
1605 		    state = S_got_outchar;
1606 		    break;
1607 		}
1608 	    }
1609 	    /*
1610 	     * Seek a translation from the chartrans tables.
1611 	     */
1612 	    if ((uck = UCTransUniChar(code,
1613 				      cs_to)) >= 32 &&
1614 		uck < 256 &&
1615 		(uck < 127 || uck >= lowest_8)) {
1616 		code = uck;
1617 		state = S_got_outchar;
1618 		break;
1619 	    } else if ((uck == -4 ||
1620 			(repl_translated_C0 &&
1621 			 uck > 0 && uck < 32)) &&
1622 		/*
1623 		 * Not found; look for replacement string.
1624 		 */
1625 		       UCTransUniCharStr(replace_buf,
1626 					 60, code,
1627 					 cs_to,
1628 					 0) >= 0) {
1629 		state = S_got_outstring;
1630 		break;
1631 	    }
1632 	    if (output_utf8 &&
1633 		code > 127 && code < 0x7fffffffL) {
1634 		state = S_got_oututf8;
1635 		break;
1636 	    }
1637 	    /*
1638 	     * For 8194 (ensp), 8195 (emsp), or 8201 (thinsp), use the
1639 	     * character reference if it's a hidden INPUT, otherwise use an
1640 	     * ASCII space (32) if plain_space is TRUE, otherwise use the Lynx
1641 	     * special character.  - FM
1642 	     */
1643 	    if (code == 8194 || code == 8195 || code == 8201) {
1644 		if (hidden) {
1645 		    state = S_recover;
1646 		} else if (plain_space) {
1647 		    code = ' ';
1648 		    state = S_got_outchar;
1649 		} else {
1650 		    code = HT_EN_SPACE;
1651 		    state = S_got_outchar;
1652 		}
1653 		break;
1654 		/*
1655 		 * Ignore 8204 (zwnj), 8205 (zwj) 8206 (lrm), and 8207 (rlm),
1656 		 * for now, if we got this far without finding a representation
1657 		 * for them.
1658 		 */
1659 	    } else if (code == 8204 || code == 8205 ||
1660 		       code == 8206 || code == 8207) {
1661 		CTRACE((tfp, "LYUCFullyTranslateString: Ignoring '%"
1662 			PRI_UCode_t "'.\n", code));
1663 		replace_buf[0] = '\0';
1664 		state = S_got_outstring;
1665 		break;
1666 		/*
1667 		 * Show the numeric entity if the value:  (1) Is greater than
1668 		 * 255 and unhandled Unicode.
1669 		 */
1670 	    } else if (code > 255) {
1671 		/*
1672 		 * Illegal or not yet handled value.  Return "&#" verbatim and
1673 		 * continue from there.  - FM
1674 		 */
1675 		state = S_recover;
1676 		break;
1677 		/*
1678 		 * If it's ASCII, or is 8-bit but HTPassEightBitNum is set or
1679 		 * the character set is "ISO Latin 1", use it's value.  - FM
1680 		 */
1681 	    } else if (code < 161 ||
1682 		       (code < 256 &&
1683 			(HTPassEightBitNum || cs_to == LATIN1))) {
1684 		/*
1685 		 * No conversion needed.
1686 		 */
1687 		state = S_got_outchar;
1688 		break;
1689 
1690 		/* The following disabled section doesn't make sense any more.
1691 		 * It used to make sense in the past, when S_check_named would
1692 		 * look in "old style" tables in addition to what it does now.
1693 		 * Disabling of going to S_check_name here prevents endless
1694 		 * looping between S_check_uni and S_check_names states, which
1695 		 * could occur here for Latin 1 codes for some cs_to if they
1696 		 * had no translation in that cs_to.  Normally all cs_to
1697 		 * *should* now have valid translations via UCTransUniChar or
1698 		 * UCTransUniCharStr for all Latin 1 codes, so that we would
1699 		 * not get here anyway, and no loop could occur.  Still, if we
1700 		 * *do* get here, FALL THROUGH to case S_recover now.  - kw
1701 		 */
1702 #if 0
1703 		/*
1704 		 * If we get to here, convert and handle the character as a
1705 		 * named entity.  - FM
1706 		 */
1707 	    } else {
1708 		name = HTMLGetEntityName(code - 160);
1709 		state = S_check_name;
1710 		break;
1711 #endif
1712 	    }
1713 	    /* FALLTHRU */
1714 
1715 	case S_recover:
1716 	    if (what == P_decimal || what == P_hex) {
1717 		/*
1718 		 * Illegal or not yet handled value.  Return "&#" verbatim and
1719 		 * continue from there.  - FM
1720 		 */
1721 		*q++ = '&';
1722 		*q++ = '#';
1723 		if (what == P_hex)
1724 		    *q++ = 'x';
1725 		if (cpe != '\0')
1726 		    *(p - 1) = cpe;
1727 		p = cp;
1728 		state = S_done;
1729 	    } else if (what == P_named) {
1730 		*cp = cpe;
1731 		*q++ = '&';
1732 		state = S_done;
1733 	    } else if (!T.output_utf8 && stype == st_HTML && !hidden &&
1734 		       !(HTPassEightBitRaw &&
1735 			 UCH(*p) >= lowest_8)) {
1736 		sprintf(replace_buf, "U%.2" PRI_UCode_t "", code);
1737 
1738 		state = S_got_outstring;
1739 	    } else {
1740 		puni = p;
1741 		code = UCH(*p);
1742 		state = S_got_outchar;
1743 	    }
1744 	    break;
1745 
1746 	case S_named:
1747 	    cp = ++p;
1748 	    while (*cp && UCH(*cp) < 127 &&
1749 		   isalnum(UCH(*cp)))
1750 		cp++;
1751 	    cpe = *cp;
1752 	    *cp = '\0';
1753 	    name = p;
1754 	    state = S_check_name;
1755 	    break;
1756 
1757 	case S_check_name:
1758 	    /*
1759 	     * Seek the Unicode value for the named entity.
1760 	     *
1761 	     * !!!!  We manually recover the case of '=' terminator which is
1762 	     * commonly found on query to CGI-scripts enclosed as href= URLs
1763 	     * like "somepath/?x=1&yz=2" Without this dirty fix, submission of
1764 	     * such URLs was broken if &yz string happened to be a recognized
1765 	     * entity name.  - LP
1766 	     */
1767 	    if (((code = HTMLGetEntityUCValue(name)) > 0) &&
1768 		!((cpe == '=') && (stype == st_URL))) {
1769 		state = S_check_uni;
1770 		break;
1771 	    }
1772 	    /*
1773 	     * Didn't find the entity.  Return verbatim.
1774 	     */
1775 	    state = S_recover;
1776 	    break;
1777 
1778 	    /* * * O U T P U T   S T A T E S * * */
1779 
1780 	case S_got_oututf8:
1781 	    if (code > 255 ||
1782 		(code >= 128 && LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8)) {
1783 		UCConvertUniToUtf8(code, replace_buf);
1784 		state = S_got_outstring;
1785 	    } else {
1786 		state = S_got_outchar;
1787 	    }
1788 	    break;
1789 	case S_got_outstring:
1790 	    if (what == P_decimal || what == P_hex) {
1791 		if (cpe != ';' && cpe != '\0')
1792 		    *(--p) = cpe;
1793 		p--;
1794 	    } else if (what == P_named) {
1795 		*cp = cpe;
1796 		p = (*cp != ';') ? (cp - 1) : cp;
1797 	    } else if (what == P_utf8) {
1798 		p = puni;
1799 	    }
1800 	    if (replace_buf[0] == '\0') {
1801 		state = S_next_char;
1802 		break;
1803 	    }
1804 	    if (stype == st_URL) {
1805 		code = replace_buf[0];	/* assume string OK if first char is */
1806 		if (code >= 127 ||
1807 		    (code < 32 && (code != 9 && code != 10 && code != 0))) {
1808 		    state = S_put_urlstring;
1809 		    break;
1810 		}
1811 	    }
1812 	    REPLACE_STRING(replace_buf);
1813 	    state = S_next_char;
1814 	    break;
1815 	case S_put_urlstring:
1816 	    esc = HTEscape(replace_buf, URL_XALPHAS);
1817 	    REPLACE_STRING(esc);
1818 	    FREE(esc);
1819 	    state = S_next_char;
1820 	    break;
1821 	case S_got_outchar:
1822 	    if (what == P_decimal || what == P_hex) {
1823 		if (cpe != ';' && cpe != '\0')
1824 		    *(--p) = cpe;
1825 		p--;
1826 	    } else if (what == P_named) {
1827 		*cp = cpe;
1828 		p = (*cp != ';') ? (cp - 1) : cp;
1829 	    } else if (what == P_utf8) {
1830 		p = puni;
1831 	    }
1832 	    if (stype == st_URL &&
1833 	    /*  Not a full HTEscape, only for 8bit and ctrl chars */
1834 		(TOASCII(code) >= 127 ||	/* S/390 -- gil -- 1925 */
1835 		 (code < ' ' && (code != '\t' && code != '\n')))) {
1836 		state = S_put_urlchar;
1837 		break;
1838 	    } else if (!hidden && code == 10 && *p == 10
1839 		       && q != qs && *(q - 1) == 13) {
1840 		/*
1841 		 * If this is not a hidden string, and the current char is the
1842 		 * LF ('\n') of a CRLF pair, drop the CR ('\r').  - KW
1843 		 */
1844 		*(q - 1) = *p++;
1845 		state = S_done;
1846 		break;
1847 	    }
1848 	    *q++ = (char) code;
1849 	    state = S_next_char;
1850 	    break;
1851 	case S_put_urlchar:
1852 	    *q++ = '%';
1853 	    REPLACE_CHAR(hex[(TOASCII(code) >> 4) & 15]);	/* S/390 -- gil -- 1944 */
1854 	    REPLACE_CHAR(hex[(TOASCII(code) & 15)]);
1855 	    /* fall through */
1856 	case S_next_char:
1857 	    p++;		/* fall through */
1858 	case S_done:
1859 	    state = S_text;
1860 	    what = P_text;
1861 	    /* for next round */
1862 	}
1863     }
1864 
1865     *q = '\0';
1866     if (chunk) {
1867 	HTChunkPutb(CHUNK, qs, (int) (q - qs + 1));	/* also terminates */
1868 	if (stype == st_URL || stype == st_other) {
1869 	    LYTrimHead(chunk->data);
1870 	    LYTrimTail(chunk->data);
1871 	}
1872 	StrAllocCopy(*str, chunk->data);
1873 	HTChunkFree(chunk);
1874     } else {
1875 	if (stype == st_URL || stype == st_other) {
1876 	    LYTrimHead(qs);
1877 	    LYTrimTail(qs);
1878 	}
1879     }
1880     return str;
1881 }
1882 
1883 #undef REPLACE_CHAR
1884 #undef REPLACE_STRING
1885 
LYUCTranslateHTMLString(char ** str,int cs_from,int cs_to,int use_lynx_specials,int plain_space,int hidden,CharUtil_st stype)1886 BOOL LYUCTranslateHTMLString(char **str,
1887 			     int cs_from,
1888 			     int cs_to,
1889 			     int use_lynx_specials,
1890 			     int plain_space,
1891 			     int hidden,
1892 			     CharUtil_st stype)
1893 {
1894     BOOL ret = YES;
1895 
1896     /* May reallocate *str even if cs_to == 0 */
1897     if (!LYUCFullyTranslateString(str, cs_from, cs_to, TRUE,
1898 				  use_lynx_specials, plain_space, hidden,
1899 				  NO, stype)) {
1900 	ret = NO;
1901     }
1902     return ret;
1903 }
1904 
LYUCTranslateBackFormData(char ** str,int cs_from,int cs_to,int plain_space)1905 BOOL LYUCTranslateBackFormData(char **str,
1906 			       int cs_from,
1907 			       int cs_to,
1908 			       int plain_space)
1909 {
1910     char **ret;
1911 
1912     /* May reallocate *str */
1913     ret = (LYUCFullyTranslateString(str, cs_from, cs_to, FALSE,
1914 				    NO, plain_space, YES,
1915 				    YES, st_HTML));
1916     return (BOOL) (ret != NULL);
1917 }
1918 
1919 /*
1920  * Parse a parameter from an HTML META tag, i.e., the CONTENT.
1921  */
LYParseTagParam(char * from,const char * name)1922 char *LYParseTagParam(char *from,
1923 		      const char *name)
1924 {
1925     size_t len = strlen(name);
1926     char *result = NULL;
1927     char *string = from;
1928 
1929     do {
1930 	if ((string = StrChr(string, ';')) == NULL)
1931 	    return NULL;
1932 	while (*string != '\0' && (*string == ';' || isspace(UCH(*string)))) {
1933 	    string++;
1934 	}
1935 	if (strlen(string) < len)
1936 	    return NULL;
1937     } while (strncasecomp(string, name, (int) len) != 0);
1938     string += len;
1939     while (*string != '\0' && (isspace(UCH(*string)) || *string == '=')) {
1940 	string++;
1941     }
1942 
1943     StrAllocCopy(result, string);
1944     len = 0;
1945     while (isprint(UCH(string[len])) && !isspace(UCH(string[len]))) {
1946 	len++;
1947     }
1948     result[len] = '\0';
1949 
1950     /*
1951      * Strip single quotes, just in case.
1952      */
1953     if (len > 2 && result[0] == '\'' && result[len - 1] == result[0]) {
1954 	result[len - 1] = '\0';
1955 	for (string = result; (string[0] = string[1]) != '\0'; ++string) ;
1956     }
1957     return result;
1958 }
1959 
1960 /*
1961  * Given a refresh-URL content string, parses the delay time and the URL
1962  * string.  Ignore the remainder of the content.
1963  */
LYParseRefreshURL(char * content,char ** p_seconds,char ** p_address)1964 void LYParseRefreshURL(char *content,
1965 		       char **p_seconds,
1966 		       char **p_address)
1967 {
1968     char *cp;
1969     char *cp1 = NULL;
1970     char *Seconds = NULL;
1971 
1972     /*
1973      * Look for the Seconds field.  - FM
1974      */
1975     cp = LYSkipBlanks(content);
1976     if (*cp && isdigit(UCH(*cp))) {
1977 	cp1 = cp;
1978 	while (*cp1 && isdigit(UCH(*cp1)))
1979 	    cp1++;
1980 	StrnAllocCopy(Seconds, cp, (size_t) (cp1 - cp));
1981     }
1982     *p_seconds = Seconds;
1983     *p_address = LYParseTagParam(content, "URL");
1984 
1985     CTRACE((tfp,
1986 	    "LYParseRefreshURL\n\tcontent: %s\n\tseconds: %s\n\taddress: %s\n",
1987 	    content, NonNull(*p_seconds), NonNull(*p_address)));
1988 }
1989 
1990 /*
1991  *  This function processes META tags in HTML streams. - FM
1992  */
LYHandleMETA(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED)1993 void LYHandleMETA(HTStructured * me, const BOOL *present,
1994 		  STRING2PTR value,
1995 		  char **include GCC_UNUSED)
1996 {
1997     char *http_equiv = NULL, *name = NULL, *content = NULL, *charset = NULL;
1998     char *href = NULL, *id_string = NULL, *temp = NULL;
1999     char *cp, *cp0, *cp1 = NULL;
2000     int url_type = 0;
2001 
2002     if (!me || !present)
2003 	return;
2004 
2005     /*
2006      * Load the attributes for possible use by Lynx.  - FM
2007      */
2008     if (present[HTML_META_HTTP_EQUIV] &&
2009 	non_empty(value[HTML_META_HTTP_EQUIV])) {
2010 	StrAllocCopy(http_equiv, value[HTML_META_HTTP_EQUIV]);
2011 	convert_to_spaces(http_equiv, TRUE);
2012 	LYUCTranslateHTMLString(&http_equiv, me->tag_charset, me->tag_charset,
2013 				NO, NO, YES, st_other);
2014 	if (*http_equiv == '\0') {
2015 	    FREE(http_equiv);
2016 	}
2017     }
2018     if (present[HTML_META_NAME] &&
2019 	non_empty(value[HTML_META_NAME])) {
2020 	StrAllocCopy(name, value[HTML_META_NAME]);
2021 	convert_to_spaces(name, TRUE);
2022 	LYUCTranslateHTMLString(&name, me->tag_charset, me->tag_charset,
2023 				NO, NO, YES, st_other);
2024 	if (*name == '\0') {
2025 	    FREE(name);
2026 	}
2027     }
2028     if (present[HTML_META_CONTENT] &&
2029 	non_empty(value[HTML_META_CONTENT])) {
2030 	/*
2031 	 * Technically, we should be creating a comma-separated list, but META
2032 	 * tags come one at a time, and we'll handle (or ignore) them as each
2033 	 * is received.  Also, at this point, we only trim leading and trailing
2034 	 * blanks from the CONTENT value, without translating any named
2035 	 * entities or numeric character references, because how we should do
2036 	 * that depends on what type of information it contains, and whether or
2037 	 * not any of it might be sent to the screen.  - FM
2038 	 */
2039 	StrAllocCopy(content, value[HTML_META_CONTENT]);
2040 	convert_to_spaces(content, FALSE);
2041 	LYTrimHead(content);
2042 	LYTrimTail(content);
2043 	if (*content == '\0') {
2044 	    FREE(content);
2045 	}
2046     }
2047     if (present[HTML_META_CHARSET] &&
2048 	non_empty(value[HTML_META_CHARSET])) {
2049 	StrAllocCopy(charset, value[HTML_META_CHARSET]);
2050 	convert_to_spaces(charset, TRUE);
2051 	LYUCTranslateHTMLString(&charset, me->tag_charset, me->tag_charset,
2052 				NO, NO, YES, st_other);
2053 	if (*charset == '\0') {
2054 	    FREE(charset);
2055 	}
2056     }
2057     CTRACE((tfp,
2058 	    "LYHandleMETA: HTTP-EQUIV=\"%s\" NAME=\"%s\" CONTENT=\"%s\" CHARSET=\"%s\"\n",
2059 	    NONNULL(http_equiv),
2060 	    NONNULL(name),
2061 	    NONNULL(content),
2062 	    NONNULL(charset)));
2063 
2064     /*
2065      * Check for a text/html Content-Type with a charset directive, if we
2066      * didn't already set the charset via a server's header.  - AAC & FM
2067      */
2068     if (isEmpty(me->node_anchor->charset) &&
2069 	(charset ||
2070 	 (!strcasecomp(NonNull(http_equiv), "Content-Type") && content))) {
2071 	LYUCcharset *p_in = NULL;
2072 	LYUCcharset *p_out = NULL;
2073 
2074 	if (charset) {
2075 	    LYLowerCase(charset);
2076 	} else {
2077 	    LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2078 				    NO, NO, YES, st_other);
2079 	    LYLowerCase(content);
2080 	}
2081 
2082 	if ((cp1 = charset) != NULL ||
2083 	    (cp1 = strstr(content, "charset")) != NULL) {
2084 	    BOOL chartrans_ok = NO;
2085 	    char *cp3 = NULL, *cp4;
2086 	    int chndl;
2087 
2088 	    if (!charset)
2089 		cp1 += 7;
2090 	    while (*cp1 == ' ' || *cp1 == '=' || *cp1 == '"')
2091 		cp1++;
2092 
2093 	    StrAllocCopy(cp3, cp1);	/* copy to mutilate more */
2094 	    for (cp4 = cp3; (*cp4 != '\0' && *cp4 != '"' &&
2095 			     *cp4 != ';' && *cp4 != ':' &&
2096 			     !WHITE(*cp4)); cp4++) {
2097 		;		/* do nothing */
2098 	    }
2099 	    *cp4 = '\0';
2100 	    cp4 = cp3;
2101 	    chndl = UCGetLYhndl_byMIME(cp3);
2102 
2103 #ifdef CAN_SWITCH_DISPLAY_CHARSET
2104 	    /* Allow a switch to a more suitable display charset */
2105 	    if (Switch_Display_Charset(chndl, SWITCH_DISPLAY_CHARSET_MAYBE)) {
2106 		/* UCT_STAGE_STRUCTURED and UCT_STAGE_HTEXT
2107 		   should have the same setting for UCInfoStage. */
2108 		HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_STRUCTURED);
2109 
2110 		me->outUCLYhndl = current_char_set;
2111 		HTAnchor_setUCInfoStage(me->node_anchor,
2112 					current_char_set,
2113 					UCT_STAGE_HTEXT,
2114 					UCT_SETBY_MIME);	/* highest priorty! */
2115 		HTAnchor_setUCInfoStage(me->node_anchor,
2116 					current_char_set,
2117 					UCT_STAGE_STRUCTURED,
2118 					UCT_SETBY_MIME);	/* highest priorty! */
2119 		me->outUCI = HTAnchor_getUCInfoStage(me->node_anchor,
2120 						     UCT_STAGE_HTEXT);
2121 		/* The SGML stage will be reset in change_chartrans_handling */
2122 	    }
2123 #endif
2124 
2125 	    if (UCCanTranslateFromTo(chndl, current_char_set)) {
2126 		chartrans_ok = YES;
2127 		StrAllocCopy(me->node_anchor->charset, cp4);
2128 		HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2129 					UCT_STAGE_PARSER,
2130 					UCT_SETBY_STRUCTURED);
2131 	    } else if (chndl < 0) {
2132 		/*
2133 		 * Got something but we don't recognize it.
2134 		 */
2135 		chndl = UCLYhndl_for_unrec;
2136 		if (chndl < 0)	/* UCLYhndl_for_unrec not defined :-( */
2137 		    chndl = UCLYhndl_for_unspec;	/* always >= 0 */
2138 		if (UCCanTranslateFromTo(chndl, current_char_set)) {
2139 		    chartrans_ok = YES;
2140 		    HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2141 					    UCT_STAGE_PARSER,
2142 					    UCT_SETBY_STRUCTURED);
2143 		}
2144 	    }
2145 	    if (chartrans_ok) {
2146 		p_in = HTAnchor_getUCInfoStage(me->node_anchor,
2147 					       UCT_STAGE_PARSER);
2148 		p_out = HTAnchor_setUCInfoStage(me->node_anchor,
2149 						current_char_set,
2150 						UCT_STAGE_HTEXT,
2151 						UCT_SETBY_DEFAULT);
2152 		if (!p_out) {
2153 		    /*
2154 		     * Try again.
2155 		     */
2156 		    p_out = HTAnchor_getUCInfoStage(me->node_anchor,
2157 						    UCT_STAGE_HTEXT);
2158 		}
2159 		if (!strcmp(p_in->MIMEname, "x-transparent")) {
2160 		    HTPassEightBitRaw = TRUE;
2161 		    HTAnchor_setUCInfoStage(me->node_anchor,
2162 					    HTAnchor_getUCLYhndl(me->node_anchor,
2163 								 UCT_STAGE_HTEXT),
2164 					    UCT_STAGE_PARSER,
2165 					    UCT_SETBY_DEFAULT);
2166 		}
2167 		if (!strcmp(p_out->MIMEname, "x-transparent")) {
2168 		    HTPassEightBitRaw = TRUE;
2169 		    HTAnchor_setUCInfoStage(me->node_anchor,
2170 					    HTAnchor_getUCLYhndl(me->node_anchor,
2171 								 UCT_STAGE_PARSER),
2172 					    UCT_STAGE_HTEXT,
2173 					    UCT_SETBY_DEFAULT);
2174 		}
2175 		if ((p_in->enc != UCT_ENC_CJK)
2176 #ifdef EXP_JAPANESEUTF8_SUPPORT
2177 		    && (p_in->enc != UCT_ENC_UTF8)
2178 #endif
2179 		    ) {
2180 		    HTCJK = NOCJK;
2181 		    if (!(p_in->codepoints &
2182 			  UCT_CP_SUBSETOF_LAT1) &&
2183 			chndl == current_char_set) {
2184 			HTPassEightBitRaw = TRUE;
2185 		    }
2186 		} else if (p_out->enc == UCT_ENC_CJK) {
2187 		    Set_HTCJK(p_in->MIMEname, p_out->MIMEname);
2188 		}
2189 		LYGetChartransInfo(me);
2190 		/*
2191 		 * Update the chartrans info homologously to a Content-Type
2192 		 * MIME header with a charset parameter.  - FM
2193 		 */
2194 		if (me->UCLYhndl != chndl) {
2195 		    HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2196 					    UCT_STAGE_MIME,
2197 					    UCT_SETBY_STRUCTURED);
2198 		    HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2199 					    UCT_STAGE_PARSER,
2200 					    UCT_SETBY_STRUCTURED);
2201 		    me->inUCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
2202 							  UCT_STAGE_PARSER);
2203 		    me->inUCI = HTAnchor_getUCInfoStage(me->node_anchor,
2204 							UCT_STAGE_PARSER);
2205 		}
2206 		UCSetTransParams(&me->T,
2207 				 me->inUCLYhndl, me->inUCI,
2208 				 me->outUCLYhndl, me->outUCI);
2209 	    } else {
2210 		/*
2211 		 * Cannot translate.  If according to some heuristic the given
2212 		 * charset and the current display character both are likely to
2213 		 * be like ISO-8859 in structure, pretend we have some kind of
2214 		 * match.
2215 		 */
2216 		BOOL given_is_8859 = (BOOL) (!StrNCmp(cp4, "iso-8859-", 9) &&
2217 					     isdigit(UCH(cp4[9])));
2218 		BOOL given_is_8859like = (BOOL) (given_is_8859
2219 						 || !StrNCmp(cp4, "windows-", 8)
2220 						 || !StrNCmp(cp4, "cp12", 4)
2221 						 || !StrNCmp(cp4, "cp-12", 5));
2222 		BOOL given_and_display_8859like = (BOOL) (given_is_8859like &&
2223 							  (strstr(LYchar_set_names[current_char_set],
2224 								  "ISO-8859") ||
2225 							   strstr(LYchar_set_names[current_char_set],
2226 								  "windows-")));
2227 
2228 		if (given_is_8859) {
2229 		    cp1 = &cp4[10];
2230 		    while (*cp1 &&
2231 			   isdigit(UCH((*cp1))))
2232 			cp1++;
2233 		    *cp1 = '\0';
2234 		}
2235 		if (given_and_display_8859like) {
2236 		    StrAllocCopy(me->node_anchor->charset, cp4);
2237 		    HTPassEightBitRaw = TRUE;
2238 		}
2239 		HTAlert(*cp4 ? cp4 : me->node_anchor->charset);
2240 
2241 	    }
2242 	    FREE(cp3);
2243 
2244 	    if (me->node_anchor->charset) {
2245 		CTRACE((tfp,
2246 			"LYHandleMETA: New charset: %s\n",
2247 			me->node_anchor->charset));
2248 	    }
2249 	}
2250 	/*
2251 	 * Set the kcode element based on the charset.  - FM
2252 	 */
2253 	HText_setKcode(me->text, me->node_anchor->charset, p_in);
2254     }
2255 
2256     /*
2257      * Make sure we have META name/value pairs to handle.  - FM
2258      */
2259     if (!(http_equiv || name) || !content)
2260 	goto free_META_copies;
2261 
2262     /*
2263      * Check for a no-cache Pragma
2264      * or Cache-Control directive. - FM
2265      */
2266     if (!strcasecomp(NonNull(http_equiv), "Pragma") ||
2267 	!strcasecomp(NonNull(http_equiv), "Cache-Control")) {
2268 	LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2269 				NO, NO, YES, st_other);
2270 	if (!strcasecomp(content, "no-cache")) {
2271 	    me->node_anchor->no_cache = TRUE;
2272 	    HText_setNoCache(me->text);
2273 	}
2274 
2275 	/*
2276 	 * If we didn't get a Cache-Control MIME header, and the META has one,
2277 	 * convert to lowercase, store it in the anchor element, and if we
2278 	 * haven't yet set no_cache, check whether we should.  - FM
2279 	 */
2280 	if ((!me->node_anchor->cache_control) &&
2281 	    !strcasecomp(NonNull(http_equiv), "Cache-Control")) {
2282 	    LYLowerCase(content);
2283 	    StrAllocCopy(me->node_anchor->cache_control, content);
2284 	    if (me->node_anchor->no_cache == FALSE) {
2285 		cp0 = content;
2286 		while ((cp = strstr(cp0, "no-cache")) != NULL) {
2287 		    cp += 8;
2288 		    while (*cp != '\0' && WHITE(*cp))
2289 			cp++;
2290 		    if (*cp == '\0' || *cp == ';') {
2291 			me->node_anchor->no_cache = TRUE;
2292 			HText_setNoCache(me->text);
2293 			break;
2294 		    }
2295 		    cp0 = cp;
2296 		}
2297 		if (me->node_anchor->no_cache == TRUE)
2298 		    goto free_META_copies;
2299 		cp0 = content;
2300 		while ((cp = strstr(cp0, "max-age")) != NULL) {
2301 		    cp += 7;
2302 		    while (*cp != '\0' && WHITE(*cp))
2303 			cp++;
2304 		    if (*cp == '=') {
2305 			cp++;
2306 			while (*cp != '\0' && WHITE(*cp))
2307 			    cp++;
2308 			if (isdigit(UCH(*cp))) {
2309 			    cp0 = cp;
2310 			    while (isdigit(UCH(*cp)))
2311 				cp++;
2312 			    if (*cp0 == '0' && cp == (cp0 + 1)) {
2313 				me->node_anchor->no_cache = TRUE;
2314 				HText_setNoCache(me->text);
2315 				break;
2316 			    }
2317 			}
2318 		    }
2319 		    cp0 = cp;
2320 		}
2321 	    }
2322 	}
2323 
2324 	/*
2325 	 * Check for an Expires directive. - FM
2326 	 */
2327     } else if (!strcasecomp(NonNull(http_equiv), "Expires")) {
2328 	/*
2329 	 * If we didn't get an Expires MIME header, store it in the anchor
2330 	 * element, and if we haven't yet set no_cache, check whether we
2331 	 * should.  Note that we don't accept a Date header via META tags,
2332 	 * because it's likely to be untrustworthy, but do check for a Date
2333 	 * header from a server when making the comparison.  - FM
2334 	 */
2335 	LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2336 				NO, NO, YES, st_other);
2337 	StrAllocCopy(me->node_anchor->expires, content);
2338 	if (me->node_anchor->no_cache == FALSE) {
2339 	    if (!strcmp(content, "0")) {
2340 		/*
2341 		 * The value is zero, which we treat as an absolute no-cache
2342 		 * directive.  - FM
2343 		 */
2344 		me->node_anchor->no_cache = TRUE;
2345 		HText_setNoCache(me->text);
2346 	    } else if (me->node_anchor->date != NULL) {
2347 		/*
2348 		 * We have a Date header, so check if the value is less than or
2349 		 * equal to that.  - FM
2350 		 */
2351 		if (LYmktime(content, TRUE) <=
2352 		    LYmktime(me->node_anchor->date, TRUE)) {
2353 		    me->node_anchor->no_cache = TRUE;
2354 		    HText_setNoCache(me->text);
2355 		}
2356 	    } else if (LYmktime(content, FALSE) == 0) {
2357 		/*
2358 		 * We don't have a Date header, and the value is in past for
2359 		 * us.  - FM
2360 		 */
2361 		me->node_anchor->no_cache = TRUE;
2362 		HText_setNoCache(me->text);
2363 	    }
2364 	}
2365 
2366 	/*
2367 	 * Check for a Refresh directive.  - FM
2368 	 */
2369     } else if (!strcasecomp(NonNull(http_equiv), "Refresh")) {
2370 	char *Seconds = NULL;
2371 
2372 	LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2373 				NO, NO, YES, st_other);
2374 	LYParseRefreshURL(content, &Seconds, &href);
2375 
2376 	if (Seconds) {
2377 	    if (href) {
2378 		/*
2379 		 * We found a URL field, so check it out.  - FM
2380 		 */
2381 		if (!LYLegitimizeHREF(me, &href, TRUE, FALSE)) {
2382 		    /*
2383 		     * The specs require a complete URL, but this is a
2384 		     * Netscapism, so don't expect the author to know that.  -
2385 		     * FM
2386 		     */
2387 		    HTUserMsg(REFRESH_URL_NOT_ABSOLUTE);
2388 		    /*
2389 		     * Use the document's address as the base.  - FM
2390 		     */
2391 		    if (*href != '\0') {
2392 			temp = HTParse(href,
2393 				       me->node_anchor->address, PARSE_ALL);
2394 			StrAllocCopy(href, temp);
2395 			FREE(temp);
2396 		    } else {
2397 			StrAllocCopy(href, me->node_anchor->address);
2398 			HText_setNoCache(me->text);
2399 		    }
2400 
2401 		} else {
2402 		    /*
2403 		     * Check whether to fill in localhost.  - FM
2404 		     */
2405 		    LYFillLocalFileURL(&href,
2406 				       (me->inBASE ?
2407 					me->base_href : me->node_anchor->address));
2408 		}
2409 
2410 		/*
2411 		 * Set the no_cache flag if the Refresh URL is the same as the
2412 		 * document's address.  - FM
2413 		 */
2414 		if (!strcmp(href, me->node_anchor->address)) {
2415 		    HText_setNoCache(me->text);
2416 		}
2417 	    } else {
2418 		/*
2419 		 * We didn't find a URL field, so use the document's own
2420 		 * address and set the no_cache flag.  - FM
2421 		 */
2422 		StrAllocCopy(href, me->node_anchor->address);
2423 		HText_setNoCache(me->text);
2424 	    }
2425 	    /*
2426 	     * Check for an anchor in http or https URLs.  - FM
2427 	     */
2428 	    cp = NULL;
2429 	    /* id_string seems to be used wrong below if given.
2430 	       not that it matters much.  avoid setting it here. - kw */
2431 	    if (track_internal_links &&
2432 		(StrNCmp(href, "http", 4) == 0) &&
2433 		(cp = StrChr(href, '#')) != NULL) {
2434 		StrAllocCopy(id_string, cp);
2435 		*cp = '\0';
2436 	    }
2437 	    if (me->inA) {
2438 		/*
2439 		 * Ugh!  The META tag, which is a HEAD element, is in an
2440 		 * Anchor, which is BODY element.  All we can do is close the
2441 		 * Anchor and cross our fingers.  - FM
2442 		 */
2443 		if (me->inBoldA == TRUE && me->inBoldH == FALSE)
2444 		    HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2445 		me->inBoldA = FALSE;
2446 		HText_endAnchor(me->text, me->CurrentANum);
2447 		me->inA = FALSE;
2448 		me->CurrentANum = 0;
2449 	    }
2450 	    me->CurrentA = HTAnchor_findChildAndLink
2451 		(
2452 		    me->node_anchor,	/* Parent */
2453 		    id_string,	/* Tag */
2454 		    href,	/* Addresss */
2455 		    (HTLinkType *) 0);	/* Type */
2456 	    if (id_string)
2457 		*cp = '#';
2458 	    FREE(id_string);
2459 	    LYEnsureSingleSpace(me);
2460 	    if (me->inUnderline == FALSE)
2461 		HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
2462 	    HTML_put_string(me, "REFRESH(");
2463 	    HTML_put_string(me, Seconds);
2464 	    HTML_put_string(me, " sec):");
2465 	    FREE(Seconds);
2466 	    if (me->inUnderline == FALSE)
2467 		HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
2468 	    HTML_put_character(me, ' ');
2469 	    me->in_word = NO;
2470 	    HText_beginAnchor(me->text, me->inUnderline, me->CurrentA);
2471 	    if (me->inBoldH == FALSE)
2472 		HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
2473 	    HTML_put_string(me, href);
2474 	    FREE(href);
2475 	    if (me->inBoldH == FALSE)
2476 		HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2477 	    HText_endAnchor(me->text, 0);
2478 	    LYEnsureSingleSpace(me);
2479 	}
2480 
2481 	/*
2482 	 * Check for a suggested filename via a Content-Disposition with a
2483 	 * filename=name.suffix in it, if we don't already have it via a server
2484 	 * header.  - FM
2485 	 */
2486     } else if (isEmpty(me->node_anchor->SugFname) &&
2487 	       !strcasecomp((http_equiv ?
2488 			     http_equiv : ""), "Content-Disposition")) {
2489 	cp = content;
2490 	while (*cp != '\0' && strncasecomp(cp, "filename", 8))
2491 	    cp++;
2492 	if (*cp != '\0') {
2493 	    cp = LYSkipBlanks(cp + 8);
2494 	    if (*cp == '=')
2495 		cp++;
2496 	    cp = LYSkipBlanks(cp);
2497 	    if (*cp != '\0') {
2498 		StrAllocCopy(me->node_anchor->SugFname, cp);
2499 		if (*me->node_anchor->SugFname == '"') {
2500 		    if ((cp = StrChr((me->node_anchor->SugFname + 1),
2501 				     '"')) != NULL) {
2502 			*(cp + 1) = '\0';
2503 			HTMIME_TrimDoubleQuotes(me->node_anchor->SugFname);
2504 			if (isEmpty(me->node_anchor->SugFname)) {
2505 			    FREE(me->node_anchor->SugFname);
2506 			}
2507 		    } else {
2508 			FREE(me->node_anchor->SugFname);
2509 		    }
2510 		}
2511 #if defined(UNIX) && !defined(DOSPATH)
2512 		/*
2513 		 * If blanks are not legal for local filenames, replace them
2514 		 * with underscores.
2515 		 */
2516 		if ((cp = me->node_anchor->SugFname) != NULL) {
2517 		    while (*cp != '\0') {
2518 			if (isspace(UCH(*cp)))
2519 			    *cp = '_';
2520 			++cp;
2521 		    }
2522 		}
2523 #endif
2524 	    }
2525 	}
2526 	/*
2527 	 * Check for a Set-Cookie directive.  - AK
2528 	 */
2529     } else if (!strcasecomp(NonNull(http_equiv), "Set-Cookie")) {
2530 	/*
2531 	 * This will need to be updated when Set-Cookie/Set-Cookie2 handling is
2532 	 * finalized.  For now, we'll still assume "historical" cookies in META
2533 	 * directives.  - FM
2534 	 */
2535 	url_type = is_url(me->inBASE ?
2536 			  me->base_href : me->node_anchor->address);
2537 	if (url_type == HTTP_URL_TYPE || url_type == HTTPS_URL_TYPE) {
2538 	    LYSetCookie(content,
2539 			NULL,
2540 			(me->inBASE ?
2541 			 me->base_href : me->node_anchor->address));
2542 	}
2543     }
2544 
2545     /*
2546      * Free the copies.  - FM
2547      */
2548   free_META_copies:
2549     FREE(http_equiv);
2550     FREE(name);
2551     FREE(content);
2552     FREE(charset);
2553 }
2554 
2555 /*
2556  *  This function handles P elements in HTML streams.
2557  *  If start is TRUE it handles a start tag, and if
2558  *  FALSE, an end tag.	We presently handle start
2559  *  and end tags identically, but this can lead to
2560  *  a different number of blank lines between the
2561  *  current paragraph and subsequent text when a P
2562  *  end tag is present or not in the markup. - FM
2563  */
LYHandlePlike(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED,int align_idx,int start)2564 void LYHandlePlike(HTStructured * me, const BOOL *present,
2565 		   STRING2PTR value,
2566 		   char **include GCC_UNUSED,
2567 		   int align_idx,
2568 		   int start)
2569 {
2570     /*
2571      * FIG content should be a true block, which like P inherits the current
2572      * style.  APPLET is like character elements or an ALT attribute, unless
2573      * its content contains a block element.  If we encounter a P in either's
2574      * content, we set flags to treat the content as a block - FM
2575      */
2576     if (start) {
2577 	if (me->inFIG)
2578 	    me->inFIGwithP = TRUE;
2579 
2580 	if (me->inAPPLET)
2581 	    me->inAPPLETwithP = TRUE;
2582     }
2583 
2584     UPDATE_STYLE;
2585     if (me->List_Nesting_Level >= 0) {
2586 	/*
2587 	 * We're in a list.  Treat P as an instruction to create one blank
2588 	 * line, if not already present, then fall through to handle
2589 	 * attributes, with the "second line" margins - FM
2590 	 */
2591 	if (me->inP) {
2592 	    if (me->inFIG || me->inAPPLET ||
2593 		me->inCAPTION || me->inCREDIT ||
2594 		me->sp->style->spaceAfter > 0 ||
2595 		(start && me->sp->style->spaceBefore > 0)) {
2596 		LYEnsureDoubleSpace(me);
2597 	    } else {
2598 		LYEnsureSingleSpace(me);
2599 	    }
2600 	}
2601     } else if (me->sp[0].tag_number == HTML_ADDRESS) {
2602 	/*
2603 	 * We're in an ADDRESS.  Treat P as an instruction to start a newline,
2604 	 * if needed, then fall through to handle attributes - FM
2605 	 */
2606 	if (!HText_LastLineEmpty(me->text, FALSE)) {
2607 	    HText_setLastChar(me->text, ' ');	/* absorb white space */
2608 	    HText_appendCharacter(me->text, '\r');
2609 	}
2610     } else {
2611 	if (start) {
2612 	    if (!(me->inLABEL && !me->inP)) {
2613 		HText_appendParagraph(me->text);
2614 	    }
2615 	} else if (me->sp->style->spaceAfter > 0) {
2616 	    LYEnsureDoubleSpace(me);
2617 	} else {
2618 	    LYEnsureSingleSpace(me);
2619 	}
2620 	me->inLABEL = FALSE;
2621     }
2622     me->in_word = NO;
2623 
2624     if (LYoverride_default_alignment(me)) {
2625 	me->sp->style->alignment = LYstyles(me->sp[0].tag_number)->alignment;
2626     } else if ((me->List_Nesting_Level >= 0 &&
2627 		(me->sp->style->id == ST_DivCenter ||
2628 		 me->sp->style->id == ST_DivLeft ||
2629 		 me->sp->style->id == ST_DivRight)) ||
2630 	       ((me->Division_Level < 0) &&
2631 		(me->sp->style->id == ST_Normal ||
2632 		 me->sp->style->id == ST_Preformatted))) {
2633 	me->sp->style->alignment = HT_LEFT;
2634     } else {
2635 	me->sp->style->alignment = (short) me->current_default_alignment;
2636     }
2637 
2638     if (start && align_idx >= 0) {
2639 	if (present && present[align_idx] && value[align_idx]) {
2640 	    if (!strcasecomp(value[align_idx], "center") &&
2641 		!(me->List_Nesting_Level >= 0 && !me->inP))
2642 		me->sp->style->alignment = HT_CENTER;
2643 	    else if (!strcasecomp(value[align_idx], "right") &&
2644 		     !(me->List_Nesting_Level >= 0 && !me->inP))
2645 		me->sp->style->alignment = HT_RIGHT;
2646 	    else if (!strcasecomp(value[align_idx], "left") ||
2647 		     !strcasecomp(value[align_idx], "justify"))
2648 		me->sp->style->alignment = HT_LEFT;
2649 	}
2650 
2651     }
2652 
2653     /*
2654      * Mark that we are starting a new paragraph and don't have any of its
2655      * text yet - FM
2656      */
2657     me->inP = FALSE;
2658 
2659     return;
2660 }
2661 
2662 /*
2663  *  This function handles SELECT elements in HTML streams.
2664  *  If start is TRUE it handles a start tag, and if FALSE,
2665  *  an end tag. - FM
2666  */
LYHandleSELECT(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED,int start)2667 void LYHandleSELECT(HTStructured * me, const BOOL *present,
2668 		    STRING2PTR value,
2669 		    char **include GCC_UNUSED,
2670 		    int start)
2671 {
2672     int i;
2673 
2674     if (start == TRUE) {
2675 	char *name = NULL;
2676 	BOOLEAN multiple = NO;
2677 	char *size = NULL;
2678 
2679 	/*
2680 	 * Initialize the disable attribute.
2681 	 */
2682 	me->select_disabled = FALSE;
2683 
2684 	/*
2685 	 * Check for unclosed TEXTAREA.
2686 	 */
2687 	if (me->inTEXTAREA) {
2688 	    if (LYBadHTML(me)) {
2689 		LYShowBadHTML("Bad HTML: Missing TEXTAREA end tag\n");
2690 	    }
2691 	}
2692 
2693 	/*
2694 	 * Set to know we are in a select tag.
2695 	 */
2696 	me->inSELECT = TRUE;
2697 
2698 	if (!(present && present[HTML_SELECT_NAME] &&
2699 	      non_empty(value[HTML_SELECT_NAME]))) {
2700 	    StrAllocCopy(name, "");
2701 	} else if (StrChr(value[HTML_SELECT_NAME], '&') == NULL) {
2702 	    StrAllocCopy(name, value[HTML_SELECT_NAME]);
2703 	} else {
2704 	    StrAllocCopy(name, value[HTML_SELECT_NAME]);
2705 	    UNESCAPE_FIELDNAME_TO_STD(&name);
2706 	}
2707 	if (present && present[HTML_SELECT_MULTIPLE])
2708 	    multiple = YES;
2709 	if (present && present[HTML_SELECT_DISABLED])
2710 	    me->select_disabled = TRUE;
2711 	if (present && present[HTML_SELECT_SIZE] &&
2712 	    non_empty(value[HTML_SELECT_SIZE])) {
2713 	    /*
2714 	     * Let the size be determined by the number of OPTIONs.  - FM
2715 	     */
2716 	    CTRACE((tfp, "LYHandleSELECT: Ignoring SIZE=\"%s\" for SELECT.\n",
2717 		    value[HTML_SELECT_SIZE]));
2718 	}
2719 
2720 	if (me->inBoldH == TRUE &&
2721 	    (multiple == NO || LYSelectPopups == FALSE)) {
2722 	    HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2723 	    me->inBoldH = FALSE;
2724 	    me->needBoldH = TRUE;
2725 	}
2726 	if (me->inUnderline == TRUE &&
2727 	    (multiple == NO || LYSelectPopups == FALSE)) {
2728 	    HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
2729 	    me->inUnderline = FALSE;
2730 	}
2731 
2732 	if ((multiple == NO && LYSelectPopups == TRUE) &&
2733 	    (me->sp[0].tag_number == HTML_PRE || me->inPRE == TRUE ||
2734 	     !me->sp->style->freeFormat) &&
2735 	    HText_LastLineSize(me->text, FALSE) > (LYcolLimit - 7)) {
2736 	    /*
2737 	     * Force a newline when we're using a popup in a PRE block and are
2738 	     * within 7 columns from the right margin.  This will allow for the
2739 	     * '[' popup designator and help avoid a wrap in the underscore
2740 	     * placeholder for the retracted popup entry in the HText
2741 	     * structure.  - FM
2742 	     */
2743 	    HTML_put_character(me, '\n');
2744 	    me->in_word = NO;
2745 	}
2746 
2747 	LYCheckForID(me, present, value, (int) HTML_SELECT_ID);
2748 
2749 	HText_beginSelect(name, ATTR_CS_IN, multiple, size);
2750 	FREE(name);
2751 	FREE(size);
2752 
2753 	me->first_option = TRUE;
2754     } else {
2755 	/*
2756 	 * Handle end tag.
2757 	 */
2758 	char *ptr;
2759 
2760 	/*
2761 	 * Make sure we had a select start tag.
2762 	 */
2763 	if (!me->inSELECT) {
2764 	    if (LYBadHTML(me)) {
2765 		LYShowBadHTML("Bad HTML: Unmatched SELECT end tag\n");
2766 	    }
2767 	    return;
2768 	}
2769 
2770 	/*
2771 	 * Set to know that we are no longer in a select tag.
2772 	 */
2773 	me->inSELECT = FALSE;
2774 
2775 	/*
2776 	 * Clear the disable attribute.
2777 	 */
2778 	me->select_disabled = FALSE;
2779 
2780 	/*
2781 	 * Finish the data off.
2782 	 */
2783 	HTChunkTerminate(&me->option);
2784 	/*
2785 	 * Finish the previous option.
2786 	 */
2787 	ptr = HText_setLastOptionValue(me->text,
2788 				       me->option.data,
2789 				       me->LastOptionValue,
2790 				       LAST_ORDER,
2791 				       me->LastOptionChecked,
2792 				       me->UCLYhndl,
2793 				       ATTR_CS_IN);
2794 	FREE(me->LastOptionValue);
2795 
2796 	me->LastOptionChecked = FALSE;
2797 
2798 	if (HTCurSelectGroupType == F_CHECKBOX_TYPE ||
2799 	    LYSelectPopups == FALSE) {
2800 	    /*
2801 	     * Start a newline after the last checkbox/button option.
2802 	     */
2803 	    LYEnsureSingleSpace(me);
2804 	} else {
2805 	    /*
2806 	     * Output popup box with the default option to screen, but use
2807 	     * non-breaking spaces for output.
2808 	     */
2809 	    if (ptr &&
2810 		me->sp[0].tag_number == HTML_PRE && strlen(ptr) > 6) {
2811 		/*
2812 		 * The code inadequately handles OPTION fields in PRE tags.
2813 		 * We'll put up a minimum of 6 characters, and if any more
2814 		 * would exceed the wrap column, we'll ignore them.
2815 		 */
2816 		for (i = 0; i < 6; i++) {
2817 		    if (*ptr == ' ')
2818 			HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
2819 		    else
2820 			HText_appendCharacter(me->text, *ptr);
2821 		    ptr++;
2822 		}
2823 	    }
2824 	    for (; non_empty(ptr); ptr++) {
2825 		if (*ptr == ' ')
2826 		    HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
2827 		else
2828 		    HText_appendCharacter(me->text, *ptr);
2829 	    }
2830 	    /*
2831 	     * Add end option character.
2832 	     */
2833 	    if (!me->first_option) {
2834 		HText_appendCharacter(me->text, ']');
2835 		HText_setLastChar(me->text, ']');
2836 		me->in_word = YES;
2837 	    }
2838 	}
2839 	HTChunkClear(&me->option);
2840 
2841 	if (me->Underline_Level > 0 && me->inUnderline == FALSE) {
2842 	    HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
2843 	    me->inUnderline = TRUE;
2844 	}
2845 	if (me->needBoldH == TRUE && me->inBoldH == FALSE) {
2846 	    HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
2847 	    me->inBoldH = TRUE;
2848 	    me->needBoldH = FALSE;
2849 	}
2850     }
2851 }
2852 
2853 /*
2854  *  This function strips white characters and
2855  *  generally fixes up attribute values that
2856  *  were received from the SGML parser and
2857  *  are to be treated as partial or absolute
2858  *  URLs. - FM
2859  */
LYLegitimizeHREF(HTStructured * me,char ** href,int force_slash,int strip_dots)2860 int LYLegitimizeHREF(HTStructured * me, char **href,
2861 		     int force_slash,
2862 		     int strip_dots)
2863 {
2864     int url_type = 0;
2865     char *p = NULL;
2866     char *pound = NULL;
2867     const char *Base = NULL;
2868 
2869     if (!me || !href || isEmpty(*href))
2870 	return (url_type);
2871 
2872     if (!LYTrimStartfile(*href)) {
2873 	/*
2874 	 * Collapse spaces in the actual URL, but just protect against tabs or
2875 	 * newlines in the fragment, if present.  This seeks to cope with
2876 	 * atrocities inflicted on the Web by authoring tools such as
2877 	 * Frontpage.  - FM
2878 	 */
2879 
2880 	/*  Before working on spaces check if we have any, usually none. */
2881 	p = LYSkipNonBlanks(*href);
2882 
2883 	if (*p) {		/* p == first space character */
2884 	    /* no reallocs below, all converted in place */
2885 
2886 	    pound = findPoundSelector(*href);
2887 
2888 	    if (pound != NULL && pound < p) {
2889 		convert_to_spaces(p, FALSE);	/* done */
2890 
2891 	    } else {
2892 		if (pound != NULL)
2893 		    *pound = '\0';	/* mark */
2894 
2895 		/*
2896 		 * No blanks really belong in the HREF,
2897 		 * but if it refers to an actual file,
2898 		 * it may actually have blanks in the name.
2899 		 * Try to accommodate. See also HTParse().
2900 		 */
2901 		if (LYRemoveNewlines(p) || StrChr(p, '\t') != 0) {
2902 		    LYRemoveBlanks(p);	/* a compromise... */
2903 		}
2904 
2905 		if (pound != NULL) {
2906 		    p = StrChr(p, '\0');
2907 		    *pound = '#';	/* restore */
2908 		    convert_to_spaces(pound, FALSE);
2909 		    if (p < pound)
2910 			strcpy(p, pound);
2911 		}
2912 	    }
2913 	}
2914     }
2915     if (**href == '\0')
2916 	return (url_type);
2917 
2918     TRANSLATE_AND_UNESCAPE_TO_STD(href);
2919 
2920     Base = me->inBASE ?
2921 	me->base_href : me->node_anchor->address;
2922 
2923     url_type = is_url(*href);
2924     if (!url_type && force_slash && **href == '.' &&
2925 	(!strcmp(*href, ".") || !strcmp(*href, "..")) &&
2926 	!isFILE_URL(Base)) {
2927 	/*
2928 	 * The Fielding RFC/ID for resolving partial HREFs says that a slash
2929 	 * should be on the end of the preceding symbolic element for "." and
2930 	 * "..", but all tested browsers only do that for an explicit "./" or
2931 	 * "../", so we'll respect the RFC/ID only if force_slash was TRUE and
2932 	 * it's not a file URL.  - FM
2933 	 */
2934 	StrAllocCat(*href, "/");
2935     }
2936     if ((!url_type && LYStripDotDotURLs && strip_dots && **href == '.') &&
2937 	!strncasecomp(Base, "http", 4)) {
2938 	/*
2939 	 * We will be resolving a partial reference versus an http or https
2940 	 * URL, and it has lead dots, which may be retained when resolving via
2941 	 * HTParse(), but the request would fail if the first element of the
2942 	 * resultant path is two dots, because no http or https server accepts
2943 	 * such paths, and the current URL draft, likely to become an RFC, says
2944 	 * that it's optional for the UA to strip them as a form of error
2945 	 * recovery.  So we will, recursively, for http/https URLs, like the
2946 	 * "major market browsers" which made this problem so common on the
2947 	 * Web, but we'll also issue a message about it, such that the bad
2948 	 * partial reference might get corrected by the document provider.  -
2949 	 * FM
2950 	 */
2951 	char *temp = NULL, *path = NULL, *cp;
2952 	const char *str = "";
2953 
2954 	temp = HTParse(*href, Base, PARSE_ALL);
2955 	path = HTParse(temp, "", PARSE_PATH + PARSE_PUNCTUATION);
2956 	if (!StrNCmp(path, "/..", 3)) {
2957 	    cp = (path + 3);
2958 	    if (LYIsHtmlSep(*cp) || *cp == '\0') {
2959 		if (Base[4] == 's') {
2960 		    str = "s";
2961 		}
2962 		CTRACE((tfp,
2963 			"LYLegitimizeHREF: Bad value '%s' for http%s URL.\n",
2964 			*href, str));
2965 		CTRACE((tfp, "                  Stripping lead dots.\n"));
2966 		if (!me->inBadHREF) {
2967 		    HTUserMsg(BAD_PARTIAL_REFERENCE);
2968 		    me->inBadHREF = TRUE;
2969 		}
2970 	    }
2971 	    if (*cp == '\0') {
2972 		StrAllocCopy(*href, "/");
2973 	    } else if (LYIsHtmlSep(*cp)) {
2974 		while (!StrNCmp(cp, "/..", 3)) {
2975 		    if (*(cp + 3) == '/') {
2976 			cp += 3;
2977 			continue;
2978 		    } else if (*(cp + 3) == '\0') {
2979 			*(cp + 1) = '\0';
2980 			*(cp + 2) = '\0';
2981 		    }
2982 		    break;
2983 		}
2984 		StrAllocCopy(*href, cp);
2985 	    }
2986 	}
2987 	FREE(temp);
2988 	FREE(path);
2989     }
2990     return (url_type);
2991 }
2992 
2993 /*
2994  *  This function checks for a Content-Base header,
2995  *  and if not present, a Content-Location header
2996  *  which is an absolute URL, and sets the BASE
2997  *  accordingly.  If set, it will be replaced by
2998  *  any BASE tag in the HTML stream, itself. - FM
2999  */
LYCheckForContentBase(HTStructured * me)3000 void LYCheckForContentBase(HTStructured * me)
3001 {
3002     char *cp = NULL;
3003     BOOL present[HTML_BASE_ATTRIBUTES];
3004     const char *value[HTML_BASE_ATTRIBUTES];
3005     int i;
3006 
3007     if (!(me && me->node_anchor))
3008 	return;
3009 
3010     if (me->node_anchor->content_base != NULL) {
3011 	/*
3012 	 * We have a Content-Base value.  Use it if it's non-zero length.  - FM
3013 	 */
3014 	if (*me->node_anchor->content_base == '\0')
3015 	    return;
3016 	StrAllocCopy(cp, me->node_anchor->content_base);
3017 	LYRemoveBlanks(cp);
3018     } else if (me->node_anchor->content_location != NULL) {
3019 	/*
3020 	 * We didn't have a Content-Base value, but do have a Content-Location
3021 	 * value.  Use it if it's an absolute URL.  - FM
3022 	 */
3023 	if (*me->node_anchor->content_location == '\0')
3024 	    return;
3025 	StrAllocCopy(cp, me->node_anchor->content_location);
3026 	LYRemoveBlanks(cp);
3027 	if (!is_url(cp)) {
3028 	    FREE(cp);
3029 	    return;
3030 	}
3031     } else {
3032 	/*
3033 	 * We had neither a Content-Base nor Content-Location value.  - FM
3034 	 */
3035 	return;
3036     }
3037 
3038     /*
3039      * If we collapsed to a zero-length value, ignore it.  - FM
3040      */
3041     if (*cp == '\0') {
3042 	FREE(cp);
3043 	return;
3044     }
3045 
3046     /*
3047      * Pass the value to HTML_start_element as the HREF of a BASE tag.  - FM
3048      */
3049     for (i = 0; i < HTML_BASE_ATTRIBUTES; i++)
3050 	present[i] = NO;
3051     present[HTML_BASE_HREF] = YES;
3052     value[HTML_BASE_HREF] = (const char *) cp;
3053     (*me->isa->start_element) (me, HTML_BASE, present, value,
3054 			       0, 0);
3055     FREE(cp);
3056 }
3057 
3058 /*
3059  *  This function creates NAMEd Anchors if a non-zero-length NAME
3060  *  or ID attribute was present in the tag. - FM
3061  */
LYCheckForID(HTStructured * me,const BOOL * present,STRING2PTR value,int attribute)3062 void LYCheckForID(HTStructured * me, const BOOL *present,
3063 		  STRING2PTR value,
3064 		  int attribute)
3065 {
3066     HTChildAnchor *ID_A = NULL;
3067     char *temp = NULL;
3068 
3069     if (!(me && me->text))
3070 	return;
3071 
3072     if (present && present[attribute]
3073 	&& non_empty(value[attribute])) {
3074 	/*
3075 	 * Translate any named or numeric character references.  - FM
3076 	 */
3077 	StrAllocCopy(temp, value[attribute]);
3078 	LYUCTranslateHTMLString(&temp, me->tag_charset, me->tag_charset,
3079 				NO, NO, YES, st_URL);
3080 
3081 	/*
3082 	 * Create the link if we still have a non-zero-length string.  - FM
3083 	 */
3084 	if ((temp[0] != '\0') &&
3085 	    (ID_A = HTAnchor_findChildAndLink
3086 	     (
3087 		 me->node_anchor,	/* Parent */
3088 		 temp,		/* Tag */
3089 		 NULL,		/* Addresss */
3090 		 (HTLinkType *) 0))) {	/* Type */
3091 	    HText_beginAnchor(me->text, me->inUnderline, ID_A);
3092 	    HText_endAnchor(me->text, 0);
3093 	}
3094 	FREE(temp);
3095     }
3096 }
3097 
3098 /*
3099  *  This function creates a NAMEd Anchor for the ID string
3100  *  passed to it directly as an argument.  It assumes the
3101  *  does not need checking for character references. - FM
3102  */
LYHandleID(HTStructured * me,const char * id)3103 void LYHandleID(HTStructured * me, const char *id)
3104 {
3105     HTChildAnchor *ID_A = NULL;
3106 
3107     if (!(me && me->text) ||
3108 	isEmpty(id))
3109 	return;
3110 
3111     /*
3112      * Create the link if we still have a non-zero-length string.  - FM
3113      */
3114     if ((ID_A = HTAnchor_findChildAndLink
3115 	 (
3116 	     me->node_anchor,	/* Parent */
3117 	     id,		/* Tag */
3118 	     NULL,		/* Addresss */
3119 	     (HTLinkType *) 0)) != NULL) {	/* Type */
3120 	HText_beginAnchor(me->text, me->inUnderline, ID_A);
3121 	HText_endAnchor(me->text, 0);
3122     }
3123 }
3124 
3125 /*
3126  *  This function checks whether we want to override
3127  *  the current default alignment for paragraphs and
3128  *  instead use that specified in the element's style
3129  *  sheet. - FM
3130  */
LYoverride_default_alignment(HTStructured * me)3131 BOOLEAN LYoverride_default_alignment(HTStructured * me)
3132 {
3133     if (!me)
3134 	return NO;
3135 
3136     switch (me->sp[0].tag_number) {
3137     case HTML_BLOCKQUOTE:
3138     case HTML_BQ:
3139     case HTML_NOTE:
3140     case HTML_FN:
3141     case HTML_ADDRESS:
3142 	me->sp->style->alignment = HT_LEFT;
3143 	return YES;
3144 
3145     default:
3146 	break;
3147     }
3148     return NO;
3149 }
3150 
3151 /*
3152  *  This function inserts newlines if needed to create double spacing,
3153  *  and sets the left margin for subsequent text to the second line
3154  *  indentation of the current style. - FM
3155  */
LYEnsureDoubleSpace(HTStructured * me)3156 void LYEnsureDoubleSpace(HTStructured * me)
3157 {
3158     if (!me || !me->text)
3159 	return;
3160 
3161     if (!HText_LastLineEmpty(me->text, FALSE)) {
3162 	HText_setLastChar(me->text, ' ');	/* absorb white space */
3163 	HText_appendCharacter(me->text, '\r');
3164 	HText_appendCharacter(me->text, '\r');
3165     } else if (!HText_PreviousLineEmpty(me->text, FALSE)) {
3166 	HText_setLastChar(me->text, ' ');	/* absorb white space */
3167 	HText_appendCharacter(me->text, '\r');
3168     } else if (me->List_Nesting_Level >= 0) {
3169 	HText_NegateLineOne(me->text);
3170     }
3171     me->in_word = NO;
3172     return;
3173 }
3174 
3175 /*
3176  *  This function inserts a newline if needed to create single spacing,
3177  *  and sets the left margin for subsequent text to the second line
3178  *  indentation of the current style. - FM
3179  */
LYEnsureSingleSpace(HTStructured * me)3180 void LYEnsureSingleSpace(HTStructured * me)
3181 {
3182     if (!me || !me->text)
3183 	return;
3184 
3185     if (!HText_LastLineEmpty(me->text, FALSE)) {
3186 	HText_setLastChar(me->text, ' ');	/* absorb white space */
3187 	HText_appendCharacter(me->text, '\r');
3188     } else if (me->List_Nesting_Level >= 0) {
3189 	HText_NegateLineOne(me->text);
3190     }
3191     me->in_word = NO;
3192     return;
3193 }
3194 
3195 /*
3196  *  This function resets paragraph alignments for block
3197  *  elements which do not have a defined style sheet. - FM
3198  */
LYResetParagraphAlignment(HTStructured * me)3199 void LYResetParagraphAlignment(HTStructured * me)
3200 {
3201     if (!me)
3202 	return;
3203 
3204     if (me->List_Nesting_Level >= 0 ||
3205 	((me->Division_Level < 0) &&
3206 	 (me->sp->style->id == ST_Normal ||
3207 	  me->sp->style->id == ST_Preformatted))) {
3208 	me->sp->style->alignment = HT_LEFT;
3209     } else {
3210 	me->sp->style->alignment = (short) me->current_default_alignment;
3211     }
3212     return;
3213 }
3214 
3215 /*
3216  *  This example function checks whether the given anchor has
3217  *  an address with a file scheme, and if so, loads it into the
3218  *  the SGML parser's context->url element, which was passed as
3219  *  the second argument.  The handle_comment() calling function in
3220  *  SGML.c then calls LYDoCSI() in LYUtils.c to insert HTML markup
3221  *  into the corresponding stream, homologously to an SSI by an
3222  *  HTTP server. - FM
3223  *
3224  *  For functions similar to this but which depend on details of
3225  *  the HTML handler's internal data, the calling interface should
3226  *  be changed, and functions in SGML.c would have to make sure not
3227  *  to call such functions inappropriately (e.g., calling a function
3228  *  specific to the Lynx_HTML_Handler when SGML.c output goes to
3229  *  some other HTStructured object like in HTMLGen.c), or the new
3230  *  functions could be added to the SGML.h interface.
3231  */
LYCheckForCSI(HTParentAnchor * anchor,char ** url)3232 BOOLEAN LYCheckForCSI(HTParentAnchor *anchor,
3233 		      char **url)
3234 {
3235     if (!(anchor && anchor->address))
3236 	return FALSE;
3237 
3238     if (!isFILE_URL(anchor->address))
3239 	return FALSE;
3240 
3241     if (!LYisLocalHost(anchor->address))
3242 	return FALSE;
3243 
3244     StrAllocCopy(*url, anchor->address);
3245     return TRUE;
3246 }
3247 
3248 /*
3249  *  This function is called from the SGML parser to look at comments
3250  *  and see whether we should collect some info from them.  Currently
3251  *  it only looks for comments with Message-Id and Subject info, in the
3252  *  exact form generated by MHonArc for archived mailing list.  If found,
3253  *  the info is stored in the document's HTParentAnchor.  It can later be
3254  *  used for generating a mail response.
3255  *
3256  *  We are extra picky here because there isn't any official definition
3257  *  for these kinds of comments - we might (and still can) misinterpret
3258  *  arbitrary comments as something they aren't.
3259  *
3260  *  If something doesn't look right, for example invalid characters, the
3261  *  strings are not stored.  Mail responses will use something else as
3262  *  the subject, probably the document URL, and will not have an
3263  *  In-Reply-To header.
3264  *
3265  *  All this is a hack - to do this the right way, mailing list archivers
3266  *  would have to agree on some better mechanism to make this kind of info
3267  *  from original mail headers available, for example using LINK.  - kw
3268  */
LYCommentHacks(HTParentAnchor * anchor,const char * comment)3269 BOOLEAN LYCommentHacks(HTParentAnchor *anchor,
3270 		       const char *comment)
3271 {
3272     const char *cp;
3273     size_t len;
3274 
3275     if (comment == NULL)
3276 	return FALSE;
3277 
3278     if (!(anchor && anchor->address))
3279 	return FALSE;
3280 
3281     if (StrNCmp(comment, "!--X-Message-Id: ", 17) == 0) {
3282 	char *messageid = NULL;
3283 	char *p;
3284 
3285 	for (cp = comment + 17; *cp; cp++) {
3286 	    if (UCH(*cp) >= 127 || !isgraph(UCH(*cp))) {
3287 		break;
3288 	    }
3289 	}
3290 	if (strcmp(cp, " --")) {
3291 	    return FALSE;
3292 	}
3293 	cp = comment + 17;
3294 	StrAllocCopy(messageid, cp);
3295 	/* This should be ok - message-id should only contain 7-bit ASCII */
3296 	if (!LYUCTranslateHTMLString(&messageid, 0, 0, NO, NO, YES, st_URL))
3297 	    return FALSE;
3298 	for (p = messageid; *p; p++) {
3299 	    if (UCH(*p) >= 127 || !isgraph(UCH(*p))) {
3300 		break;
3301 	    }
3302 	}
3303 	if (strcmp(p, " --")) {
3304 	    FREE(messageid);
3305 	    return FALSE;
3306 	}
3307 	if ((p = StrChr(messageid, '@')) == NULL || p[1] == '\0') {
3308 	    FREE(messageid);
3309 	    return FALSE;
3310 	}
3311 	p = messageid;
3312 	if ((len = strlen(p)) >= 8 && !strcmp(&p[len - 3], " --")) {
3313 	    p[len - 3] = '\0';
3314 	} else {
3315 	    FREE(messageid);
3316 	    return FALSE;
3317 	}
3318 	if (HTAnchor_setMessageID(anchor, messageid)) {
3319 	    FREE(messageid);
3320 	    return TRUE;
3321 	} else {
3322 	    FREE(messageid);
3323 	    return FALSE;
3324 	}
3325     }
3326     if (StrNCmp(comment, "!--X-Subject: ", 14) == 0) {
3327 	char *subject = NULL;
3328 	char *p;
3329 
3330 	for (cp = comment + 14; *cp; cp++) {
3331 	    if (UCH(*cp) >= 127 || !isprint(UCH(*cp))) {
3332 		return FALSE;
3333 	    }
3334 	}
3335 	cp = comment + 14;
3336 	StrAllocCopy(subject, cp);
3337 	/* @@@
3338 	 * This may not be the right thing for the subject - but mail
3339 	 * subjects shouldn't contain 8-bit characters in raw form anyway.
3340 	 * We have to unescape character entities, since that's what MHonArc
3341 	 * seems to generate.  But if after that there are 8-bit characters
3342 	 * the string is rejected.  We would probably not know correctly
3343 	 * what charset to assume anyway - the mail sender's can differ from
3344 	 * the archive's.  And the code for sending mail cannot deal well
3345 	 * with 8-bit characters - we should not put them in the Subject
3346 	 * header in raw form, but don't have MIME encoding implemented.
3347 	 * Someone may want to do more about this...  - kw
3348 	 */
3349 	if (!LYUCTranslateHTMLString(&subject, 0, 0, NO, YES, NO, st_HTML))
3350 	    return FALSE;
3351 	for (p = subject; *p; p++) {
3352 	    if (UCH(*p) >= 127 || !isprint(UCH(*p))) {
3353 		FREE(subject);
3354 		return FALSE;
3355 	    }
3356 	}
3357 	p = subject;
3358 	if ((len = strlen(p)) >= 4 && !strcmp(&p[len - 3], " --")) {
3359 	    p[len - 3] = '\0';
3360 	} else {
3361 	    FREE(subject);
3362 	    return FALSE;
3363 	}
3364 	if (HTAnchor_setSubject(anchor, subject)) {
3365 	    FREE(subject);
3366 	    return TRUE;
3367 	} else {
3368 	    FREE(subject);
3369 	    return FALSE;
3370 	}
3371     }
3372 
3373     return FALSE;
3374 }
3375 
3376     /*
3377      * Create the Title with any left-angle-brackets converted to &lt; entities
3378      * and any ampersands converted to &amp; entities.  - FM
3379      *
3380      * Convert 8-bit letters to &#xUUUU to avoid dependencies from display
3381      * character set which may need changing.  Do NOT convert any 8-bit chars
3382      * if we have CJK display.  - LP
3383      */
LYformTitle(char ** dst,const char * src)3384 void LYformTitle(char **dst,
3385 		 const char *src)
3386 {
3387     if (HTCJK == JAPANESE) {
3388 	char *tmp_buffer = NULL;
3389 
3390 	if ((tmp_buffer = (char *) malloc(strlen(src) + 1)) == 0)
3391 	    outofmem(__FILE__, "LYformTitle");
3392 
3393 	switch (kanji_code) {	/* 1997/11/22 (Sat) 09:28:00 */
3394 	case EUC:
3395 	    TO_EUC((const unsigned char *) src, (unsigned char *) tmp_buffer);
3396 	    break;
3397 	case SJIS:
3398 	    TO_SJIS((const unsigned char *) src, (unsigned char *) tmp_buffer);
3399 	    break;
3400 	default:
3401 	    CTRACE((tfp, "\nLYformTitle: kanji_code is an unexpected value."));
3402 	    strcpy(tmp_buffer, src);
3403 	    break;
3404 	}
3405 	StrAllocCopy(*dst, tmp_buffer);
3406 	FREE(tmp_buffer);
3407     } else {
3408 	StrAllocCopy(*dst, src);
3409     }
3410 }
3411