1 /*
2 * $LynxId: LYCharUtils.c,v 1.131 2018/03/05 22:32:14 tom Exp $
3 *
4 * Functions associated with LYCharSets.c and the Lynx version of HTML.c - FM
5 * ==========================================================================
6 */
7 #include <HTUtils.h>
8 #include <SGML.h>
9
10 #define Lynx_HTML_Handler
11 #include <HTChunk.h>
12 #include <HText.h>
13 #include <HTStyle.h>
14 #include <HTMIME.h>
15 #include <HTML.h>
16
17 #include <HTCJK.h>
18 #include <HTAtom.h>
19 #include <HTMLGen.h>
20 #include <HTParse.h>
21 #include <UCMap.h>
22 #include <UCDefs.h>
23 #include <UCAux.h>
24
25 #include <LYGlobalDefs.h>
26 #include <LYCharUtils.h>
27 #include <LYCharSets.h>
28
29 #include <HTAlert.h>
30 #include <HTForms.h>
31 #include <HTNestedList.h>
32 #include <GridText.h>
33 #include <LYStrings.h>
34 #include <LYUtils.h>
35 #include <LYMap.h>
36 #include <LYBookmark.h>
37 #include <LYCurses.h>
38 #include <LYCookie.h>
39
40 #include <LYexit.h>
41 #include <LYLeaks.h>
42
43 /*
44 * Used for nested lists. - FM
45 */
46 int OL_CONTINUE = -29999; /* flag for whether CONTINUE is set */
47 int OL_VOID = -29998; /* flag for whether a count is set */
48
count_char(const char * value,int ch)49 static size_t count_char(const char *value, int ch)
50 {
51 const char *found;
52 size_t result = 0;
53
54 while ((*value != '\0') && (found = StrChr(value, ch)) != NULL) {
55 ++result;
56 value = (found + 1);
57 }
58 return result;
59 }
60
61 /*
62 * This function converts any ampersands in a pre-allocated string to "&".
63 * If brackets is TRUE, it also converts any angle-brackets to "<" or ">".
64 */
LYEntify(char ** in_out,int brackets)65 void LYEntify(char **in_out,
66 int brackets)
67 {
68 char *source = *in_out;
69 char *target;
70 char *result = NULL;
71 size_t count_AMPs = 0;
72 size_t count_LTs = 0;
73 size_t count_GTs = 0;
74
75 #ifdef CJK_EX
76 enum _state {
77 S_text,
78 S_esc,
79 S_dollar,
80 S_paren,
81 S_nonascii_text,
82 S_dollar_paren
83 } state = S_text;
84 int in_sjis = 0;
85 #endif
86
87 if (non_empty(source)) {
88 count_AMPs = count_char(*in_out, '&');
89 if (brackets) {
90 count_LTs = count_char(*in_out, '<');
91 count_GTs = count_char(*in_out, '>');
92 }
93
94 if (count_AMPs != 0 || count_LTs != 0 || count_GTs != 0) {
95
96 target = typecallocn(char,
97 (strlen(*in_out)
98 + (4 * count_AMPs)
99 + (3 * count_LTs)
100 + (3 * count_GTs) + 1));
101
102 if ((result = target) == NULL)
103 outofmem(__FILE__, "LYEntify");
104
105 for (source = *in_out; *source; source++) {
106 #ifdef CJK_EX
107 if (IS_CJK_TTY) {
108 switch (state) {
109 case S_text:
110 if (*source == '\033') {
111 state = S_esc;
112 *target++ = *source;
113 continue;
114 }
115 break;
116
117 case S_esc:
118 if (*source == '$') {
119 state = S_dollar;
120 } else if (*source == '(') {
121 state = S_paren;
122 } else {
123 state = S_text;
124 }
125 *target++ = *source;
126 continue;
127
128 case S_dollar:
129 if (*source == '@' || *source == 'B' || *source == 'A') {
130 state = S_nonascii_text;
131 } else if (*source == '(') {
132 state = S_dollar_paren;
133 } else {
134 state = S_text;
135 }
136 *target++ = *source;
137 continue;
138
139 case S_dollar_paren:
140 if (*source == 'C') {
141 state = S_nonascii_text;
142 } else {
143 state = S_text;
144 }
145 *target++ = *source;
146 continue;
147
148 case S_paren:
149 if (*source == 'B' || *source == 'J' || *source == 'T') {
150 state = S_text;
151 } else if (*source == 'I') {
152 state = S_nonascii_text;
153 } else if (*source == '\033') {
154 state = S_esc;
155 }
156 *target++ = *source;
157 continue;
158
159 case S_nonascii_text:
160 if (*source == '\033')
161 state = S_esc;
162 *target++ = *source;
163 continue;
164
165 default:
166 break;
167 }
168 if (*(source + 1) != '\0' &&
169 (IS_EUC(UCH(*source), UCH(*(source + 1))) ||
170 IS_SJIS(UCH(*source), UCH(*(source + 1)), in_sjis) ||
171 IS_BIG5(UCH(*source), UCH(*(source + 1))))) {
172 *target++ = *source++;
173 *target++ = *source;
174 continue;
175 }
176 }
177 #endif
178 switch (*source) {
179 case '&':
180 *target++ = '&';
181 *target++ = 'a';
182 *target++ = 'm';
183 *target++ = 'p';
184 *target++ = ';';
185 break;
186 case '<':
187 if (brackets) {
188 *target++ = '&';
189 *target++ = 'l';
190 *target++ = 't';
191 *target++ = ';';
192 break;
193 }
194 /* FALLTHRU */
195 case '>':
196 if (brackets) {
197 *target++ = '&';
198 *target++ = 'g';
199 *target++ = 't';
200 *target++ = ';';
201 break;
202 }
203 /* FALLTHRU */
204 default:
205 *target++ = *source;
206 break;
207 }
208 }
209 *target = '\0';
210 FREE(*in_out);
211 *in_out = result;
212 }
213 }
214 }
215
216 /*
217 * Callers to LYEntifyTitle/LYEntifyValue do not look at the 'target' param.
218 * Optimize things a little by avoiding the memory allocation if not needed,
219 * as is usually the case.
220 */
MustEntify(const char * source)221 static BOOL MustEntify(const char *source)
222 {
223 BOOL result;
224
225 #ifdef CJK_EX
226 if (IS_CJK_TTY && StrChr(source, '\033') != 0) {
227 result = TRUE;
228 } else
229 #endif
230 {
231 size_t length = strlen(source);
232 size_t reject = strcspn(source, "<&>");
233
234 result = (BOOL) (length != reject);
235 }
236
237 return result;
238 }
239
240 /*
241 * Wrappers for LYEntify() which do not assume that the source was allocated,
242 * e.g., output from gettext().
243 */
LYEntifyTitle(char ** target,const char * source)244 const char *LYEntifyTitle(char **target, const char *source)
245 {
246 const char *result = 0;
247
248 if (MustEntify(source)) {
249 StrAllocCopy(*target, source);
250 LYEntify(target, TRUE);
251 result = *target;
252 } else {
253 result = source;
254 }
255 return result;
256 }
257
LYEntifyValue(char ** target,const char * source)258 const char *LYEntifyValue(char **target, const char *source)
259 {
260 const char *result = 0;
261
262 if (MustEntify(source)) {
263 StrAllocCopy(*target, source);
264 LYEntify(target, FALSE);
265 result = *target;
266 } else {
267 result = source;
268 }
269 return result;
270 }
271
272 /*
273 * This function trims characters <= that of a space (32),
274 * including HT_NON_BREAK_SPACE (1) and HT_EN_SPACE (2),
275 * but not ESC, from the heads of strings. - FM
276 */
LYTrimHead(char * str)277 void LYTrimHead(char *str)
278 {
279 const char *s = str;
280
281 if (isEmpty(s))
282 return;
283
284 while (*s && WHITE(*s) && UCH(*s) != UCH(CH_ESC)) /* S/390 -- gil -- 1669 */
285 s++;
286 if (s > str) {
287 char *ns = str;
288
289 while (*s) {
290 *ns++ = *s++;
291 }
292 *ns = '\0';
293 }
294 }
295
296 /*
297 * This function trims characters <= that of a space (32),
298 * including HT_NON_BREAK_SPACE (1), HT_EN_SPACE (2), and
299 * ESC from the tails of strings. - FM
300 */
LYTrimTail(char * str)301 void LYTrimTail(char *str)
302 {
303 int i;
304
305 if (isEmpty(str))
306 return;
307
308 i = (int) strlen(str) - 1;
309 while (i >= 0) {
310 if (WHITE(str[i]))
311 str[i] = '\0';
312 else
313 break;
314 i--;
315 }
316 }
317
318 /*
319 * This function should receive a pointer to the start
320 * of a comment. It returns a pointer to the end ('>')
321 * character of comment, or it's best guess if the comment
322 * is invalid. - FM
323 */
LYFindEndOfComment(char * str)324 char *LYFindEndOfComment(char *str)
325 {
326 char *cp, *cp1;
327 enum comment_state {
328 start1,
329 start2,
330 end1,
331 end2
332 } state;
333
334 if (str == NULL)
335 /*
336 * We got NULL, so return NULL. - FM
337 */
338 return NULL;
339
340 if (StrNCmp(str, "<!--", 4))
341 /*
342 * We don't have the start of a comment, so return the beginning of the
343 * string. - FM
344 */
345 return str;
346
347 cp = (str + 4);
348 if (*cp == '>')
349 /*
350 * It's an invalid comment, so
351 * return this end character. - FM
352 */
353 return cp;
354
355 if ((cp1 = StrChr(cp, '>')) == NULL)
356 /*
357 * We don't have an end character, so return the beginning of the
358 * string. - FM
359 */
360 return str;
361
362 if (*cp == '-')
363 /*
364 * Ugh, it's a "decorative" series of dashes, so return the next end
365 * character. - FM
366 */
367 return cp1;
368
369 /*
370 * OK, we're ready to start parsing. - FM
371 */
372 state = start2;
373 while (*cp != '\0') {
374 switch (state) {
375 case start1:
376 if (*cp == '-')
377 state = start2;
378 else
379 /*
380 * Invalid comment, so return the first '>' from the start of
381 * the string. - FM
382 */
383 return cp1;
384 break;
385
386 case start2:
387 if (*cp == '-')
388 state = end1;
389 break;
390
391 case end1:
392 if (*cp == '-')
393 state = end2;
394 else
395 /*
396 * Invalid comment, so return the first '>' from the start of
397 * the string. - FM
398 */
399 return cp1;
400 break;
401
402 case end2:
403 if (*cp == '>')
404 /*
405 * Valid comment, so return the end character. - FM
406 */
407 return cp;
408 if (*cp == '-') {
409 state = start1;
410 } else if (!(WHITE(*cp) && UCH(*cp) != UCH(CH_ESC))) { /* S/390 -- gil -- 1686 */
411 /*
412 * Invalid comment, so return the first '>' from the start of
413 * the string. - FM
414 */
415 return cp1;
416 }
417 break;
418
419 default:
420 break;
421 }
422 cp++;
423 }
424
425 /*
426 * Invalid comment, so return the first '>' from the start of the string.
427 * - FM
428 */
429 return cp1;
430 }
431
432 /*
433 * If an HREF, itself or if resolved against a base,
434 * represents a file URL, and the host is defaulted,
435 * force in "//localhost". We need this until
436 * all the other Lynx code which performs security
437 * checks based on the "localhost" string is changed
438 * to assume "//localhost" when a host field is not
439 * present in file URLs - FM
440 */
LYFillLocalFileURL(char ** href,const char * base)441 void LYFillLocalFileURL(char **href,
442 const char *base)
443 {
444 char *temp = NULL;
445
446 if (isEmpty(*href))
447 return;
448
449 if (!strcmp(*href, "//") || !StrNCmp(*href, "///", 3)) {
450 if (base != NULL && isFILE_URL(base)) {
451 StrAllocCopy(temp, STR_FILE_URL);
452 StrAllocCat(temp, *href);
453 StrAllocCopy(*href, temp);
454 }
455 }
456 if (isFILE_URL(*href)) {
457 if (*(*href + 5) == '\0') {
458 StrAllocCat(*href, "//localhost");
459 } else if (!strcmp(*href, "file://")) {
460 StrAllocCat(*href, "localhost");
461 } else if (!StrNCmp(*href, "file:///", 8)) {
462 StrAllocCopy(temp, (*href + 7));
463 LYLocalFileToURL(href, temp);
464 } else if (!StrNCmp(*href, "file:/", 6) && !LYIsHtmlSep(*(*href + 6))) {
465 StrAllocCopy(temp, (*href + 5));
466 LYLocalFileToURL(href, temp);
467 }
468 }
469 #if defined(USE_DOS_DRIVES)
470 if (LYIsDosDrive(*href)) {
471 /*
472 * If it's a local DOS path beginning with drive letter,
473 * add file://localhost/ prefix and go ahead.
474 */
475 StrAllocCopy(temp, *href);
476 LYLocalFileToURL(href, temp);
477 }
478
479 /* use below: strlen("file://localhost/") = 17 */
480 if (!StrNCmp(*href, "file://localhost/", 17)
481 && (strlen(*href) == 19)
482 && LYIsDosDrive(*href + 17)) {
483 /*
484 * Terminate DOS drive letter with a slash to surf root successfully.
485 * Here seems a proper place to do so.
486 */
487 LYAddPathSep(href);
488 }
489 #endif /* USE_DOS_DRIVES */
490
491 /*
492 * No path in a file://localhost URL means a
493 * directory listing for the current default. - FM
494 */
495 if (!strcmp(*href, "file://localhost")) {
496 const char *temp2;
497
498 #ifdef VMS
499 temp2 = HTVMS_wwwName(LYGetEnv("PATH"));
500 #else
501 char curdir[LY_MAXPATH];
502
503 temp2 = wwwName(Current_Dir(curdir));
504 #endif /* VMS */
505 if (!LYIsHtmlSep(*temp2))
506 LYAddHtmlSep(href);
507 /*
508 * Check for pathological cases - current dir has chars which MUST BE
509 * URL-escaped - kw
510 */
511 if (StrChr(temp2, '%') != NULL || StrChr(temp2, '#') != NULL) {
512 FREE(temp);
513 temp = HTEscape(temp2, URL_PATH);
514 StrAllocCat(*href, temp);
515 } else {
516 StrAllocCat(*href, temp2);
517 }
518 }
519 #ifdef VMS
520 /*
521 * On VMS, a file://localhost/ URL means
522 * a listing for the login directory. - FM
523 */
524 if (!strcmp(*href, "file://localhost/"))
525 StrAllocCat(*href, (HTVMS_wwwName(Home_Dir()) + 1));
526 #endif /* VMS */
527
528 FREE(temp);
529 return;
530 }
531
LYAddMETAcharsetToStream(HTStream * target,int disp_chndl)532 void LYAddMETAcharsetToStream(HTStream *target, int disp_chndl)
533 {
534 char *buf = 0;
535
536 if (disp_chndl == -1)
537 /*
538 * -1 means use current_char_set.
539 */
540 disp_chndl = current_char_set;
541
542 if (target != 0 && disp_chndl >= 0) {
543 HTSprintf0(&buf, "<META %s content=\"" STR_HTML ";charset=%s\">\n",
544 "http-equiv=\"content-type\"",
545 LYCharSet_UC[disp_chndl].MIMEname);
546 (*target->isa->put_string) (target, buf);
547 FREE(buf);
548 }
549 }
550
551 /*
552 * This function writes a line with a META tag to an open file,
553 * which will specify a charset parameter to use when the file is
554 * read back in. It is meant for temporary HTML files used by the
555 * various special pages which may show titles of documents. When those
556 * files are created, the title strings normally have been translated and
557 * expanded to the display character set, so we have to make sure they
558 * don't get translated again.
559 * If the user has changed the display character set during the lifetime
560 * of the Lynx session (or, more exactly, during the time the title
561 * strings to be written were generated), they may now have different
562 * character encodings and there is currently no way to get it all right.
563 * To change this, we would have to add a variable for each string which
564 * keeps track of its character encoding.
565 * But at least we can try to ensure that reading the file after future
566 * display character set changes will give reasonable output.
567 *
568 * The META tag is not written if the display character set (passed as
569 * disp_chndl) already corresponds to the charset assumption that
570 * would be made when the file is read. - KW
571 *
572 * Currently this function is used for temporary files like "Lynx Info Page"
573 * and for one permanent - bookmarks (so it may be a problem if you change
574 * the display charset later: new bookmark entries may be mistranslated).
575 * - LP
576 */
LYAddMETAcharsetToFD(FILE * fd,int disp_chndl)577 void LYAddMETAcharsetToFD(FILE *fd, int disp_chndl)
578 {
579 if (disp_chndl == -1)
580 /*
581 * -1 means use current_char_set.
582 */
583 disp_chndl = current_char_set;
584
585 if (fd == NULL || disp_chndl < 0)
586 /*
587 * Should not happen.
588 */
589 return;
590
591 if (UCLYhndl_HTFile_for_unspec == disp_chndl)
592 /*
593 * Not need to do, so we don't.
594 */
595 return;
596
597 if (LYCharSet_UC[disp_chndl].enc == UCT_ENC_7BIT)
598 /*
599 * There shouldn't be any 8-bit characters in this case.
600 */
601 return;
602
603 /*
604 * In other cases we don't know because UCLYhndl_for_unspec may change
605 * during the lifetime of the file (by toggling raw mode or changing the
606 * display character set), so proceed.
607 */
608 fprintf(fd, "<META %s content=\"" STR_HTML ";charset=%s\">\n",
609 "http-equiv=\"content-type\"",
610 LYCharSet_UC[disp_chndl].MIMEname);
611 }
612
613 /*
614 * This function returns OL TYPE="A" strings in
615 * the range of " A." (1) to "ZZZ." (18278). - FM
616 */
LYUppercaseA_OL_String(int seqnum)617 char *LYUppercaseA_OL_String(int seqnum)
618 {
619 static char OLstring[8];
620
621 if (seqnum <= 1) {
622 strcpy(OLstring, " A.");
623 return OLstring;
624 }
625 if (seqnum < 27) {
626 sprintf(OLstring, " %c.", (seqnum + 64));
627 return OLstring;
628 }
629 if (seqnum < 703) {
630 sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 64),
631 (seqnum - ((seqnum - 1) / 26) * 26 + 64));
632 return OLstring;
633 }
634 if (seqnum < 18279) {
635 sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 64),
636 (((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 64),
637 (seqnum - ((seqnum - 1) / 26) * 26 + 64));
638 return OLstring;
639 }
640 strcpy(OLstring, "ZZZ.");
641 return OLstring;
642 }
643
644 /*
645 * This function returns OL TYPE="a" strings in
646 * the range of " a." (1) to "zzz." (18278). - FM
647 */
LYLowercaseA_OL_String(int seqnum)648 char *LYLowercaseA_OL_String(int seqnum)
649 {
650 static char OLstring[8];
651
652 if (seqnum <= 1) {
653 strcpy(OLstring, " a.");
654 return OLstring;
655 }
656 if (seqnum < 27) {
657 sprintf(OLstring, " %c.", (seqnum + 96));
658 return OLstring;
659 }
660 if (seqnum < 703) {
661 sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 96),
662 (seqnum - ((seqnum - 1) / 26) * 26 + 96));
663 return OLstring;
664 }
665 if (seqnum < 18279) {
666 sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 96),
667 (((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 96),
668 (seqnum - ((seqnum - 1) / 26) * 26 + 96));
669 return OLstring;
670 }
671 strcpy(OLstring, "zzz.");
672 return OLstring;
673 }
674
675 /*
676 * This function returns OL TYPE="I" strings in the
677 * range of " I." (1) to "MMM." (3000).- FM
678 * Maximum length: 16 -TD
679 */
LYUppercaseI_OL_String(int seqnum)680 char *LYUppercaseI_OL_String(int seqnum)
681 {
682 static char OLstring[20];
683 int Arabic = seqnum;
684
685 if (Arabic >= 3000) {
686 strcpy(OLstring, "MMM.");
687 return OLstring;
688 }
689
690 switch (Arabic) {
691 case 1:
692 strcpy(OLstring, " I.");
693 return OLstring;
694 case 5:
695 strcpy(OLstring, " V.");
696 return OLstring;
697 case 10:
698 strcpy(OLstring, " X.");
699 return OLstring;
700 case 50:
701 strcpy(OLstring, " L.");
702 return OLstring;
703 case 100:
704 strcpy(OLstring, " C.");
705 return OLstring;
706 case 500:
707 strcpy(OLstring, " D.");
708 return OLstring;
709 case 1000:
710 strcpy(OLstring, " M.");
711 return OLstring;
712 default:
713 OLstring[0] = '\0';
714 break;
715 }
716
717 while (Arabic >= 1000) {
718 strcat(OLstring, "M");
719 Arabic -= 1000;
720 }
721
722 if (Arabic >= 900) {
723 strcat(OLstring, "CM");
724 Arabic -= 900;
725 }
726
727 if (Arabic >= 500) {
728 strcat(OLstring, "D");
729 Arabic -= 500;
730 }
731
732 if (Arabic >= 400) {
733 strcat(OLstring, "CD");
734 Arabic -= 400;
735 }
736
737 while (Arabic >= 100) {
738 strcat(OLstring, "C");
739 Arabic -= 100;
740 }
741
742 if (Arabic >= 90) {
743 strcat(OLstring, "XC");
744 Arabic -= 90;
745 }
746
747 if (Arabic >= 50) {
748 strcat(OLstring, "L");
749 Arabic -= 50;
750 }
751
752 if (Arabic >= 40) {
753 strcat(OLstring, "XL");
754 Arabic -= 40;
755 }
756
757 while (Arabic > 10) {
758 strcat(OLstring, "X");
759 Arabic -= 10;
760 }
761
762 switch (Arabic) {
763 case 1:
764 strcat(OLstring, "I.");
765 break;
766 case 2:
767 strcat(OLstring, "II.");
768 break;
769 case 3:
770 strcat(OLstring, "III.");
771 break;
772 case 4:
773 strcat(OLstring, "IV.");
774 break;
775 case 5:
776 strcat(OLstring, "V.");
777 break;
778 case 6:
779 strcat(OLstring, "VI.");
780 break;
781 case 7:
782 strcat(OLstring, "VII.");
783 break;
784 case 8:
785 strcat(OLstring, "VIII.");
786 break;
787 case 9:
788 strcat(OLstring, "IX.");
789 break;
790 case 10:
791 strcat(OLstring, "X.");
792 break;
793 default:
794 strcat(OLstring, ".");
795 break;
796 }
797
798 return OLstring;
799 }
800
801 /*
802 * This function returns OL TYPE="i" strings in
803 * range of " i." (1) to "mmm." (3000).- FM
804 * Maximum length: 16 -TD
805 */
LYLowercaseI_OL_String(int seqnum)806 char *LYLowercaseI_OL_String(int seqnum)
807 {
808 static char OLstring[20];
809 int Arabic = seqnum;
810
811 if (Arabic >= 3000) {
812 strcpy(OLstring, "mmm.");
813 return OLstring;
814 }
815
816 switch (Arabic) {
817 case 1:
818 strcpy(OLstring, " i.");
819 return OLstring;
820 case 5:
821 strcpy(OLstring, " v.");
822 return OLstring;
823 case 10:
824 strcpy(OLstring, " x.");
825 return OLstring;
826 case 50:
827 strcpy(OLstring, " l.");
828 return OLstring;
829 case 100:
830 strcpy(OLstring, " c.");
831 return OLstring;
832 case 500:
833 strcpy(OLstring, " d.");
834 return OLstring;
835 case 1000:
836 strcpy(OLstring, " m.");
837 return OLstring;
838 default:
839 OLstring[0] = '\0';
840 break;
841 }
842
843 while (Arabic >= 1000) {
844 strcat(OLstring, "m");
845 Arabic -= 1000;
846 }
847
848 if (Arabic >= 900) {
849 strcat(OLstring, "cm");
850 Arabic -= 900;
851 }
852
853 if (Arabic >= 500) {
854 strcat(OLstring, "d");
855 Arabic -= 500;
856 }
857
858 if (Arabic >= 400) {
859 strcat(OLstring, "cd");
860 Arabic -= 400;
861 }
862
863 while (Arabic >= 100) {
864 strcat(OLstring, "c");
865 Arabic -= 100;
866 }
867
868 if (Arabic >= 90) {
869 strcat(OLstring, "xc");
870 Arabic -= 90;
871 }
872
873 if (Arabic >= 50) {
874 strcat(OLstring, "l");
875 Arabic -= 50;
876 }
877
878 if (Arabic >= 40) {
879 strcat(OLstring, "xl");
880 Arabic -= 40;
881 }
882
883 while (Arabic > 10) {
884 strcat(OLstring, "x");
885 Arabic -= 10;
886 }
887
888 switch (Arabic) {
889 case 1:
890 strcat(OLstring, "i.");
891 break;
892 case 2:
893 strcat(OLstring, "ii.");
894 break;
895 case 3:
896 strcat(OLstring, "iii.");
897 break;
898 case 4:
899 strcat(OLstring, "iv.");
900 break;
901 case 5:
902 strcat(OLstring, "v.");
903 break;
904 case 6:
905 strcat(OLstring, "vi.");
906 break;
907 case 7:
908 strcat(OLstring, "vii.");
909 break;
910 case 8:
911 strcat(OLstring, "viii.");
912 break;
913 case 9:
914 strcat(OLstring, "ix.");
915 break;
916 case 10:
917 strcat(OLstring, "x.");
918 break;
919 default:
920 strcat(OLstring, ".");
921 break;
922 }
923
924 return OLstring;
925 }
926
927 /*
928 * This function initializes the Ordered List counter. - FM
929 */
LYZero_OL_Counter(HTStructured * me)930 void LYZero_OL_Counter(HTStructured * me)
931 {
932 int i;
933
934 if (!me)
935 return;
936
937 for (i = 0; i < 12; i++) {
938 me->OL_Counter[i] = OL_VOID;
939 me->OL_Type[i] = '1';
940 }
941
942 me->Last_OL_Count = 0;
943 me->Last_OL_Type = '1';
944
945 return;
946 }
947
948 /*
949 * This function is used by the HTML Structured object. - KW
950 */
LYGetChartransInfo(HTStructured * me)951 void LYGetChartransInfo(HTStructured * me)
952 {
953 me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
954 UCT_STAGE_STRUCTURED);
955 if (me->UCLYhndl < 0) {
956 int chndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_HTEXT);
957
958 if (chndl < 0) {
959 chndl = current_char_set;
960 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
961 UCT_STAGE_HTEXT,
962 UCT_SETBY_STRUCTURED);
963 }
964 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
965 UCT_STAGE_STRUCTURED,
966 UCT_SETBY_STRUCTURED);
967 me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
968 UCT_STAGE_STRUCTURED);
969 }
970 me->UCI = HTAnchor_getUCInfoStage(me->node_anchor,
971 UCT_STAGE_STRUCTURED);
972 }
973
974 /* as in HTParse.c, saves some calls - kw */
975 static const char *hex = "0123456789ABCDEF";
976
977 /*
978 * Any raw 8-bit or multibyte characters already have been
979 * handled in relation to the display character set
980 * in SGML_character(), including named and numeric entities.
981 *
982 * This function used for translations HTML special fields inside tags
983 * (ALT=, VALUE=, etc.) from charset `cs_from' to charset `cs_to'.
984 * It also unescapes non-ASCII characters from URL (#fragments !)
985 * if st_URL is active.
986 *
987 * If `do_ent' is YES, it converts named entities
988 * and numeric character references (NCRs) to their `cs_to' replacements.
989 *
990 * Named entities converted to unicodes. NCRs (unicodes) converted
991 * by UCdomap.c chartrans functions.
992 * ???NCRs with values in the ISO-8859-1 range 160-255 may be converted
993 * to their HTML entity names (via old-style entities) and then translated
994 * according to the LYCharSets.c array for `cs_out'???.
995 *
996 * Some characters (see descriptions in `put_special_unicodes' from SGML.c)
997 * translated in relation with the state of boolean variables
998 * `use_lynx_specials', `plain_space' and `hidden'. It is not clear yet:
999 *
1000 * If plain_space is TRUE, nbsp (160) will be treated as an ASCII
1001 * space (32). If hidden is TRUE, entities will be translated
1002 * (if `do_ent' is YES) but escape sequences will be passed unaltered.
1003 * If `hidden' is FALSE, some characters are converted to Lynx special
1004 * codes (see `put_special_unicodes') or ASCII space if `plain_space'
1005 * applies). @@ is `use_lynx_specials' needed, does it have any effect? @@
1006 * If `use_lynx_specials' is YES, translate byte values 160 and 173
1007 * meaning U+00A0 and U+00AD given as or converted from raw char input
1008 * are converted to HT_NON_BREAK_SPACE and LY_SOFT_HYPHEN, respectively
1009 * (unless input and output charset are both iso-8859-1, for compatibility
1010 * with previous usage in HTML.c) even if `hidden' or `plain_space' is set.
1011 *
1012 * If `Back' is YES, the reverse is done instead i.e., Lynx special codes
1013 * in the input are translated back to character values.
1014 *
1015 * If `Back' is YES, an attempt is made to use UCReverseTransChar() for
1016 * back translation which may be more efficient. (?)
1017 *
1018 * If `stype' is st_URL, non-ASCII characters are URL-encoded instead.
1019 * The sequence of bytes being URL-encoded is the raw input character if
1020 * we couldn't translate it from `cs_in' (CJK etc.); otherwise it is the
1021 * UTF-8 representation if either `cs_to' requires this or if the
1022 * character's Unicode value is > 255, otherwise it should be the iso-8859-1
1023 * representation.
1024 * No general URL-encoding occurs for displayable ASCII characters and
1025 * spaces and some C0 controls valid in HTML (LF, TAB), it is expected
1026 * that other functions will take care of that as appropriate.
1027 *
1028 * Escape characters (0x1B, '\033') are
1029 * - URL-encoded if `stype' is st_URL, otherwise
1030 * - dropped if `stype' is st_other, otherwise (i.e., st_HTML)
1031 * - passed if `hidden' is TRUE or HTCJK is set, otherwise
1032 * - dropped.
1033 *
1034 * (If `stype' is st_URL or st_other most of the parameters really predefined:
1035 * cs_from=cs_to, use_lynx_specials=plain_space=NO, and hidden=YES)
1036 *
1037 *
1038 * Returns pointer to the char** passed in
1039 * if string translated or translation unnecessary,
1040 * NULL otherwise
1041 * (in which case something probably went wrong.)
1042 *
1043 *
1044 * In general, this somehow ugly function (KW)
1045 * cover three functions from v.2.7.2 (FM):
1046 * extern void LYExpandString (
1047 * HTStructured * me,
1048 * char ** str);
1049 * extern void LYUnEscapeEntities (
1050 * HTStructured * me,
1051 * char ** str);
1052 * extern void LYUnEscapeToLatinOne (
1053 * HTStructured * me,
1054 * char ** str,
1055 * BOOLEAN isURL);
1056 */
1057
LYUCFullyTranslateString(char ** str,int cs_from,int cs_to,int do_ent,int use_lynx_specials,int plain_space,int hidden,int Back,CharUtil_st stype)1058 char **LYUCFullyTranslateString(char **str,
1059 int cs_from,
1060 int cs_to,
1061 int do_ent,
1062 int use_lynx_specials,
1063 int plain_space,
1064 int hidden,
1065 int Back,
1066 CharUtil_st stype)
1067 {
1068 char *p;
1069 char *q, *qs;
1070 HTChunk *chunk = NULL;
1071 char *cp = 0;
1072 char cpe = 0;
1073 char *esc = NULL;
1074 char replace_buf[64];
1075 int uck;
1076 int lowest_8;
1077 UCode_t code = 0;
1078 BOOL output_utf8 = 0, repl_translated_C0 = 0;
1079 size_t len;
1080 const char *name = NULL;
1081 BOOLEAN no_bytetrans;
1082 UCTransParams T;
1083 BOOL from_is_utf8 = FALSE;
1084 char *puni = 0;
1085 enum _state {
1086 S_text,
1087 S_esc,
1088 S_dollar,
1089 S_paren,
1090 S_nonascii_text,
1091 S_dollar_paren,
1092 S_trans_byte,
1093 S_check_ent,
1094 S_ncr,
1095 S_check_uni,
1096 S_named,
1097 S_check_name,
1098 S_recover,
1099 S_got_oututf8,
1100 S_got_outstring,
1101 S_put_urlstring,
1102 S_got_outchar,
1103 S_put_urlchar,
1104 S_next_char,
1105 S_done
1106 } state = S_text;
1107 enum _parsing_what {
1108 P_text,
1109 P_utf8,
1110 P_hex,
1111 P_decimal,
1112 P_named
1113 } what = P_text;
1114
1115 #ifdef KANJI_CODE_OVERRIDE
1116 static unsigned char sjis_1st = '\0';
1117
1118 unsigned char sjis_str[3];
1119 #endif
1120
1121 /*
1122 * Make sure we have a non-empty string. - FM
1123 */
1124 if (isEmpty(*str))
1125 return str;
1126
1127 /*
1128 * FIXME: something's wrong with the limit checks here (clearing the
1129 * buffer helps).
1130 */
1131 memset(replace_buf, 0, sizeof(replace_buf));
1132
1133 /*
1134 * Don't do byte translation if original AND target character sets are both
1135 * iso-8859-1 (and we are not called to back-translate), or if we are in
1136 * CJK mode.
1137 */
1138 if (IS_CJK_TTY
1139 #ifdef EXP_JAPANESEUTF8_SUPPORT
1140 && (strcmp(LYCharSet_UC[cs_from].MIMEname, "utf-8") != 0)
1141 && (strcmp(LYCharSet_UC[cs_to].MIMEname, "utf-8") != 0)
1142 #endif
1143 ) {
1144 no_bytetrans = TRUE;
1145 } else if (cs_to <= 0 && cs_from == cs_to && (!Back || cs_to < 0)) {
1146 no_bytetrans = TRUE;
1147 } else {
1148 /* No need to translate or examine the string any further */
1149 no_bytetrans = (BOOL) (!use_lynx_specials && !Back &&
1150 UCNeedNotTranslate(cs_from, cs_to));
1151 }
1152 /*
1153 * Save malloc/calloc overhead in simple case - kw
1154 */
1155 if (do_ent && hidden && (stype != st_URL) && (StrChr(*str, '&') == NULL))
1156 do_ent = FALSE;
1157
1158 /* Can't do, caller should figure out what to do... */
1159 if (!UCCanTranslateFromTo(cs_from, cs_to)) {
1160 if (cs_to < 0)
1161 return NULL;
1162 if (!do_ent && no_bytetrans)
1163 return NULL;
1164 no_bytetrans = TRUE;
1165 } else if (cs_to < 0) {
1166 do_ent = FALSE;
1167 }
1168
1169 if (!do_ent && no_bytetrans)
1170 return str;
1171 p = *str;
1172
1173 if (!no_bytetrans) {
1174 UCTransParams_clear(&T);
1175 UCSetTransParams(&T, cs_from, &LYCharSet_UC[cs_from],
1176 cs_to, &LYCharSet_UC[cs_to]);
1177 from_is_utf8 = (BOOL) (LYCharSet_UC[cs_from].enc == UCT_ENC_UTF8);
1178 output_utf8 = T.output_utf8;
1179 repl_translated_C0 = T.repl_translated_C0;
1180 puni = p;
1181 } else if (do_ent) {
1182 output_utf8 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8 ||
1183 HText_hasUTF8OutputSet(HTMainText));
1184 repl_translated_C0 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_8BIT_C0);
1185 }
1186
1187 lowest_8 = LYlowest_eightbit[cs_to];
1188
1189 /*
1190 * Create a buffer string seven times the length of the original, so we
1191 * have plenty of room for expansions. - FM
1192 */
1193 len = strlen(p) + 16;
1194 q = p;
1195
1196 qs = q;
1197
1198 /* Create the HTChunk only if we need it */
1199 #define CHUNK (chunk ? chunk : (chunk = HTChunkCreate2(128, len+1)))
1200
1201 #define REPLACE_STRING(s) \
1202 if (q != qs) HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
1203 HTChunkPuts(CHUNK, s); \
1204 qs = q = *str
1205
1206 #define REPLACE_CHAR(c) if (q > p) { \
1207 HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
1208 qs = q = *str; \
1209 *q++ = c; \
1210 } else \
1211 *q++ = c
1212
1213 /*
1214 * Loop through string, making conversions as needed.
1215 *
1216 * The while() checks for a non-'\0' char only for the normal text states
1217 * since other states may temporarily modify p or *p (which should be
1218 * restored before S_done!) - kw
1219 */
1220 while (*p || (state != S_text && state != S_nonascii_text)) {
1221 switch (state) {
1222 case S_text:
1223 code = UCH(*p);
1224 #ifdef KANJI_CODE_OVERRIDE
1225 if (HTCJK == JAPANESE && last_kcode == SJIS) {
1226 if (sjis_1st == '\0' && (IS_SJIS_HI1(code) || IS_SJIS_HI2(code))) {
1227 sjis_1st = UCH(code);
1228 } else if (sjis_1st && IS_SJIS_LO(code)) {
1229 sjis_1st = '\0';
1230 } else {
1231 if (conv_jisx0201kana && 0xA1 <= code && code <= 0xDF) {
1232 sjis_str[2] = '\0';
1233 JISx0201TO0208_SJIS(UCH(code),
1234 sjis_str, sjis_str + 1);
1235 REPLACE_STRING(sjis_str);
1236 p++;
1237 continue;
1238 }
1239 }
1240 }
1241 #endif
1242 if (*p == '\033') {
1243 if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
1244 state = S_esc;
1245 if (stype == st_URL) {
1246 REPLACE_STRING("%1B");
1247 p++;
1248 continue;
1249 } else if (stype != st_HTML) {
1250 p++;
1251 continue;
1252 } else {
1253 *q++ = *p++;
1254 continue;
1255 }
1256 } else if (!hidden) {
1257 /*
1258 * CJK handling not on, and not a hidden INPUT, so block
1259 * escape. - FM
1260 */
1261 state = S_next_char;
1262 } else {
1263 state = S_trans_byte;
1264 }
1265 } else {
1266 state = (do_ent ? S_check_ent : S_trans_byte);
1267 }
1268 break;
1269
1270 case S_esc:
1271 if (*p == '$') {
1272 state = S_dollar;
1273 *q++ = *p++;
1274 continue;
1275 } else if (*p == '(') {
1276 state = S_paren;
1277 *q++ = *p++;
1278 continue;
1279 } else {
1280 state = S_text;
1281 }
1282 break;
1283
1284 case S_dollar:
1285 if (*p == '@' || *p == 'B' || *p == 'A') {
1286 state = S_nonascii_text;
1287 *q++ = *p++;
1288 continue;
1289 } else if (*p == '(') {
1290 state = S_dollar_paren;
1291 *q++ = *p++;
1292 continue;
1293 } else {
1294 state = S_text;
1295 }
1296 break;
1297
1298 case S_dollar_paren:
1299 if (*p == 'C') {
1300 state = S_nonascii_text;
1301 *q++ = *p++;
1302 continue;
1303 } else {
1304 state = S_text;
1305 }
1306 break;
1307
1308 case S_paren:
1309 if (*p == 'B' || *p == 'J' || *p == 'T') {
1310 state = S_text;
1311 *q++ = *p++;
1312 continue;
1313 } else if (*p == 'I') {
1314 state = S_nonascii_text;
1315 *q++ = *p++;
1316 continue;
1317 } else {
1318 state = S_text;
1319 }
1320 break;
1321
1322 case S_nonascii_text:
1323 if (*p == '\033') {
1324 if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
1325 state = S_esc;
1326 if (stype == st_URL) {
1327 REPLACE_STRING("%1B");
1328 p++;
1329 continue;
1330 } else if (stype != st_HTML) {
1331 p++;
1332 continue;
1333 }
1334 }
1335 }
1336 *q++ = *p++;
1337 continue;
1338
1339 case S_trans_byte:
1340 /* character translation goes here */
1341 /*
1342 * Don't do anything if we have no string, or if original AND
1343 * target character sets are both iso-8859-1, or if we are in CJK
1344 * mode.
1345 */
1346 if (*p == '\0' || no_bytetrans) {
1347 state = S_got_outchar;
1348 break;
1349 }
1350
1351 if (Back) {
1352 int rev_c;
1353
1354 if ((*p) == HT_NON_BREAK_SPACE ||
1355 (*p) == HT_EN_SPACE) {
1356 if (plain_space) {
1357 code = *p = ' ';
1358 state = S_got_outchar;
1359 break;
1360 } else {
1361 code = 160;
1362 if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1363 (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
1364 state = S_got_outchar;
1365 break;
1366 } else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
1367 || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1368 state = S_check_uni;
1369 break;
1370 } else {
1371 *(unsigned char *) p = UCH(160);
1372 }
1373 }
1374 } else if ((*p) == LY_SOFT_HYPHEN) {
1375 code = 173;
1376 if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1377 (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
1378 state = S_got_outchar;
1379 break;
1380 } else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
1381 || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1382 state = S_check_uni;
1383 break;
1384 } else {
1385 *(unsigned char *) p = UCH(173);
1386 }
1387 #ifdef EXP_JAPANESEUTF8_SUPPORT
1388 } else if (output_utf8) {
1389 if ((!strcmp(LYCharSet_UC[cs_from].MIMEname, "euc-jp") &&
1390 (IS_EUC((unsigned char) (*p),
1391 (unsigned char) (*(p + 1))))) ||
1392 (!strcmp(LYCharSet_UC[cs_from].MIMEname, "shift_jis") &&
1393 (IS_SJIS_2BYTE((unsigned char) (*p),
1394 (unsigned char) (*(p + 1)))))) {
1395 code = UCTransJPToUni(p, 2, cs_from);
1396 p++;
1397 state = S_check_uni;
1398 break;
1399 }
1400 #endif
1401 } else if (code < 127 || T.transp) {
1402 state = S_got_outchar;
1403 break;
1404 }
1405 rev_c = UCReverseTransChar(*p, cs_to, cs_from);
1406 if (rev_c > 127) {
1407 *p = (char) rev_c;
1408 code = rev_c;
1409 state = S_got_outchar;
1410 break;
1411 }
1412 } else if (code < 127) {
1413 state = S_got_outchar;
1414 break;
1415 }
1416
1417 if (from_is_utf8) {
1418 if (((*p) & 0xc0) == 0xc0) {
1419 const char *pq = p;
1420
1421 puni = p;
1422 code = UCGetUniFromUtf8String(&pq);
1423 if (code <= 0) {
1424 code = UCH(*p);
1425 } else {
1426 what = P_utf8;
1427 puni += (pq - (const char *) p);
1428 }
1429 }
1430 } else if (use_lynx_specials && !Back &&
1431 (code == 160 || code == 173) &&
1432 (LYCharSet_UC[cs_from].enc == UCT_ENC_8859 ||
1433 (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1434 if (code == 160)
1435 code = *p = HT_NON_BREAK_SPACE;
1436 else if (code == 173)
1437 code = *p = LY_SOFT_HYPHEN;
1438 state = S_got_outchar;
1439 break;
1440 } else if (T.trans_to_uni) {
1441 code = UCTransToUni(*p, cs_from);
1442 if (code <= 0) {
1443 /* What else can we do? */
1444 code = UCH(*p);
1445 }
1446 } else if (!T.trans_from_uni) {
1447 state = S_got_outchar;
1448 break;
1449 }
1450 /*
1451 * Substitute Lynx special character for 160 (nbsp) if
1452 * use_lynx_specials is set.
1453 */
1454 if (use_lynx_specials && !Back &&
1455 (code == 160 || code == 173)) {
1456 code = ((code == 160 ? HT_NON_BREAK_SPACE : LY_SOFT_HYPHEN));
1457 state = S_got_outchar;
1458 break;
1459 }
1460
1461 state = S_check_uni;
1462 break;
1463
1464 case S_check_ent:
1465 if (*p == '&') {
1466 char *pp = p + 1;
1467
1468 len = strlen(pp);
1469 /*
1470 * Check for a numeric entity. - FM
1471 */
1472 if (*pp == '#' && len > 2 &&
1473 (*(pp + 1) == 'x' || *(pp + 1) == 'X') &&
1474 UCH(*(pp + 2)) < 127 &&
1475 isxdigit(UCH(*(pp + 2)))) {
1476 what = P_hex;
1477 state = S_ncr;
1478 } else if (*pp == '#' && len > 2 &&
1479 UCH(*(pp + 1)) < 127 &&
1480 isdigit(UCH(*(pp + 1)))) {
1481 what = P_decimal;
1482 state = S_ncr;
1483 } else if (UCH(*pp) < 127 &&
1484 isalpha(UCH(*pp))) {
1485 what = P_named;
1486 state = S_named;
1487 } else {
1488 state = S_trans_byte;
1489 }
1490 } else {
1491 state = S_trans_byte;
1492 }
1493 break;
1494
1495 case S_ncr:
1496 if (what == P_hex) {
1497 p += 3;
1498 } else { /* P_decimal */
1499 p += 2;
1500 }
1501 cp = p;
1502 while (*p && UCH(*p) < 127 &&
1503 (what == P_hex ? isxdigit(UCH(*p)) :
1504 isdigit(UCH(*p)))) {
1505 p++;
1506 }
1507 /*
1508 * Save the terminator and isolate the digit(s). - FM
1509 */
1510 cpe = *p;
1511 if (*p)
1512 *p++ = '\0';
1513 /*
1514 * Show the numeric entity if the value:
1515 * (1) Is greater than 255 and unhandled Unicode.
1516 * (2) Is less than 32, and not valid and we don't have HTCJK set.
1517 * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
1518 * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
1519 */
1520 if (UCScanCode(&code, cp, (BOOL) (what == P_hex))) {
1521 code = LYcp1252ToUnicode(code);
1522 state = S_check_uni;
1523 } else {
1524 state = S_recover;
1525 break;
1526 }
1527 break;
1528
1529 case S_check_uni:
1530 /*
1531 * Show the numeric entity if the value:
1532 * (2) Is less than 32, and not valid and we don't have HTCJK set.
1533 * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
1534 * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
1535 */
1536 if ((code < 32 &&
1537 code != 9 && code != 10 && code != 13 &&
1538 !IS_CJK_TTY) ||
1539 (code == 127 &&
1540 !(HTPassHighCtrlRaw || IS_CJK_TTY)) ||
1541 (code > 127 && code < 160 &&
1542 !HTPassHighCtrlNum)) {
1543 state = S_recover;
1544 break;
1545 }
1546 /*
1547 * Convert the value as an unsigned char, hex escaped if isURL is
1548 * set and it's 8-bit, and then recycle the terminator if it is not
1549 * a semicolon. - FM
1550 */
1551 if (code > 159 && stype == st_URL) {
1552 state = S_got_oututf8;
1553 break;
1554 }
1555 /*
1556 * For 160 (nbsp), use that value if it's a hidden INPUT, otherwise
1557 * use an ASCII space (32) if plain_space is TRUE, otherwise use
1558 * the Lynx special character. - FM
1559 */
1560 if (code == 160) {
1561 if (plain_space) {
1562 code = ' ';
1563 state = S_got_outchar;
1564 break;
1565 } else if (use_lynx_specials) {
1566 code = HT_NON_BREAK_SPACE;
1567 state = S_got_outchar;
1568 break;
1569 } else if ((hidden && !Back)
1570 || (LYCharSet_UC[cs_to].codepoints & UCT_CP_SUPERSETOF_LAT1)
1571 || LYCharSet_UC[cs_to].enc == UCT_ENC_8859
1572 || (LYCharSet_UC[cs_to].like8859 &
1573 UCT_R_8859SPECL)) {
1574 state = S_got_outchar;
1575 break;
1576 } else if (
1577 (LYCharSet_UC[cs_to].repertoire & UCT_REP_SUPERSETOF_LAT1)) {
1578 ; /* nothing, may be translated later */
1579 } else {
1580 code = ' ';
1581 state = S_got_outchar;
1582 break;
1583 }
1584 }
1585 /*
1586 * For 173 (shy), use that value if it's a hidden INPUT, otherwise
1587 * ignore it if plain_space is TRUE, otherwise use the Lynx special
1588 * character. - FM
1589 */
1590 if (code == 173) {
1591 if (plain_space) {
1592 replace_buf[0] = '\0';
1593 state = S_got_outstring;
1594 break;
1595 } else if (Back &&
1596 !(LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1597 (LYCharSet_UC[cs_to].like8859 &
1598 UCT_R_8859SPECL))) {
1599 ; /* nothing, may be translated later */
1600 } else if (hidden || Back) {
1601 state = S_got_outchar;
1602 break;
1603 } else if (use_lynx_specials) {
1604 code = LY_SOFT_HYPHEN;
1605 state = S_got_outchar;
1606 break;
1607 }
1608 }
1609 /*
1610 * Seek a translation from the chartrans tables.
1611 */
1612 if ((uck = UCTransUniChar(code,
1613 cs_to)) >= 32 &&
1614 uck < 256 &&
1615 (uck < 127 || uck >= lowest_8)) {
1616 code = uck;
1617 state = S_got_outchar;
1618 break;
1619 } else if ((uck == -4 ||
1620 (repl_translated_C0 &&
1621 uck > 0 && uck < 32)) &&
1622 /*
1623 * Not found; look for replacement string.
1624 */
1625 UCTransUniCharStr(replace_buf,
1626 60, code,
1627 cs_to,
1628 0) >= 0) {
1629 state = S_got_outstring;
1630 break;
1631 }
1632 if (output_utf8 &&
1633 code > 127 && code < 0x7fffffffL) {
1634 state = S_got_oututf8;
1635 break;
1636 }
1637 /*
1638 * For 8194 (ensp), 8195 (emsp), or 8201 (thinsp), use the
1639 * character reference if it's a hidden INPUT, otherwise use an
1640 * ASCII space (32) if plain_space is TRUE, otherwise use the Lynx
1641 * special character. - FM
1642 */
1643 if (code == 8194 || code == 8195 || code == 8201) {
1644 if (hidden) {
1645 state = S_recover;
1646 } else if (plain_space) {
1647 code = ' ';
1648 state = S_got_outchar;
1649 } else {
1650 code = HT_EN_SPACE;
1651 state = S_got_outchar;
1652 }
1653 break;
1654 /*
1655 * Ignore 8204 (zwnj), 8205 (zwj) 8206 (lrm), and 8207 (rlm),
1656 * for now, if we got this far without finding a representation
1657 * for them.
1658 */
1659 } else if (code == 8204 || code == 8205 ||
1660 code == 8206 || code == 8207) {
1661 CTRACE((tfp, "LYUCFullyTranslateString: Ignoring '%"
1662 PRI_UCode_t "'.\n", code));
1663 replace_buf[0] = '\0';
1664 state = S_got_outstring;
1665 break;
1666 /*
1667 * Show the numeric entity if the value: (1) Is greater than
1668 * 255 and unhandled Unicode.
1669 */
1670 } else if (code > 255) {
1671 /*
1672 * Illegal or not yet handled value. Return "&#" verbatim and
1673 * continue from there. - FM
1674 */
1675 state = S_recover;
1676 break;
1677 /*
1678 * If it's ASCII, or is 8-bit but HTPassEightBitNum is set or
1679 * the character set is "ISO Latin 1", use it's value. - FM
1680 */
1681 } else if (code < 161 ||
1682 (code < 256 &&
1683 (HTPassEightBitNum || cs_to == LATIN1))) {
1684 /*
1685 * No conversion needed.
1686 */
1687 state = S_got_outchar;
1688 break;
1689
1690 /* The following disabled section doesn't make sense any more.
1691 * It used to make sense in the past, when S_check_named would
1692 * look in "old style" tables in addition to what it does now.
1693 * Disabling of going to S_check_name here prevents endless
1694 * looping between S_check_uni and S_check_names states, which
1695 * could occur here for Latin 1 codes for some cs_to if they
1696 * had no translation in that cs_to. Normally all cs_to
1697 * *should* now have valid translations via UCTransUniChar or
1698 * UCTransUniCharStr for all Latin 1 codes, so that we would
1699 * not get here anyway, and no loop could occur. Still, if we
1700 * *do* get here, FALL THROUGH to case S_recover now. - kw
1701 */
1702 #if 0
1703 /*
1704 * If we get to here, convert and handle the character as a
1705 * named entity. - FM
1706 */
1707 } else {
1708 name = HTMLGetEntityName(code - 160);
1709 state = S_check_name;
1710 break;
1711 #endif
1712 }
1713 /* FALLTHRU */
1714
1715 case S_recover:
1716 if (what == P_decimal || what == P_hex) {
1717 /*
1718 * Illegal or not yet handled value. Return "&#" verbatim and
1719 * continue from there. - FM
1720 */
1721 *q++ = '&';
1722 *q++ = '#';
1723 if (what == P_hex)
1724 *q++ = 'x';
1725 if (cpe != '\0')
1726 *(p - 1) = cpe;
1727 p = cp;
1728 state = S_done;
1729 } else if (what == P_named) {
1730 *cp = cpe;
1731 *q++ = '&';
1732 state = S_done;
1733 } else if (!T.output_utf8 && stype == st_HTML && !hidden &&
1734 !(HTPassEightBitRaw &&
1735 UCH(*p) >= lowest_8)) {
1736 sprintf(replace_buf, "U%.2" PRI_UCode_t "", code);
1737
1738 state = S_got_outstring;
1739 } else {
1740 puni = p;
1741 code = UCH(*p);
1742 state = S_got_outchar;
1743 }
1744 break;
1745
1746 case S_named:
1747 cp = ++p;
1748 while (*cp && UCH(*cp) < 127 &&
1749 isalnum(UCH(*cp)))
1750 cp++;
1751 cpe = *cp;
1752 *cp = '\0';
1753 name = p;
1754 state = S_check_name;
1755 break;
1756
1757 case S_check_name:
1758 /*
1759 * Seek the Unicode value for the named entity.
1760 *
1761 * !!!! We manually recover the case of '=' terminator which is
1762 * commonly found on query to CGI-scripts enclosed as href= URLs
1763 * like "somepath/?x=1&yz=2" Without this dirty fix, submission of
1764 * such URLs was broken if &yz string happened to be a recognized
1765 * entity name. - LP
1766 */
1767 if (((code = HTMLGetEntityUCValue(name)) > 0) &&
1768 !((cpe == '=') && (stype == st_URL))) {
1769 state = S_check_uni;
1770 break;
1771 }
1772 /*
1773 * Didn't find the entity. Return verbatim.
1774 */
1775 state = S_recover;
1776 break;
1777
1778 /* * * O U T P U T S T A T E S * * */
1779
1780 case S_got_oututf8:
1781 if (code > 255 ||
1782 (code >= 128 && LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8)) {
1783 UCConvertUniToUtf8(code, replace_buf);
1784 state = S_got_outstring;
1785 } else {
1786 state = S_got_outchar;
1787 }
1788 break;
1789 case S_got_outstring:
1790 if (what == P_decimal || what == P_hex) {
1791 if (cpe != ';' && cpe != '\0')
1792 *(--p) = cpe;
1793 p--;
1794 } else if (what == P_named) {
1795 *cp = cpe;
1796 p = (*cp != ';') ? (cp - 1) : cp;
1797 } else if (what == P_utf8) {
1798 p = puni;
1799 }
1800 if (replace_buf[0] == '\0') {
1801 state = S_next_char;
1802 break;
1803 }
1804 if (stype == st_URL) {
1805 code = replace_buf[0]; /* assume string OK if first char is */
1806 if (code >= 127 ||
1807 (code < 32 && (code != 9 && code != 10 && code != 0))) {
1808 state = S_put_urlstring;
1809 break;
1810 }
1811 }
1812 REPLACE_STRING(replace_buf);
1813 state = S_next_char;
1814 break;
1815 case S_put_urlstring:
1816 esc = HTEscape(replace_buf, URL_XALPHAS);
1817 REPLACE_STRING(esc);
1818 FREE(esc);
1819 state = S_next_char;
1820 break;
1821 case S_got_outchar:
1822 if (what == P_decimal || what == P_hex) {
1823 if (cpe != ';' && cpe != '\0')
1824 *(--p) = cpe;
1825 p--;
1826 } else if (what == P_named) {
1827 *cp = cpe;
1828 p = (*cp != ';') ? (cp - 1) : cp;
1829 } else if (what == P_utf8) {
1830 p = puni;
1831 }
1832 if (stype == st_URL &&
1833 /* Not a full HTEscape, only for 8bit and ctrl chars */
1834 (TOASCII(code) >= 127 || /* S/390 -- gil -- 1925 */
1835 (code < ' ' && (code != '\t' && code != '\n')))) {
1836 state = S_put_urlchar;
1837 break;
1838 } else if (!hidden && code == 10 && *p == 10
1839 && q != qs && *(q - 1) == 13) {
1840 /*
1841 * If this is not a hidden string, and the current char is the
1842 * LF ('\n') of a CRLF pair, drop the CR ('\r'). - KW
1843 */
1844 *(q - 1) = *p++;
1845 state = S_done;
1846 break;
1847 }
1848 *q++ = (char) code;
1849 state = S_next_char;
1850 break;
1851 case S_put_urlchar:
1852 *q++ = '%';
1853 REPLACE_CHAR(hex[(TOASCII(code) >> 4) & 15]); /* S/390 -- gil -- 1944 */
1854 REPLACE_CHAR(hex[(TOASCII(code) & 15)]);
1855 /* fall through */
1856 case S_next_char:
1857 p++; /* fall through */
1858 case S_done:
1859 state = S_text;
1860 what = P_text;
1861 /* for next round */
1862 }
1863 }
1864
1865 *q = '\0';
1866 if (chunk) {
1867 HTChunkPutb(CHUNK, qs, (int) (q - qs + 1)); /* also terminates */
1868 if (stype == st_URL || stype == st_other) {
1869 LYTrimHead(chunk->data);
1870 LYTrimTail(chunk->data);
1871 }
1872 StrAllocCopy(*str, chunk->data);
1873 HTChunkFree(chunk);
1874 } else {
1875 if (stype == st_URL || stype == st_other) {
1876 LYTrimHead(qs);
1877 LYTrimTail(qs);
1878 }
1879 }
1880 return str;
1881 }
1882
1883 #undef REPLACE_CHAR
1884 #undef REPLACE_STRING
1885
LYUCTranslateHTMLString(char ** str,int cs_from,int cs_to,int use_lynx_specials,int plain_space,int hidden,CharUtil_st stype)1886 BOOL LYUCTranslateHTMLString(char **str,
1887 int cs_from,
1888 int cs_to,
1889 int use_lynx_specials,
1890 int plain_space,
1891 int hidden,
1892 CharUtil_st stype)
1893 {
1894 BOOL ret = YES;
1895
1896 /* May reallocate *str even if cs_to == 0 */
1897 if (!LYUCFullyTranslateString(str, cs_from, cs_to, TRUE,
1898 use_lynx_specials, plain_space, hidden,
1899 NO, stype)) {
1900 ret = NO;
1901 }
1902 return ret;
1903 }
1904
LYUCTranslateBackFormData(char ** str,int cs_from,int cs_to,int plain_space)1905 BOOL LYUCTranslateBackFormData(char **str,
1906 int cs_from,
1907 int cs_to,
1908 int plain_space)
1909 {
1910 char **ret;
1911
1912 /* May reallocate *str */
1913 ret = (LYUCFullyTranslateString(str, cs_from, cs_to, FALSE,
1914 NO, plain_space, YES,
1915 YES, st_HTML));
1916 return (BOOL) (ret != NULL);
1917 }
1918
1919 /*
1920 * Parse a parameter from an HTML META tag, i.e., the CONTENT.
1921 */
LYParseTagParam(char * from,const char * name)1922 char *LYParseTagParam(char *from,
1923 const char *name)
1924 {
1925 size_t len = strlen(name);
1926 char *result = NULL;
1927 char *string = from;
1928
1929 do {
1930 if ((string = StrChr(string, ';')) == NULL)
1931 return NULL;
1932 while (*string != '\0' && (*string == ';' || isspace(UCH(*string)))) {
1933 string++;
1934 }
1935 if (strlen(string) < len)
1936 return NULL;
1937 } while (strncasecomp(string, name, (int) len) != 0);
1938 string += len;
1939 while (*string != '\0' && (isspace(UCH(*string)) || *string == '=')) {
1940 string++;
1941 }
1942
1943 StrAllocCopy(result, string);
1944 len = 0;
1945 while (isprint(UCH(string[len])) && !isspace(UCH(string[len]))) {
1946 len++;
1947 }
1948 result[len] = '\0';
1949
1950 /*
1951 * Strip single quotes, just in case.
1952 */
1953 if (len > 2 && result[0] == '\'' && result[len - 1] == result[0]) {
1954 result[len - 1] = '\0';
1955 for (string = result; (string[0] = string[1]) != '\0'; ++string) ;
1956 }
1957 return result;
1958 }
1959
1960 /*
1961 * Given a refresh-URL content string, parses the delay time and the URL
1962 * string. Ignore the remainder of the content.
1963 */
LYParseRefreshURL(char * content,char ** p_seconds,char ** p_address)1964 void LYParseRefreshURL(char *content,
1965 char **p_seconds,
1966 char **p_address)
1967 {
1968 char *cp;
1969 char *cp1 = NULL;
1970 char *Seconds = NULL;
1971
1972 /*
1973 * Look for the Seconds field. - FM
1974 */
1975 cp = LYSkipBlanks(content);
1976 if (*cp && isdigit(UCH(*cp))) {
1977 cp1 = cp;
1978 while (*cp1 && isdigit(UCH(*cp1)))
1979 cp1++;
1980 StrnAllocCopy(Seconds, cp, (size_t) (cp1 - cp));
1981 }
1982 *p_seconds = Seconds;
1983 *p_address = LYParseTagParam(content, "URL");
1984
1985 CTRACE((tfp,
1986 "LYParseRefreshURL\n\tcontent: %s\n\tseconds: %s\n\taddress: %s\n",
1987 content, NonNull(*p_seconds), NonNull(*p_address)));
1988 }
1989
1990 /*
1991 * This function processes META tags in HTML streams. - FM
1992 */
LYHandleMETA(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED)1993 void LYHandleMETA(HTStructured * me, const BOOL *present,
1994 STRING2PTR value,
1995 char **include GCC_UNUSED)
1996 {
1997 char *http_equiv = NULL, *name = NULL, *content = NULL, *charset = NULL;
1998 char *href = NULL, *id_string = NULL, *temp = NULL;
1999 char *cp, *cp0, *cp1 = NULL;
2000 int url_type = 0;
2001
2002 if (!me || !present)
2003 return;
2004
2005 /*
2006 * Load the attributes for possible use by Lynx. - FM
2007 */
2008 if (present[HTML_META_HTTP_EQUIV] &&
2009 non_empty(value[HTML_META_HTTP_EQUIV])) {
2010 StrAllocCopy(http_equiv, value[HTML_META_HTTP_EQUIV]);
2011 convert_to_spaces(http_equiv, TRUE);
2012 LYUCTranslateHTMLString(&http_equiv, me->tag_charset, me->tag_charset,
2013 NO, NO, YES, st_other);
2014 if (*http_equiv == '\0') {
2015 FREE(http_equiv);
2016 }
2017 }
2018 if (present[HTML_META_NAME] &&
2019 non_empty(value[HTML_META_NAME])) {
2020 StrAllocCopy(name, value[HTML_META_NAME]);
2021 convert_to_spaces(name, TRUE);
2022 LYUCTranslateHTMLString(&name, me->tag_charset, me->tag_charset,
2023 NO, NO, YES, st_other);
2024 if (*name == '\0') {
2025 FREE(name);
2026 }
2027 }
2028 if (present[HTML_META_CONTENT] &&
2029 non_empty(value[HTML_META_CONTENT])) {
2030 /*
2031 * Technically, we should be creating a comma-separated list, but META
2032 * tags come one at a time, and we'll handle (or ignore) them as each
2033 * is received. Also, at this point, we only trim leading and trailing
2034 * blanks from the CONTENT value, without translating any named
2035 * entities or numeric character references, because how we should do
2036 * that depends on what type of information it contains, and whether or
2037 * not any of it might be sent to the screen. - FM
2038 */
2039 StrAllocCopy(content, value[HTML_META_CONTENT]);
2040 convert_to_spaces(content, FALSE);
2041 LYTrimHead(content);
2042 LYTrimTail(content);
2043 if (*content == '\0') {
2044 FREE(content);
2045 }
2046 }
2047 if (present[HTML_META_CHARSET] &&
2048 non_empty(value[HTML_META_CHARSET])) {
2049 StrAllocCopy(charset, value[HTML_META_CHARSET]);
2050 convert_to_spaces(charset, TRUE);
2051 LYUCTranslateHTMLString(&charset, me->tag_charset, me->tag_charset,
2052 NO, NO, YES, st_other);
2053 if (*charset == '\0') {
2054 FREE(charset);
2055 }
2056 }
2057 CTRACE((tfp,
2058 "LYHandleMETA: HTTP-EQUIV=\"%s\" NAME=\"%s\" CONTENT=\"%s\" CHARSET=\"%s\"\n",
2059 NONNULL(http_equiv),
2060 NONNULL(name),
2061 NONNULL(content),
2062 NONNULL(charset)));
2063
2064 /*
2065 * Check for a text/html Content-Type with a charset directive, if we
2066 * didn't already set the charset via a server's header. - AAC & FM
2067 */
2068 if (isEmpty(me->node_anchor->charset) &&
2069 (charset ||
2070 (!strcasecomp(NonNull(http_equiv), "Content-Type") && content))) {
2071 LYUCcharset *p_in = NULL;
2072 LYUCcharset *p_out = NULL;
2073
2074 if (charset) {
2075 LYLowerCase(charset);
2076 } else {
2077 LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2078 NO, NO, YES, st_other);
2079 LYLowerCase(content);
2080 }
2081
2082 if ((cp1 = charset) != NULL ||
2083 (cp1 = strstr(content, "charset")) != NULL) {
2084 BOOL chartrans_ok = NO;
2085 char *cp3 = NULL, *cp4;
2086 int chndl;
2087
2088 if (!charset)
2089 cp1 += 7;
2090 while (*cp1 == ' ' || *cp1 == '=' || *cp1 == '"')
2091 cp1++;
2092
2093 StrAllocCopy(cp3, cp1); /* copy to mutilate more */
2094 for (cp4 = cp3; (*cp4 != '\0' && *cp4 != '"' &&
2095 *cp4 != ';' && *cp4 != ':' &&
2096 !WHITE(*cp4)); cp4++) {
2097 ; /* do nothing */
2098 }
2099 *cp4 = '\0';
2100 cp4 = cp3;
2101 chndl = UCGetLYhndl_byMIME(cp3);
2102
2103 #ifdef CAN_SWITCH_DISPLAY_CHARSET
2104 /* Allow a switch to a more suitable display charset */
2105 if (Switch_Display_Charset(chndl, SWITCH_DISPLAY_CHARSET_MAYBE)) {
2106 /* UCT_STAGE_STRUCTURED and UCT_STAGE_HTEXT
2107 should have the same setting for UCInfoStage. */
2108 HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_STRUCTURED);
2109
2110 me->outUCLYhndl = current_char_set;
2111 HTAnchor_setUCInfoStage(me->node_anchor,
2112 current_char_set,
2113 UCT_STAGE_HTEXT,
2114 UCT_SETBY_MIME); /* highest priorty! */
2115 HTAnchor_setUCInfoStage(me->node_anchor,
2116 current_char_set,
2117 UCT_STAGE_STRUCTURED,
2118 UCT_SETBY_MIME); /* highest priorty! */
2119 me->outUCI = HTAnchor_getUCInfoStage(me->node_anchor,
2120 UCT_STAGE_HTEXT);
2121 /* The SGML stage will be reset in change_chartrans_handling */
2122 }
2123 #endif
2124
2125 if (UCCanTranslateFromTo(chndl, current_char_set)) {
2126 chartrans_ok = YES;
2127 StrAllocCopy(me->node_anchor->charset, cp4);
2128 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2129 UCT_STAGE_PARSER,
2130 UCT_SETBY_STRUCTURED);
2131 } else if (chndl < 0) {
2132 /*
2133 * Got something but we don't recognize it.
2134 */
2135 chndl = UCLYhndl_for_unrec;
2136 if (chndl < 0) /* UCLYhndl_for_unrec not defined :-( */
2137 chndl = UCLYhndl_for_unspec; /* always >= 0 */
2138 if (UCCanTranslateFromTo(chndl, current_char_set)) {
2139 chartrans_ok = YES;
2140 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2141 UCT_STAGE_PARSER,
2142 UCT_SETBY_STRUCTURED);
2143 }
2144 }
2145 if (chartrans_ok) {
2146 p_in = HTAnchor_getUCInfoStage(me->node_anchor,
2147 UCT_STAGE_PARSER);
2148 p_out = HTAnchor_setUCInfoStage(me->node_anchor,
2149 current_char_set,
2150 UCT_STAGE_HTEXT,
2151 UCT_SETBY_DEFAULT);
2152 if (!p_out) {
2153 /*
2154 * Try again.
2155 */
2156 p_out = HTAnchor_getUCInfoStage(me->node_anchor,
2157 UCT_STAGE_HTEXT);
2158 }
2159 if (!strcmp(p_in->MIMEname, "x-transparent")) {
2160 HTPassEightBitRaw = TRUE;
2161 HTAnchor_setUCInfoStage(me->node_anchor,
2162 HTAnchor_getUCLYhndl(me->node_anchor,
2163 UCT_STAGE_HTEXT),
2164 UCT_STAGE_PARSER,
2165 UCT_SETBY_DEFAULT);
2166 }
2167 if (!strcmp(p_out->MIMEname, "x-transparent")) {
2168 HTPassEightBitRaw = TRUE;
2169 HTAnchor_setUCInfoStage(me->node_anchor,
2170 HTAnchor_getUCLYhndl(me->node_anchor,
2171 UCT_STAGE_PARSER),
2172 UCT_STAGE_HTEXT,
2173 UCT_SETBY_DEFAULT);
2174 }
2175 if ((p_in->enc != UCT_ENC_CJK)
2176 #ifdef EXP_JAPANESEUTF8_SUPPORT
2177 && (p_in->enc != UCT_ENC_UTF8)
2178 #endif
2179 ) {
2180 HTCJK = NOCJK;
2181 if (!(p_in->codepoints &
2182 UCT_CP_SUBSETOF_LAT1) &&
2183 chndl == current_char_set) {
2184 HTPassEightBitRaw = TRUE;
2185 }
2186 } else if (p_out->enc == UCT_ENC_CJK) {
2187 Set_HTCJK(p_in->MIMEname, p_out->MIMEname);
2188 }
2189 LYGetChartransInfo(me);
2190 /*
2191 * Update the chartrans info homologously to a Content-Type
2192 * MIME header with a charset parameter. - FM
2193 */
2194 if (me->UCLYhndl != chndl) {
2195 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2196 UCT_STAGE_MIME,
2197 UCT_SETBY_STRUCTURED);
2198 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2199 UCT_STAGE_PARSER,
2200 UCT_SETBY_STRUCTURED);
2201 me->inUCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
2202 UCT_STAGE_PARSER);
2203 me->inUCI = HTAnchor_getUCInfoStage(me->node_anchor,
2204 UCT_STAGE_PARSER);
2205 }
2206 UCSetTransParams(&me->T,
2207 me->inUCLYhndl, me->inUCI,
2208 me->outUCLYhndl, me->outUCI);
2209 } else {
2210 /*
2211 * Cannot translate. If according to some heuristic the given
2212 * charset and the current display character both are likely to
2213 * be like ISO-8859 in structure, pretend we have some kind of
2214 * match.
2215 */
2216 BOOL given_is_8859 = (BOOL) (!StrNCmp(cp4, "iso-8859-", 9) &&
2217 isdigit(UCH(cp4[9])));
2218 BOOL given_is_8859like = (BOOL) (given_is_8859
2219 || !StrNCmp(cp4, "windows-", 8)
2220 || !StrNCmp(cp4, "cp12", 4)
2221 || !StrNCmp(cp4, "cp-12", 5));
2222 BOOL given_and_display_8859like = (BOOL) (given_is_8859like &&
2223 (strstr(LYchar_set_names[current_char_set],
2224 "ISO-8859") ||
2225 strstr(LYchar_set_names[current_char_set],
2226 "windows-")));
2227
2228 if (given_is_8859) {
2229 cp1 = &cp4[10];
2230 while (*cp1 &&
2231 isdigit(UCH((*cp1))))
2232 cp1++;
2233 *cp1 = '\0';
2234 }
2235 if (given_and_display_8859like) {
2236 StrAllocCopy(me->node_anchor->charset, cp4);
2237 HTPassEightBitRaw = TRUE;
2238 }
2239 HTAlert(*cp4 ? cp4 : me->node_anchor->charset);
2240
2241 }
2242 FREE(cp3);
2243
2244 if (me->node_anchor->charset) {
2245 CTRACE((tfp,
2246 "LYHandleMETA: New charset: %s\n",
2247 me->node_anchor->charset));
2248 }
2249 }
2250 /*
2251 * Set the kcode element based on the charset. - FM
2252 */
2253 HText_setKcode(me->text, me->node_anchor->charset, p_in);
2254 }
2255
2256 /*
2257 * Make sure we have META name/value pairs to handle. - FM
2258 */
2259 if (!(http_equiv || name) || !content)
2260 goto free_META_copies;
2261
2262 /*
2263 * Check for a no-cache Pragma
2264 * or Cache-Control directive. - FM
2265 */
2266 if (!strcasecomp(NonNull(http_equiv), "Pragma") ||
2267 !strcasecomp(NonNull(http_equiv), "Cache-Control")) {
2268 LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2269 NO, NO, YES, st_other);
2270 if (!strcasecomp(content, "no-cache")) {
2271 me->node_anchor->no_cache = TRUE;
2272 HText_setNoCache(me->text);
2273 }
2274
2275 /*
2276 * If we didn't get a Cache-Control MIME header, and the META has one,
2277 * convert to lowercase, store it in the anchor element, and if we
2278 * haven't yet set no_cache, check whether we should. - FM
2279 */
2280 if ((!me->node_anchor->cache_control) &&
2281 !strcasecomp(NonNull(http_equiv), "Cache-Control")) {
2282 LYLowerCase(content);
2283 StrAllocCopy(me->node_anchor->cache_control, content);
2284 if (me->node_anchor->no_cache == FALSE) {
2285 cp0 = content;
2286 while ((cp = strstr(cp0, "no-cache")) != NULL) {
2287 cp += 8;
2288 while (*cp != '\0' && WHITE(*cp))
2289 cp++;
2290 if (*cp == '\0' || *cp == ';') {
2291 me->node_anchor->no_cache = TRUE;
2292 HText_setNoCache(me->text);
2293 break;
2294 }
2295 cp0 = cp;
2296 }
2297 if (me->node_anchor->no_cache == TRUE)
2298 goto free_META_copies;
2299 cp0 = content;
2300 while ((cp = strstr(cp0, "max-age")) != NULL) {
2301 cp += 7;
2302 while (*cp != '\0' && WHITE(*cp))
2303 cp++;
2304 if (*cp == '=') {
2305 cp++;
2306 while (*cp != '\0' && WHITE(*cp))
2307 cp++;
2308 if (isdigit(UCH(*cp))) {
2309 cp0 = cp;
2310 while (isdigit(UCH(*cp)))
2311 cp++;
2312 if (*cp0 == '0' && cp == (cp0 + 1)) {
2313 me->node_anchor->no_cache = TRUE;
2314 HText_setNoCache(me->text);
2315 break;
2316 }
2317 }
2318 }
2319 cp0 = cp;
2320 }
2321 }
2322 }
2323
2324 /*
2325 * Check for an Expires directive. - FM
2326 */
2327 } else if (!strcasecomp(NonNull(http_equiv), "Expires")) {
2328 /*
2329 * If we didn't get an Expires MIME header, store it in the anchor
2330 * element, and if we haven't yet set no_cache, check whether we
2331 * should. Note that we don't accept a Date header via META tags,
2332 * because it's likely to be untrustworthy, but do check for a Date
2333 * header from a server when making the comparison. - FM
2334 */
2335 LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2336 NO, NO, YES, st_other);
2337 StrAllocCopy(me->node_anchor->expires, content);
2338 if (me->node_anchor->no_cache == FALSE) {
2339 if (!strcmp(content, "0")) {
2340 /*
2341 * The value is zero, which we treat as an absolute no-cache
2342 * directive. - FM
2343 */
2344 me->node_anchor->no_cache = TRUE;
2345 HText_setNoCache(me->text);
2346 } else if (me->node_anchor->date != NULL) {
2347 /*
2348 * We have a Date header, so check if the value is less than or
2349 * equal to that. - FM
2350 */
2351 if (LYmktime(content, TRUE) <=
2352 LYmktime(me->node_anchor->date, TRUE)) {
2353 me->node_anchor->no_cache = TRUE;
2354 HText_setNoCache(me->text);
2355 }
2356 } else if (LYmktime(content, FALSE) == 0) {
2357 /*
2358 * We don't have a Date header, and the value is in past for
2359 * us. - FM
2360 */
2361 me->node_anchor->no_cache = TRUE;
2362 HText_setNoCache(me->text);
2363 }
2364 }
2365
2366 /*
2367 * Check for a Refresh directive. - FM
2368 */
2369 } else if (!strcasecomp(NonNull(http_equiv), "Refresh")) {
2370 char *Seconds = NULL;
2371
2372 LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2373 NO, NO, YES, st_other);
2374 LYParseRefreshURL(content, &Seconds, &href);
2375
2376 if (Seconds) {
2377 if (href) {
2378 /*
2379 * We found a URL field, so check it out. - FM
2380 */
2381 if (!LYLegitimizeHREF(me, &href, TRUE, FALSE)) {
2382 /*
2383 * The specs require a complete URL, but this is a
2384 * Netscapism, so don't expect the author to know that. -
2385 * FM
2386 */
2387 HTUserMsg(REFRESH_URL_NOT_ABSOLUTE);
2388 /*
2389 * Use the document's address as the base. - FM
2390 */
2391 if (*href != '\0') {
2392 temp = HTParse(href,
2393 me->node_anchor->address, PARSE_ALL);
2394 StrAllocCopy(href, temp);
2395 FREE(temp);
2396 } else {
2397 StrAllocCopy(href, me->node_anchor->address);
2398 HText_setNoCache(me->text);
2399 }
2400
2401 } else {
2402 /*
2403 * Check whether to fill in localhost. - FM
2404 */
2405 LYFillLocalFileURL(&href,
2406 (me->inBASE ?
2407 me->base_href : me->node_anchor->address));
2408 }
2409
2410 /*
2411 * Set the no_cache flag if the Refresh URL is the same as the
2412 * document's address. - FM
2413 */
2414 if (!strcmp(href, me->node_anchor->address)) {
2415 HText_setNoCache(me->text);
2416 }
2417 } else {
2418 /*
2419 * We didn't find a URL field, so use the document's own
2420 * address and set the no_cache flag. - FM
2421 */
2422 StrAllocCopy(href, me->node_anchor->address);
2423 HText_setNoCache(me->text);
2424 }
2425 /*
2426 * Check for an anchor in http or https URLs. - FM
2427 */
2428 cp = NULL;
2429 /* id_string seems to be used wrong below if given.
2430 not that it matters much. avoid setting it here. - kw */
2431 if (track_internal_links &&
2432 (StrNCmp(href, "http", 4) == 0) &&
2433 (cp = StrChr(href, '#')) != NULL) {
2434 StrAllocCopy(id_string, cp);
2435 *cp = '\0';
2436 }
2437 if (me->inA) {
2438 /*
2439 * Ugh! The META tag, which is a HEAD element, is in an
2440 * Anchor, which is BODY element. All we can do is close the
2441 * Anchor and cross our fingers. - FM
2442 */
2443 if (me->inBoldA == TRUE && me->inBoldH == FALSE)
2444 HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2445 me->inBoldA = FALSE;
2446 HText_endAnchor(me->text, me->CurrentANum);
2447 me->inA = FALSE;
2448 me->CurrentANum = 0;
2449 }
2450 me->CurrentA = HTAnchor_findChildAndLink
2451 (
2452 me->node_anchor, /* Parent */
2453 id_string, /* Tag */
2454 href, /* Addresss */
2455 (HTLinkType *) 0); /* Type */
2456 if (id_string)
2457 *cp = '#';
2458 FREE(id_string);
2459 LYEnsureSingleSpace(me);
2460 if (me->inUnderline == FALSE)
2461 HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
2462 HTML_put_string(me, "REFRESH(");
2463 HTML_put_string(me, Seconds);
2464 HTML_put_string(me, " sec):");
2465 FREE(Seconds);
2466 if (me->inUnderline == FALSE)
2467 HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
2468 HTML_put_character(me, ' ');
2469 me->in_word = NO;
2470 HText_beginAnchor(me->text, me->inUnderline, me->CurrentA);
2471 if (me->inBoldH == FALSE)
2472 HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
2473 HTML_put_string(me, href);
2474 FREE(href);
2475 if (me->inBoldH == FALSE)
2476 HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2477 HText_endAnchor(me->text, 0);
2478 LYEnsureSingleSpace(me);
2479 }
2480
2481 /*
2482 * Check for a suggested filename via a Content-Disposition with a
2483 * filename=name.suffix in it, if we don't already have it via a server
2484 * header. - FM
2485 */
2486 } else if (isEmpty(me->node_anchor->SugFname) &&
2487 !strcasecomp((http_equiv ?
2488 http_equiv : ""), "Content-Disposition")) {
2489 cp = content;
2490 while (*cp != '\0' && strncasecomp(cp, "filename", 8))
2491 cp++;
2492 if (*cp != '\0') {
2493 cp = LYSkipBlanks(cp + 8);
2494 if (*cp == '=')
2495 cp++;
2496 cp = LYSkipBlanks(cp);
2497 if (*cp != '\0') {
2498 StrAllocCopy(me->node_anchor->SugFname, cp);
2499 if (*me->node_anchor->SugFname == '"') {
2500 if ((cp = StrChr((me->node_anchor->SugFname + 1),
2501 '"')) != NULL) {
2502 *(cp + 1) = '\0';
2503 HTMIME_TrimDoubleQuotes(me->node_anchor->SugFname);
2504 if (isEmpty(me->node_anchor->SugFname)) {
2505 FREE(me->node_anchor->SugFname);
2506 }
2507 } else {
2508 FREE(me->node_anchor->SugFname);
2509 }
2510 }
2511 #if defined(UNIX) && !defined(DOSPATH)
2512 /*
2513 * If blanks are not legal for local filenames, replace them
2514 * with underscores.
2515 */
2516 if ((cp = me->node_anchor->SugFname) != NULL) {
2517 while (*cp != '\0') {
2518 if (isspace(UCH(*cp)))
2519 *cp = '_';
2520 ++cp;
2521 }
2522 }
2523 #endif
2524 }
2525 }
2526 /*
2527 * Check for a Set-Cookie directive. - AK
2528 */
2529 } else if (!strcasecomp(NonNull(http_equiv), "Set-Cookie")) {
2530 /*
2531 * This will need to be updated when Set-Cookie/Set-Cookie2 handling is
2532 * finalized. For now, we'll still assume "historical" cookies in META
2533 * directives. - FM
2534 */
2535 url_type = is_url(me->inBASE ?
2536 me->base_href : me->node_anchor->address);
2537 if (url_type == HTTP_URL_TYPE || url_type == HTTPS_URL_TYPE) {
2538 LYSetCookie(content,
2539 NULL,
2540 (me->inBASE ?
2541 me->base_href : me->node_anchor->address));
2542 }
2543 }
2544
2545 /*
2546 * Free the copies. - FM
2547 */
2548 free_META_copies:
2549 FREE(http_equiv);
2550 FREE(name);
2551 FREE(content);
2552 FREE(charset);
2553 }
2554
2555 /*
2556 * This function handles P elements in HTML streams.
2557 * If start is TRUE it handles a start tag, and if
2558 * FALSE, an end tag. We presently handle start
2559 * and end tags identically, but this can lead to
2560 * a different number of blank lines between the
2561 * current paragraph and subsequent text when a P
2562 * end tag is present or not in the markup. - FM
2563 */
LYHandlePlike(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED,int align_idx,int start)2564 void LYHandlePlike(HTStructured * me, const BOOL *present,
2565 STRING2PTR value,
2566 char **include GCC_UNUSED,
2567 int align_idx,
2568 int start)
2569 {
2570 /*
2571 * FIG content should be a true block, which like P inherits the current
2572 * style. APPLET is like character elements or an ALT attribute, unless
2573 * its content contains a block element. If we encounter a P in either's
2574 * content, we set flags to treat the content as a block - FM
2575 */
2576 if (start) {
2577 if (me->inFIG)
2578 me->inFIGwithP = TRUE;
2579
2580 if (me->inAPPLET)
2581 me->inAPPLETwithP = TRUE;
2582 }
2583
2584 UPDATE_STYLE;
2585 if (me->List_Nesting_Level >= 0) {
2586 /*
2587 * We're in a list. Treat P as an instruction to create one blank
2588 * line, if not already present, then fall through to handle
2589 * attributes, with the "second line" margins - FM
2590 */
2591 if (me->inP) {
2592 if (me->inFIG || me->inAPPLET ||
2593 me->inCAPTION || me->inCREDIT ||
2594 me->sp->style->spaceAfter > 0 ||
2595 (start && me->sp->style->spaceBefore > 0)) {
2596 LYEnsureDoubleSpace(me);
2597 } else {
2598 LYEnsureSingleSpace(me);
2599 }
2600 }
2601 } else if (me->sp[0].tag_number == HTML_ADDRESS) {
2602 /*
2603 * We're in an ADDRESS. Treat P as an instruction to start a newline,
2604 * if needed, then fall through to handle attributes - FM
2605 */
2606 if (!HText_LastLineEmpty(me->text, FALSE)) {
2607 HText_setLastChar(me->text, ' '); /* absorb white space */
2608 HText_appendCharacter(me->text, '\r');
2609 }
2610 } else {
2611 if (start) {
2612 if (!(me->inLABEL && !me->inP)) {
2613 HText_appendParagraph(me->text);
2614 }
2615 } else if (me->sp->style->spaceAfter > 0) {
2616 LYEnsureDoubleSpace(me);
2617 } else {
2618 LYEnsureSingleSpace(me);
2619 }
2620 me->inLABEL = FALSE;
2621 }
2622 me->in_word = NO;
2623
2624 if (LYoverride_default_alignment(me)) {
2625 me->sp->style->alignment = LYstyles(me->sp[0].tag_number)->alignment;
2626 } else if ((me->List_Nesting_Level >= 0 &&
2627 (me->sp->style->id == ST_DivCenter ||
2628 me->sp->style->id == ST_DivLeft ||
2629 me->sp->style->id == ST_DivRight)) ||
2630 ((me->Division_Level < 0) &&
2631 (me->sp->style->id == ST_Normal ||
2632 me->sp->style->id == ST_Preformatted))) {
2633 me->sp->style->alignment = HT_LEFT;
2634 } else {
2635 me->sp->style->alignment = (short) me->current_default_alignment;
2636 }
2637
2638 if (start && align_idx >= 0) {
2639 if (present && present[align_idx] && value[align_idx]) {
2640 if (!strcasecomp(value[align_idx], "center") &&
2641 !(me->List_Nesting_Level >= 0 && !me->inP))
2642 me->sp->style->alignment = HT_CENTER;
2643 else if (!strcasecomp(value[align_idx], "right") &&
2644 !(me->List_Nesting_Level >= 0 && !me->inP))
2645 me->sp->style->alignment = HT_RIGHT;
2646 else if (!strcasecomp(value[align_idx], "left") ||
2647 !strcasecomp(value[align_idx], "justify"))
2648 me->sp->style->alignment = HT_LEFT;
2649 }
2650
2651 }
2652
2653 /*
2654 * Mark that we are starting a new paragraph and don't have any of its
2655 * text yet - FM
2656 */
2657 me->inP = FALSE;
2658
2659 return;
2660 }
2661
2662 /*
2663 * This function handles SELECT elements in HTML streams.
2664 * If start is TRUE it handles a start tag, and if FALSE,
2665 * an end tag. - FM
2666 */
LYHandleSELECT(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED,int start)2667 void LYHandleSELECT(HTStructured * me, const BOOL *present,
2668 STRING2PTR value,
2669 char **include GCC_UNUSED,
2670 int start)
2671 {
2672 int i;
2673
2674 if (start == TRUE) {
2675 char *name = NULL;
2676 BOOLEAN multiple = NO;
2677 char *size = NULL;
2678
2679 /*
2680 * Initialize the disable attribute.
2681 */
2682 me->select_disabled = FALSE;
2683
2684 /*
2685 * Check for unclosed TEXTAREA.
2686 */
2687 if (me->inTEXTAREA) {
2688 if (LYBadHTML(me)) {
2689 LYShowBadHTML("Bad HTML: Missing TEXTAREA end tag\n");
2690 }
2691 }
2692
2693 /*
2694 * Set to know we are in a select tag.
2695 */
2696 me->inSELECT = TRUE;
2697
2698 if (!(present && present[HTML_SELECT_NAME] &&
2699 non_empty(value[HTML_SELECT_NAME]))) {
2700 StrAllocCopy(name, "");
2701 } else if (StrChr(value[HTML_SELECT_NAME], '&') == NULL) {
2702 StrAllocCopy(name, value[HTML_SELECT_NAME]);
2703 } else {
2704 StrAllocCopy(name, value[HTML_SELECT_NAME]);
2705 UNESCAPE_FIELDNAME_TO_STD(&name);
2706 }
2707 if (present && present[HTML_SELECT_MULTIPLE])
2708 multiple = YES;
2709 if (present && present[HTML_SELECT_DISABLED])
2710 me->select_disabled = TRUE;
2711 if (present && present[HTML_SELECT_SIZE] &&
2712 non_empty(value[HTML_SELECT_SIZE])) {
2713 /*
2714 * Let the size be determined by the number of OPTIONs. - FM
2715 */
2716 CTRACE((tfp, "LYHandleSELECT: Ignoring SIZE=\"%s\" for SELECT.\n",
2717 value[HTML_SELECT_SIZE]));
2718 }
2719
2720 if (me->inBoldH == TRUE &&
2721 (multiple == NO || LYSelectPopups == FALSE)) {
2722 HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2723 me->inBoldH = FALSE;
2724 me->needBoldH = TRUE;
2725 }
2726 if (me->inUnderline == TRUE &&
2727 (multiple == NO || LYSelectPopups == FALSE)) {
2728 HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
2729 me->inUnderline = FALSE;
2730 }
2731
2732 if ((multiple == NO && LYSelectPopups == TRUE) &&
2733 (me->sp[0].tag_number == HTML_PRE || me->inPRE == TRUE ||
2734 !me->sp->style->freeFormat) &&
2735 HText_LastLineSize(me->text, FALSE) > (LYcolLimit - 7)) {
2736 /*
2737 * Force a newline when we're using a popup in a PRE block and are
2738 * within 7 columns from the right margin. This will allow for the
2739 * '[' popup designator and help avoid a wrap in the underscore
2740 * placeholder for the retracted popup entry in the HText
2741 * structure. - FM
2742 */
2743 HTML_put_character(me, '\n');
2744 me->in_word = NO;
2745 }
2746
2747 LYCheckForID(me, present, value, (int) HTML_SELECT_ID);
2748
2749 HText_beginSelect(name, ATTR_CS_IN, multiple, size);
2750 FREE(name);
2751 FREE(size);
2752
2753 me->first_option = TRUE;
2754 } else {
2755 /*
2756 * Handle end tag.
2757 */
2758 char *ptr;
2759
2760 /*
2761 * Make sure we had a select start tag.
2762 */
2763 if (!me->inSELECT) {
2764 if (LYBadHTML(me)) {
2765 LYShowBadHTML("Bad HTML: Unmatched SELECT end tag\n");
2766 }
2767 return;
2768 }
2769
2770 /*
2771 * Set to know that we are no longer in a select tag.
2772 */
2773 me->inSELECT = FALSE;
2774
2775 /*
2776 * Clear the disable attribute.
2777 */
2778 me->select_disabled = FALSE;
2779
2780 /*
2781 * Finish the data off.
2782 */
2783 HTChunkTerminate(&me->option);
2784 /*
2785 * Finish the previous option.
2786 */
2787 ptr = HText_setLastOptionValue(me->text,
2788 me->option.data,
2789 me->LastOptionValue,
2790 LAST_ORDER,
2791 me->LastOptionChecked,
2792 me->UCLYhndl,
2793 ATTR_CS_IN);
2794 FREE(me->LastOptionValue);
2795
2796 me->LastOptionChecked = FALSE;
2797
2798 if (HTCurSelectGroupType == F_CHECKBOX_TYPE ||
2799 LYSelectPopups == FALSE) {
2800 /*
2801 * Start a newline after the last checkbox/button option.
2802 */
2803 LYEnsureSingleSpace(me);
2804 } else {
2805 /*
2806 * Output popup box with the default option to screen, but use
2807 * non-breaking spaces for output.
2808 */
2809 if (ptr &&
2810 me->sp[0].tag_number == HTML_PRE && strlen(ptr) > 6) {
2811 /*
2812 * The code inadequately handles OPTION fields in PRE tags.
2813 * We'll put up a minimum of 6 characters, and if any more
2814 * would exceed the wrap column, we'll ignore them.
2815 */
2816 for (i = 0; i < 6; i++) {
2817 if (*ptr == ' ')
2818 HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
2819 else
2820 HText_appendCharacter(me->text, *ptr);
2821 ptr++;
2822 }
2823 }
2824 for (; non_empty(ptr); ptr++) {
2825 if (*ptr == ' ')
2826 HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
2827 else
2828 HText_appendCharacter(me->text, *ptr);
2829 }
2830 /*
2831 * Add end option character.
2832 */
2833 if (!me->first_option) {
2834 HText_appendCharacter(me->text, ']');
2835 HText_setLastChar(me->text, ']');
2836 me->in_word = YES;
2837 }
2838 }
2839 HTChunkClear(&me->option);
2840
2841 if (me->Underline_Level > 0 && me->inUnderline == FALSE) {
2842 HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
2843 me->inUnderline = TRUE;
2844 }
2845 if (me->needBoldH == TRUE && me->inBoldH == FALSE) {
2846 HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
2847 me->inBoldH = TRUE;
2848 me->needBoldH = FALSE;
2849 }
2850 }
2851 }
2852
2853 /*
2854 * This function strips white characters and
2855 * generally fixes up attribute values that
2856 * were received from the SGML parser and
2857 * are to be treated as partial or absolute
2858 * URLs. - FM
2859 */
LYLegitimizeHREF(HTStructured * me,char ** href,int force_slash,int strip_dots)2860 int LYLegitimizeHREF(HTStructured * me, char **href,
2861 int force_slash,
2862 int strip_dots)
2863 {
2864 int url_type = 0;
2865 char *p = NULL;
2866 char *pound = NULL;
2867 const char *Base = NULL;
2868
2869 if (!me || !href || isEmpty(*href))
2870 return (url_type);
2871
2872 if (!LYTrimStartfile(*href)) {
2873 /*
2874 * Collapse spaces in the actual URL, but just protect against tabs or
2875 * newlines in the fragment, if present. This seeks to cope with
2876 * atrocities inflicted on the Web by authoring tools such as
2877 * Frontpage. - FM
2878 */
2879
2880 /* Before working on spaces check if we have any, usually none. */
2881 p = LYSkipNonBlanks(*href);
2882
2883 if (*p) { /* p == first space character */
2884 /* no reallocs below, all converted in place */
2885
2886 pound = findPoundSelector(*href);
2887
2888 if (pound != NULL && pound < p) {
2889 convert_to_spaces(p, FALSE); /* done */
2890
2891 } else {
2892 if (pound != NULL)
2893 *pound = '\0'; /* mark */
2894
2895 /*
2896 * No blanks really belong in the HREF,
2897 * but if it refers to an actual file,
2898 * it may actually have blanks in the name.
2899 * Try to accommodate. See also HTParse().
2900 */
2901 if (LYRemoveNewlines(p) || StrChr(p, '\t') != 0) {
2902 LYRemoveBlanks(p); /* a compromise... */
2903 }
2904
2905 if (pound != NULL) {
2906 p = StrChr(p, '\0');
2907 *pound = '#'; /* restore */
2908 convert_to_spaces(pound, FALSE);
2909 if (p < pound)
2910 strcpy(p, pound);
2911 }
2912 }
2913 }
2914 }
2915 if (**href == '\0')
2916 return (url_type);
2917
2918 TRANSLATE_AND_UNESCAPE_TO_STD(href);
2919
2920 Base = me->inBASE ?
2921 me->base_href : me->node_anchor->address;
2922
2923 url_type = is_url(*href);
2924 if (!url_type && force_slash && **href == '.' &&
2925 (!strcmp(*href, ".") || !strcmp(*href, "..")) &&
2926 !isFILE_URL(Base)) {
2927 /*
2928 * The Fielding RFC/ID for resolving partial HREFs says that a slash
2929 * should be on the end of the preceding symbolic element for "." and
2930 * "..", but all tested browsers only do that for an explicit "./" or
2931 * "../", so we'll respect the RFC/ID only if force_slash was TRUE and
2932 * it's not a file URL. - FM
2933 */
2934 StrAllocCat(*href, "/");
2935 }
2936 if ((!url_type && LYStripDotDotURLs && strip_dots && **href == '.') &&
2937 !strncasecomp(Base, "http", 4)) {
2938 /*
2939 * We will be resolving a partial reference versus an http or https
2940 * URL, and it has lead dots, which may be retained when resolving via
2941 * HTParse(), but the request would fail if the first element of the
2942 * resultant path is two dots, because no http or https server accepts
2943 * such paths, and the current URL draft, likely to become an RFC, says
2944 * that it's optional for the UA to strip them as a form of error
2945 * recovery. So we will, recursively, for http/https URLs, like the
2946 * "major market browsers" which made this problem so common on the
2947 * Web, but we'll also issue a message about it, such that the bad
2948 * partial reference might get corrected by the document provider. -
2949 * FM
2950 */
2951 char *temp = NULL, *path = NULL, *cp;
2952 const char *str = "";
2953
2954 temp = HTParse(*href, Base, PARSE_ALL);
2955 path = HTParse(temp, "", PARSE_PATH + PARSE_PUNCTUATION);
2956 if (!StrNCmp(path, "/..", 3)) {
2957 cp = (path + 3);
2958 if (LYIsHtmlSep(*cp) || *cp == '\0') {
2959 if (Base[4] == 's') {
2960 str = "s";
2961 }
2962 CTRACE((tfp,
2963 "LYLegitimizeHREF: Bad value '%s' for http%s URL.\n",
2964 *href, str));
2965 CTRACE((tfp, " Stripping lead dots.\n"));
2966 if (!me->inBadHREF) {
2967 HTUserMsg(BAD_PARTIAL_REFERENCE);
2968 me->inBadHREF = TRUE;
2969 }
2970 }
2971 if (*cp == '\0') {
2972 StrAllocCopy(*href, "/");
2973 } else if (LYIsHtmlSep(*cp)) {
2974 while (!StrNCmp(cp, "/..", 3)) {
2975 if (*(cp + 3) == '/') {
2976 cp += 3;
2977 continue;
2978 } else if (*(cp + 3) == '\0') {
2979 *(cp + 1) = '\0';
2980 *(cp + 2) = '\0';
2981 }
2982 break;
2983 }
2984 StrAllocCopy(*href, cp);
2985 }
2986 }
2987 FREE(temp);
2988 FREE(path);
2989 }
2990 return (url_type);
2991 }
2992
2993 /*
2994 * This function checks for a Content-Base header,
2995 * and if not present, a Content-Location header
2996 * which is an absolute URL, and sets the BASE
2997 * accordingly. If set, it will be replaced by
2998 * any BASE tag in the HTML stream, itself. - FM
2999 */
LYCheckForContentBase(HTStructured * me)3000 void LYCheckForContentBase(HTStructured * me)
3001 {
3002 char *cp = NULL;
3003 BOOL present[HTML_BASE_ATTRIBUTES];
3004 const char *value[HTML_BASE_ATTRIBUTES];
3005 int i;
3006
3007 if (!(me && me->node_anchor))
3008 return;
3009
3010 if (me->node_anchor->content_base != NULL) {
3011 /*
3012 * We have a Content-Base value. Use it if it's non-zero length. - FM
3013 */
3014 if (*me->node_anchor->content_base == '\0')
3015 return;
3016 StrAllocCopy(cp, me->node_anchor->content_base);
3017 LYRemoveBlanks(cp);
3018 } else if (me->node_anchor->content_location != NULL) {
3019 /*
3020 * We didn't have a Content-Base value, but do have a Content-Location
3021 * value. Use it if it's an absolute URL. - FM
3022 */
3023 if (*me->node_anchor->content_location == '\0')
3024 return;
3025 StrAllocCopy(cp, me->node_anchor->content_location);
3026 LYRemoveBlanks(cp);
3027 if (!is_url(cp)) {
3028 FREE(cp);
3029 return;
3030 }
3031 } else {
3032 /*
3033 * We had neither a Content-Base nor Content-Location value. - FM
3034 */
3035 return;
3036 }
3037
3038 /*
3039 * If we collapsed to a zero-length value, ignore it. - FM
3040 */
3041 if (*cp == '\0') {
3042 FREE(cp);
3043 return;
3044 }
3045
3046 /*
3047 * Pass the value to HTML_start_element as the HREF of a BASE tag. - FM
3048 */
3049 for (i = 0; i < HTML_BASE_ATTRIBUTES; i++)
3050 present[i] = NO;
3051 present[HTML_BASE_HREF] = YES;
3052 value[HTML_BASE_HREF] = (const char *) cp;
3053 (*me->isa->start_element) (me, HTML_BASE, present, value,
3054 0, 0);
3055 FREE(cp);
3056 }
3057
3058 /*
3059 * This function creates NAMEd Anchors if a non-zero-length NAME
3060 * or ID attribute was present in the tag. - FM
3061 */
LYCheckForID(HTStructured * me,const BOOL * present,STRING2PTR value,int attribute)3062 void LYCheckForID(HTStructured * me, const BOOL *present,
3063 STRING2PTR value,
3064 int attribute)
3065 {
3066 HTChildAnchor *ID_A = NULL;
3067 char *temp = NULL;
3068
3069 if (!(me && me->text))
3070 return;
3071
3072 if (present && present[attribute]
3073 && non_empty(value[attribute])) {
3074 /*
3075 * Translate any named or numeric character references. - FM
3076 */
3077 StrAllocCopy(temp, value[attribute]);
3078 LYUCTranslateHTMLString(&temp, me->tag_charset, me->tag_charset,
3079 NO, NO, YES, st_URL);
3080
3081 /*
3082 * Create the link if we still have a non-zero-length string. - FM
3083 */
3084 if ((temp[0] != '\0') &&
3085 (ID_A = HTAnchor_findChildAndLink
3086 (
3087 me->node_anchor, /* Parent */
3088 temp, /* Tag */
3089 NULL, /* Addresss */
3090 (HTLinkType *) 0))) { /* Type */
3091 HText_beginAnchor(me->text, me->inUnderline, ID_A);
3092 HText_endAnchor(me->text, 0);
3093 }
3094 FREE(temp);
3095 }
3096 }
3097
3098 /*
3099 * This function creates a NAMEd Anchor for the ID string
3100 * passed to it directly as an argument. It assumes the
3101 * does not need checking for character references. - FM
3102 */
LYHandleID(HTStructured * me,const char * id)3103 void LYHandleID(HTStructured * me, const char *id)
3104 {
3105 HTChildAnchor *ID_A = NULL;
3106
3107 if (!(me && me->text) ||
3108 isEmpty(id))
3109 return;
3110
3111 /*
3112 * Create the link if we still have a non-zero-length string. - FM
3113 */
3114 if ((ID_A = HTAnchor_findChildAndLink
3115 (
3116 me->node_anchor, /* Parent */
3117 id, /* Tag */
3118 NULL, /* Addresss */
3119 (HTLinkType *) 0)) != NULL) { /* Type */
3120 HText_beginAnchor(me->text, me->inUnderline, ID_A);
3121 HText_endAnchor(me->text, 0);
3122 }
3123 }
3124
3125 /*
3126 * This function checks whether we want to override
3127 * the current default alignment for paragraphs and
3128 * instead use that specified in the element's style
3129 * sheet. - FM
3130 */
LYoverride_default_alignment(HTStructured * me)3131 BOOLEAN LYoverride_default_alignment(HTStructured * me)
3132 {
3133 if (!me)
3134 return NO;
3135
3136 switch (me->sp[0].tag_number) {
3137 case HTML_BLOCKQUOTE:
3138 case HTML_BQ:
3139 case HTML_NOTE:
3140 case HTML_FN:
3141 case HTML_ADDRESS:
3142 me->sp->style->alignment = HT_LEFT;
3143 return YES;
3144
3145 default:
3146 break;
3147 }
3148 return NO;
3149 }
3150
3151 /*
3152 * This function inserts newlines if needed to create double spacing,
3153 * and sets the left margin for subsequent text to the second line
3154 * indentation of the current style. - FM
3155 */
LYEnsureDoubleSpace(HTStructured * me)3156 void LYEnsureDoubleSpace(HTStructured * me)
3157 {
3158 if (!me || !me->text)
3159 return;
3160
3161 if (!HText_LastLineEmpty(me->text, FALSE)) {
3162 HText_setLastChar(me->text, ' '); /* absorb white space */
3163 HText_appendCharacter(me->text, '\r');
3164 HText_appendCharacter(me->text, '\r');
3165 } else if (!HText_PreviousLineEmpty(me->text, FALSE)) {
3166 HText_setLastChar(me->text, ' '); /* absorb white space */
3167 HText_appendCharacter(me->text, '\r');
3168 } else if (me->List_Nesting_Level >= 0) {
3169 HText_NegateLineOne(me->text);
3170 }
3171 me->in_word = NO;
3172 return;
3173 }
3174
3175 /*
3176 * This function inserts a newline if needed to create single spacing,
3177 * and sets the left margin for subsequent text to the second line
3178 * indentation of the current style. - FM
3179 */
LYEnsureSingleSpace(HTStructured * me)3180 void LYEnsureSingleSpace(HTStructured * me)
3181 {
3182 if (!me || !me->text)
3183 return;
3184
3185 if (!HText_LastLineEmpty(me->text, FALSE)) {
3186 HText_setLastChar(me->text, ' '); /* absorb white space */
3187 HText_appendCharacter(me->text, '\r');
3188 } else if (me->List_Nesting_Level >= 0) {
3189 HText_NegateLineOne(me->text);
3190 }
3191 me->in_word = NO;
3192 return;
3193 }
3194
3195 /*
3196 * This function resets paragraph alignments for block
3197 * elements which do not have a defined style sheet. - FM
3198 */
LYResetParagraphAlignment(HTStructured * me)3199 void LYResetParagraphAlignment(HTStructured * me)
3200 {
3201 if (!me)
3202 return;
3203
3204 if (me->List_Nesting_Level >= 0 ||
3205 ((me->Division_Level < 0) &&
3206 (me->sp->style->id == ST_Normal ||
3207 me->sp->style->id == ST_Preformatted))) {
3208 me->sp->style->alignment = HT_LEFT;
3209 } else {
3210 me->sp->style->alignment = (short) me->current_default_alignment;
3211 }
3212 return;
3213 }
3214
3215 /*
3216 * This example function checks whether the given anchor has
3217 * an address with a file scheme, and if so, loads it into the
3218 * the SGML parser's context->url element, which was passed as
3219 * the second argument. The handle_comment() calling function in
3220 * SGML.c then calls LYDoCSI() in LYUtils.c to insert HTML markup
3221 * into the corresponding stream, homologously to an SSI by an
3222 * HTTP server. - FM
3223 *
3224 * For functions similar to this but which depend on details of
3225 * the HTML handler's internal data, the calling interface should
3226 * be changed, and functions in SGML.c would have to make sure not
3227 * to call such functions inappropriately (e.g., calling a function
3228 * specific to the Lynx_HTML_Handler when SGML.c output goes to
3229 * some other HTStructured object like in HTMLGen.c), or the new
3230 * functions could be added to the SGML.h interface.
3231 */
LYCheckForCSI(HTParentAnchor * anchor,char ** url)3232 BOOLEAN LYCheckForCSI(HTParentAnchor *anchor,
3233 char **url)
3234 {
3235 if (!(anchor && anchor->address))
3236 return FALSE;
3237
3238 if (!isFILE_URL(anchor->address))
3239 return FALSE;
3240
3241 if (!LYisLocalHost(anchor->address))
3242 return FALSE;
3243
3244 StrAllocCopy(*url, anchor->address);
3245 return TRUE;
3246 }
3247
3248 /*
3249 * This function is called from the SGML parser to look at comments
3250 * and see whether we should collect some info from them. Currently
3251 * it only looks for comments with Message-Id and Subject info, in the
3252 * exact form generated by MHonArc for archived mailing list. If found,
3253 * the info is stored in the document's HTParentAnchor. It can later be
3254 * used for generating a mail response.
3255 *
3256 * We are extra picky here because there isn't any official definition
3257 * for these kinds of comments - we might (and still can) misinterpret
3258 * arbitrary comments as something they aren't.
3259 *
3260 * If something doesn't look right, for example invalid characters, the
3261 * strings are not stored. Mail responses will use something else as
3262 * the subject, probably the document URL, and will not have an
3263 * In-Reply-To header.
3264 *
3265 * All this is a hack - to do this the right way, mailing list archivers
3266 * would have to agree on some better mechanism to make this kind of info
3267 * from original mail headers available, for example using LINK. - kw
3268 */
LYCommentHacks(HTParentAnchor * anchor,const char * comment)3269 BOOLEAN LYCommentHacks(HTParentAnchor *anchor,
3270 const char *comment)
3271 {
3272 const char *cp;
3273 size_t len;
3274
3275 if (comment == NULL)
3276 return FALSE;
3277
3278 if (!(anchor && anchor->address))
3279 return FALSE;
3280
3281 if (StrNCmp(comment, "!--X-Message-Id: ", 17) == 0) {
3282 char *messageid = NULL;
3283 char *p;
3284
3285 for (cp = comment + 17; *cp; cp++) {
3286 if (UCH(*cp) >= 127 || !isgraph(UCH(*cp))) {
3287 break;
3288 }
3289 }
3290 if (strcmp(cp, " --")) {
3291 return FALSE;
3292 }
3293 cp = comment + 17;
3294 StrAllocCopy(messageid, cp);
3295 /* This should be ok - message-id should only contain 7-bit ASCII */
3296 if (!LYUCTranslateHTMLString(&messageid, 0, 0, NO, NO, YES, st_URL))
3297 return FALSE;
3298 for (p = messageid; *p; p++) {
3299 if (UCH(*p) >= 127 || !isgraph(UCH(*p))) {
3300 break;
3301 }
3302 }
3303 if (strcmp(p, " --")) {
3304 FREE(messageid);
3305 return FALSE;
3306 }
3307 if ((p = StrChr(messageid, '@')) == NULL || p[1] == '\0') {
3308 FREE(messageid);
3309 return FALSE;
3310 }
3311 p = messageid;
3312 if ((len = strlen(p)) >= 8 && !strcmp(&p[len - 3], " --")) {
3313 p[len - 3] = '\0';
3314 } else {
3315 FREE(messageid);
3316 return FALSE;
3317 }
3318 if (HTAnchor_setMessageID(anchor, messageid)) {
3319 FREE(messageid);
3320 return TRUE;
3321 } else {
3322 FREE(messageid);
3323 return FALSE;
3324 }
3325 }
3326 if (StrNCmp(comment, "!--X-Subject: ", 14) == 0) {
3327 char *subject = NULL;
3328 char *p;
3329
3330 for (cp = comment + 14; *cp; cp++) {
3331 if (UCH(*cp) >= 127 || !isprint(UCH(*cp))) {
3332 return FALSE;
3333 }
3334 }
3335 cp = comment + 14;
3336 StrAllocCopy(subject, cp);
3337 /* @@@
3338 * This may not be the right thing for the subject - but mail
3339 * subjects shouldn't contain 8-bit characters in raw form anyway.
3340 * We have to unescape character entities, since that's what MHonArc
3341 * seems to generate. But if after that there are 8-bit characters
3342 * the string is rejected. We would probably not know correctly
3343 * what charset to assume anyway - the mail sender's can differ from
3344 * the archive's. And the code for sending mail cannot deal well
3345 * with 8-bit characters - we should not put them in the Subject
3346 * header in raw form, but don't have MIME encoding implemented.
3347 * Someone may want to do more about this... - kw
3348 */
3349 if (!LYUCTranslateHTMLString(&subject, 0, 0, NO, YES, NO, st_HTML))
3350 return FALSE;
3351 for (p = subject; *p; p++) {
3352 if (UCH(*p) >= 127 || !isprint(UCH(*p))) {
3353 FREE(subject);
3354 return FALSE;
3355 }
3356 }
3357 p = subject;
3358 if ((len = strlen(p)) >= 4 && !strcmp(&p[len - 3], " --")) {
3359 p[len - 3] = '\0';
3360 } else {
3361 FREE(subject);
3362 return FALSE;
3363 }
3364 if (HTAnchor_setSubject(anchor, subject)) {
3365 FREE(subject);
3366 return TRUE;
3367 } else {
3368 FREE(subject);
3369 return FALSE;
3370 }
3371 }
3372
3373 return FALSE;
3374 }
3375
3376 /*
3377 * Create the Title with any left-angle-brackets converted to < entities
3378 * and any ampersands converted to & entities. - FM
3379 *
3380 * Convert 8-bit letters to &#xUUUU to avoid dependencies from display
3381 * character set which may need changing. Do NOT convert any 8-bit chars
3382 * if we have CJK display. - LP
3383 */
LYformTitle(char ** dst,const char * src)3384 void LYformTitle(char **dst,
3385 const char *src)
3386 {
3387 if (HTCJK == JAPANESE) {
3388 char *tmp_buffer = NULL;
3389
3390 if ((tmp_buffer = (char *) malloc(strlen(src) + 1)) == 0)
3391 outofmem(__FILE__, "LYformTitle");
3392
3393 switch (kanji_code) { /* 1997/11/22 (Sat) 09:28:00 */
3394 case EUC:
3395 TO_EUC((const unsigned char *) src, (unsigned char *) tmp_buffer);
3396 break;
3397 case SJIS:
3398 TO_SJIS((const unsigned char *) src, (unsigned char *) tmp_buffer);
3399 break;
3400 default:
3401 CTRACE((tfp, "\nLYformTitle: kanji_code is an unexpected value."));
3402 strcpy(tmp_buffer, src);
3403 break;
3404 }
3405 StrAllocCopy(*dst, tmp_buffer);
3406 FREE(tmp_buffer);
3407 } else {
3408 StrAllocCopy(*dst, src);
3409 }
3410 }
3411