1 /* text.c - Text manipulation functions
2  *	Copyright (c) 1995-1997 Stefan Jokisch
3  *
4  * This file is part of Frotz.
5  *
6  * Frotz is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * Frotz is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "frotz.h"
22 
23 enum string_type {
24 	LOW_STRING, ABBREVIATION, HIGH_STRING, EMBEDDED_STRING, VOCABULARY
25 };
26 
27 extern zword object_name(zword);
28 extern zword get_window_font(zword);
29 
30 static zchar decoded[10];
31 static zword encoded[3];
32 
33 /*
34  * According to Matteo De Luigi <matteo.de.luigi@libero.it>,
35  * 0xab and 0xbb were in each other's proper positions.
36  *   Sat Apr 21, 2001
37  */
38 static zword zscii_to_latin1[] = {
39 	0x0e4, 0x0f6, 0x0fc, 0x0c4, 0x0d6, 0x0dc, 0x0df, 0x0bb,
40 	0x0ab, 0x0eb, 0x0ef, 0x0ff, 0x0cb, 0x0cf, 0x0e1, 0x0e9,
41 	0x0ed, 0x0f3, 0x0fa, 0x0fd, 0x0c1, 0x0c9, 0x0cd, 0x0d3,
42 	0x0da, 0x0dd, 0x0e0, 0x0e8, 0x0ec, 0x0f2, 0x0f9, 0x0c0,
43 	0x0c8, 0x0cc, 0x0d2, 0x0d9, 0x0e2, 0x0ea, 0x0ee, 0x0f4,
44 	0x0fb, 0x0c2, 0x0ca, 0x0ce, 0x0d4, 0x0db, 0x0e5, 0x0c5,
45 	0x0f8, 0x0d8, 0x0e3, 0x0f1, 0x0f5, 0x0c3, 0x0d1, 0x0d5,
46 	0x0e6, 0x0c6, 0x0e7, 0x0c7, 0x0fe, 0x0f0, 0x0de, 0x0d0,
47 	0x0a3, 0x153, 0x152, 0x0a1, 0x0bf
48 };
49 
50 
51 /*
52  * translate_from_zscii
53  *
54  * Map a ZSCII character into Unicode.
55  *
56  */
translate_from_zscii(zbyte c)57 zchar translate_from_zscii(zbyte c)
58 {
59 	if (c == 0xfc)
60 		return ZC_MENU_CLICK;
61 	if (c == 0xfd)
62 		return ZC_DOUBLE_CLICK;
63 	if (c == 0xfe)
64 		return ZC_SINGLE_CLICK;
65 
66 	if (c >= 0x9b && story_id != BEYOND_ZORK) {
67 
68 		if (z_header.x_unicode_table != 0) {	/* game has its own Unicode table */
69 			zbyte N;
70 
71 			LOW_BYTE(z_header.x_unicode_table, N)
72 			if (c - 0x9b < N) {
73 				zword addr =
74 				    z_header.x_unicode_table + 1 + 2 * (c - 0x9b);
75 				zword unicode;
76 
77 				LOW_WORD(addr, unicode)
78 #ifdef USE_UTF8
79 				if (unicode < 0x20)
80 					return '?';
81 #else
82 				if ((unicode < 0x20) || (unicode > 0xff))
83 					return '?';
84 #endif
85 				return unicode;
86 			} else
87 				return '?';
88 
89 		} else /* game uses standard set */ if (c <= 0xdf) {
90 			return zscii_to_latin1[c - 0x9b];
91 		} else
92 			return '?';
93 	}
94 	return c;
95 } /* translate_from_zscii */
96 
97 
98 /*
99  * unicode_to_zscii
100  *
101  * Convert a Unicode character to ZSCII, returning 0 on failure.
102  *
103  */
unicode_to_zscii(zchar c)104 zbyte unicode_to_zscii(zchar c)
105 {
106 	int i;
107 
108 	if (c >= ZC_LATIN1_MIN) {
109 		/* game has its own Unicode table */
110 		if (z_header.x_unicode_table != 0) {
111 			zbyte N;
112 			int i;
113 
114 			LOW_BYTE(z_header.x_unicode_table, N)
115 			for (i = 0x9b; i < 0x9b + N; i++) {
116 				zword addr =
117 					z_header.x_unicode_table + 1 + 2 * (i - 0x9b);
118 				zword unicode;
119 
120 				LOW_WORD(addr, unicode)
121 				if (c == unicode)
122 					return (zbyte) i;
123 			}
124 			return 0;
125 		} else {	/* game uses standard set */
126 			for (i = 0x9b; i <= 0xdf; i++) {
127 				if (c == zscii_to_latin1[i - 0x9b])
128 					return (zbyte) i;
129 			}
130 			return 0;
131 		}
132 	}
133 	return (zbyte) c;
134 } /* unicode_to_zscii */
135 
136 
137 /*
138  * translate_to_zscii
139  *
140  * Map a Unicode character onto the ZSCII alphabet.
141  *
142  */
143 
translate_to_zscii(zchar c)144 zbyte translate_to_zscii(zchar c)
145 {
146 	if (c == ZC_SINGLE_CLICK)
147 		return 0xfe;
148 	if (c == ZC_DOUBLE_CLICK)
149 		return 0xfd;
150 	if (c == ZC_MENU_CLICK)
151 		return 0xfc;
152 	if (c == 0)
153 		return 0;
154 
155 	c = unicode_to_zscii(c);
156 	if (c == 0)
157 		c = '?';
158 
159 	return (zbyte) c;
160 } /* translate_to_zscii */
161 
162 
163 /*
164  * alphabet
165  *
166  * Return a character from one of the three character sets.
167  *
168  */
alphabet(int set,int index)169 static zchar alphabet(int set, int index)
170 {
171 	if (z_header.alphabet != 0) {	/* game uses its own alphabet */
172 
173 		zbyte c;
174 
175 		zword addr = z_header.alphabet + 26 * set + index;
176 		LOW_BYTE(addr, c)
177 		return translate_from_zscii(c);
178 
179 	} else /* game uses default alphabet */ if (set == 0)
180 		return 'a' + index;
181 	else if (set == 1)
182 		return 'A' + index;
183 	else if (z_header.version == V1)
184 		return " 0123456789.,!?_#'\"/\\<-:()"[index];
185 	else
186 		return " ^0123456789.,!?_#'\"/\\-:()"[index];
187 } /* alphabet */
188 
189 
190 /*
191  * load_string
192  *
193  * Copy a ZSCII string from the memory to the global "decoded" string.
194  *
195  */
load_string(zword addr,zword length)196 static void load_string(zword addr, zword length)
197 {
198 	int resolution = (z_header.version <= V3) ? 2 : 3;
199 	int i = 0;
200 
201 	while (i < 3 * resolution) {
202 		if (i < length) {
203 
204 			zbyte c;
205 
206 			LOW_BYTE(addr, c)
207 			addr++;
208 
209 			decoded[i++] = translate_from_zscii(c);
210 
211 		} else
212 			decoded[i++] = 0;
213 	}
214 } /* load_string */
215 
216 
217 /*
218  * encode_text
219  *
220  * Encode the Unicode text in the global "decoded" string then write
221  * the result to the global "encoded" array. (This is used to look up
222  * words in the dictionary.) Up to V3 the vocabulary resolution is
223  * two, since V4 it is three words.
224  * Because each word contains three Z-characters, that makes six or
225  * nine Z-characters respectively. Longer words are chopped to the
226  * proper size, shorter words are are padded out with 5's. For word
227  * completion we pad with 0s and 31s, the minimum and maximum
228  * Z-characters.
229  *
230  */
encode_text(int padding)231 static void encode_text(int padding)
232 {
233 	static zchar again[] = { 'a', 'g', 'a', 'i', 'n', 0 };
234 	static zchar examine[] = { 'e', 'x', 'a', 'm', 'i', 'n', 'e', 0 };
235 	static zchar wait[] = { 'w', 'a', 'i', 't', 0 };
236 
237 	zbyte zchars[12];
238 	const zchar *ptr = decoded;
239 	zchar c;
240 	int resolution = (z_header.version <= V3) ? 2 : 3;
241 	int i = 0;
242 
243 	/* Expand abbreviations that some old Infocom games lack */
244 	if (f_setup.expand_abbreviations)
245 		if (padding == 0x05 && decoded[1] == 0)
246 			switch (decoded[0]) {
247 			case 'g':
248 				ptr = again;
249 				break;
250 			case 'x':
251 				ptr = examine;
252 				break;
253 			case 'z':
254 				ptr = wait;
255 				break;
256 			}
257 
258 	/* Translate string to a sequence of Z-characters */
259 	while (i < 3 * resolution) {
260 		if ((c = *ptr++) != 0) {
261 			int index, set;
262 			zbyte c2;
263 
264 			/* Search character in the alphabet */
265 			for (set = 0; set < 3; set++)
266 				for (index = 0; index < 26; index++)
267 					if (c == alphabet(set, index))
268 						goto letter_found;
269 
270 			/* Character not found, store its ZSCII value */
271 			c2 = translate_to_zscii(c);
272 			zchars[i++] = 5;
273 			zchars[i++] = 6;
274 			zchars[i++] = c2 >> 5;
275 			zchars[i++] = c2 & 0x1f;
276 			continue;
277 
278 letter_found:
279 
280 			/* Character found, store its index */
281 			if (set != 0)
282 				zchars[i++] = ((z_header.version <= V2) ? 1 : 3) + set;
283 
284 			zchars[i++] = index + 6;
285 		} else
286 			zchars[i++] = padding;
287 	}
288 
289 	/* Three Z-characters make a 16bit word */
290 	for (i = 0; i < resolution; i++) {
291 		encoded[i] =
292 		    (zchars[3 * i + 0] << 10) |
293 		    (zchars[3 * i + 1] << 5) | (zchars[3 * i + 2]);
294 	}
295 	encoded[resolution - 1] |= 0x8000;
296 } /* encode_text */
297 
298 
299 /*
300  * z_check_unicode
301  *
302  * test if a unicode character can be printed (bit 0) and read (bit 1).
303  *
304  * 	zargs[0] = Unicode
305  *
306  */
z_check_unicode(void)307 void z_check_unicode(void)
308 {
309 	zword c = zargs[0];
310 
311 	if (c >= 0x20 && c <= 0x7e)
312 		store(3);
313 	else if (c == 0xa0)
314 		store(1);
315 #ifndef USE_UTF8
316 	else if (c >= 0xa1 && c <= 0xff)
317 		store(3);
318 #else
319 	else if (c >= 0xa1) {
320 		/* being optimistic, we can print all unicode characters
321 		 * supported and input the ones with zscii representation
322 		 */
323 		zword mask = (unicode_to_zscii(c) != 0) ? 3 : 1;
324 		store(mask & os_check_unicode(get_window_font(cwin), c));
325 	}
326 #endif
327 	else
328 		store(0);
329 } /* z_check_unicode */
330 
331 
332 /*
333  * z_encode_text, encode a ZSCII string for use in a dictionary.
334  *
335  *	zargs[0] = address of text buffer
336  *	zargs[1] = length of ASCII string
337  *	zargs[2] = offset of ASCII string within the text buffer
338  *	zargs[3] = address to store encoded text in
339  *
340  * This is a V5+ opcode and therefore the dictionary resolution must be
341  * three 16bit words.
342  *
343  */
z_encode_text(void)344 void z_encode_text(void)
345 {
346 	int i;
347 
348 	load_string((zword) (zargs[0] + zargs[2]), zargs[1]);
349 	encode_text(0x05);
350 	for (i = 0; i < 3; i++)
351 		storew((zword) (zargs[3] + 2 * i), encoded[i]);
352 } /* z_encode_text */
353 
354 
355 /*
356  * decode_text
357  *
358  * Convert encoded text to Unicode. The encoded text consists of 16bit
359  * words. Every word holds 3 Z-characters (5 bits each) plus a spare
360  * bit to mark the last word. The Z-characters translate to ZSCII by
361  * looking at the current current character set. Some select another
362  * character set, others refer to abbreviations.
363  *
364  * There are several different string types:
365  *
366  *    LOW_STRING - from the lower 64KB (byte address)
367  *    ABBREVIATION - from the abbreviations table (word address)
368  *    HIGH_STRING - from the end of the memory map (packed address)
369  *    EMBEDDED_STRING - from the instruction stream (at PC)
370  *    VOCABULARY - from the dictionary (byte address)
371  *
372  * The last type is only used for word completion.
373  *
374  */
375 #define outchar(c)	if (st==VOCABULARY) *ptr++=c; else print_char(c)
decode_text(enum string_type st,zword addr)376 static void decode_text(enum string_type st, zword addr)
377 {
378 	zchar *ptr;
379 	long byte_addr;
380 	zchar c2;
381 	zword code;
382 	zbyte c, prev_c = 0;
383 	int shift_state = 0;
384 	int shift_lock = 0;
385 	int status = 0;
386 
387 	ptr = NULL;		/* makes compilers shut up */
388 	byte_addr = 0;
389 
390 	/* Calculate the byte address if necessary */
391 	if (st == ABBREVIATION)
392 		byte_addr = (long)addr << 1;
393 
394 	else if (st == HIGH_STRING) {
395 
396 		if (z_header.version <= V3)
397 			byte_addr = (long)addr << 1;
398 		else if (z_header.version <= V5)
399 			byte_addr = (long)addr << 2;
400 		else if (z_header.version <= V7)
401 			byte_addr =
402 			    ((long)addr << 2) + ((long)z_header.strings_offset << 3);
403 		else		/* (z_header.version == V8) */
404 			byte_addr = (long)addr << 3;
405 
406 		if (byte_addr >= story_size)
407 			runtime_error(ERR_ILL_PRINT_ADDR);
408 
409 	}
410 
411 	/* Loop until a 16bit word has the highest bit set */
412 	if (st == VOCABULARY)
413 		ptr = decoded;
414 
415 	do {
416 		int i;
417 
418 		/* Fetch the next 16bit word */
419 		if (st == LOW_STRING || st == VOCABULARY) {
420 			LOW_WORD(addr, code)
421 			addr += 2;
422 		} else if (st == HIGH_STRING || st == ABBREVIATION) {
423 			HIGH_WORD(byte_addr, code)
424 			byte_addr += 2;
425 		} else
426 			CODE_WORD(code)
427 			/* Read its three Z-characters */
428 		for (i = 10; i >= 0; i -= 5) {
429 			zword abbr_addr;
430 			zword ptr_addr;
431 
432 			c = (code >> i) & 0x1f;
433 
434 			switch (status) {
435 			case 0:	/* normal operation */
436 				if (shift_state == 2 && c == 6)
437 					status = 2;
438 				else if (z_header.version == V1 && c == 1)
439 					new_line();
440 				else if (z_header.version >= V2
441 					 && shift_state == 2 && c == 7)
442 					new_line();
443 				else if (c >= 6)
444 					outchar(alphabet
445 						(shift_state, c - 6));
446 				else if (c == 0)
447 					outchar(' ');
448 				else if (z_header.version >= V2 && c == 1)
449 					status = 1;
450 				else if (z_header.version >= V3 && c <= 3)
451 					status = 1;
452 				else {
453 					shift_state =
454 					    (shift_lock + (c & 1) +
455 					     1) % 3;
456 					if (z_header.version <= V2 && c >= 4)
457 						shift_lock =
458 						    shift_state;
459 					break;
460 				}
461 				shift_state = shift_lock;
462 				break;
463 			case 1:	/* abbreviation */
464 				ptr_addr =
465 				    z_header.abbreviations + 64 * (prev_c -
466 						    1) + 2 * c;
467 				LOW_WORD(ptr_addr, abbr_addr)
468 				    decode_text(ABBREVIATION,
469 						abbr_addr);
470 				status = 0;
471 				break;
472 			case 2:	/* ZSCII character - first part */
473 				status = 3;
474 				break;
475 			case 3:	/* ZSCII character - second part */
476 				c2 = translate_from_zscii((prev_c << 5) | c);
477 				outchar(c2);
478 				status = 0;
479 				break;
480 			}
481 			prev_c = c;
482 		}
483 	} while (!(code & 0x8000));
484 
485 	if (st == VOCABULARY)
486 		*ptr = 0;
487 } /* decode_text */
488 
489 #undef outchar
490 
491 
492 /*
493  * z_new_line, print a new line.
494  *
495  * 	no zargs used
496  *
497  */
z_new_line(void)498 void z_new_line(void)
499 {
500 	new_line();
501 } /* z_new_line */
502 
503 
504 /*
505  * z_print, print a string embedded in the instruction stream.
506  *
507  *	no zargs used
508  *
509  */
z_print(void)510 void z_print(void)
511 {
512 	decode_text(EMBEDDED_STRING, 0);
513 } /* z_print */
514 
515 
516 /*
517  * z_print_addr, print a string from the lower 64KB.
518  *
519  *	zargs[0] = address of string to print
520  *
521  */
z_print_addr(void)522 void z_print_addr(void)
523 {
524 	decode_text(LOW_STRING, zargs[0]);
525 } /* z_print_addr */
526 
527 
528 /*
529  * z_print_char print a single ZSCII character.
530  *
531  *	zargs[0] = ZSCII character to be printed
532  *
533  */
z_print_char(void)534 void z_print_char(void)
535 {
536 	print_char(translate_from_zscii(zargs[0]));
537 } /* z_print_char */
538 
539 
540 /*
541  * z_print_form, print a formatted table.
542  *
543  *	zargs[0] = address of formatted table to be printed
544  *
545  */
z_print_form(void)546 void z_print_form(void)
547 {
548 	zword count;
549 	zword addr = zargs[0];
550 
551 	bool first = TRUE;
552 
553 	for (;;) {
554 
555 		LOW_WORD(addr, count)
556 		    addr += 2;
557 
558 		if (count == 0)
559 			break;
560 
561 		if (!first)
562 			new_line();
563 
564 		while (count--) {
565 			zbyte c;
566 			LOW_BYTE(addr, c)
567 			    addr++;
568 
569 			print_char(translate_from_zscii(c));
570 		}
571 		first = FALSE;
572 	}
573 } /* z_print_form */
574 
575 
576 /*
577  * print_num
578  *
579  * Print a signed 16bit number.
580  *
581  */
print_num(zword value)582 void print_num(zword value)
583 {
584 	int i;
585 
586 	/* Print sign */
587 	if ((short)value < 0) {
588 		print_char('-');
589 		value = -(short)value;
590 	}
591 
592 	/* Print absolute value */ {
593 	for (i = 10000; i != 0; i /= 10)
594 		if (value >= i || i == 1)
595 			print_char('0' + (value / i) % 10);
596 	}
597 
598 } /* print_num */
599 
600 
601 /*
602  * z_print_num, print a signed number.
603  *
604  * 	zargs[0] = number to print
605  *
606  */
z_print_num(void)607 void z_print_num(void)
608 {
609 	print_num(zargs[0]);
610 
611 } /* z_print_num */
612 
613 
614 /*
615  * print_object
616  *
617  * Print an object description.
618  *
619  */
print_object(zword object)620 void print_object(zword object)
621 {
622 	zword addr = object_name(object);
623 	zword code = 0x94a5;
624 	zbyte length;
625 
626 	LOW_BYTE(addr, length)
627 	    addr++;
628 
629 	if (length != 0) {
630 		LOW_WORD(addr, code)
631 		if (code == 0x94a5) { /* encoded text 0x94a5 == empty string */
632 			print_string("object#"); /* supply a generic name */
633 			print_num(object);	/* for anonymous objects */
634 		} else
635 			decode_text(LOW_STRING, addr);
636 	}
637 } /* print_object */
638 
639 
640 /*
641  * z_print_obj, print an object description.
642  *
643  * 	zargs[0] = number of object to be printed
644  *
645  */
z_print_obj(void)646 void z_print_obj(void)
647 {
648 	print_object(zargs[0]);
649 } /* z_print_obj */
650 
651 
652 /*
653  * z_print_paddr, print the string at the given packed address.
654  *
655  * 	zargs[0] = packed address of string to be printed
656  *
657  */
z_print_paddr(void)658 void z_print_paddr(void)
659 {
660 	decode_text(HIGH_STRING, zargs[0]);
661 } /* z_print_paddr */
662 
663 
664 /*
665  * z_print_ret, print the string at PC, print newline then return true.
666  *
667  * 	no zargs used
668  *
669  */
z_print_ret(void)670 void z_print_ret(void)
671 {
672 	decode_text(EMBEDDED_STRING, 0);
673 	new_line();
674 	ret(1);
675 } /* z_print_ret */
676 
677 
678 /*
679  * print_string
680  *
681  * Print a string of ASCII characters.
682  *
683  */
print_string(const char * s)684 void print_string(const char *s)
685 {
686 	char c;
687 
688 	while ((c = *s++) != 0) {
689 		if (c == '\n')
690 			new_line();
691 		else
692 			print_char(c);
693 	}
694 } /* print_string */
695 
696 
697 /*
698  * z_print_unicode
699  *
700  * 	zargs[0] = Unicode
701  *
702  */
z_print_unicode(void)703 void z_print_unicode(void)
704 {
705 	if (zargs[0] < 0x20)
706 		print_char('?');
707 	else
708 		print_char(zargs[0]);
709 } /* z_print_unicode */
710 
711 
712 /*
713  * lookup_text
714  *
715  * Scan a dictionary searching for the given word. The first argument
716  * can be
717  *
718  * 0x00 - find the first word which is >= the given one
719  * 0x05 - find the word which exactly matches the given one
720  * 0x1f - find the last word which is <= the given one
721  *
722  * The return value is 0 if the search fails.
723  *
724  */
lookup_text(int padding,zword dct)725 static zword lookup_text(int padding, zword dct)
726 {
727 	zword entry_addr;
728 	zword entry_count;
729 	zword entry;
730 	zword addr;
731 	zbyte entry_len;
732 	zbyte sep_count;
733 	int resolution = (z_header.version <= V3) ? 2 : 3;
734 	int entry_number;
735 	int lower, upper;
736 	int i;
737 	bool sorted;
738 
739 	encode_text(padding);
740 
741 	LOW_BYTE(dct, sep_count)	/* skip word separators */
742 	dct += 1 + sep_count;
743 	LOW_BYTE(dct, entry_len)	/* get length of entries */
744 	dct += 1;
745 	LOW_WORD(dct, entry_count)	/* get number of entries */
746 	dct += 2;
747 
748 	if ((short)entry_count < 0) {	/* bad luck, entries aren't sorted */
749 		entry_count = -(short)entry_count;
750 		sorted = FALSE;
751 	} else
752 		sorted = TRUE;	/* entries are sorted */
753 
754 	lower = 0;
755 	upper = entry_count - 1;
756 
757 	while (lower <= upper) {
758 		if (sorted)	/* binary search */
759 			entry_number = (lower + upper) / 2;
760 		else		/* linear search */
761 			entry_number = lower;
762 
763 		entry_addr = dct + entry_number * entry_len;
764 
765 		/* Compare word to dictionary entry */
766 		addr = entry_addr;
767 
768 		for (i = 0; i < resolution; i++) {
769 			LOW_WORD(addr, entry)
770 			    if (encoded[i] != entry)
771 				goto continuing;
772 			addr += 2;
773 		}
774 		return entry_addr;	/* exact match found, return now */
775 
776 continuing:
777 		if (sorted) {	/* binary search */
778 			if (encoded[i] > entry)
779 				lower = entry_number + 1;
780 			else
781 				upper = entry_number - 1;
782 		} else
783 			lower++;	/* linear search */
784 	}
785 
786 	/* No exact match has been found */
787 	if (padding == 0x05)
788 		return 0;
789 
790 	entry_number = (padding == 0x00) ? lower : upper;
791 	if (entry_number == -1 || entry_number == entry_count)
792 		return 0;
793 	return dct + entry_number * entry_len;
794 } /* lookup_text */
795 
796 
797 /*
798  * tokenise_text
799  *
800  * Translate a single word to a token and append it to the token
801  * buffer. Every token consists of the address of the dictionary
802  * entry, the length of the word and the offset of the word from
803  * the start of the text buffer. Unknown words cause empty slots
804  * if the flag is set (such that the text can be scanned several
805  * times with different dictionaries); otherwise they are zero.
806  *
807  */
tokenise_text(zword text,zword length,zword from,zword parse,zword dct,bool flag)808 static void tokenise_text(zword text, zword length, zword from, zword parse,
809 			  zword dct, bool flag)
810 {
811 	zword addr;
812 	zbyte token_max, token_count;
813 
814 	LOW_BYTE(parse, token_max)
815 	parse++;
816 	LOW_BYTE(parse, token_count)
817 	if (token_count < token_max) {	/* sufficient space left for token? */
818 		storeb(parse++, token_count + 1);
819 		load_string((zword) (text + from), length);
820 		addr = lookup_text(0x05, dct);
821 		if (addr != 0 || !flag) {
822 			parse += 4 * token_count;
823 			storew((zword) (parse + 0), addr);
824 			storeb((zword) (parse + 2), length);
825 			storeb((zword) (parse + 3), from);
826 		}
827 	}
828 } /* tokenise_text */
829 
830 
831 /*
832  * tokenise_line
833  *
834  * Split an input line into words and translate the words to tokens.
835  *
836  */
tokenise_line(zword text,zword token,zword dct,bool flag)837 void tokenise_line(zword text, zword token, zword dct, bool flag)
838 {
839 	zword addr1;
840 	zword addr2;
841 	zbyte length;
842 	zbyte c;
843 
844 	length = 0;		/* makes compilers shut up */
845 
846 	/* Use standard dictionary if the given dictionary is zero */
847 	if (dct == 0)
848 		dct = z_header.dictionary;
849 
850 	/* Remove all tokens before inserting new ones */
851 	storeb((zword) (token + 1), 0);
852 
853 	/* Move the first pointer across the text buffer searching for the
854 	   beginning of a word. If this succeeds, store the position in a
855 	   second pointer. Move the first pointer searching for the end of
856 	   the word. When it is found, "tokenise" the word. Continue until
857 	   the end of the buffer is reached. */
858 	addr1 = text;
859 	addr2 = 0;
860 
861 	if (z_header.version >= V5) {
862 		addr1++;
863 		LOW_BYTE(addr1, length)
864 	}
865 
866 	do {
867 		zword sep_addr;
868 		zbyte sep_count;
869 		zbyte separator;
870 
871 		/* Fetch next ZSCII character */
872 
873 		addr1++;
874 
875 		if (z_header.version >= V5 && addr1 == text + 2 + length)
876 			c = 0;
877 		else
878 			LOW_BYTE(addr1, c)
879 			/* Check for separator */
880 			sep_addr = dct;
881 
882 		LOW_BYTE(sep_addr, sep_count)
883 		sep_addr++;
884 
885 		do {
886 			LOW_BYTE(sep_addr, separator)
887 			    sep_addr++;
888 
889 		} while (c != separator && --sep_count != 0);
890 
891 		/* This could be the start or the end of a word */
892 		if (sep_count == 0 && c != ' ' && c != 0) {
893 			if (addr2 == 0)
894 				addr2 = addr1;
895 		} else if (addr2 != 0) {
896 			tokenise_text(text,
897 				      (zword) (addr1 - addr2),
898 				      (zword) (addr2 - text), token, dct, flag);
899 			addr2 = 0;
900 		}
901 
902 		/* Translate separator (which is a word in its own right) */
903 		if (sep_count != 0) {
904 			tokenise_text(text, (zword) (1),
905 				(zword) (addr1 - text), token, dct, flag);
906 		}
907 	} while (c != 0);
908 
909 } /* tokenise_line */
910 
911 
912 /*
913  * z_tokenise, make a lexical analysis of a ZSCII string.
914  *
915  *	zargs[0] = address of string to analyze
916  *	zargs[1] = address of token buffer
917  *	zargs[2] = address of dictionary (optional)
918  *	zargs[3] = set when unknown words cause empty slots (optional)
919  *
920  */
z_tokenise(void)921 void z_tokenise(void)
922 {
923 	/* Supply default arguments */
924 	if (zargc < 3)
925 		zargs[2] = 0;
926 	if (zargc < 4)
927 		zargs[3] = 0;
928 
929 	/* Call tokenise_line to do the real work */
930 	tokenise_line(zargs[0], zargs[1], zargs[2], zargs[3] != 0);
931 } /* z_tokenise */
932 
933 
934 /*
935  * completion
936  *
937  * Scan the vocabulary to complete the last word on the input line
938  * (similar to "tcsh" under Unix). The return value is
939  *
940  *    2 ==> completion is impossible
941  *    1 ==> completion is ambiguous
942  *    0 ==> completion is successful
943  *
944  * The function also returns a string in its second argument. In case
945  * of 2, the string is empty; in case of 1, the string is the longest
946  * extension of the last word on the input line that is common to all
947  * possible completions (for instance, if the last word on the input
948  * is "fo" and its only possible completions are "follow" and "folly"
949  * then the string is "ll"); in case of 0, the string is an extension
950  * to the last word that results in the only possible completion.
951  *
952  */
completion(const zchar * buffer,zchar * result)953 int completion(const zchar * buffer, zchar * result)
954 {
955 	zword minaddr;
956 	zword maxaddr;
957 	zchar *ptr;
958 	zchar c;
959 	int len;
960 	int i;
961 
962 	*result = 0;
963 
964 	/* Copy last word to "decoded" string */
965 	len = 0;
966 	while ((c = *buffer++) != 0) {
967 		if (c != ' ') {
968 			if (len < 9)
969 				decoded[len++] = c;
970 		} else
971 			len = 0;
972 	}
973 
974 	decoded[len] = 0;
975 
976 	/* Search the dictionary for first and last possible extensions */
977 	minaddr = lookup_text(0x00, z_header.dictionary);
978 	maxaddr = lookup_text(0x1f, z_header.dictionary);
979 
980 	if (minaddr == 0 || maxaddr == 0 || minaddr > maxaddr)
981 		return 2;
982 
983 	/* Copy first extension to "result" string */
984 	decode_text(VOCABULARY, minaddr);
985 
986 	ptr = result;
987 	for (i = len; (c = decoded[i]) != 0; i++)
988 		*ptr++ = c;
989 	*ptr = 0;
990 
991 	/* Merge second extension with "result" string */
992 	decode_text(VOCABULARY, maxaddr);
993 
994 	for (i = len, ptr = result; (c = decoded[i]) != 0; i++, ptr++)
995 		if (*ptr != c)
996 			break;
997 	*ptr = 0;
998 
999 	/* Search was ambiguous or successful */
1000 	return (minaddr == maxaddr) ? 0 : 1;
1001 
1002 } /* completion */
1003 
1004 
1005 /*
1006  * unicode_tolower
1007  *
1008  * Convert a Unicode character to lowercase.
1009  * Taken from Zip2000 by Kevin Bracey.
1010  *
1011  */
unicode_tolower(zword c)1012 zword unicode_tolower(zword c)
1013 {
1014 	static const unsigned char tolower_basic_latin[0x100] = {
1015 		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
1016 		    0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1017 		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
1018 		    0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
1019 		0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
1020 		    0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
1021 		0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
1022 		    0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
1023 		0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
1024 		    0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
1025 		0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
1026 		    0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
1027 		0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
1028 		    0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
1029 		0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
1030 		    0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
1031 		0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
1032 		    0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
1033 		0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
1034 		    0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
1035 		0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9,
1036 		    0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
1037 		0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9,
1038 		    0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
1039 		0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
1040 		    0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
1041 		0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7, 0xF8, 0xF9,
1042 		    0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
1043 		0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
1044 		    0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
1045 		0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9,
1046 		    0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
1047 	};
1048 	static const unsigned char tolower_latin_extended_a[0x80] = {
1049 		0x01, 0x01, 0x03, 0x03, 0x05, 0x05, 0x07, 0x07, 0x09, 0x09,
1050 		    0x0B, 0x0B, 0x0D, 0x0D, 0x0F, 0x0F,
1051 		0x11, 0x11, 0x13, 0x13, 0x15, 0x15, 0x17, 0x17, 0x19, 0x19,
1052 		    0x1B, 0x1B, 0x1D, 0x1D, 0x1F, 0x1F,
1053 		0x21, 0x21, 0x23, 0x23, 0x25, 0x25, 0x27, 0x27, 0x29, 0x29,
1054 		    0x2B, 0x2B, 0x2D, 0x2D, 0x2F, 0x2F,
1055 		0x00, 0x31, 0x33, 0x33, 0x35, 0x35, 0x37, 0x37, 0x38, 0x3A,
1056 		    0x3A, 0x3C, 0x3C, 0x3E, 0x3E, 0x40,
1057 		0x40, 0x42, 0x42, 0x44, 0x44, 0x46, 0x46, 0x48, 0x48, 0x49,
1058 		    0x4B, 0x4B, 0x4D, 0x4D, 0x4F, 0x4F,
1059 		0x51, 0x51, 0x53, 0x53, 0x55, 0x55, 0x57, 0x57, 0x59, 0x59,
1060 		    0x5B, 0x5B, 0x5D, 0x5D, 0x5F, 0x5F,
1061 		0x61, 0x61, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x69,
1062 		    0x6B, 0x6B, 0x6D, 0x6D, 0x6F, 0x6F,
1063 		0x71, 0x71, 0x73, 0x73, 0x75, 0x75, 0x77, 0x77, 0x00, 0x7A,
1064 		    0x7A, 0x7C, 0x7C, 0x7E, 0x7E, 0x7F
1065 	};
1066 	static const unsigned char tolower_greek[0x50] = {
1067 		0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0xAC, 0x87, 0xAD, 0xAE,
1068 		    0xAF, 0x8B, 0xCC, 0x8D, 0xCD, 0xCE,
1069 		0x90, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9,
1070 		    0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
1071 		0xC0, 0xC1, 0xA2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9,
1072 		    0xCA, 0xCB, 0xAC, 0xAD, 0xAE, 0xAF,
1073 		0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9,
1074 		    0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
1075 		0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9,
1076 		    0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF
1077 	};
1078 	static const unsigned char tolower_cyrillic[0x60] = {
1079 		0x00, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
1080 		    0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
1081 		0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
1082 		    0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
1083 		0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
1084 		    0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
1085 		0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
1086 		    0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
1087 		0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
1088 		    0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
1089 		0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
1090 		    0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F
1091 	};
1092 
1093 	if (c < 0x0100)
1094 		c = tolower_basic_latin[c];
1095 	else if (c == 0x0130)
1096 		c = 0x0069; /* Capital I with dot -> lower case i */
1097 	else if (c == 0x0178)
1098 		c = 0x00FF; /* Capital Y diaeresis -> lower case y diaeresis */
1099 	else if (c < 0x0180)
1100 		c = tolower_latin_extended_a[c - 0x100] + 0x100;
1101 	else if (c >= 0x380 && c < 0x3D0)
1102 		c = tolower_greek[c - 0x380] + 0x300;
1103 	else if (c >= 0x400 && c < 0x460)
1104 		c = tolower_cyrillic[c - 0x400] + 0x400;
1105 
1106 	return c;
1107 } /* unicode_tolower */
1108