1 /* text.c - Text manipulation functions
2 * Copyright (c) 1995-1997 Stefan Jokisch
3 *
4 * This file is part of Frotz.
5 *
6 * Frotz is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * Frotz is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "frotz.h"
22
23 enum string_type {
24 LOW_STRING, ABBREVIATION, HIGH_STRING, EMBEDDED_STRING, VOCABULARY
25 };
26
27 extern zword object_name(zword);
28 extern zword get_window_font(zword);
29
30 static zchar decoded[10];
31 static zword encoded[3];
32
33 /*
34 * According to Matteo De Luigi <matteo.de.luigi@libero.it>,
35 * 0xab and 0xbb were in each other's proper positions.
36 * Sat Apr 21, 2001
37 */
38 static zword zscii_to_latin1[] = {
39 0x0e4, 0x0f6, 0x0fc, 0x0c4, 0x0d6, 0x0dc, 0x0df, 0x0bb,
40 0x0ab, 0x0eb, 0x0ef, 0x0ff, 0x0cb, 0x0cf, 0x0e1, 0x0e9,
41 0x0ed, 0x0f3, 0x0fa, 0x0fd, 0x0c1, 0x0c9, 0x0cd, 0x0d3,
42 0x0da, 0x0dd, 0x0e0, 0x0e8, 0x0ec, 0x0f2, 0x0f9, 0x0c0,
43 0x0c8, 0x0cc, 0x0d2, 0x0d9, 0x0e2, 0x0ea, 0x0ee, 0x0f4,
44 0x0fb, 0x0c2, 0x0ca, 0x0ce, 0x0d4, 0x0db, 0x0e5, 0x0c5,
45 0x0f8, 0x0d8, 0x0e3, 0x0f1, 0x0f5, 0x0c3, 0x0d1, 0x0d5,
46 0x0e6, 0x0c6, 0x0e7, 0x0c7, 0x0fe, 0x0f0, 0x0de, 0x0d0,
47 0x0a3, 0x153, 0x152, 0x0a1, 0x0bf
48 };
49
50
51 /*
52 * translate_from_zscii
53 *
54 * Map a ZSCII character into Unicode.
55 *
56 */
translate_from_zscii(zbyte c)57 zchar translate_from_zscii(zbyte c)
58 {
59 if (c == 0xfc)
60 return ZC_MENU_CLICK;
61 if (c == 0xfd)
62 return ZC_DOUBLE_CLICK;
63 if (c == 0xfe)
64 return ZC_SINGLE_CLICK;
65
66 if (c >= 0x9b && story_id != BEYOND_ZORK) {
67
68 if (z_header.x_unicode_table != 0) { /* game has its own Unicode table */
69 zbyte N;
70
71 LOW_BYTE(z_header.x_unicode_table, N)
72 if (c - 0x9b < N) {
73 zword addr =
74 z_header.x_unicode_table + 1 + 2 * (c - 0x9b);
75 zword unicode;
76
77 LOW_WORD(addr, unicode)
78 #ifdef USE_UTF8
79 if (unicode < 0x20)
80 return '?';
81 #else
82 if ((unicode < 0x20) || (unicode > 0xff))
83 return '?';
84 #endif
85 return unicode;
86 } else
87 return '?';
88
89 } else /* game uses standard set */ if (c <= 0xdf) {
90 return zscii_to_latin1[c - 0x9b];
91 } else
92 return '?';
93 }
94 return c;
95 } /* translate_from_zscii */
96
97
98 /*
99 * unicode_to_zscii
100 *
101 * Convert a Unicode character to ZSCII, returning 0 on failure.
102 *
103 */
unicode_to_zscii(zchar c)104 zbyte unicode_to_zscii(zchar c)
105 {
106 int i;
107
108 if (c >= ZC_LATIN1_MIN) {
109 /* game has its own Unicode table */
110 if (z_header.x_unicode_table != 0) {
111 zbyte N;
112 int i;
113
114 LOW_BYTE(z_header.x_unicode_table, N)
115 for (i = 0x9b; i < 0x9b + N; i++) {
116 zword addr =
117 z_header.x_unicode_table + 1 + 2 * (i - 0x9b);
118 zword unicode;
119
120 LOW_WORD(addr, unicode)
121 if (c == unicode)
122 return (zbyte) i;
123 }
124 return 0;
125 } else { /* game uses standard set */
126 for (i = 0x9b; i <= 0xdf; i++) {
127 if (c == zscii_to_latin1[i - 0x9b])
128 return (zbyte) i;
129 }
130 return 0;
131 }
132 }
133 return (zbyte) c;
134 } /* unicode_to_zscii */
135
136
137 /*
138 * translate_to_zscii
139 *
140 * Map a Unicode character onto the ZSCII alphabet.
141 *
142 */
143
translate_to_zscii(zchar c)144 zbyte translate_to_zscii(zchar c)
145 {
146 if (c == ZC_SINGLE_CLICK)
147 return 0xfe;
148 if (c == ZC_DOUBLE_CLICK)
149 return 0xfd;
150 if (c == ZC_MENU_CLICK)
151 return 0xfc;
152 if (c == 0)
153 return 0;
154
155 c = unicode_to_zscii(c);
156 if (c == 0)
157 c = '?';
158
159 return (zbyte) c;
160 } /* translate_to_zscii */
161
162
163 /*
164 * alphabet
165 *
166 * Return a character from one of the three character sets.
167 *
168 */
alphabet(int set,int index)169 static zchar alphabet(int set, int index)
170 {
171 if (z_header.alphabet != 0) { /* game uses its own alphabet */
172
173 zbyte c;
174
175 zword addr = z_header.alphabet + 26 * set + index;
176 LOW_BYTE(addr, c)
177 return translate_from_zscii(c);
178
179 } else /* game uses default alphabet */ if (set == 0)
180 return 'a' + index;
181 else if (set == 1)
182 return 'A' + index;
183 else if (z_header.version == V1)
184 return " 0123456789.,!?_#'\"/\\<-:()"[index];
185 else
186 return " ^0123456789.,!?_#'\"/\\-:()"[index];
187 } /* alphabet */
188
189
190 /*
191 * load_string
192 *
193 * Copy a ZSCII string from the memory to the global "decoded" string.
194 *
195 */
load_string(zword addr,zword length)196 static void load_string(zword addr, zword length)
197 {
198 int resolution = (z_header.version <= V3) ? 2 : 3;
199 int i = 0;
200
201 while (i < 3 * resolution) {
202 if (i < length) {
203
204 zbyte c;
205
206 LOW_BYTE(addr, c)
207 addr++;
208
209 decoded[i++] = translate_from_zscii(c);
210
211 } else
212 decoded[i++] = 0;
213 }
214 } /* load_string */
215
216
217 /*
218 * encode_text
219 *
220 * Encode the Unicode text in the global "decoded" string then write
221 * the result to the global "encoded" array. (This is used to look up
222 * words in the dictionary.) Up to V3 the vocabulary resolution is
223 * two, since V4 it is three words.
224 * Because each word contains three Z-characters, that makes six or
225 * nine Z-characters respectively. Longer words are chopped to the
226 * proper size, shorter words are are padded out with 5's. For word
227 * completion we pad with 0s and 31s, the minimum and maximum
228 * Z-characters.
229 *
230 */
encode_text(int padding)231 static void encode_text(int padding)
232 {
233 static zchar again[] = { 'a', 'g', 'a', 'i', 'n', 0 };
234 static zchar examine[] = { 'e', 'x', 'a', 'm', 'i', 'n', 'e', 0 };
235 static zchar wait[] = { 'w', 'a', 'i', 't', 0 };
236
237 zbyte zchars[12];
238 const zchar *ptr = decoded;
239 zchar c;
240 int resolution = (z_header.version <= V3) ? 2 : 3;
241 int i = 0;
242
243 /* Expand abbreviations that some old Infocom games lack */
244 if (f_setup.expand_abbreviations)
245 if (padding == 0x05 && decoded[1] == 0)
246 switch (decoded[0]) {
247 case 'g':
248 ptr = again;
249 break;
250 case 'x':
251 ptr = examine;
252 break;
253 case 'z':
254 ptr = wait;
255 break;
256 }
257
258 /* Translate string to a sequence of Z-characters */
259 while (i < 3 * resolution) {
260 if ((c = *ptr++) != 0) {
261 int index, set;
262 zbyte c2;
263
264 /* Search character in the alphabet */
265 for (set = 0; set < 3; set++)
266 for (index = 0; index < 26; index++)
267 if (c == alphabet(set, index))
268 goto letter_found;
269
270 /* Character not found, store its ZSCII value */
271 c2 = translate_to_zscii(c);
272 zchars[i++] = 5;
273 zchars[i++] = 6;
274 zchars[i++] = c2 >> 5;
275 zchars[i++] = c2 & 0x1f;
276 continue;
277
278 letter_found:
279
280 /* Character found, store its index */
281 if (set != 0)
282 zchars[i++] = ((z_header.version <= V2) ? 1 : 3) + set;
283
284 zchars[i++] = index + 6;
285 } else
286 zchars[i++] = padding;
287 }
288
289 /* Three Z-characters make a 16bit word */
290 for (i = 0; i < resolution; i++) {
291 encoded[i] =
292 (zchars[3 * i + 0] << 10) |
293 (zchars[3 * i + 1] << 5) | (zchars[3 * i + 2]);
294 }
295 encoded[resolution - 1] |= 0x8000;
296 } /* encode_text */
297
298
299 /*
300 * z_check_unicode
301 *
302 * test if a unicode character can be printed (bit 0) and read (bit 1).
303 *
304 * zargs[0] = Unicode
305 *
306 */
z_check_unicode(void)307 void z_check_unicode(void)
308 {
309 zword c = zargs[0];
310
311 if (c >= 0x20 && c <= 0x7e)
312 store(3);
313 else if (c == 0xa0)
314 store(1);
315 #ifndef USE_UTF8
316 else if (c >= 0xa1 && c <= 0xff)
317 store(3);
318 #else
319 else if (c >= 0xa1) {
320 /* being optimistic, we can print all unicode characters
321 * supported and input the ones with zscii representation
322 */
323 zword mask = (unicode_to_zscii(c) != 0) ? 3 : 1;
324 store(mask & os_check_unicode(get_window_font(cwin), c));
325 }
326 #endif
327 else
328 store(0);
329 } /* z_check_unicode */
330
331
332 /*
333 * z_encode_text, encode a ZSCII string for use in a dictionary.
334 *
335 * zargs[0] = address of text buffer
336 * zargs[1] = length of ASCII string
337 * zargs[2] = offset of ASCII string within the text buffer
338 * zargs[3] = address to store encoded text in
339 *
340 * This is a V5+ opcode and therefore the dictionary resolution must be
341 * three 16bit words.
342 *
343 */
z_encode_text(void)344 void z_encode_text(void)
345 {
346 int i;
347
348 load_string((zword) (zargs[0] + zargs[2]), zargs[1]);
349 encode_text(0x05);
350 for (i = 0; i < 3; i++)
351 storew((zword) (zargs[3] + 2 * i), encoded[i]);
352 } /* z_encode_text */
353
354
355 /*
356 * decode_text
357 *
358 * Convert encoded text to Unicode. The encoded text consists of 16bit
359 * words. Every word holds 3 Z-characters (5 bits each) plus a spare
360 * bit to mark the last word. The Z-characters translate to ZSCII by
361 * looking at the current current character set. Some select another
362 * character set, others refer to abbreviations.
363 *
364 * There are several different string types:
365 *
366 * LOW_STRING - from the lower 64KB (byte address)
367 * ABBREVIATION - from the abbreviations table (word address)
368 * HIGH_STRING - from the end of the memory map (packed address)
369 * EMBEDDED_STRING - from the instruction stream (at PC)
370 * VOCABULARY - from the dictionary (byte address)
371 *
372 * The last type is only used for word completion.
373 *
374 */
375 #define outchar(c) if (st==VOCABULARY) *ptr++=c; else print_char(c)
decode_text(enum string_type st,zword addr)376 static void decode_text(enum string_type st, zword addr)
377 {
378 zchar *ptr;
379 long byte_addr;
380 zchar c2;
381 zword code;
382 zbyte c, prev_c = 0;
383 int shift_state = 0;
384 int shift_lock = 0;
385 int status = 0;
386
387 ptr = NULL; /* makes compilers shut up */
388 byte_addr = 0;
389
390 /* Calculate the byte address if necessary */
391 if (st == ABBREVIATION)
392 byte_addr = (long)addr << 1;
393
394 else if (st == HIGH_STRING) {
395
396 if (z_header.version <= V3)
397 byte_addr = (long)addr << 1;
398 else if (z_header.version <= V5)
399 byte_addr = (long)addr << 2;
400 else if (z_header.version <= V7)
401 byte_addr =
402 ((long)addr << 2) + ((long)z_header.strings_offset << 3);
403 else /* (z_header.version == V8) */
404 byte_addr = (long)addr << 3;
405
406 if (byte_addr >= story_size)
407 runtime_error(ERR_ILL_PRINT_ADDR);
408
409 }
410
411 /* Loop until a 16bit word has the highest bit set */
412 if (st == VOCABULARY)
413 ptr = decoded;
414
415 do {
416 int i;
417
418 /* Fetch the next 16bit word */
419 if (st == LOW_STRING || st == VOCABULARY) {
420 LOW_WORD(addr, code)
421 addr += 2;
422 } else if (st == HIGH_STRING || st == ABBREVIATION) {
423 HIGH_WORD(byte_addr, code)
424 byte_addr += 2;
425 } else
426 CODE_WORD(code)
427 /* Read its three Z-characters */
428 for (i = 10; i >= 0; i -= 5) {
429 zword abbr_addr;
430 zword ptr_addr;
431
432 c = (code >> i) & 0x1f;
433
434 switch (status) {
435 case 0: /* normal operation */
436 if (shift_state == 2 && c == 6)
437 status = 2;
438 else if (z_header.version == V1 && c == 1)
439 new_line();
440 else if (z_header.version >= V2
441 && shift_state == 2 && c == 7)
442 new_line();
443 else if (c >= 6)
444 outchar(alphabet
445 (shift_state, c - 6));
446 else if (c == 0)
447 outchar(' ');
448 else if (z_header.version >= V2 && c == 1)
449 status = 1;
450 else if (z_header.version >= V3 && c <= 3)
451 status = 1;
452 else {
453 shift_state =
454 (shift_lock + (c & 1) +
455 1) % 3;
456 if (z_header.version <= V2 && c >= 4)
457 shift_lock =
458 shift_state;
459 break;
460 }
461 shift_state = shift_lock;
462 break;
463 case 1: /* abbreviation */
464 ptr_addr =
465 z_header.abbreviations + 64 * (prev_c -
466 1) + 2 * c;
467 LOW_WORD(ptr_addr, abbr_addr)
468 decode_text(ABBREVIATION,
469 abbr_addr);
470 status = 0;
471 break;
472 case 2: /* ZSCII character - first part */
473 status = 3;
474 break;
475 case 3: /* ZSCII character - second part */
476 c2 = translate_from_zscii((prev_c << 5) | c);
477 outchar(c2);
478 status = 0;
479 break;
480 }
481 prev_c = c;
482 }
483 } while (!(code & 0x8000));
484
485 if (st == VOCABULARY)
486 *ptr = 0;
487 } /* decode_text */
488
489 #undef outchar
490
491
492 /*
493 * z_new_line, print a new line.
494 *
495 * no zargs used
496 *
497 */
z_new_line(void)498 void z_new_line(void)
499 {
500 new_line();
501 } /* z_new_line */
502
503
504 /*
505 * z_print, print a string embedded in the instruction stream.
506 *
507 * no zargs used
508 *
509 */
z_print(void)510 void z_print(void)
511 {
512 decode_text(EMBEDDED_STRING, 0);
513 } /* z_print */
514
515
516 /*
517 * z_print_addr, print a string from the lower 64KB.
518 *
519 * zargs[0] = address of string to print
520 *
521 */
z_print_addr(void)522 void z_print_addr(void)
523 {
524 decode_text(LOW_STRING, zargs[0]);
525 } /* z_print_addr */
526
527
528 /*
529 * z_print_char print a single ZSCII character.
530 *
531 * zargs[0] = ZSCII character to be printed
532 *
533 */
z_print_char(void)534 void z_print_char(void)
535 {
536 print_char(translate_from_zscii(zargs[0]));
537 } /* z_print_char */
538
539
540 /*
541 * z_print_form, print a formatted table.
542 *
543 * zargs[0] = address of formatted table to be printed
544 *
545 */
z_print_form(void)546 void z_print_form(void)
547 {
548 zword count;
549 zword addr = zargs[0];
550
551 bool first = TRUE;
552
553 for (;;) {
554
555 LOW_WORD(addr, count)
556 addr += 2;
557
558 if (count == 0)
559 break;
560
561 if (!first)
562 new_line();
563
564 while (count--) {
565 zbyte c;
566 LOW_BYTE(addr, c)
567 addr++;
568
569 print_char(translate_from_zscii(c));
570 }
571 first = FALSE;
572 }
573 } /* z_print_form */
574
575
576 /*
577 * print_num
578 *
579 * Print a signed 16bit number.
580 *
581 */
print_num(zword value)582 void print_num(zword value)
583 {
584 int i;
585
586 /* Print sign */
587 if ((short)value < 0) {
588 print_char('-');
589 value = -(short)value;
590 }
591
592 /* Print absolute value */ {
593 for (i = 10000; i != 0; i /= 10)
594 if (value >= i || i == 1)
595 print_char('0' + (value / i) % 10);
596 }
597
598 } /* print_num */
599
600
601 /*
602 * z_print_num, print a signed number.
603 *
604 * zargs[0] = number to print
605 *
606 */
z_print_num(void)607 void z_print_num(void)
608 {
609 print_num(zargs[0]);
610
611 } /* z_print_num */
612
613
614 /*
615 * print_object
616 *
617 * Print an object description.
618 *
619 */
print_object(zword object)620 void print_object(zword object)
621 {
622 zword addr = object_name(object);
623 zword code = 0x94a5;
624 zbyte length;
625
626 LOW_BYTE(addr, length)
627 addr++;
628
629 if (length != 0) {
630 LOW_WORD(addr, code)
631 if (code == 0x94a5) { /* encoded text 0x94a5 == empty string */
632 print_string("object#"); /* supply a generic name */
633 print_num(object); /* for anonymous objects */
634 } else
635 decode_text(LOW_STRING, addr);
636 }
637 } /* print_object */
638
639
640 /*
641 * z_print_obj, print an object description.
642 *
643 * zargs[0] = number of object to be printed
644 *
645 */
z_print_obj(void)646 void z_print_obj(void)
647 {
648 print_object(zargs[0]);
649 } /* z_print_obj */
650
651
652 /*
653 * z_print_paddr, print the string at the given packed address.
654 *
655 * zargs[0] = packed address of string to be printed
656 *
657 */
z_print_paddr(void)658 void z_print_paddr(void)
659 {
660 decode_text(HIGH_STRING, zargs[0]);
661 } /* z_print_paddr */
662
663
664 /*
665 * z_print_ret, print the string at PC, print newline then return true.
666 *
667 * no zargs used
668 *
669 */
z_print_ret(void)670 void z_print_ret(void)
671 {
672 decode_text(EMBEDDED_STRING, 0);
673 new_line();
674 ret(1);
675 } /* z_print_ret */
676
677
678 /*
679 * print_string
680 *
681 * Print a string of ASCII characters.
682 *
683 */
print_string(const char * s)684 void print_string(const char *s)
685 {
686 char c;
687
688 while ((c = *s++) != 0) {
689 if (c == '\n')
690 new_line();
691 else
692 print_char(c);
693 }
694 } /* print_string */
695
696
697 /*
698 * z_print_unicode
699 *
700 * zargs[0] = Unicode
701 *
702 */
z_print_unicode(void)703 void z_print_unicode(void)
704 {
705 if (zargs[0] < 0x20)
706 print_char('?');
707 else
708 print_char(zargs[0]);
709 } /* z_print_unicode */
710
711
712 /*
713 * lookup_text
714 *
715 * Scan a dictionary searching for the given word. The first argument
716 * can be
717 *
718 * 0x00 - find the first word which is >= the given one
719 * 0x05 - find the word which exactly matches the given one
720 * 0x1f - find the last word which is <= the given one
721 *
722 * The return value is 0 if the search fails.
723 *
724 */
lookup_text(int padding,zword dct)725 static zword lookup_text(int padding, zword dct)
726 {
727 zword entry_addr;
728 zword entry_count;
729 zword entry;
730 zword addr;
731 zbyte entry_len;
732 zbyte sep_count;
733 int resolution = (z_header.version <= V3) ? 2 : 3;
734 int entry_number;
735 int lower, upper;
736 int i;
737 bool sorted;
738
739 encode_text(padding);
740
741 LOW_BYTE(dct, sep_count) /* skip word separators */
742 dct += 1 + sep_count;
743 LOW_BYTE(dct, entry_len) /* get length of entries */
744 dct += 1;
745 LOW_WORD(dct, entry_count) /* get number of entries */
746 dct += 2;
747
748 if ((short)entry_count < 0) { /* bad luck, entries aren't sorted */
749 entry_count = -(short)entry_count;
750 sorted = FALSE;
751 } else
752 sorted = TRUE; /* entries are sorted */
753
754 lower = 0;
755 upper = entry_count - 1;
756
757 while (lower <= upper) {
758 if (sorted) /* binary search */
759 entry_number = (lower + upper) / 2;
760 else /* linear search */
761 entry_number = lower;
762
763 entry_addr = dct + entry_number * entry_len;
764
765 /* Compare word to dictionary entry */
766 addr = entry_addr;
767
768 for (i = 0; i < resolution; i++) {
769 LOW_WORD(addr, entry)
770 if (encoded[i] != entry)
771 goto continuing;
772 addr += 2;
773 }
774 return entry_addr; /* exact match found, return now */
775
776 continuing:
777 if (sorted) { /* binary search */
778 if (encoded[i] > entry)
779 lower = entry_number + 1;
780 else
781 upper = entry_number - 1;
782 } else
783 lower++; /* linear search */
784 }
785
786 /* No exact match has been found */
787 if (padding == 0x05)
788 return 0;
789
790 entry_number = (padding == 0x00) ? lower : upper;
791 if (entry_number == -1 || entry_number == entry_count)
792 return 0;
793 return dct + entry_number * entry_len;
794 } /* lookup_text */
795
796
797 /*
798 * tokenise_text
799 *
800 * Translate a single word to a token and append it to the token
801 * buffer. Every token consists of the address of the dictionary
802 * entry, the length of the word and the offset of the word from
803 * the start of the text buffer. Unknown words cause empty slots
804 * if the flag is set (such that the text can be scanned several
805 * times with different dictionaries); otherwise they are zero.
806 *
807 */
tokenise_text(zword text,zword length,zword from,zword parse,zword dct,bool flag)808 static void tokenise_text(zword text, zword length, zword from, zword parse,
809 zword dct, bool flag)
810 {
811 zword addr;
812 zbyte token_max, token_count;
813
814 LOW_BYTE(parse, token_max)
815 parse++;
816 LOW_BYTE(parse, token_count)
817 if (token_count < token_max) { /* sufficient space left for token? */
818 storeb(parse++, token_count + 1);
819 load_string((zword) (text + from), length);
820 addr = lookup_text(0x05, dct);
821 if (addr != 0 || !flag) {
822 parse += 4 * token_count;
823 storew((zword) (parse + 0), addr);
824 storeb((zword) (parse + 2), length);
825 storeb((zword) (parse + 3), from);
826 }
827 }
828 } /* tokenise_text */
829
830
831 /*
832 * tokenise_line
833 *
834 * Split an input line into words and translate the words to tokens.
835 *
836 */
tokenise_line(zword text,zword token,zword dct,bool flag)837 void tokenise_line(zword text, zword token, zword dct, bool flag)
838 {
839 zword addr1;
840 zword addr2;
841 zbyte length;
842 zbyte c;
843
844 length = 0; /* makes compilers shut up */
845
846 /* Use standard dictionary if the given dictionary is zero */
847 if (dct == 0)
848 dct = z_header.dictionary;
849
850 /* Remove all tokens before inserting new ones */
851 storeb((zword) (token + 1), 0);
852
853 /* Move the first pointer across the text buffer searching for the
854 beginning of a word. If this succeeds, store the position in a
855 second pointer. Move the first pointer searching for the end of
856 the word. When it is found, "tokenise" the word. Continue until
857 the end of the buffer is reached. */
858 addr1 = text;
859 addr2 = 0;
860
861 if (z_header.version >= V5) {
862 addr1++;
863 LOW_BYTE(addr1, length)
864 }
865
866 do {
867 zword sep_addr;
868 zbyte sep_count;
869 zbyte separator;
870
871 /* Fetch next ZSCII character */
872
873 addr1++;
874
875 if (z_header.version >= V5 && addr1 == text + 2 + length)
876 c = 0;
877 else
878 LOW_BYTE(addr1, c)
879 /* Check for separator */
880 sep_addr = dct;
881
882 LOW_BYTE(sep_addr, sep_count)
883 sep_addr++;
884
885 do {
886 LOW_BYTE(sep_addr, separator)
887 sep_addr++;
888
889 } while (c != separator && --sep_count != 0);
890
891 /* This could be the start or the end of a word */
892 if (sep_count == 0 && c != ' ' && c != 0) {
893 if (addr2 == 0)
894 addr2 = addr1;
895 } else if (addr2 != 0) {
896 tokenise_text(text,
897 (zword) (addr1 - addr2),
898 (zword) (addr2 - text), token, dct, flag);
899 addr2 = 0;
900 }
901
902 /* Translate separator (which is a word in its own right) */
903 if (sep_count != 0) {
904 tokenise_text(text, (zword) (1),
905 (zword) (addr1 - text), token, dct, flag);
906 }
907 } while (c != 0);
908
909 } /* tokenise_line */
910
911
912 /*
913 * z_tokenise, make a lexical analysis of a ZSCII string.
914 *
915 * zargs[0] = address of string to analyze
916 * zargs[1] = address of token buffer
917 * zargs[2] = address of dictionary (optional)
918 * zargs[3] = set when unknown words cause empty slots (optional)
919 *
920 */
z_tokenise(void)921 void z_tokenise(void)
922 {
923 /* Supply default arguments */
924 if (zargc < 3)
925 zargs[2] = 0;
926 if (zargc < 4)
927 zargs[3] = 0;
928
929 /* Call tokenise_line to do the real work */
930 tokenise_line(zargs[0], zargs[1], zargs[2], zargs[3] != 0);
931 } /* z_tokenise */
932
933
934 /*
935 * completion
936 *
937 * Scan the vocabulary to complete the last word on the input line
938 * (similar to "tcsh" under Unix). The return value is
939 *
940 * 2 ==> completion is impossible
941 * 1 ==> completion is ambiguous
942 * 0 ==> completion is successful
943 *
944 * The function also returns a string in its second argument. In case
945 * of 2, the string is empty; in case of 1, the string is the longest
946 * extension of the last word on the input line that is common to all
947 * possible completions (for instance, if the last word on the input
948 * is "fo" and its only possible completions are "follow" and "folly"
949 * then the string is "ll"); in case of 0, the string is an extension
950 * to the last word that results in the only possible completion.
951 *
952 */
completion(const zchar * buffer,zchar * result)953 int completion(const zchar * buffer, zchar * result)
954 {
955 zword minaddr;
956 zword maxaddr;
957 zchar *ptr;
958 zchar c;
959 int len;
960 int i;
961
962 *result = 0;
963
964 /* Copy last word to "decoded" string */
965 len = 0;
966 while ((c = *buffer++) != 0) {
967 if (c != ' ') {
968 if (len < 9)
969 decoded[len++] = c;
970 } else
971 len = 0;
972 }
973
974 decoded[len] = 0;
975
976 /* Search the dictionary for first and last possible extensions */
977 minaddr = lookup_text(0x00, z_header.dictionary);
978 maxaddr = lookup_text(0x1f, z_header.dictionary);
979
980 if (minaddr == 0 || maxaddr == 0 || minaddr > maxaddr)
981 return 2;
982
983 /* Copy first extension to "result" string */
984 decode_text(VOCABULARY, minaddr);
985
986 ptr = result;
987 for (i = len; (c = decoded[i]) != 0; i++)
988 *ptr++ = c;
989 *ptr = 0;
990
991 /* Merge second extension with "result" string */
992 decode_text(VOCABULARY, maxaddr);
993
994 for (i = len, ptr = result; (c = decoded[i]) != 0; i++, ptr++)
995 if (*ptr != c)
996 break;
997 *ptr = 0;
998
999 /* Search was ambiguous or successful */
1000 return (minaddr == maxaddr) ? 0 : 1;
1001
1002 } /* completion */
1003
1004
1005 /*
1006 * unicode_tolower
1007 *
1008 * Convert a Unicode character to lowercase.
1009 * Taken from Zip2000 by Kevin Bracey.
1010 *
1011 */
unicode_tolower(zword c)1012 zword unicode_tolower(zword c)
1013 {
1014 static const unsigned char tolower_basic_latin[0x100] = {
1015 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
1016 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1017 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
1018 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
1019 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
1020 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
1021 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
1022 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
1023 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
1024 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
1025 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
1026 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
1027 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
1028 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
1029 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
1030 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
1031 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
1032 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
1033 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
1034 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
1035 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9,
1036 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
1037 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9,
1038 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
1039 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
1040 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
1041 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7, 0xF8, 0xF9,
1042 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
1043 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
1044 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
1045 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9,
1046 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
1047 };
1048 static const unsigned char tolower_latin_extended_a[0x80] = {
1049 0x01, 0x01, 0x03, 0x03, 0x05, 0x05, 0x07, 0x07, 0x09, 0x09,
1050 0x0B, 0x0B, 0x0D, 0x0D, 0x0F, 0x0F,
1051 0x11, 0x11, 0x13, 0x13, 0x15, 0x15, 0x17, 0x17, 0x19, 0x19,
1052 0x1B, 0x1B, 0x1D, 0x1D, 0x1F, 0x1F,
1053 0x21, 0x21, 0x23, 0x23, 0x25, 0x25, 0x27, 0x27, 0x29, 0x29,
1054 0x2B, 0x2B, 0x2D, 0x2D, 0x2F, 0x2F,
1055 0x00, 0x31, 0x33, 0x33, 0x35, 0x35, 0x37, 0x37, 0x38, 0x3A,
1056 0x3A, 0x3C, 0x3C, 0x3E, 0x3E, 0x40,
1057 0x40, 0x42, 0x42, 0x44, 0x44, 0x46, 0x46, 0x48, 0x48, 0x49,
1058 0x4B, 0x4B, 0x4D, 0x4D, 0x4F, 0x4F,
1059 0x51, 0x51, 0x53, 0x53, 0x55, 0x55, 0x57, 0x57, 0x59, 0x59,
1060 0x5B, 0x5B, 0x5D, 0x5D, 0x5F, 0x5F,
1061 0x61, 0x61, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x69,
1062 0x6B, 0x6B, 0x6D, 0x6D, 0x6F, 0x6F,
1063 0x71, 0x71, 0x73, 0x73, 0x75, 0x75, 0x77, 0x77, 0x00, 0x7A,
1064 0x7A, 0x7C, 0x7C, 0x7E, 0x7E, 0x7F
1065 };
1066 static const unsigned char tolower_greek[0x50] = {
1067 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0xAC, 0x87, 0xAD, 0xAE,
1068 0xAF, 0x8B, 0xCC, 0x8D, 0xCD, 0xCE,
1069 0x90, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9,
1070 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
1071 0xC0, 0xC1, 0xA2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9,
1072 0xCA, 0xCB, 0xAC, 0xAD, 0xAE, 0xAF,
1073 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9,
1074 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
1075 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9,
1076 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF
1077 };
1078 static const unsigned char tolower_cyrillic[0x60] = {
1079 0x00, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
1080 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
1081 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
1082 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
1083 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
1084 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
1085 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
1086 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
1087 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
1088 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
1089 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
1090 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F
1091 };
1092
1093 if (c < 0x0100)
1094 c = tolower_basic_latin[c];
1095 else if (c == 0x0130)
1096 c = 0x0069; /* Capital I with dot -> lower case i */
1097 else if (c == 0x0178)
1098 c = 0x00FF; /* Capital Y diaeresis -> lower case y diaeresis */
1099 else if (c < 0x0180)
1100 c = tolower_latin_extended_a[c - 0x100] + 0x100;
1101 else if (c >= 0x380 && c < 0x3D0)
1102 c = tolower_greek[c - 0x380] + 0x300;
1103 else if (c >= 0x400 && c < 0x460)
1104 c = tolower_cyrillic[c - 0x400] + 0x400;
1105
1106 return c;
1107 } /* unicode_tolower */
1108