1
2 /*===========================================================================*/
3
4 /*
5 * Copyright (C) 1997 Jason Hutchens
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the Free
9 * Software Foundation; either version 2 of the license or (at your option)
10 * any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 * or FITNESS FOR A PARTICULAR PURPOSE. See the Gnu Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21
22 /*===========================================================================*/
23
24 /*
25 * $Id: hal.c,v 1.6 2001/08/21 06:18:37 devinteske Exp $
26 *
27 * File: megahal.c
28 *
29 * Program: MegaHAL v8
30 *
31 * Purpose: To simulate a natural language conversation with a psychotic
32 * computer. This is achieved by learning from the user's
33 * input using a third-order Markov model on the word level.
34 * Words are considered to be sequences of characters separated
35 * by whitespace and punctuation. Replies are generated
36 * randomly based on a keyword, and they are scored using
37 * measures of surprise.
38 *
39 * Author: Mr. Jason L. Hutchens
40 *
41 * WWW: http://ciips.ee.uwa.edu.au/~hutch/hal/
42 *
43 * E-Mail: hutch@ciips.ee.uwa.edu.au
44 *
45 * Contact: The Centre for Intelligent Information Processing Systems
46 * Department of Electrical and Electronic Engineering
47 * The University of Western Australia
48 * AUSTRALIA 6907
49 *
50 * Phone: +61-8-9380-3856
51 *
52 * Facsimile: +61-8-9380-1168
53 *
54 * Notes: This file is best viewed with tabstops set to three spaces.
55 */
56
57 /*===========================================================================*/
58
59 #include <stdio.h>
60 #include <stdlib.h>
61 #include <stdarg.h>
62 #include <string.h>
63 #include <unistd.h>
64 #include <math.h>
65 #include <time.h>
66 #include <ctype.h>
67 #include "hx.h"
68 #include "xmalloc.h"
69
70 /*===========================================================================*/
71
72 #define P_THINK 40
73 #define D_KEY 100000
74 #define V_KEY 50000
75 #define D_THINK 500000
76 #define V_THINK 250000
77
78 #ifndef MIN
79 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
80 #endif
81
82 #define COOKIE "MegaHALv8"
83
84 /*===========================================================================*/
85
86 typedef enum {FALSE, TRUE} bool;
87
88 typedef struct {
89 u_int8_t length;
90 char *word;
91 } STRING;
92
93 typedef struct {
94 u_int32_t size;
95 STRING *entry;
96 u_int16_t *index;
97 } DICTIONARY;
98
99 typedef struct {
100 u_int16_t size;
101 STRING *from;
102 STRING *to;
103 } SWAP;
104
105 typedef struct NODE {
106 u_int16_t symbol;
107 u_int32_t usage;
108 u_int16_t count;
109 u_int16_t branch;
110 struct NODE **tree;
111 } TREE;
112
113 typedef struct {
114 u_int8_t order;
115 TREE *forward;
116 TREE *backward;
117 TREE **context;
118 DICTIONARY *dictionary;
119 } MODEL;
120
121 /*===========================================================================*/
122
123 void add_aux (MODEL *, DICTIONARY *, STRING);
124 void add_key (MODEL *, DICTIONARY *, STRING);
125 void add_node (TREE *, TREE *, int);
126 void add_swap (SWAP *, char *, char *);
127 TREE *add_symbol (TREE *, u_int16_t);
128 u_int16_t add_word (DICTIONARY *, STRING);
129 int babble (MODEL *, DICTIONARY *, DICTIONARY *);
130 bool boundary (char *, unsigned int);
131 void capitalize (char *);
132 void delay (struct htlc_conn *, char *);
133 bool dissimilar (DICTIONARY *, DICTIONARY *);
134 void error (const char *, const char *, ...);
135 float evaluate_reply (MODEL *, DICTIONARY *, DICTIONARY *);
136 void exithal (void);
137 TREE *find_symbol (TREE *, int);
138 TREE *find_symbol_add (TREE *, int);
139 u_int16_t find_word (DICTIONARY *, STRING);
140 void free_dictionary (DICTIONARY *);
141 char *generate_reply (MODEL *, DICTIONARY *);
142 void initialize_context (MODEL *);
143 void initialize_dictionary (DICTIONARY *);
144 void initialize_error (const char *);
145 DICTIONARY *initialize_list (const char *);
146 void initialize_status (const char *);
147 SWAP *initialize_swap (const char *);
148 void learn (MODEL *, DICTIONARY *);
149 void load_dictionary (FILE *, DICTIONARY *);
150 bool load_model (const char *, MODEL *);
151 void load_tree (FILE *, TREE *);
152 void load_word (FILE *, DICTIONARY *);
153 void make_greeting (DICTIONARY *);
154 DICTIONARY *make_keywords (MODEL *, DICTIONARY *);
155 char *make_output (DICTIONARY *);
156 void make_words (char *, DICTIONARY *);
157 DICTIONARY *new_dictionary (void);
158 MODEL *new_model (int);
159 TREE *new_node (void);
160 SWAP *new_swap (void);
161 void print_header (FILE *);
162 DICTIONARY *reply (MODEL *, DICTIONARY *);
163 void save_dictionary (FILE *, DICTIONARY *);
164 void save_model (const char *, MODEL *);
165 void save_tree (FILE *, TREE *);
166 void save_word (FILE *, STRING);
167 int search_dictionary (DICTIONARY *, STRING, bool *);
168 int search_node (TREE *, int, bool *);
169 int seed (MODEL *, DICTIONARY *);
170 void show_dictionary (DICTIONARY *);
171 void status (const char *, ...);
172 void train (MODEL *, const char *);
173 void update_context (MODEL *, int);
174 void update_model (MODEL *, int);
175 void upper (char *);
176 int wordcmp (STRING, STRING);
177 bool word_exists (DICTIONARY *, STRING);
178 void write_input (char *, char *);
179 void write_output (struct htlc_conn *, char *);
180
181 /*===========================================================================*/
182 #define ORDER 5
183 #define TIMEOUT 2
184
185 int hal_active = 0;
186 bool used_key, typing_delay = TRUE;
187 DICTIONARY *g_words = 0, *g_ban = 0, *g_aux = 0, *g_grt = 0;
188 MODEL *g_model = 0;
189 SWAP *g_swp = 0;
190 FILE *errorfp, *statusfp;
191
192 #include "getopt.h"
193
194 static struct option hal_opts[] = {
195 {"delay", 0, 0, 'd'},
196 {"quit", 0, 0, 'q'},
197 {"restart", 0, 0, 'r'},
198 {"save", 0, 0, 's'},
199 {0, 0, 0, 0}
200 };
201
202 void
cmd_hal(int argc,char ** argv,char * str,struct htlc_conn * htlc,struct hx_chat * chat)203 cmd_hal (int argc, char **argv, char *str, struct htlc_conn *htlc, struct hx_chat *chat)
204 {
205 int longind, o;
206 struct opt_r opt;
207
208 if (str) {} /* removes unused parameter compiler warning */
209 if (argc < 2) {
210 usage: hx_printf(htlc, chat, "usage: %s [-sdrq] [--save] [--delay] [--restart] [--quit]\n", argv[0]);
211 return;
212 }
213
214 opt.ind = 0;
215 opt.err_printf = 0;
216 while ((o = getopt_long_r(argc, argv, "dqrs", hal_opts, &longind, &opt)) != EOF) {
217 if (o == 0)
218 o = hal_opts[longind].val;
219 switch (o) {
220 case 's':
221 if (g_model) {
222 save_model("megahal.brn", g_model);
223 make_greeting(g_words);
224 }
225 break;
226 case 'd':
227 typing_delay = typing_delay == TRUE ? FALSE : TRUE;
228 break;
229 case 'r':
230 hal_active = 1;
231 break;
232 case 'q':
233 if (g_model)
234 save_model("megahal.brn", g_model);
235 exithal();
236 break;
237 default:
238 goto usage;
239 }
240 }
241 }
242
243 void
hal_rcv(struct htlc_conn * htlc,char * input_p,char * user)244 hal_rcv (struct htlc_conn *htlc, char *input_p, char *user)
245 {
246 char *output, *input = xstrdup(input_p);
247
248 if (!g_words) {
249 errorfp = stderr;
250 statusfp = stdout;
251 /*
252 * Create a dictionary which will be used to hold the segmented
253 * version of the user's input.
254 */
255 g_words = new_dictionary();
256
257 /*
258 * Do some initialisation
259 */
260 initialize_error("megahal.log");
261 srandom(time(0) + clock() + getpid());
262
263 /*
264 * Create a language model.
265 */
266 g_model = new_model(ORDER);
267
268 /*
269 * Train the model on a text if one exists
270 */
271 if (load_model("megahal.brn", g_model) == FALSE)
272 train(g_model, "megahal.trn");
273
274 /*
275 * Read a dictionary containing banned keywords, auxiliary keywords,
276 * greeting keywords and swap keywords
277 */
278 g_ban = initialize_list("megahal.ban");
279 g_aux = initialize_list("megahal.aux");
280 g_grt = initialize_list("megahal.grt");
281 g_swp = initialize_swap("megahal.swp");
282
283 initialize_status("megahal.txt");
284 }
285
286 write_input(input, user);
287 upper(input);
288 make_words(input, g_words);
289 learn(g_model, g_words);
290 output = generate_reply(g_model, g_words);
291 write_output(htlc, output);
292 xfree(output);
293 xfree(input);
294 }
295
296 /*---------------------------------------------------------------------------*/
297
298 /*
299 * Function: exithal
300 *
301 * Purpose: Terminate the program.
302 */
303 void
exithal(void)304 exithal (void)
305 {
306 hal_active = 0;
307 }
308
309 /*---------------------------------------------------------------------------*/
310
311 /*
312 * Function: initialize_error
313 *
314 * Purpose: Close the current error file pointer, and open a new one.
315 */
316 void
initialize_error(const char * filename)317 initialize_error (const char *filename)
318 {
319 if (errorfp != stderr)
320 fclose(errorfp);
321 if (!filename || !(errorfp = fopen(filename, "a"))) {
322 errorfp = stderr;
323 return;
324 }
325
326 print_header(errorfp);
327 }
328
329 /*---------------------------------------------------------------------------*/
330
331 /*
332 * Function: error
333 *
334 * Purpose: Print the specified message to the error file.
335 */
336 void
error(const char * title,const char * fmt,...)337 error (const char *title, const char *fmt, ...)
338 {
339 va_list argp;
340
341 fprintf(errorfp, "%s: ", title);
342 va_start(argp, fmt);
343 vfprintf(errorfp, fmt, argp);
344 va_end(argp);
345 fprintf(errorfp, ".\n");
346 fflush(errorfp);
347 }
348
349 /*---------------------------------------------------------------------------*/
350
351 /*
352 * Function: initialize_status
353 *
354 * Purpose: Close the current status file pointer, and open a new one.
355 */
356 void
initialize_status(const char * filename)357 initialize_status (const char *filename)
358 {
359 if (statusfp != stdout)
360 fclose(statusfp);
361 if (!filename || !(statusfp = fopen(filename, "a"))) {
362 statusfp = stdout;
363 return;
364 }
365
366 print_header(statusfp);
367 }
368
369 /*---------------------------------------------------------------------------*/
370
371 /*
372 * Function: status
373 *
374 * Purpose Print the specified message to the status file.
375 */
376 void
status(const char * fmt,...)377 status (const char *fmt, ...)
378 {
379 va_list argp;
380
381 va_start(argp, fmt);
382 vfprintf(statusfp, fmt, argp);
383 va_end(argp);
384 fflush(statusfp);
385 }
386
387 /*---------------------------------------------------------------------------*/
388
389 /*
390 * Function: print_header
391 *
392 * Purpose: Display a copyright message and timestamp.
393 */
394 void
print_header(FILE * fp)395 print_header (FILE *fp)
396 {
397 time_t t;
398 char timestamp[1024];
399 struct tm *local;
400
401 t = time(0);
402 local = localtime(&t);
403 strftime(timestamp, 1024, "Start at: [%Y/%m/%d %H:%M:%S]", local);
404
405 fprintf(fp, "(c)1998 Cambridge Center For Behavioral Studies all rights reserved\n"
406 "[MegaHALv8][Jason Hutchens]\n"
407 "%s\n", timestamp);
408 fflush(fp);
409 }
410
411 /*---------------------------------------------------------------------------*/
412
413 /*
414 * Function: write_output
415 *
416 * Purpose: Display the output string.
417 */
418 void
write_output(struct htlc_conn * htlc,char * output)419 write_output (struct htlc_conn *htlc, char *output)
420 {
421 time_t t;
422 char timestamp[1024];
423 struct tm *local;
424
425 capitalize(output);
426 t = time(0);
427 local = localtime(&t);
428 strftime(timestamp, 1024, "HAL[%H:%M:%S]", local);
429
430 delay(htlc, output);
431
432 status("%s %s\n", timestamp, output);
433 }
434
435 /*---------------------------------------------------------------------------*/
436
437 /*
438 * Function: capitalize
439 *
440 * Purpose: Convert a string to look nice.
441 */
442 void
capitalize(char * str)443 capitalize (char *str)
444 {
445 register unsigned int i;
446 bool start = TRUE;
447
448 for (i = 0; i < 3 && str[i]; i++)
449 if (isalpha(str[i])) {
450 if (start == TRUE)
451 str[i] = toupper(str[i]);
452 else
453 str[i] = tolower(str[i]);
454 start = FALSE;
455 }
456
457 for (i = 3; str[i]; i++) {
458 if (isalpha(str[i])) {
459 if (start == TRUE)
460 str[i] = toupper(str[i]);
461 else
462 str[i] = tolower(str[i]);
463 start = FALSE;
464 }
465 if (isspace(str[i])) {
466 switch (str[i - 1]) {
467 case '?':
468 case '.':
469 case '!':
470 start = TRUE;
471 }
472 } else {
473 switch (str[i]) {
474 case '"':
475 start = TRUE;
476 }
477 }
478 }
479 }
480
481 /*---------------------------------------------------------------------------*/
482
483 /*
484 * Function: upper
485 *
486 * Purpose: Convert a string to its uppercase representation.
487 */
488 void
upper(char * str)489 upper (char *str)
490 {
491 register unsigned int i;
492
493 for (i = 0; str[i]; i++)
494 str[i] = toupper(str[i]);
495 }
496
497 /*---------------------------------------------------------------------------*/
498
499 /*
500 * Function: write_input
501 *
502 * Purpose: Log the user's input
503 */
504 void
write_input(char * input,char * user)505 write_input (char *input, char *user)
506 {
507 time_t t;
508 char timestamp[1024], tmp[1024];
509 struct tm *local;
510
511 t = time(0);
512 local = localtime(&t);
513 strftime(tmp, 1024, "[%H:%M:%S]", local);
514 sprintf(timestamp, "%s%s", user, tmp);
515
516 status("%s %s\n", timestamp, input);
517 }
518
519 /*---------------------------------------------------------------------------*/
520
521 /*
522 * Function: add_word
523 *
524 * Purpose: Add a word to a dictionary, and return the identifier
525 * assigned to the word. If the word already exists in
526 * the dictionary, then return its current identifier
527 * without adding it again.
528 */
529 u_int16_t
add_word(DICTIONARY * dictionary,STRING word)530 add_word (DICTIONARY *dictionary, STRING word)
531 {
532 register int i;
533 int position;
534 bool found;
535
536 /*
537 * If the word's already in the dictionary, there is no need to add it
538 */
539 position = search_dictionary(dictionary, word, &found);
540 if (found == TRUE)
541 goto succeed;
542
543 /*
544 * Increase the number of words in the dictionary
545 */
546 dictionary->size++;
547
548 /*
549 * Allocate one more entry for the word index
550 */
551 dictionary->index = xrealloc(dictionary->index, 2 * dictionary->size);
552
553 /*
554 * Allocate one more entry for the word array
555 */
556 dictionary->entry = xrealloc(dictionary->entry, sizeof(STRING) * dictionary->size);
557
558 /*
559 * Copy the new word into the word array
560 */
561 dictionary->entry[dictionary->size - 1].length = word.length;
562 dictionary->entry[dictionary->size - 1].word = xmalloc(word.length);
563 memcpy(dictionary->entry[dictionary->size - 1].word, word.word, word.length);
564
565 /*
566 * Shuffle the word index to keep it sorted alphabetically
567 */
568 for (i = dictionary->size - 1; i > position; i--)
569 dictionary->index[i] = dictionary->index[i - 1];
570
571 /*
572 * Copy the new symbol identifier into the word index
573 */
574 dictionary->index[position] = dictionary->size - 1;
575
576 succeed:
577 return dictionary->index[position];
578 }
579
580 /*---------------------------------------------------------------------------*/
581
582 /*
583 * Function: search_dictionary
584 *
585 * Purpose: Search the dictionary for the specified word, returning its
586 * position in the index if found, or the position where it
587 * should be inserted otherwise.
588 */
589 int
search_dictionary(DICTIONARY * dictionary,STRING word,bool * find)590 search_dictionary (DICTIONARY *dictionary, STRING word, bool *find)
591 {
592 int position, min, max, middle, compar;
593
594 /*
595 * If the dictionary is empty, then obviously the word won't be found
596 */
597 if (!dictionary->size) {
598 position = 0;
599 goto notfound;
600 }
601
602 /*
603 * Initialize the lower and upper bounds of the search
604 */
605 min = 0;
606 max = dictionary->size - 1;
607 /*
608 * Search repeatedly, halving the search space each time, until either
609 * the entry is found, or the search space becomes empty
610 */
611 while (TRUE) {
612 /*
613 * See whether the middle element of the search space is greater
614 * than, equal to, or less than the element being searched for.
615 */
616 middle = (min + max) / 2;
617 compar = wordcmp(word, dictionary->entry[dictionary->index[middle]]);
618 /*
619 * If it is equal then we have found the element. Otherwise we
620 * can halve the search space accordingly.
621 */
622 if (!compar) {
623 position = middle;
624 goto found;
625 } else if (compar > 0) {
626 if (max == middle) {
627 position = middle + 1;
628 goto notfound;
629 }
630 min = middle + 1;
631 } else {
632 if (min == middle) {
633 position = middle;
634 goto notfound;
635 }
636 max = middle - 1;
637 }
638 }
639
640 found:
641 *find = TRUE;
642
643 return position;
644
645 notfound:
646 *find = FALSE;
647
648 return position;
649 }
650
651 /*---------------------------------------------------------------------------*/
652
653 /*
654 * Function: find_word
655 *
656 * Purpose: Return the symbol corresponding to the word specified.
657 * We assume that the word with index zero is equal to a
658 * NULL word, indicating an error condition.
659 */
660 u_int16_t
find_word(DICTIONARY * dictionary,STRING word)661 find_word (DICTIONARY *dictionary, STRING word)
662 {
663 int position;
664 bool found;
665
666 position = search_dictionary(dictionary, word, &found);
667
668 if (found == TRUE)
669 return dictionary->index[position];
670 else
671 return 0;
672 }
673
674 /*---------------------------------------------------------------------------*/
675
676 /*
677 * Function: wordcmp
678 *
679 * Purpose: Compare two words, and return an integer indicating whether
680 * the first word is less than, equal to or greater than the
681 * second word.
682 */
683 int
wordcmp(STRING word1,STRING word2)684 wordcmp (STRING word1, STRING word2)
685 {
686 register int i;
687 int bound;
688
689 bound = MIN(word1.length,word2.length);
690
691 for (i = 0; i < bound; i++)
692 if (word1.word[i] != word2.word[i])
693 return (int)(word1.word[i] - word2.word[i]);
694
695 if (word1.length < word2.length)
696 return -1;
697 if (word1.length > word2.length)
698 return 1;
699
700 return 0;
701 }
702
703 /*---------------------------------------------------------------------------*/
704
705 /*
706 * Function: free_dictionary
707 *
708 * Purpose: Release the memory consumed by the dictionary.
709 */
710 void
free_dictionary(DICTIONARY * dictionary)711 free_dictionary (DICTIONARY *dictionary)
712 {
713 register u_int32_t i;
714
715 if (!dictionary->size)
716 return;
717 for (i = 0; i < dictionary->size; i++)
718 xfree(dictionary->entry[i].word);
719 xfree(dictionary->entry);
720 dictionary->entry = 0;
721 xfree(dictionary->index);
722 dictionary->index = 0;
723 dictionary->size = 0;
724
725 initialize_dictionary(dictionary);
726 }
727
728 /*---------------------------------------------------------------------------*/
729
730 /*
731 * Function: initialize_dictionary
732 *
733 * Purpose: Add dummy words to the dictionary.
734 */
735 void
initialize_dictionary(DICTIONARY * dictionary)736 initialize_dictionary (DICTIONARY *dictionary)
737 {
738 char err[] = "<ERROR>", fin[] = "<FIN>";
739 STRING word, end;
740
741 word.word = err;
742 word.length = 7;
743 end.word = fin;
744 end.length = 5;
745
746 add_word(dictionary, word);
747 add_word(dictionary, end);
748 }
749
750 /*---------------------------------------------------------------------------*/
751
752 /*
753 * Function: new_dictionary
754 *
755 * Purpose: Allocate room for a new dictionary.
756 */
757 DICTIONARY *
new_dictionary(void)758 new_dictionary (void)
759 {
760 DICTIONARY *dictionary;
761
762 dictionary = xmalloc(sizeof(*dictionary));
763
764 dictionary->size = 0;
765 dictionary->index = 0;
766 dictionary->entry = 0;
767
768 initialize_dictionary(dictionary);
769
770 return dictionary;
771 }
772
773 /*---------------------------------------------------------------------------*/
774
775 /*
776 * Function: save_dictionary
777 *
778 * Purpose: Save a dictionary to the specified file.
779 */
780 void
save_dictionary(FILE * fp,DICTIONARY * dictionary)781 save_dictionary (FILE *fp, DICTIONARY *dictionary)
782 {
783 register u_int32_t i;
784
785 fwrite(&(dictionary->size), sizeof(dictionary->size), 1, fp);
786 for (i = 0; i < dictionary->size; i++)
787 save_word(fp, dictionary->entry[i]);
788 }
789
790 /*---------------------------------------------------------------------------*/
791
792 /*
793 * Function: load_dictionary
794 *
795 * Purpose: Load a dictionary from the specified file.
796 */
797 void
load_dictionary(FILE * fp,DICTIONARY * dictionary)798 load_dictionary (FILE *fp, DICTIONARY *dictionary)
799 {
800 register u_int32_t i;
801 u_int32_t size;
802
803 fread(&size, sizeof(size), 1, fp);
804 for (i = 0; i < size; i++)
805 load_word(fp, dictionary);
806 }
807
808 /*---------------------------------------------------------------------------*/
809
810 /*
811 * Function: save_word
812 *
813 * Purpose: Save a dictionary word to a file.
814 */
815 void
save_word(FILE * fp,STRING word)816 save_word (FILE *fp, STRING word)
817 {
818 fwrite(&(word.length), sizeof(word.length), 1, fp);
819 fwrite(word.word, sizeof(*(word.word)), word.length, fp);
820 }
821
822 /*---------------------------------------------------------------------------*/
823
824 /*
825 * Function: load_word
826 *
827 * Purpose: Load a dictionary word from a file.
828 */
829 void
load_word(FILE * fp,DICTIONARY * dictionary)830 load_word (FILE *fp, DICTIONARY *dictionary)
831 {
832 char buf[0xff];
833 STRING word;
834
835 fread(&(word.length), sizeof(word.length), 1, fp);
836 word.word = buf;
837 fread(word.word, sizeof(*(word.word)), word.length, fp);
838 add_word(dictionary, word);
839 }
840
841 /*---------------------------------------------------------------------------*/
842
843 /*
844 * Function: new_node
845 *
846 * Purpose: Allocate a new node for the n-gram tree, and initialise
847 * its contents to sensible values.
848 */
849 TREE *
new_node(void)850 new_node (void)
851 {
852 TREE *node;
853
854 /*
855 * Allocate memory for the new node
856 */
857 node = xmalloc(sizeof(*node));
858
859 /*
860 * Initialise the contents of the node
861 */
862 node->symbol = 0;
863 node->usage = 0;
864 node->count = 0;
865 node->branch = 0;
866 node->tree = 0;
867
868 return node;
869 }
870
871 /*---------------------------------------------------------------------------*/
872
873 /*
874 * Function: new_model
875 *
876 * Purpose: Create and initialise a new ngram model.
877 */
878 MODEL *
new_model(int order)879 new_model (int order)
880 {
881 MODEL *model;
882
883 model = xmalloc(sizeof(*model));
884
885 model->order = order;
886 model->forward = new_node();
887 model->backward = new_node();
888 model->context = xmalloc(sizeof(TREE *) * (order + 2));
889 initialize_context(model);
890 model->dictionary = new_dictionary();
891
892 return model;
893 }
894
895 /*---------------------------------------------------------------------------*/
896
897 /*
898 * Function: update_model
899 *
900 * Purpose: Update the model with the specified symbol.
901 */
902 void
update_model(MODEL * model,int symbol)903 update_model (MODEL *model, int symbol)
904 {
905 register u_int16_t i;
906
907 /*
908 * Update all of the models in the current context with the specified
909 * symbol.
910 */
911 for (i = model->order + 1; i > 0; i--)
912 if (model->context[i - 1])
913 model->context[i] = add_symbol(model->context[i - 1], (u_int16_t)symbol);
914 }
915
916 /*---------------------------------------------------------------------------*/
917
918 /*
919 * Function: update_context
920 *
921 * Purpose: Update the context of the model without adding the symbol.
922 */
923 void
update_context(MODEL * model,int symbol)924 update_context (MODEL *model, int symbol)
925 {
926 register u_int16_t i;
927
928 for (i = model->order + 1; i > 0; i--)
929 if (model->context[i - 1])
930 model->context[i] = find_symbol(model->context[i - 1], symbol);
931 }
932
933 /*---------------------------------------------------------------------------*/
934
935 /*
936 * Function: add_symbol
937 *
938 * Purpose: Update the statistics of the specified tree with the
939 * specified symbol, which may mean growing the tree if the
940 * symbol hasn't been seen in this context before.
941 */
942 TREE *
add_symbol(TREE * tree,u_int16_t symbol)943 add_symbol (TREE *tree, u_int16_t symbol)
944 {
945 TREE *node = 0;
946
947 /*
948 * Search for the symbol in the subtree of the tree node.
949 */
950 node = find_symbol_add(tree, symbol);
951
952 /*
953 *Increment the symbol counts
954 */
955 if (node->count < 65535) {
956 node->count++;
957 tree->usage++;
958 }
959
960 return node;
961 }
962
963 /*---------------------------------------------------------------------------*/
964
965 /*
966 * Function: find_symbol
967 *
968 * Purpose: Return a pointer to the child node, if one exists, which
969 * contains the specified symbol.
970 */
971 TREE *
find_symbol(TREE * node,int symbol)972 find_symbol (TREE *node, int symbol)
973 {
974 register int i;
975 TREE *found = 0;
976 bool found_symbol = FALSE;
977
978 /*
979 * Perform a binary search for the symbol.
980 */
981 i = search_node(node, symbol, &found_symbol);
982 if (found_symbol == TRUE)
983 found = node->tree[i];
984
985 return found;
986 }
987
988 /*---------------------------------------------------------------------------*/
989
990 /*
991 * Function: find_symbol_add
992 *
993 * Purpose: This function is conceptually similar to find_symbol,
994 * apart from the fact that if the symbol is not found,
995 * a new node is automatically allocated and added to the
996 * tree.
997 */
998 TREE *
find_symbol_add(TREE * node,int symbol)999 find_symbol_add (TREE *node, int symbol)
1000 {
1001 register int i;
1002 TREE *found = 0;
1003 bool found_symbol = FALSE;
1004
1005 /*
1006 * Perform a binary search for the symbol. If the symbol isn't found,
1007 * attach a new sub-node to the tree node so that it remains sorted.
1008 */
1009 i = search_node(node, symbol, &found_symbol);
1010 if (found_symbol == TRUE) {
1011 found = node->tree[i];
1012 } else {
1013 found = new_node();
1014 found->symbol = symbol;
1015 add_node(node, found, i);
1016 }
1017
1018 return found;
1019 }
1020
1021 /*---------------------------------------------------------------------------*/
1022
1023 /*
1024 * Function: add_node
1025 *
1026 * Purpose: Attach a new child node to the sub-tree of the tree
1027 * specified.
1028 */
1029 void
add_node(TREE * tree,TREE * node,int position)1030 add_node (TREE *tree, TREE *node, int position)
1031 {
1032 register int i;
1033
1034 /*
1035 * Allocate room for one more child node, which may mean allocating
1036 * the sub-tree from scratch.
1037 */
1038 tree->tree = xrealloc(tree->tree, sizeof(TREE *) * (tree->branch + 1));
1039
1040 /*
1041 * Shuffle the nodes down so that we can insert the new node at the
1042 * subtree index given by position.
1043 */
1044 for (i = tree->branch; i > position; i--)
1045 tree->tree[i] = tree->tree[i - 1];
1046
1047 /*
1048 * Add the new node to the sub-tree.
1049 */
1050 tree->tree[position] = node;
1051 tree->branch++;
1052 }
1053
1054 /*---------------------------------------------------------------------------*/
1055
1056 /*
1057 * Function: search_node
1058 *
1059 * Purpose: Perform a binary search for the specified symbol on the
1060 * subtree of the given node. Return the position of the
1061 * child node in the subtree if the symbol was found, or the
1062 * position where it should be inserted to keep the subtree
1063 * sorted if it wasn't.
1064 */
1065 int
search_node(TREE * node,int symbol,bool * found_symbol)1066 search_node (TREE *node, int symbol, bool *found_symbol)
1067 {
1068 register int position;
1069 int min, max, middle, compar;
1070
1071 /*
1072 * Handle the special case where the subtree is empty.
1073 */
1074 if (!node->branch) {
1075 position = 0;
1076 goto notfound;
1077 }
1078
1079 /*
1080 * Perform a binary search on the subtree.
1081 */
1082 min = 0;
1083 max = node->branch - 1;
1084 while (TRUE) {
1085 middle = (min + max) / 2;
1086 compar = symbol-node->tree[middle]->symbol;
1087 if (!compar) {
1088 position = middle;
1089 goto found;
1090 } else if (compar > 0) {
1091 if (max == middle) {
1092 position = middle + 1;
1093 goto notfound;
1094 }
1095 min = middle + 1;
1096 } else {
1097 if (min == middle) {
1098 position = middle;
1099 goto notfound;
1100 }
1101 max = middle - 1;
1102 }
1103 }
1104
1105 found:
1106 *found_symbol = TRUE;
1107
1108 return position;
1109
1110 notfound:
1111 *found_symbol = FALSE;
1112
1113 return position;
1114 }
1115
1116 /*---------------------------------------------------------------------------*/
1117
1118 /*
1119 * Function: initialize_context
1120 *
1121 * Purpose: Set the context of the model to a default value.
1122 */
1123 void
initialize_context(MODEL * model)1124 initialize_context (MODEL *model)
1125 {
1126 register u_int16_t i;
1127
1128 for (i = 0; i <= model->order; i++)
1129 model->context[i] = 0;
1130 }
1131
1132 /*---------------------------------------------------------------------------*/
1133
1134 /*
1135 * Function: learn
1136 *
1137 * Purpose: Learn from the user's input.
1138 */
1139 void
learn(MODEL * model,DICTIONARY * words)1140 learn (MODEL *model, DICTIONARY *words)
1141 {
1142 register int i;
1143 u_int16_t symbol;
1144
1145 /*
1146 * We only learn from inputs which are long enough
1147 */
1148 if (words->size <= model->order)
1149 return;
1150
1151 /*
1152 * Train the model in the forwards direction. Start by initializing
1153 * the context of the model.
1154 */
1155 initialize_context(model);
1156 model->context[0] = model->forward;
1157 for (i = 0; i < (signed)words->size; i++) {
1158 /*
1159 * Add the symbol to the model's dictionary if necessary, and then
1160 * update the forward model accordingly.
1161 */
1162 symbol = add_word(model->dictionary, words->entry[i]);
1163 update_model(model, symbol);
1164 }
1165 /*
1166 * Add the sentence-terminating symbol.
1167 */
1168 update_model(model, 1);
1169
1170 /*
1171 * Train the model in the backwards direction. Start by initializing
1172 * the context of the model.
1173 */
1174 initialize_context(model);
1175 model->context[0] = model->backward;
1176 for (i = words->size - 1; i >= 0; --i) {
1177 /*
1178 * Find the symbol in the model's dictionary, and then update
1179 * the backward model accordingly.
1180 */
1181 symbol = find_word(model->dictionary, words->entry[i]);
1182 update_model(model, symbol);
1183 }
1184 /*
1185 * Add the sentence-terminating symbol.
1186 */
1187 update_model(model, 1);
1188
1189 return;
1190 }
1191
1192 /*---------------------------------------------------------------------------*/
1193
1194 /*
1195 * Function: train
1196 *
1197 * Purpose: Infer a MegaHAL brain from the contents of a text file.
1198 */
1199 void
train(MODEL * model,const char * filename)1200 train (MODEL *model, const char *filename)
1201 {
1202 FILE *fp;
1203 char buffer[1024];
1204 DICTIONARY *words = 0;
1205
1206 if (!filename || !(fp = fopen(filename, "r")))
1207 return;
1208
1209 words = new_dictionary();
1210
1211 while (fgets(buffer, sizeof(buffer), fp)) {
1212 if (buffer[0] == '#')
1213 continue;
1214 buffer[strlen(buffer) - 1] = 0;
1215 upper(buffer);
1216 make_words(buffer, words);
1217 learn(model, words);
1218 }
1219 fclose(fp);
1220
1221 xfree(words->entry);
1222 xfree(words);
1223 }
1224
1225 /*---------------------------------------------------------------------------*/
1226
1227 /*
1228 * Function: show_dictionary
1229 *
1230 * Purpose: Display the dictionary for training purposes.
1231 */
1232 void
show_dictionary(DICTIONARY * dictionary)1233 show_dictionary (DICTIONARY *dictionary)
1234 {
1235 register u_int32_t i;
1236 FILE *fp;
1237
1238 if (!(fp = fopen("megahal.dic", "w"))) {
1239 error("show_dictionary", "Unable to open file");
1240 return;
1241 }
1242
1243 for (i = 0; i < dictionary->size; i++)
1244 fprintf(fp, "%.*s\n", dictionary->entry[i].length, dictionary->entry[i].word);
1245
1246 fclose(fp);
1247 }
1248
1249 /*---------------------------------------------------------------------------*/
1250
1251 /*
1252 * Function: save_model
1253 *
1254 * Purpose: Save the current state to a MegaHAL brain file.
1255 */
1256 void
save_model(const char * filename,MODEL * model)1257 save_model (const char *filename, MODEL *model)
1258 {
1259 FILE *fp;
1260
1261 show_dictionary(model->dictionary);
1262
1263 if (!filename || !(fp = fopen(filename, "w"))) {
1264 error("save_model", "Unable to open file `%s'", filename);
1265 return;
1266 }
1267
1268 fwrite(COOKIE, sizeof(char), sizeof(COOKIE) - 1, fp);
1269 fwrite(&(model->order), sizeof(model->order), 1, fp);
1270 save_tree(fp, model->forward);
1271 save_tree(fp, model->backward);
1272 save_dictionary(fp, model->dictionary);
1273
1274 fclose(fp);
1275 }
1276
1277 /*---------------------------------------------------------------------------*/
1278
1279 /*
1280 * Function: save_tree
1281 *
1282 * Purpose: Save a tree structure to the specified file.
1283 */
1284 void
save_tree(FILE * fp,TREE * node)1285 save_tree (FILE *fp, TREE *node)
1286 {
1287 register u_int16_t i;
1288
1289 fwrite(&(node->symbol), sizeof(node->symbol), 1, fp);
1290 fwrite(&(node->usage), sizeof(node->usage), 1, fp);
1291 fwrite(&(node->count), sizeof(node->count), 1, fp);
1292 fwrite(&(node->branch), sizeof(node->branch), 1, fp);
1293
1294 for (i = 0; i < node->branch; i++)
1295 save_tree(fp, node->tree[i]);
1296 }
1297
1298 /*---------------------------------------------------------------------------*/
1299
1300 /*
1301 * Function: load_tree
1302 *
1303 * Purpose: Load a tree structure from the specified file.
1304 */
1305 void
load_tree(FILE * fp,TREE * node)1306 load_tree (FILE *fp, TREE *node)
1307 {
1308 register u_int16_t i;
1309
1310 fread(&(node->symbol), sizeof(node->symbol), 1, fp);
1311 fread(&(node->usage), sizeof(node->usage), 1, fp);
1312 fread(&(node->count), sizeof(node->count), 1, fp);
1313 fread(&(node->branch), sizeof(node->branch), 1, fp);
1314
1315 if (!node->branch)
1316 return;
1317
1318 node->tree = xmalloc(sizeof(TREE *) * node->branch);
1319
1320 for (i = 0; i < node->branch; i++) {
1321 node->tree[i] = new_node();
1322 load_tree(fp, node->tree[i]);
1323 }
1324 }
1325
1326 /*---------------------------------------------------------------------------*/
1327
1328 /*
1329 * Function: load_model
1330 *
1331 * Purpose: Load a model into memory.
1332 */
1333 bool
load_model(const char * filename,MODEL * model)1334 load_model (const char *filename, MODEL *model)
1335 {
1336 FILE *fp;
1337 char cookie[sizeof(COOKIE)];
1338
1339 if (!filename || !(fp = fopen(filename, "r"))) {
1340 error("load_model", "Unable to open file `%s'", filename);
1341 return FALSE;
1342 }
1343
1344 fread(cookie, sizeof(*cookie), sizeof(COOKIE) - 1, fp);
1345 if (memcmp(cookie, COOKIE, sizeof(COOKIE) - 1)) {
1346 error("load_model", "File `%s' is not a MegaHAL brain (bad cookie %.*s)",
1347 filename, sizeof(COOKIE) - 1, cookie);
1348 fclose(fp);
1349 return FALSE;
1350 }
1351
1352 fread(&(model->order), sizeof(model->order), 1, fp);
1353 load_tree(fp, model->forward);
1354 load_tree(fp, model->backward);
1355 load_dictionary(fp, model->dictionary);
1356
1357 fclose(fp);
1358
1359 return TRUE;
1360 }
1361
1362 static char period_str[] = ".";
1363
1364 /*---------------------------------------------------------------------------*/
1365
1366 /*
1367 * Function: make_words
1368 *
1369 * Purpose: Break a string into an array of words.
1370 */
1371 void
make_words(char * input,DICTIONARY * words)1372 make_words (char *input, DICTIONARY *words)
1373 {
1374 unsigned int offset = 0;
1375
1376 /*
1377 * Clear the entries in the dictionary
1378 */
1379 words->size = 0;
1380
1381 /*
1382 * If the string is empty then do nothing, for it contains no words.
1383 */
1384 if (!input[0])
1385 return;
1386
1387 /*
1388 * Loop forever.
1389 */
1390 for (;;) {
1391 /*
1392 * If the current character is of the same type as the previous
1393 * character, then include it in the word. Otherwise, terminate
1394 * the current word.
1395 */
1396 if (boundary(input, offset)) {
1397 /*
1398 * Add the word to the dictionary
1399 */
1400 words->entry = xrealloc(words->entry, (words->size + 1) * sizeof(STRING));
1401 words->entry[words->size].length = offset;
1402 words->entry[words->size].word = input;
1403 words->size++;
1404 if (offset == strlen(input))
1405 break;
1406 input += offset;
1407 offset = 0;
1408 } else {
1409 offset++;
1410 }
1411 }
1412
1413 /*
1414 * If the last word isn't punctuation, then replace it with a
1415 * full-stop character.
1416 */
1417 if (isalnum(words->entry[words->size - 1].word[0])) {
1418 words->entry = xrealloc(words->entry, (words->size + 1) * sizeof(STRING));
1419 words->entry[words->size].length = 1;
1420 words->entry[words->size].word = period_str;
1421 words->size++;
1422 } else if (!strchr("!.?", words->entry[words->size - 1].word[0])) {
1423 words->entry[words->size - 1].length = 1;
1424 words->entry[words->size - 1].word = period_str;
1425 }
1426 }
1427
1428 /*---------------------------------------------------------------------------*/
1429 /*
1430 * Function: boundary
1431 *
1432 * Purpose: Return whether or not a word boundary exists in a string
1433 * at the specified location.
1434 */
1435 bool
boundary(char * str,unsigned int position)1436 boundary (char *str, unsigned int position)
1437 {
1438 if (!position)
1439 return FALSE;
1440
1441 if (position == strlen(str))
1442 return TRUE;
1443
1444 if (
1445 str[position] == '\'' &&
1446 isalpha(str[position - 1]) &&
1447 isalpha(str[position + 1])
1448 )
1449 return FALSE;
1450
1451 if (
1452 position > 1 &&
1453 str[position - 1] == '\'' &&
1454 isalpha(str[position - 2]) &&
1455 isalpha(str[position])
1456 )
1457 return FALSE;
1458
1459 if (
1460 isalpha(str[position]) &&
1461 !isalpha(str[position - 1])
1462 )
1463 return TRUE;
1464
1465 if (
1466 !isalpha(str[position]) &&
1467 isalpha(str[position - 1])
1468 )
1469 return TRUE;
1470
1471 if (isdigit(str[position]) != isdigit(str[position - 1]))
1472 return TRUE;
1473
1474 return FALSE;
1475 }
1476
1477 /*---------------------------------------------------------------------------*/
1478 /*
1479 * Function: make_greeting
1480 *
1481 * Purpose: Put some special words into the dictionary so that the
1482 * program will respond as if to a new judge.
1483 */
1484 void
make_greeting(DICTIONARY * words)1485 make_greeting (DICTIONARY *words)
1486 {
1487 words->size = 0;
1488 if (g_grt->size > 2)
1489 add_word(words, g_grt->entry[random() % (g_grt->size - 2) + 2]);
1490 }
1491
1492 /*---------------------------------------------------------------------------*/
1493 /*
1494 * Function: generate_reply
1495 *
1496 * Purpose: Take a string of user input and return a string of output
1497 * which may vaguely be construed as containing a reply to
1498 * whatever is in the input string.
1499 */
1500 char *
generate_reply(MODEL * model,DICTIONARY * words)1501 generate_reply (MODEL *model, DICTIONARY *words)
1502 {
1503 static DICTIONARY *dummy = 0;
1504 DICTIONARY *replywords, *keywords;
1505 float surprise, max_surprise;
1506 char *output = 0;
1507 int count, basetime;
1508
1509 /*
1510 * Create an array of keywords from the words in the user's input
1511 */
1512 keywords = make_keywords(model, words);
1513
1514 /*
1515 * Make sure some sort of reply exists
1516 */
1517 if (!dummy)
1518 dummy = new_dictionary();
1519 replywords = reply(model, dummy);
1520 if (dissimilar(words, replywords) == TRUE)
1521 output = make_output(replywords);
1522
1523 /*
1524 * Loop for the specified waiting period, generating and evaluating
1525 * replies
1526 */
1527 max_surprise = (float)-1.0;
1528 count = 0;
1529 basetime = time(0);
1530 do {
1531 replywords = reply(model, keywords);
1532 surprise = evaluate_reply(model, keywords, replywords);
1533 ++count;
1534 if ((surprise > max_surprise) && (dissimilar(words, replywords) == TRUE)) {
1535 max_surprise = surprise;
1536 output = make_output(replywords);
1537 }
1538 } while ((time(0) - basetime) < TIMEOUT);
1539
1540 /*
1541 * Return the best answer we generated
1542 */
1543 return output ? output : xstrdup("I forgot what i was gonna say!");
1544 }
1545
1546 /*---------------------------------------------------------------------------*/
1547
1548 /*
1549 * Function: dissimilar
1550 *
1551 * Purpose: Return TRUE or FALSE depending on whether the dictionaries
1552 * are the same or not.
1553 */
1554 bool
dissimilar(DICTIONARY * words1,DICTIONARY * words2)1555 dissimilar (DICTIONARY *words1, DICTIONARY *words2)
1556 {
1557 register unsigned int i;
1558
1559 if (words1->size != words2->size)
1560 return TRUE;
1561 for (i = 0; i < words1->size; i++)
1562 if (wordcmp(words1->entry[i], words2->entry[i]))
1563 return TRUE;
1564
1565 return FALSE;
1566 }
1567
1568 /*---------------------------------------------------------------------------*/
1569
1570 /*
1571 * Function: make_keywords
1572 *
1573 * Purpose: Put all the interesting words from the user's input into
1574 * a keywords dictionary, which will be used when generating
1575 * a reply.
1576 */
1577 DICTIONARY *
make_keywords(MODEL * model,DICTIONARY * words)1578 make_keywords (MODEL *model, DICTIONARY *words)
1579 {
1580 static DICTIONARY *keys = 0;
1581 register unsigned int i, j;
1582 int c;
1583
1584 if (!keys)
1585 keys = new_dictionary();
1586 else
1587 free_dictionary(keys);
1588
1589 for (i = 0; i < words->size; i++) {
1590 /*
1591 * Find the symbol ID of the word. If it doesn't exist in
1592 * the model, or if it begins with a non-alphanumeric
1593 * character, or if it is in the exclusion array, then
1594 * skip over it.
1595 */
1596 c = 0;
1597 for (j = 0; j < g_swp->size; j++)
1598 if (!wordcmp(g_swp->from[j], words->entry[i])) {
1599 add_key(model, keys, g_swp->to[j]);
1600 c = 1;
1601 }
1602 if (!c)
1603 add_key(model, keys, words->entry[i]);
1604 }
1605
1606 if (keys->size > 2)
1607 for (i = 0; i < words->size; i++) {
1608 c = 0;
1609 for (j = 0; j < g_swp->size; j++)
1610 if (!wordcmp(g_swp->from[j], words->entry[i])) {
1611 add_aux(model, keys, g_swp->to[j]);
1612 c = 1;
1613 }
1614 if (!c)
1615 add_aux(model, keys, words->entry[i]);
1616 }
1617
1618 return keys;
1619 }
1620
1621 /*---------------------------------------------------------------------------*/
1622
1623 /*
1624 * Function: add_key
1625 *
1626 * Purpose: Add a word to the keyword dictionary.
1627 */
1628 void
add_key(MODEL * model,DICTIONARY * keys,STRING word)1629 add_key (MODEL *model, DICTIONARY *keys, STRING word)
1630 {
1631 int symbol;
1632
1633 symbol = find_word(model->dictionary, word);
1634 if (!symbol)
1635 return;
1636 if (!isalnum(word.word[0]))
1637 return;
1638 symbol = find_word(g_ban, word);
1639 if (symbol)
1640 return;
1641 symbol = find_word(g_aux, word);
1642 if (symbol)
1643 return;
1644
1645 add_word(keys, word);
1646 }
1647
1648 /*---------------------------------------------------------------------------*/
1649
1650 /*
1651 * Function: add_aux
1652 *
1653 * Purpose: Add an auxilliary keyword to the keyword dictionary.
1654 */
1655 void
add_aux(MODEL * model,DICTIONARY * keys,STRING word)1656 add_aux (MODEL *model, DICTIONARY *keys, STRING word)
1657 {
1658 int symbol;
1659
1660 symbol = find_word(model->dictionary, word);
1661 if (!symbol)
1662 return;
1663 if (!isalnum(word.word[0]))
1664 return;
1665 symbol = find_word(g_aux, word);
1666 if (!symbol)
1667 return;
1668 add_word(keys, word);
1669 }
1670
1671 /*---------------------------------------------------------------------------*/
1672
1673 /*
1674 * Function: reply
1675 *
1676 * Purpose: Generate a dictionary of reply words appropriate to the
1677 * given dictionary of keywords.
1678 */
1679 DICTIONARY *
reply(MODEL * model,DICTIONARY * keys)1680 reply (MODEL *model, DICTIONARY *keys)
1681 {
1682 DICTIONARY *replies;
1683 register int i;
1684 int symbol;
1685 bool start = TRUE;
1686
1687 replies = new_dictionary();
1688 replies->size = 0;
1689
1690 /*
1691 * Start off by making sure that the model's context is empty.
1692 */
1693 initialize_context(model);
1694 model->context[0] = model->forward;
1695 used_key = FALSE;
1696
1697 /*
1698 * Generate the reply in the forward direction.
1699 */
1700 while (TRUE) {
1701 /*
1702 * Get a random symbol from the current context.
1703 */
1704 if (start == TRUE)
1705 symbol = seed(model, keys);
1706 else
1707 symbol = babble(model, keys, replies);
1708 if (!symbol || symbol == 1)
1709 break;
1710 start = FALSE;
1711
1712 /*
1713 * Append the symbol to the reply dictionary.
1714 */
1715 replies->entry = xrealloc(replies->entry, (replies->size + 1) * sizeof(STRING));
1716
1717 replies->entry[replies->size].length =
1718 model->dictionary->entry[symbol].length;
1719 replies->entry[replies->size].word =
1720 model->dictionary->entry[symbol].word;
1721 replies->size++;
1722
1723 /*
1724 * Extend the current context of the model with the current symbol.
1725 */
1726 update_context(model, symbol);
1727 }
1728
1729 /*
1730 * Start off by making sure that the model's context is empty.
1731 */
1732 initialize_context(model);
1733 model->context[0] = model->backward;
1734
1735 /*
1736 * Re-create the context of the model from the current reply
1737 * dictionary so that we can generate backwards to reach the
1738 * beginning of the string.
1739 */
1740 if (replies->size > 0)
1741 for (i = MIN(replies->size - 1, model->order); i >= 0; --i) {
1742 symbol = find_word(model->dictionary, replies->entry[i]);
1743 update_context(model, symbol);
1744 }
1745
1746 /*
1747 * Generate the reply in the backward direction.
1748 */
1749 while (TRUE) {
1750 /*
1751 * Get a random symbol from the current context.
1752 */
1753 symbol = babble(model, keys, replies);
1754 if (!symbol || symbol == 1)
1755 break;
1756
1757 /*
1758 * Prepend the symbol to the reply dictionary.
1759 */
1760 replies->entry = xrealloc(replies->entry, (replies->size + 1) * sizeof(STRING));
1761
1762 /*
1763 * Shuffle everything up for the prepend.
1764 */
1765 for (i = replies->size; i > 0; --i) {
1766 replies->entry[i].length = replies->entry[i - 1].length;
1767 replies->entry[i].word = replies->entry[i - 1].word;
1768 }
1769
1770 replies->entry[0].length = model->dictionary->entry[symbol].length;
1771 replies->entry[0].word = model->dictionary->entry[symbol].word;
1772 replies->size++;
1773
1774 /*
1775 * Extend the current context of the model with the current symbol.
1776 */
1777 update_context(model, symbol);
1778 }
1779
1780 return replies;
1781 }
1782
1783 /*---------------------------------------------------------------------------*/
1784
1785 /*
1786 * Function: evaluate_reply
1787 *
1788 * Purpose: Measure the average surprise of keywords relative to the
1789 * language model.
1790 */
1791 float
evaluate_reply(MODEL * model,DICTIONARY * keys,DICTIONARY * words)1792 evaluate_reply (MODEL *model, DICTIONARY *keys, DICTIONARY *words)
1793 {
1794 register unsigned int j;
1795 register int i;
1796 int symbol, count, num = 0;
1797 float probability, entropy = (float)0.0;
1798 TREE *node;
1799
1800 if (words->size <= 0)
1801 return (float)0.0;
1802 initialize_context(model);
1803 model->context[0] = model->forward;
1804 for (i = 0; i < (signed)words->size; i++) {
1805 symbol = find_word(model->dictionary, words->entry[i]);
1806 if (find_word(keys, words->entry[i])) {
1807 probability = (float)0.0;
1808 count = 0;
1809 ++num;
1810 for (j = 0; j < model->order; j++)
1811 if (model->context[j]) {
1812 node = find_symbol(model->context[j], symbol);
1813 probability += (float)(node->count) /
1814 (float)(model->context[j]->usage);
1815 ++count;
1816 }
1817 if (count > 0.0)
1818 entropy -= (float)log(probability / (float)count);
1819 }
1820 update_context(model, symbol);
1821 }
1822
1823 initialize_context(model);
1824 model->context[0] = model->backward;
1825 for (i = words->size - 1; i >= 0; --i) {
1826 symbol = find_word(model->dictionary, words->entry[i]);
1827 if (find_word(keys, words->entry[i])) {
1828 probability = (float)0.0;
1829 count = 0;
1830 ++num;
1831 for (j = 0; j < model->order; j++)
1832 if (model->context[j]) {
1833 node = find_symbol(model->context[j], symbol);
1834 probability += (float)(node->count) /
1835 (float)(model->context[j]->usage);
1836 ++count;
1837 }
1838 if (count > 0.0)
1839 entropy -= (float)log(probability / (float)count);
1840 }
1841 update_context(model, symbol);
1842 }
1843
1844 if (num >= 8)
1845 entropy /= (float)sqrt(num - 1);
1846 if (num >= 16)
1847 entropy /= (float)num;
1848
1849 return entropy;
1850 }
1851
1852 /*---------------------------------------------------------------------------*/
1853
1854 /*
1855 * Function: make_output
1856 *
1857 * Purpose: Generate a string from the dictionary of reply words.
1858 */
1859 char *
make_output(DICTIONARY * words)1860 make_output (DICTIONARY *words)
1861 {
1862 char *output = 0;
1863 register unsigned int i, len;
1864
1865 if (!words->size)
1866 return xstrdup("I am utterly speechless!");
1867
1868 len = 1;
1869 for (i = 0; i < words->size; i++)
1870 len += words->entry[i].length;
1871 output = xmalloc(len + 1);
1872 len = 0;
1873 for (i = 0; i < words->size; i++) {
1874 memcpy(&(output[len]), words->entry[i].word, words->entry[i].length);
1875 len += words->entry[i].length;
1876 }
1877 output[len] = 0;
1878
1879 return output;
1880 }
1881
1882 /*---------------------------------------------------------------------------*/
1883
1884 /*
1885 * Function: babble
1886 *
1887 * Purpose: Return a random symbol from the current context, or a
1888 * zero symbol identifier if we've reached either the
1889 * start or end of the sentence. Select the symbol based
1890 * on probabilities, favouring keywords. In all cases,
1891 * use the longest available context to choose the symbol.
1892 */
1893 int
babble(MODEL * model,DICTIONARY * keys,DICTIONARY * words)1894 babble (MODEL *model, DICTIONARY *keys, DICTIONARY *words)
1895 {
1896 TREE *node = 0;
1897 register int i;
1898 int count, symbol = 0;
1899
1900 /*
1901 * Select the longest available context.
1902 */
1903 for (i = 0; i <= model->order; i++)
1904 if (model->context[i])
1905 node = model->context[i];
1906
1907 if (!node || !node->branch)
1908 return 0;
1909
1910 /*
1911 * Choose a symbol at random from this context.
1912 */
1913 i = random() % node->branch;
1914 count = random() % node->usage;
1915 while (count >= 0) {
1916 /*
1917 * If the symbol occurs as a keyword, then use it. Only use an
1918 * auxilliary keyword if a normal keyword has already been used.
1919 */
1920 symbol = node->tree[i]->symbol;
1921
1922 if (
1923 (find_word(keys, model->dictionary->entry[symbol])) &&
1924 (used_key == TRUE ||
1925 (!find_word(g_aux, model->dictionary->entry[symbol]))) &&
1926 (word_exists(words, model->dictionary->entry[symbol]) == FALSE)
1927 ) {
1928 used_key = TRUE;
1929 break;
1930 }
1931 count -= node->tree[i]->count;
1932 i = i >= (node->branch - 1) ? 0 : i + 1;
1933 }
1934
1935 return symbol;
1936 }
1937
1938 /*---------------------------------------------------------------------------*/
1939
1940 /*
1941 * Function: word_exists
1942 *
1943 * Purpose: A silly brute-force searcher for the reply string.
1944 */
1945 bool
word_exists(DICTIONARY * dictionary,STRING word)1946 word_exists (DICTIONARY *dictionary, STRING word)
1947 {
1948 register u_int32_t i;
1949
1950 for (i = 0; i < dictionary->size; i++)
1951 if (!wordcmp(dictionary->entry[i], word))
1952 return TRUE;
1953
1954 return FALSE;
1955 }
1956
1957 /*---------------------------------------------------------------------------*/
1958
1959 /*
1960 * Function: seed
1961 *
1962 * Purpose: Seed the reply by guaranteeing that it contains a
1963 * keyword, if one exists.
1964 */
1965 int
seed(MODEL * model,DICTIONARY * keys)1966 seed (MODEL *model, DICTIONARY *keys)
1967 {
1968 register unsigned int i, stop;
1969 int symbol;
1970
1971 if (!model->context[0]->branch)
1972 symbol = 0;
1973 else
1974 symbol = random() % model->context[0]->branch;
1975
1976 if (keys->size > 2) {
1977 do {
1978 i = random() % keys->size;
1979 } while (i < 2);
1980 stop = i;
1981 while (TRUE) {
1982 if (
1983 (find_word(model->dictionary, keys->entry[i])) &&
1984 (!find_word(g_aux, keys->entry[i]))
1985 ) {
1986 symbol = find_word(model->dictionary, keys->entry[i]);
1987 return symbol;
1988 }
1989 i++;
1990 if (i == keys->size)
1991 i = 2;
1992 if (i == stop)
1993 return symbol;
1994 }
1995 }
1996
1997 return symbol;
1998 }
1999
2000 /*---------------------------------------------------------------------------*/
2001
2002 /*
2003 * Function: new_swap
2004 *
2005 * Purpose: Allocate a new swap structure.
2006 */
2007 SWAP *
new_swap(void)2008 new_swap (void)
2009 {
2010 SWAP *list;
2011
2012 list = xmalloc(sizeof(*list));
2013 list->size = 0;
2014 list->from = 0;
2015 list->to = 0;
2016
2017 return list;
2018 }
2019
2020 /*---------------------------------------------------------------------------*/
2021
2022 /*
2023 * Function: add_swap
2024 *
2025 * Purpose: Add a new entry to the swap structure.
2026 */
2027 void
add_swap(SWAP * list,char * s,char * d)2028 add_swap (SWAP *list, char *s, char *d)
2029 {
2030 list->size++;
2031
2032 list->from = xrealloc(list->from, sizeof(STRING) * (list->size));
2033 list->to = xrealloc(list->to, sizeof(STRING) * (list->size));
2034
2035 list->from[list->size - 1].length = strlen(s);
2036 list->from[list->size - 1].word = xstrdup(s);
2037 list->to[list->size - 1].length = strlen(d);
2038 list->to[list->size - 1].word = xstrdup(d);
2039 }
2040
2041 /*---------------------------------------------------------------------------*/
2042
2043 /*
2044 * Function: initialize_swap
2045 *
2046 * Purpose: Read a swap structure from a file.
2047 */
2048 SWAP *
initialize_swap(const char * filename)2049 initialize_swap (const char *filename)
2050 {
2051 SWAP *list;
2052 FILE *fp;
2053 char buffer[1024], *from, *to;
2054
2055 list = new_swap();
2056
2057 if (!filename || !(fp = fopen(filename, "r")))
2058 return list;
2059
2060 while (fgets(buffer, sizeof(buffer), fp)) {
2061 if (buffer[0] == '#')
2062 continue;
2063 from = strtok(buffer, "\t ");
2064 to = strtok(0, "\t \n#");
2065 add_swap(list, from, to);
2066 }
2067
2068 fclose(fp);
2069
2070 return list;
2071 }
2072
2073 /*---------------------------------------------------------------------------*/
2074
2075 /*
2076 * Function: initialize_list
2077 *
2078 * Purpose: Read a dictionary from a file.
2079 */
2080 DICTIONARY
initialize_list(const char * filename)2081 *initialize_list (const char *filename)
2082 {
2083 DICTIONARY *list;
2084 FILE *fp;
2085 STRING word;
2086 char *string, buffer[1024];
2087
2088 list = new_dictionary();
2089
2090 if (!filename || !(fp = fopen(filename, "r")))
2091 return list;
2092
2093 while (fgets(buffer, sizeof(buffer), fp)) {
2094 if (buffer[0] == '#')
2095 continue;
2096 string = strtok(buffer, "\t \n#");
2097 if (string && string[0]) {
2098 word.length = strlen(string);
2099 word.word = xstrdup(buffer);
2100 add_word(list, word);
2101 }
2102 }
2103
2104 fclose(fp);
2105
2106 return list;
2107 }
2108
2109 /*---------------------------------------------------------------------------*/
2110
2111 /*
2112 * Function: delay
2113 *
2114 * Purpose: Display the string to stdout as if it was typed by a human.
2115 */
2116 void
delay(struct htlc_conn * htlc,char * str)2117 delay (struct htlc_conn *htlc, char *str)
2118 {
2119 register int i;
2120 register char *out;
2121
2122 /*
2123 * Don't simulate typing if the feature is turned off
2124 */
2125 if (typing_delay == FALSE) {
2126 hx_send_chat(htlc, 0, str);
2127 return;
2128 }
2129
2130 out = xmalloc(strlen(str) + 1);
2131 for (i = 0; str[i]; i++) {
2132 /*
2133 * Standard keyboard delay
2134 */
2135 usleep(D_KEY + random() % V_KEY - random() % V_KEY);
2136 out[i] = str[i];
2137
2138 /*
2139 * A random thinking delay
2140 */
2141 if ((!isalnum(str[i])) && ((random() % 100) < P_THINK))
2142 usleep(D_THINK + random() % V_THINK - random() % V_THINK);
2143 }
2144 out[i] = 0;
2145 hx_send_chat(htlc, 0, out);
2146 xfree(out);
2147 }
2148
2149 /*===========================================================================*/
2150
2151 /*
2152 * Revision 1.8 1997/12/24 03:17:01 hutch
2153 * More bug fixes, and hopefully the final contest version!
2154 *
2155 * Revision 1.7 1997/12/22 13:18:09 hutch
2156 * A few more bug fixes, and non-repeating implemented.
2157 *
2158 * Revision 1.6 1997/12/22 04:27:04 hutch
2159 * A few minor bug fixes.
2160 *
2161 * Revision 1.5 1997/12/15 04:35:59 hutch
2162 * Final Loebner version!
2163 *
2164 * Revision 1.4 1997/12/11 05:45:29 hutch
2165 * the almost finished version.
2166 *
2167 * Revision 1.3 1997/12/10 09:08:09 hutch
2168 * Now Loebner complient (tm)
2169 *
2170 * Revision 1.2 1997/12/08 06:22:32 hutch
2171 * Tidied up.
2172 *
2173 * Revision 1.1 1997/12/05 07:11:44 hutch
2174 * Initial revision
2175 *
2176 * Revision 1.7 1997/12/04 07:07:13 hutch
2177 * Added load and save functions, and tidied up some code/
2178 *
2179 * Revision 1.6 1997/12/02 08:34:47 hutch
2180 * Added the ban, aux and swp functions.
2181 *
2182 * Revision 1.5 1997/12/02 06:03:04 hutch
2183 * Updated to use a special terminating symbol, and to store only
2184 * branches of maximum depth, as they are the only ones used in
2185 * the reply.
2186 *
2187 * Revision 1.4 1997/10/28 09:23:12 hutch
2188 * MegaHAL is babbling nicely, but without keywords.
2189 *
2190 * Revision 1.3 1997/10/15 09:04:03 hutch
2191 * MegaHAL can parrot back whatever the user says.
2192 *
2193 * Revision 1.2 1997/07/21 04:03:28 hutch
2194 * Fully working.
2195 *
2196 * Revision 1.1 1997/07/15 01:55:25 hutch
2197 * Initial revision
2198 *
2199 * Revision 1.1 1997/07/15 01:54:21 hutch
2200 * Initial revision
2201 */
2202
2203 /*===========================================================================*/
2204
2205