1 /*************************************************************************/
2 /* Copyright (c) 2014 Amir Plivatsky                                     */
3 /* All rights reserved                                                   */
4 /*                                                                       */
5 /* Use of the link grammar parsing system is subject to the terms of the */
6 /* license set forth in the LICENSE file included with this software.    */
7 /* This license allows free redistribution and use in source and binary  */
8 /* forms, with or without modification, subject to certain conditions.   */
9 /*                                                                       */
10 /*************************************************************************/
11 
12 #include <stdlib.h>
13 
14 #ifdef USE_WORDGRAPH_DISPLAY
15 #include <stdio.h>
16 #include <errno.h>
17 #include <stdint.h>
18 #ifdef HAVE_FORK
19 #include <unistd.h>    /* fork() and execl() */
20 #include <sys/wait.h>  /* waitpid() */
21 #endif
22 #ifdef HAVE_PRCTL
23 #include <sys/prctl.h> /* prctl() */
24 #endif
25 #include <signal.h>    /* SIG* */
26 
27 #include "print/print-util.h" /* for append_string */
28 #include "utilities.h" /* for dyn_str functions and UNREACHABLE */
29 #endif /* USE_WORDGRAPH_DISPLAY */
30 
31 #include "api-structures.h"
32 #include "error.h"
33 #include "string-set.h"
34 #include "tok-structures.h"
35 #include "wordgraph.h"
36 
37 #ifdef __APPLE__
38 #define POPEN_DOT
39 #endif /* __APPLE__ */
40 
41 #if USE_WORDGRAPH_DISPLAY || defined(DEBUG)
gword_morpheme(Sentence sent,const Gword * w)42 GNUC_UNUSED const char *gword_morpheme(Sentence sent, const Gword *w)
43 {
44 	const char *mt;
45 	char buff[64];
46 
47 	switch (w->morpheme_type)
48 	{
49 		case MT_INVALID:
50 			mt = "MT_INVALID";
51 			break;
52 		case MT_WORD:
53 			mt = "MT_WORD";
54 			break;
55 		case MT_FEATURE:
56 			mt = "MT_FEATURE";
57 			break;
58 		case MT_INFRASTRUCTURE:
59 			mt = "MT_I-S";
60 			break;
61 		case MT_WALL:
62 			mt = "MT_WALL";
63 			break;
64 		case MT_EMPTY:
65 			mt = "MT_EMPTY";
66 			break;
67 		case MT_UNKNOWN:
68 			mt = "MT_UNKNOWN";
69 			break;
70 		case MT_TEMPLATE:
71 			mt = "MT_TEMPLATE";
72 			break;
73 		case MT_ROOT:
74 			mt = "MT_ROOT";
75 			break;
76 		case MT_CONTR:
77 			mt = "MT_CONTR";
78 			break;
79 		case MT_PUNC:
80 			mt = "MT_PUNC";
81 			break;
82 		case MT_STEM:
83 			mt = "MT_STEM";
84 			break;
85 		case MT_PREFIX:
86 			mt = "MT_PREFIX";
87 			break;
88 		case MT_MIDDLE:
89 			mt = "MT_MIDDLE";
90 			break;
91 		case MT_SUFFIX:
92 			mt = "MT_SUFFIX";
93 			break;
94 		default:
95 			/* No truncation is expected. */
96 			snprintf(buff, sizeof(buff), "MT_%d", (int)w->morpheme_type);
97 			mt = string_set_add(buff, sent->string_set);
98 	}
99 
100 	return mt;
101 }
102 #endif /* USE_WORDGRAPH_DISPLAY || defined(DEBUG) */
103 
104 #if USE_WORDGRAPH_DISPLAY
105 /* === Wordgraph graphical representation === */
106 
wordgraph_legend(dyn_str * wgd,unsigned int mode)107 static void wordgraph_legend(dyn_str *wgd, unsigned int mode)
108 {
109 	size_t i;
110 	static char const *wst[] = {
111 		"RE", "Matched a regex",
112 		"SP", "Result of spell guess",
113 		"RU", "Separated run-on word",
114 		"HA", "Has an alternative",
115 		"UNS", "Also unsplit_word",
116 		"IN", "In the dict file",
117 		"FI", "First char is uppercase"
118 	};
119 
120 	append_string(wgd,
121 		"subgraph cluster_legend {\n"
122 		"label=Legend;\n"
123 		"%s"
124 		"legend [label=\"subword\\n(status-flags)\\nmorpheme-type\"];\n"
125 		"legend [xlabel=\"ordinal-number\\ndebug-label\"];\n"
126 		"%s"
127 		"legend_width [width=4.5 height=0 shape=none label=<\n"
128 		"<table border='0' cellborder='1' cellspacing='0'>\n"
129 		"<tr><td colspan='2'>status-flags</td></tr>\n",
130 		(mode & WGR_SUB) ? "subgraph cluster_unsplit_word {\n"
131 		                   "label=\"ordinal-number unsplit-word\";\n" : "",
132 		(mode & WGR_SUB) ? "}\n" : ""
133 
134 	);
135 	for (i = 0; i < sizeof(wst)/sizeof(wst[0]); i += 2)
136 	{
137 		append_string(wgd,
138 		  "<tr><td align='left'>%s</td><td align='left'>%s</td></tr>\n",
139 		  wst[i], wst[i+1]);
140 	}
141 
142 	append_string(wgd,
143 		"</table>>];"
144 		"}\n"
145 		"subgraph cluster_legend_top_space {\n"
146 			"style=invis legend_dummy [style=invis height=0 shape=box]\n"
147 		"};\n"
148 	);
149 }
150 
151 /**
152  * Graph node name: Add "Sentence:" for the main node.
153  * Also escape " and \ with a \.
154  */
wlabel(Sentence sent,const Gword * w)155 static const char *wlabel(Sentence sent, const Gword *w)
156 {
157 	const char *s;
158 	const char sentence_label[] = "Sentence:\\n";
159 	dyn_str *l = dyn_str_new();
160 	char c0[] = "\0\0";
161 
162 	assert((NULL != w) && (NULL != w->subword), "Word must exist");
163 	if ('\0' == *w->subword)
164 		 return string_set_add("(nothing)", sent->string_set);
165 
166 	if (w == sent->wordgraph) dyn_strcat(l, sentence_label);
167 
168 	for (s = w->subword; *s; s++)
169 	{
170 		switch (*s)
171 		{
172 			case '\"':
173 				dyn_strcat(l, "\\\"");
174 				break;
175 			case '\\':
176 				dyn_strcat(l, "\\");
177 				break;
178 			default:
179 				*c0 = *s;
180 				dyn_strcat(l, c0);
181 		}
182 	}
183 
184 	char *label_str = dyn_str_take(l);
185 	s = string_set_add(label_str, sent->string_set);
186 	free(label_str);
187 	return s;
188 }
189 
190 /**
191  *  Generate the wordgraph in dot(1) format, for debug.
192  */
wordgraph2dot(Sentence sent,unsigned int mode,const char * modestr)193 static dyn_str *wordgraph2dot(Sentence sent, unsigned int mode, const char *modestr)
194 {
195 	const Gword *w;
196 	Gword	**wp;
197 	dyn_str *wgd = dyn_str_new(); /* the wordgraph in dot representation */
198 	char nn[2*sizeof(char *) + 2 + 2 + 1]; /* \"%p\" node name: "0x..."+NUL*/
199 
200 	/* This function is called only if we have a wordgraph, in which case
201 	 * chain_next is non-NULL. So stop static analyzers to complain that
202 	 * it can be possibly NULL. */
203 	UNREACHABLE(NULL == sent->wordgraph->chain_next);
204 
205 	append_string(wgd, "# Mode: %s\n", modestr);
206 	dyn_strcat(wgd, "digraph G {\nsize =\"30,20\";\nrankdir=LR;\n");
207 	if ((mode & (WGR_SUB)) && !(mode & WGR_COMPACT))
208 		dyn_strcat(wgd, "newrank=true;\n");
209 	if (mode & WGR_LEGEND) wordgraph_legend(wgd, mode);
210 	append_string(wgd, "\"%p\" [shape=box,style=filled,color=\".7 .3 1.0\"];\n",
211 	              sent->wordgraph);
212 
213 	for (w = sent->wordgraph; w; w = w->chain_next)
214 	{
215 		bool show_node;
216 
217 		if (!(mode & WGR_UNSPLIT) && (MT_INFRASTRUCTURE != w->morpheme_type))
218 		{
219 			Gword *wu;
220 
221 			show_node = false;
222 			/* In this mode nodes that are only unsplit_word are not shown. */
223 			for (wu = sent->wordgraph; wu; wu = wu->chain_next)
224 			{
225 				if (NULL != wu->next)
226 				{
227 					for (wp = wu->next; *wp; wp++)
228 					{
229 						if (w == *wp)
230 						{
231 							show_node = true;
232 							break;
233 						}
234 					}
235 				}
236 			}
237 
238 			if (!show_node) continue;
239 		}
240 
241 		snprintf(nn, sizeof(nn), "\"%p\"", w);
242 
243 		/* Subword node format:
244 		 *                     +------------------+
245 		 *                     +                  +
246 		 *                     +    w->subword    +
247 		 *                     +    (w->flags)    +
248 		 *                     + w->morpheme_type +
249 		 *                     +                  +
250 		 *                     +------------------+
251 		 *          w->node_num  } <- external node label
252 		 *           w->label    }
253 		 *
254 		 * The flags and morpheme type are printed symbolically.
255 		 * The node_num field is the ordinal number of word creation.
256 		 * The label shows the code positions that created the subword.
257 		 * The external node label may appear at other positions near the node.
258 		 *
259 		 * FIXME: Use HTML labels.
260 		 */
261 
262 		append_string(wgd, "%s [label=\"%s\\n(%s)\\n%s\"];\n", nn,
263 			wlabel(sent, w), gword_status(sent, w), gword_morpheme(sent, w));
264 
265 		if (!(mode & WGR_DBGLABEL))
266 		{
267 			append_string(wgd, "%s [xlabel=\"%zu",
268 							  nn, w->node_num);
269 		}
270 		else
271 		{
272 			append_string(wgd, "%s [xlabel=\"%zu\\n%s",
273 							  nn, w->node_num, w->label);
274 		}
275 
276 		/* For debugging this function: display also hex node names. */
277 		if (mode & WGR_DOTDEBUG)
278 			append_string(wgd, "\\n%p-%s", w, wlabel(sent, w));
279 
280 		dyn_strcat(wgd, "\"];\n");
281 
282 		if (NULL != w->next)
283 		{
284 			for (wp = w->next; *wp; wp++)
285 			{
286 				append_string(wgd, "%s->\"%p\" [label=next color=red];\n",
287 				              nn, *wp);
288 			}
289 		}
290 		if (mode & WGR_PREV)
291 		{
292 			if (NULL != w->prev)
293 			{
294 				for (wp = w->prev; *wp; wp++)
295 				{
296 					append_string(wgd, "%s->\"%p\" [label=prev color=blue];\n",
297 					              nn, *wp);
298 				}
299 			}
300 		}
301 		if (mode & WGR_UNSPLIT)
302 		{
303 			if (!(mode & WGR_SUB) && (NULL != w->unsplit_word))
304 			{
305 				append_string(wgd, "%s->\"%p\" [label=unsplit];\n",
306 				              nn, w->unsplit_word);
307 			}
308 		}
309 	}
310 
311 	if (mode & WGR_SUB)
312 	{
313 		const Gword *old_unsplit = NULL;
314 
315 		for (w = sent->wordgraph; w; w = w->chain_next)
316 		{
317 			if (NULL != w->unsplit_word)
318 			{
319 				if (w->unsplit_word != old_unsplit)
320 				{
321 					if (NULL != old_unsplit) dyn_strcat(wgd, "}\n");
322 					append_string(wgd, "subgraph \"cluster-%p\" {", w->unsplit_word);
323 					append_string(wgd, "label=\"%zu %s\"; \n",
324 						w->unsplit_word->node_num, wlabel(sent, w->unsplit_word));
325 
326 					old_unsplit = w->unsplit_word;
327 				}
328 				snprintf(nn, sizeof(nn), "\"%p\"", w);
329 				if (strstr(dyn_str_value(wgd), nn))
330 					append_string(wgd, "\"%p\"; ", w);
331 			}
332 		}
333 		dyn_strcat(wgd, "}\n");
334 	}
335 	else
336 	{
337 #ifdef WGR_SHOW_TERMINATOR_AT_LHS /* not defined - not useful */
338 		const Gword *terminating_node = NULL;
339 #endif
340 
341 		dyn_strcat(wgd, "{rank=same; ");
342 		for (w = sent->wordgraph->chain_next; w; w = w->chain_next)
343 		{
344 			snprintf(nn, sizeof(nn), "\"%p\"", w);
345 			if (IS_SENTENCE_WORD(sent, w) &&
346 			    ((mode & WGR_UNSPLIT) || strstr(dyn_str_value(wgd), nn)))
347 			{
348 				append_string(wgd, "%s; ", nn);
349 			}
350 
351 #ifdef WGR_SHOW_TERMINATOR_AT_LHS
352 			if (NULL == w->next) terminating_node = w;
353 #endif
354 		}
355 		dyn_strcat(wgd, "}\n");
356 
357 #ifdef WGR_SHOW_TERMINATOR_AT_LHS
358 		if (terminating_node)
359 			append_string(wgd, "{rank=sink; \"%p\"}\n", terminating_node);
360 #endif
361 	}
362 
363 	dyn_strcat(wgd, "\n}\n");
364 
365 	return wgd;
366 }
367 
368 #if defined(HAVE_FORK) && !defined(POPEN_DOT)
369 static pid_t pid; /* XXX not reentrant */
370 
371 #ifndef HAVE_PRCTL
372 /**
373  * Cancel the wordgraph viewers, to be used if there is fork() but no prctl().
374  */
wordgraph_show_cancel(void)375 static void wordgraph_show_cancel(void)
376 {
377 		kill(pid, SIGTERM);
378 }
379 #endif /* HAVE_FORK */
380 #endif /* HAVE_PRCTL */
381 
382 #ifndef DOT_COMMNAD
383 #define DOT_COMMAND "dot"
384 #endif
385 
386 #ifndef DOT_DRIVER
387 #define DOT_DRIVER "-Txlib"
388 #endif
389 
390 /* In case files are used, their names are fixed. So more than one thread
391  * (or program) cannot use the word-graph display at the same time. This
392  * can be corrected, even though there is no much point to do that
393  * (displaying the word-graph is for debug). */
394 #define DOT_FILENAME "lg-wg.vg"
395 
396 #define POPEN_DOT_CMD DOT_COMMAND" "DOT_DRIVER
397 #ifndef POPEN_DOT_CMD_NATIVE
398 #  ifdef _WIN32
399 #    ifndef IMAGE_VIEWER
400 #      define IMAGE_VIEWER "rundll32 PhotoViewer,ImageView_Fullscreen"
401 #    endif
402 #    define WGJPG "%TEMP%\\lg-wg.jpg"
403 #    define POPEN_DOT_CMD_NATIVE \
404 				DOT_COMMAND" -Tjpg>"WGJPG"&"IMAGE_VIEWER" "WGJPG"&del "WGJPG
405 #  elif __APPLE__
406 #    ifndef IMAGE_VIEWER
407 #      define IMAGE_VIEWER "open -W"
408 #    endif
409 #    define WGJPG "$TMPDIR/lg-wg.jpg"
410 #    define POPEN_DOT_CMD_NATIVE \
411 				DOT_COMMAND" -Tjpg>"WGJPG";"IMAGE_VIEWER" "WGJPG";rm "WGJPG
412 #  else
413 #    define POPEN_DOT_CMD_NATIVE POPEN_DOT_CMD
414 #  endif
415 #endif
416 
417 #if !defined HAVE_FORK || defined POPEN_DOT
418 #ifdef _MSC_VER
419 #define popen _popen
420 #define pclose _pclose
421 #endif
422 /**
423  * popen a command with the given input.
424  * If the system doesn't have fork(), popen() is used to launch "dot".
425  * This is an inferior implementation than the one below that uses
426  * fork(), in which the window remains open and is updated automatically
427  * when new sentences are entered. With popen(), the program blocks at
428  * pclose() and the user needs to close the window after each sentence.
429  */
x_popen(const char * cmd,const char * wgds)430 static bool x_popen(const char *cmd, const char *wgds)
431 {
432 	lgdebug(+3, "Invoking: %s\n", cmd);
433 	FILE *const cmdf = popen(cmd, "w");
434 	bool rc = true;
435 
436 	if (NULL == cmdf)
437 	{
438 		prt_error("Error: popen of '%s' failed: %s\n", cmd, strerror(errno));
439 		rc = false;
440 	}
441 	else
442 	{
443 		if (fputs(wgds, cmdf) == EOF) /* see default_error_handler() */
444 		{
445 			prt_error("Error: x_popen(): fputs() error: %s\n", strerror(errno));
446 			rc = false;
447 		}
448 		if (pclose(cmdf) == -1)
449 		{
450 			prt_error("Error: x_popen(): pclose() error: %s\n", strerror(errno));
451 			rc = false;
452 		}
453 	}
454 
455 	return rc;
456 }
457 #else
x_forkexec(const char * const argv[],pid_t * vpid,const char err[])458 static bool x_forkexec(const char *const argv[], pid_t *vpid, const char err[])
459 {
460 	/* Fork/exec a graph viewer, and leave it in the background until we exit.
461 	 * On exit, send SIGHUP. If prctl() is not available and the program
462 	 * crashes, then it is left to the user to exit the viewer. */
463 	if (0 < *vpid)
464 	{
465 		pid_t rpid = waitpid(*vpid, NULL, WNOHANG);
466 
467 		if (0 == rpid) return true; /* viewer still active */
468 		if (-1 == rpid)
469 		{
470 			prt_error("Error: waitpid(%d): %s\n", *vpid, strerror(errno));
471 			*vpid = 0;
472 			return false;
473 		}
474 	}
475 
476 	*vpid = fork();
477 	switch (*vpid)
478 	{
479 		case -1:
480 			prt_error("Error: fork(): %s\n", strerror(errno));
481 			return false;
482 		case 0:
483 #ifdef HAVE_PRCTL
484 			if (-1 == prctl(PR_SET_PDEATHSIG, SIGHUP))
485 			{
486 					prt_error("Error: prctl: %s\n", strerror(errno));
487 					/* Non-fatal error - continue. */
488 			}
489 #endif
490 			/* Not closing fd 0/1/2, to allow interaction with the program */
491 			execvp(argv[0], (char **)argv);
492 			prt_error("Error: execlp of %s: %s%s\n",
493 			          argv[0], strerror(errno), (ENOENT == errno) ? err : "");
494 			_exit(1);
495 		default:
496 #ifndef HAVE_PRCTL
497 			if (0 != atexit(wordgraph_show_cancel))
498 			{
499 				 prt_error("Warning: atexit(wordgraph_show_cancel) failed.\n");
500 				/* Non-fatal error - continue. */
501 			}
502 #endif
503 			break;
504 	}
505 
506 	return true;
507 }
508 #endif /* !defined HAVE_FORK || defined POPEN_DOT */
509 
510 #ifdef _WIN32
511 #define TMPDIR (getenv("TEMP") ? getenv("TEMP") : ".")
512 #else
513 #define TMPDIR (getenv("TMPDIR") ? getenv("TMPDIR") : "/tmp")
514 #endif
515 
516 #define concatfn(fn, fn1, fn2) \
517 	(fn=alloca(strlen(fn1)+strlen(fn2)+2),\
518 	 strcpy(fn, fn1), strcat(fn, "/"), strcat(fn, fn2))
519 
wordgraph_unlink_xtmpfile(void)520 static void wordgraph_unlink_xtmpfile(void)
521 {
522 	char *fn;
523 
524 	if (!test_enabled("gvfile"))
525 	{
526 		concatfn(fn, TMPDIR, DOT_FILENAME);
527 		if (unlink(fn) == -1)
528 			prt_error("Warning: Cannot unlink %s: %s\n", fn, strerror(errno));
529 	}
530 }
531 
532 /**
533  * Display the word-graph in the indicated mode.
534  * This is for debug and inspiration. It is not reentrant due to the
535  * static pid and the possibly created fixed filenames.
536  * When Using X11, a "dot -Txlib" program is launched on the graph
537  * description file.  The xlib driver refreshes the graph when the file is
538  * changed, displaying additional sentences in the same window.  The viewer
539  * program exits on program end (see the comments in the code).  When
540  * compiled with MSVC or MINGW, the system PhotoViewer is used by default,
541  * unless !wg:x is used (for using X11 when available).
542  *
543  * The "dot" and the "PhotoViewer" programs must be in the PATH.
544  *
545  * FIXME? "dot" may get a SEGV due to memory corruptions in it (a known
546  * problem - exists even in 2.38). This can be worked-around by trying it
547  * again until it succeeds (but the window size, if changed by the user,
548  * will not be preserved).
549  *
550  * modestr: a graph display mode as defined in wordgraph.h (default "ldu").
551  */
sentence_display_wordgraph(Sentence sent,const char * modestr)552 bool sentence_display_wordgraph(Sentence sent, const char *modestr)
553 {
554 	dyn_str *wgd;
555 	char *gvf_name = NULL;
556 	bool generate_gvfile = test_enabled("gvfile"); /* keep it for debug */
557 	char *wgds;
558 	bool gvfile = false;
559 	uint32_t mode = 0;
560 	const char *mp;
561 	bool rc = true;
562 
563 	for (mp = modestr; '\0' != *mp && ',' != *mp; mp++)
564 	{
565 		if ((*mp >= 'a') && (*mp <= 'z')) mode |= 1<<(*mp-'a');
566 	}
567 	if ((0 == mode) || (WGR_X11 == mode))
568 		mode |= WGR_LEGEND|WGR_DBGLABEL|WGR_UNSPLIT;
569 
570 	wgd = wordgraph2dot(sent, mode, modestr);
571 	wgds = dyn_str_take(wgd);
572 
573 #if defined(HAVE_FORK) && !defined(POPEN_DOT)
574 	gvfile = true;
575 #endif
576 
577 	if (gvfile || generate_gvfile)
578 	{
579 		FILE *gvf;
580 		bool gvf_error = false;
581 		static bool wordgraph_unlink_xtmpfile_needed = true;
582 
583 		concatfn(gvf_name, TMPDIR, DOT_FILENAME);
584 		gvf = fopen(gvf_name, "w");
585 		if (NULL == gvf)
586 		{
587 			prt_error("Error: %s(): fopen() of %s failed: %s\n",
588 						 __func__, gvf_name, strerror(errno));
589 			gvf_error = true;
590 		}
591 		else
592 		{
593 			if (fputs(wgds, gvf) == EOF)
594 			{
595 				gvf_error = true;
596 				prt_error("Error: %s(): fputs() to %s failed: %s\n",
597 							 __func__, gvf_name, strerror(errno));
598 			}
599 			if (fclose(gvf) == EOF)
600 			{
601 				gvf_error = true;
602 				prt_error("Error: %s(): fclose() of %s failed: %s\n",
603 							  __func__, gvf_name, strerror(errno));
604 			}
605 		}
606 		if (gvf_error && gvfile) /* we need it - cannot continue */
607 		{
608 			rc = false;
609 			goto finish;
610 		}
611 
612 		if (wordgraph_unlink_xtmpfile_needed)
613 		{
614 			/* The filename is fixed - removal needed only once. */
615 			wordgraph_unlink_xtmpfile_needed = false;
616 			atexit(wordgraph_unlink_xtmpfile);
617 		}
618 	}
619 
620 #ifdef _WIN32
621 #define EXITKEY "ALT-F4"
622 #elif __APPLE__
623 #define EXITKEY "⌘-Q"
624 #endif
625 
626 #ifdef EXITKEY
627 	prt_error("Press "EXITKEY" in the graphical display window to continue\n");
628 #endif
629 
630 #if !defined HAVE_FORK || defined POPEN_DOT
631 	rc = x_popen((mode & WGR_X11)? POPEN_DOT_CMD : POPEN_DOT_CMD_NATIVE, wgds);
632 #else
633 	{
634 		assert(NULL != gvf_name, "DOT filename not initialized (#define mess?)");
635 		const char *const args[] = { DOT_COMMAND, DOT_DRIVER, gvf_name, NULL };
636 		const char notfound[] =
637 			" (command not in PATH; \"graphviz\" package not installed?).";
638 		rc = x_forkexec(args, &pid, notfound);
639 	}
640 #endif
641 
642 finish:
643 	free(wgds);
644 	return rc;
645 }
646 #else
sentence_display_wordgraph(Sentence sent,const char * modestr)647 bool sentence_display_wordgraph(Sentence sent, const char *modestr)
648 {
649 		prt_error("Error: Library not configured with wordgraph-display\n");
650 		return false;
651 }
652 #endif /* USE_WORDGRAPH_DISPLAY */
653