xref: /openbsd/usr.bin/mandoc/tag.c (revision 2b14f697)
1*2b14f697Sschwarze /* $OpenBSD: tag.c,v 1.38 2023/11/24 04:48:02 schwarze Exp $ */
2c0a657b3Sschwarze /*
3*2b14f697Sschwarze  * Copyright (c) 2015, 2016, 2018, 2019, 2020, 2022, 2023
47fbd65bbSschwarze  *               Ingo Schwarze <schwarze@openbsd.org>
5c0a657b3Sschwarze  *
6c0a657b3Sschwarze  * Permission to use, copy, modify, and distribute this software for any
7c0a657b3Sschwarze  * purpose with or without fee is hereby granted, provided that the above
8c0a657b3Sschwarze  * copyright notice and this permission notice appear in all copies.
9c0a657b3Sschwarze  *
10c0a657b3Sschwarze  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11c0a657b3Sschwarze  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12c0a657b3Sschwarze  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13c0a657b3Sschwarze  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14c0a657b3Sschwarze  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15c0a657b3Sschwarze  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16c0a657b3Sschwarze  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
170ac7e6ecSschwarze  *
180ac7e6ecSschwarze  * Functions to tag syntax tree nodes.
190ac7e6ecSschwarze  * For internal use by mandoc(1) validation modules only.
20c0a657b3Sschwarze  */
21c0a657b3Sschwarze #include <sys/types.h>
22c0a657b3Sschwarze 
237d109111Sschwarze #include <assert.h>
243af42e7fSschwarze #include <limits.h>
25c0a657b3Sschwarze #include <stddef.h>
2624807e10Sschwarze #include <stdint.h>
27*2b14f697Sschwarze #include <stdio.h>
28c0a657b3Sschwarze #include <stdlib.h>
29c0a657b3Sschwarze #include <string.h>
30c0a657b3Sschwarze 
31c0a657b3Sschwarze #include "mandoc_aux.h"
32c4b66caeSschwarze #include "mandoc_ohash.h"
33*2b14f697Sschwarze #include "mandoc.h"
340ac7e6ecSschwarze #include "roff.h"
359226948cSschwarze #include "mdoc.h"
366e2a0df9Sschwarze #include "roff_int.h"
37c0a657b3Sschwarze #include "tag.h"
38c0a657b3Sschwarze 
39c0a657b3Sschwarze struct tag_entry {
400ac7e6ecSschwarze 	struct roff_node **nodes;
410ac7e6ecSschwarze 	size_t	 maxnodes;
420ac7e6ecSschwarze 	size_t	 nnodes;
4362615e18Sschwarze 	int	 prio;
44c0a657b3Sschwarze 	char	 s[];
45c0a657b3Sschwarze };
46c0a657b3Sschwarze 
47a875cd0eSschwarze static void		 tag_move_href(struct roff_man *,
48a875cd0eSschwarze 				struct roff_node *, const char *);
499226948cSschwarze static void		 tag_move_id(struct roff_node *);
509226948cSschwarze 
51c0a657b3Sschwarze static struct ohash	 tag_data;
52c0a657b3Sschwarze 
53c0a657b3Sschwarze 
54c0a657b3Sschwarze /*
550ac7e6ecSschwarze  * Set up the ohash table to collect nodes
562ae7e873Sschwarze  * where various marked-up terms are documented.
572ae7e873Sschwarze  */
580ac7e6ecSschwarze void
tag_alloc(void)590ac7e6ecSschwarze tag_alloc(void)
600ac7e6ecSschwarze {
61c4b66caeSschwarze 	mandoc_ohash_init(&tag_data, 4, offsetof(struct tag_entry, s));
620ac7e6ecSschwarze }
632ae7e873Sschwarze 
640ac7e6ecSschwarze void
tag_free(void)650ac7e6ecSschwarze tag_free(void)
660ac7e6ecSschwarze {
670ac7e6ecSschwarze 	struct tag_entry	*entry;
680ac7e6ecSschwarze 	unsigned int		 slot;
690ac7e6ecSschwarze 
704c8dba62Sschwarze 	if (tag_data.info.free == NULL)
714c8dba62Sschwarze 		return;
720ac7e6ecSschwarze 	entry = ohash_first(&tag_data, &slot);
730ac7e6ecSschwarze 	while (entry != NULL) {
740ac7e6ecSschwarze 		free(entry->nodes);
750ac7e6ecSschwarze 		free(entry);
760ac7e6ecSschwarze 		entry = ohash_next(&tag_data, &slot);
770ac7e6ecSschwarze 	}
780ac7e6ecSschwarze 	ohash_delete(&tag_data);
794c8dba62Sschwarze 	tag_data.info.free = NULL;
80c0a657b3Sschwarze }
81c0a657b3Sschwarze 
82c0a657b3Sschwarze /*
830ac7e6ecSschwarze  * Set a node where a term is defined,
847fbd65bbSschwarze  * unless the term is already defined at a lower priority.
85c0a657b3Sschwarze  */
86c0a657b3Sschwarze void
tag_put(const char * s,int prio,struct roff_node * n)870ac7e6ecSschwarze tag_put(const char *s, int prio, struct roff_node *n)
88c0a657b3Sschwarze {
89c0a657b3Sschwarze 	struct tag_entry	*entry;
90c220f9cfSschwarze 	struct roff_node	*nold;
91*2b14f697Sschwarze 	const char		*se, *src;
92*2b14f697Sschwarze 	char			*cpy;
93c4f5c5f1Sschwarze 	size_t			 len;
94c0a657b3Sschwarze 	unsigned int		 slot;
95*2b14f697Sschwarze 	int			 changed;
96c0a657b3Sschwarze 
977d109111Sschwarze 	assert(prio <= TAG_FALLBACK);
983af42e7fSschwarze 
997fbd65bbSschwarze 	/*
1007fbd65bbSschwarze 	 * If the node is already tagged, the existing tag is
1017fbd65bbSschwarze 	 * explicit and we are now about to add an implicit tag.
1027fbd65bbSschwarze 	 * Don't do that; just skip implicit tagging if the author
1037fbd65bbSschwarze 	 * specified an explicit tag.
1047fbd65bbSschwarze 	 */
1057fbd65bbSschwarze 
1067fbd65bbSschwarze 	if (n->flags & NODE_ID)
1077fbd65bbSschwarze 		return;
1087fbd65bbSschwarze 
1097fbd65bbSschwarze 	/* Determine the implicit tag. */
1107fbd65bbSschwarze 
111*2b14f697Sschwarze 	changed = 1;
1120ac7e6ecSschwarze 	if (s == NULL) {
1130ac7e6ecSschwarze 		if (n->child == NULL || n->child->type != ROFFT_TEXT)
1140ac7e6ecSschwarze 			return;
1150ac7e6ecSschwarze 		s = n->child->string;
116d55e8c17Sschwarze 		switch (s[0]) {
117d55e8c17Sschwarze 		case '-':
118d55e8c17Sschwarze 			s++;
119d55e8c17Sschwarze 			break;
120d55e8c17Sschwarze 		case '\\':
121d55e8c17Sschwarze 			switch (s[1]) {
122d55e8c17Sschwarze 			case '&':
123d55e8c17Sschwarze 			case '-':
124d55e8c17Sschwarze 			case 'e':
12552d50eacSschwarze 				s += 2;
126d55e8c17Sschwarze 				break;
127d55e8c17Sschwarze 			default:
128*2b14f697Sschwarze 				return;
129d55e8c17Sschwarze 			}
130d55e8c17Sschwarze 			break;
131d55e8c17Sschwarze 		default:
132*2b14f697Sschwarze 			changed = 0;
133d55e8c17Sschwarze 			break;
134d55e8c17Sschwarze 		}
1350ac7e6ecSschwarze 	}
1363af42e7fSschwarze 
1373af42e7fSschwarze 	/*
138*2b14f697Sschwarze 	 * Translate \- and ASCII_HYPH to plain '-'.
1391e876328Sschwarze 	 * Skip whitespace and escapes and whatever follows,
1403af42e7fSschwarze 	 * and if there is any, downgrade the priority.
1413af42e7fSschwarze 	 */
1423af42e7fSschwarze 
143*2b14f697Sschwarze 	cpy = mandoc_malloc(strlen(s) + 1);
144*2b14f697Sschwarze 	for (src = s, len = 0; *src != '\0'; src++, len++) {
145*2b14f697Sschwarze 		switch (*src) {
146*2b14f697Sschwarze 		case '\t':
147*2b14f697Sschwarze 		case ' ':
148*2b14f697Sschwarze 			changed = 1;
149*2b14f697Sschwarze 			break;
150*2b14f697Sschwarze 		case ASCII_HYPH:
151*2b14f697Sschwarze 			cpy[len] = '-';
152*2b14f697Sschwarze 			changed = 1;
153*2b14f697Sschwarze 			continue;
154*2b14f697Sschwarze 		case '\\':
155*2b14f697Sschwarze 			if (src[1] != '-')
156*2b14f697Sschwarze 				break;
157*2b14f697Sschwarze 			src++;
158*2b14f697Sschwarze 			changed = 1;
159*2b14f697Sschwarze 			/* FALLTHROUGH */
160*2b14f697Sschwarze 		default:
161*2b14f697Sschwarze 			cpy[len] = *src;
162*2b14f697Sschwarze 			continue;
163*2b14f697Sschwarze 		}
164*2b14f697Sschwarze 		break;
165*2b14f697Sschwarze 	}
1663af42e7fSschwarze 	if (len == 0)
167*2b14f697Sschwarze 		goto out;
168*2b14f697Sschwarze 	cpy[len] = '\0';
169730dc51fSschwarze 
170*2b14f697Sschwarze 	if (*src != '\0' && prio < TAG_WEAK)
1717d109111Sschwarze 		prio = TAG_WEAK;
1723af42e7fSschwarze 
173*2b14f697Sschwarze 	s = cpy;
174*2b14f697Sschwarze 	se = cpy + len;
1753af42e7fSschwarze 	slot = ohash_qlookupi(&tag_data, s, &se);
176c0a657b3Sschwarze 	entry = ohash_find(&tag_data, slot);
177730dc51fSschwarze 
178730dc51fSschwarze 	/* Build a new entry. */
179730dc51fSschwarze 
1800ac7e6ecSschwarze 	if (entry == NULL) {
1813af42e7fSschwarze 		entry = mandoc_malloc(sizeof(*entry) + len + 1);
182*2b14f697Sschwarze 		memcpy(entry->s, s, len + 1);
1830ac7e6ecSschwarze 		entry->nodes = NULL;
1840ac7e6ecSschwarze 		entry->maxnodes = entry->nnodes = 0;
185c0a657b3Sschwarze 		ohash_insert(&tag_data, slot, entry);
1860ac7e6ecSschwarze 	}
187730dc51fSschwarze 
1883af42e7fSschwarze 	/*
1890ac7e6ecSschwarze 	 * Lower priority numbers take precedence.
1900ac7e6ecSschwarze 	 * If a better entry is already present, ignore the new one.
1913af42e7fSschwarze 	 */
192ba3488f0Sschwarze 
1930ac7e6ecSschwarze 	else if (entry->prio < prio)
194*2b14f697Sschwarze 		goto out;
1950ac7e6ecSschwarze 
1960ac7e6ecSschwarze 	/*
1970ac7e6ecSschwarze 	 * If the existing entry is worse, clear it.
1980ac7e6ecSschwarze 	 * In addition, a tag with priority TAG_FALLBACK
1990ac7e6ecSschwarze 	 * is only used if the tag occurs exactly once.
2000ac7e6ecSschwarze 	 */
2010ac7e6ecSschwarze 
2020ac7e6ecSschwarze 	else if (entry->prio > prio || prio == TAG_FALLBACK) {
203c220f9cfSschwarze 		while (entry->nnodes > 0) {
204c220f9cfSschwarze 			nold = entry->nodes[--entry->nnodes];
205c220f9cfSschwarze 			nold->flags &= ~NODE_ID;
206c220f9cfSschwarze 			free(nold->tag);
207c220f9cfSschwarze 			nold->tag = NULL;
208c220f9cfSschwarze 		}
2097d109111Sschwarze 		if (prio == TAG_FALLBACK) {
2107d109111Sschwarze 			entry->prio = TAG_DELETE;
211*2b14f697Sschwarze 			goto out;
212ba3488f0Sschwarze 		}
213730dc51fSschwarze 	}
214730dc51fSschwarze 
2150ac7e6ecSschwarze 	/* Remember the new node. */
216730dc51fSschwarze 
2170ac7e6ecSschwarze 	if (entry->maxnodes == entry->nnodes) {
2180ac7e6ecSschwarze 		entry->maxnodes += 4;
2190ac7e6ecSschwarze 		entry->nodes = mandoc_reallocarray(entry->nodes,
2200ac7e6ecSschwarze 		    entry->maxnodes, sizeof(*entry->nodes));
221730dc51fSschwarze 	}
2220ac7e6ecSschwarze 	entry->nodes[entry->nnodes++] = n;
22362615e18Sschwarze 	entry->prio = prio;
2240ac7e6ecSschwarze 	n->flags |= NODE_ID;
225*2b14f697Sschwarze 	if (changed) {
226c220f9cfSschwarze 		assert(n->tag == NULL);
227c220f9cfSschwarze 		n->tag = mandoc_strndup(s, len);
2280ac7e6ecSschwarze 	}
229*2b14f697Sschwarze 
230*2b14f697Sschwarze  out:
231*2b14f697Sschwarze 	free(cpy);
232c0a657b3Sschwarze }
233c0a657b3Sschwarze 
234beabc24cSschwarze int
tag_exists(const char * tag)235beabc24cSschwarze tag_exists(const char *tag)
236c0a657b3Sschwarze {
237beabc24cSschwarze 	return ohash_find(&tag_data, ohash_qlookup(&tag_data, tag)) != NULL;
2381a557d3cSschwarze }
2399226948cSschwarze 
2409226948cSschwarze /*
2419226948cSschwarze  * For in-line elements, move the link target
2429226948cSschwarze  * to the enclosing paragraph when appropriate.
2439226948cSschwarze  */
2449226948cSschwarze static void
tag_move_id(struct roff_node * n)2459226948cSschwarze tag_move_id(struct roff_node *n)
2469226948cSschwarze {
2479226948cSschwarze 	struct roff_node *np;
2489226948cSschwarze 
2499226948cSschwarze 	np = n;
2509226948cSschwarze 	for (;;) {
2519226948cSschwarze 		if (np->prev != NULL)
2529226948cSschwarze 			np = np->prev;
2539226948cSschwarze 		else if ((np = np->parent) == NULL)
2549226948cSschwarze 			return;
2559226948cSschwarze 		switch (np->tok) {
2569226948cSschwarze 		case MDOC_It:
2579226948cSschwarze 			switch (np->parent->parent->norm->Bl.type) {
2589226948cSschwarze 			case LIST_column:
2599226948cSschwarze 				/* Target the ROFFT_BLOCK = <tr>. */
2609226948cSschwarze 				np = np->parent;
2619226948cSschwarze 				break;
2629226948cSschwarze 			case LIST_diag:
2639226948cSschwarze 			case LIST_hang:
2649226948cSschwarze 			case LIST_inset:
2659226948cSschwarze 			case LIST_ohang:
2669226948cSschwarze 			case LIST_tag:
2679226948cSschwarze 				/* Target the ROFFT_HEAD = <dt>. */
2689226948cSschwarze 				np = np->parent->head;
2699226948cSschwarze 				break;
2709226948cSschwarze 			default:
2719226948cSschwarze 				/* Target the ROFF_BODY = <li>. */
2729226948cSschwarze 				break;
2739226948cSschwarze 			}
2749226948cSschwarze 			/* FALLTHROUGH */
2759226948cSschwarze 		case MDOC_Pp:	/* Target the ROFFT_ELEM = <p>. */
276c220f9cfSschwarze 			if (np->tag == NULL) {
277c220f9cfSschwarze 				np->tag = mandoc_strdup(n->tag == NULL ?
278c220f9cfSschwarze 				    n->child->string : n->tag);
2799226948cSschwarze 				np->flags |= NODE_ID;
2809226948cSschwarze 				n->flags &= ~NODE_ID;
2819226948cSschwarze 			}
2829226948cSschwarze 			return;
2839226948cSschwarze 		case MDOC_Sh:
2849226948cSschwarze 		case MDOC_Ss:
2859226948cSschwarze 		case MDOC_Bd:
2869226948cSschwarze 		case MDOC_Bl:
2879226948cSschwarze 		case MDOC_D1:
2889226948cSschwarze 		case MDOC_Dl:
2899226948cSschwarze 		case MDOC_Rs:
2909226948cSschwarze 			/* Do not move past major blocks. */
2919226948cSschwarze 			return;
2929226948cSschwarze 		default:
2939226948cSschwarze 			/*
2949226948cSschwarze 			 * Move past in-line content and partial
2959226948cSschwarze 			 * blocks, for example .It Xo or .It Bq Er.
2969226948cSschwarze 			 */
2979226948cSschwarze 			break;
2989226948cSschwarze 		}
2999226948cSschwarze 	}
3009226948cSschwarze }
3019226948cSschwarze 
3029226948cSschwarze /*
303a875cd0eSschwarze  * When a paragraph is tagged and starts with text,
304a875cd0eSschwarze  * move the permalink to the first few words.
305a875cd0eSschwarze  */
306a875cd0eSschwarze static void
tag_move_href(struct roff_man * man,struct roff_node * n,const char * tag)307a875cd0eSschwarze tag_move_href(struct roff_man *man, struct roff_node *n, const char *tag)
308a875cd0eSschwarze {
309a875cd0eSschwarze 	char	*cp;
310a875cd0eSschwarze 
311a875cd0eSschwarze 	if (n == NULL || n->type != ROFFT_TEXT ||
312a875cd0eSschwarze 	    *n->string == '\0' || *n->string == ' ')
313a875cd0eSschwarze 		return;
314a875cd0eSschwarze 
315a875cd0eSschwarze 	cp = n->string;
316a875cd0eSschwarze 	while (cp != NULL && cp - n->string < 5)
317a875cd0eSschwarze 		cp = strchr(cp + 1, ' ');
318a875cd0eSschwarze 
319a875cd0eSschwarze 	/* If the first text node is longer, split it. */
320a875cd0eSschwarze 
321a875cd0eSschwarze 	if (cp != NULL && cp[1] != '\0') {
322a875cd0eSschwarze 		man->last = n;
323a875cd0eSschwarze 		man->next = ROFF_NEXT_SIBLING;
324a875cd0eSschwarze 		roff_word_alloc(man, n->line,
325a875cd0eSschwarze 		    n->pos + (cp - n->string), cp + 1);
326a875cd0eSschwarze 		man->last->flags = n->flags & ~NODE_LINE;
327a875cd0eSschwarze 		*cp = '\0';
328a875cd0eSschwarze 	}
329a875cd0eSschwarze 
330a875cd0eSschwarze 	assert(n->tag == NULL);
331a875cd0eSschwarze 	n->tag = mandoc_strdup(tag);
332a875cd0eSschwarze 	n->flags |= NODE_HREF;
333a875cd0eSschwarze }
334a875cd0eSschwarze 
335a875cd0eSschwarze /*
3369226948cSschwarze  * When all tags have been set, decide where to put
3379226948cSschwarze  * the associated permalinks, and maybe move some tags
3389226948cSschwarze  * to the beginning of the respective paragraphs.
3399226948cSschwarze  */
3409226948cSschwarze void
tag_postprocess(struct roff_man * man,struct roff_node * n)3416e2a0df9Sschwarze tag_postprocess(struct roff_man *man, struct roff_node *n)
3429226948cSschwarze {
3439226948cSschwarze 	if (n->flags & NODE_ID) {
3449226948cSschwarze 		switch (n->tok) {
3456e2a0df9Sschwarze 		case MDOC_Pp:
346a875cd0eSschwarze 			tag_move_href(man, n->next, n->tag);
3476e2a0df9Sschwarze 			break;
3489226948cSschwarze 		case MDOC_Bd:
349a875cd0eSschwarze 		case MDOC_D1:
350a875cd0eSschwarze 		case MDOC_Dl:
351a875cd0eSschwarze 			tag_move_href(man, n->child, n->tag);
352a875cd0eSschwarze 			break;
3539226948cSschwarze 		case MDOC_Bl:
3549226948cSschwarze 			/* XXX No permalink for now. */
3559226948cSschwarze 			break;
3569226948cSschwarze 		default:
3579226948cSschwarze 			if (n->type == ROFFT_ELEM || n->tok == MDOC_Fo)
3589226948cSschwarze 				tag_move_id(n);
3599226948cSschwarze 			if (n->tok != MDOC_Tg)
3609226948cSschwarze 				n->flags |= NODE_HREF;
361c220f9cfSschwarze 			else if ((n->flags & NODE_ID) == 0) {
3629226948cSschwarze 				n->flags |= NODE_NOPRT;
363c220f9cfSschwarze 				free(n->tag);
364c220f9cfSschwarze 				n->tag = NULL;
365c220f9cfSschwarze 			}
3669226948cSschwarze 			break;
3679226948cSschwarze 		}
3689226948cSschwarze 	}
3699226948cSschwarze 	for (n = n->child; n != NULL; n = n->next)
3706e2a0df9Sschwarze 		tag_postprocess(man, n);
3719226948cSschwarze }
372