1 /*
2  * latex_parse.c
3  *
4  * Parse LaTeX code.
5  *
6  * Copyright (c) Chris Putnam 2020
7  *
8  * Source code released under the GPL version 2
9  *
10  */
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include "bibdefs.h"
15 #include "is_ws.h"
16 #include "latex_parse.h"
17 
18 typedef struct latex_node {
19 	struct latex_edge *next_edge;
20 	struct latex_node *down_node;
21 } latex_node;
22 
23 typedef struct latex_edge {
24 	struct latex_node *next_node;
25 	str text;
26 } latex_edge;
27 
28 static latex_node *
latex_node_new(void)29 latex_node_new( void )
30 {
31 	return ( latex_node * ) calloc( 1, sizeof( latex_node ) );
32 }
33 
34 static void
latex_node_delete(latex_node * n)35 latex_node_delete( latex_node *n )
36 {
37 	free( n );
38 }
39 
40 static latex_edge *
latex_edge_new(void)41 latex_edge_new( void )
42 {
43 	latex_edge *e;
44 	e = ( latex_edge * ) calloc( 1, sizeof( latex_edge ) );
45 	if ( e ) str_init( &(e->text) );
46 	return e;
47 }
48 
49 static void
latex_edge_delete(latex_edge * e)50 latex_edge_delete( latex_edge *e )
51 {
52 	str_free( &(e->text) );
53 	free( e );
54 }
55 
56 static int
is_unescaped(char * p,unsigned long * offset,char c)57 is_unescaped( char *p, unsigned long *offset, char c )
58 {
59 	if ( *p!=c ) return 0;
60 	if ( *offset > 0 && *(p-1)=='\\' ) return 0;
61 	return 1;
62 }
63 
64 static int
build_latex_graph_r(str * in,unsigned long * offset,int * mathmode,int depth,latex_node ** node)65 build_latex_graph_r( str *in, unsigned long *offset, int *mathmode, int depth, latex_node **node )
66 {
67 	latex_node *newnode, *downnode;
68 	int status = BIBL_OK;
69 	latex_edge *newedge;
70 	char *p;
71 
72 	newnode = latex_node_new();
73 	if ( !newnode ) return BIBL_ERR_MEMERR;
74 
75 	newedge = latex_edge_new();
76 	if ( !newedge ) {
77 		latex_node_delete( newnode );
78 		return BIBL_ERR_MEMERR;
79 	}
80 
81 	*node = newnode;
82 
83 	newnode->next_edge = newedge;
84 
85 	p = str_cstr( in ) + *offset;
86 
87 	while ( *p ) {
88 		if ( is_unescaped( p, offset, '{' ) ) {
89 			*offset += 1;
90 			newnode = latex_node_new();
91 			if ( !newnode ) { status = BIBL_ERR_MEMERR; goto out; }
92 			newedge->next_node = newnode;
93 			newedge = latex_edge_new();
94 			if ( !newedge ) { status = BIBL_ERR_MEMERR; goto out; }
95 			newnode->next_edge = newedge;
96 			status = build_latex_graph_r( in, offset, mathmode, depth+1, &downnode );
97 			if ( status!=BIBL_OK ) goto out;
98 			newnode->down_node = downnode;
99 			p = str_cstr( in ) + *offset;
100 		}
101 		else if ( is_unescaped( p, offset, '}' ) ) {
102 			*offset += 1;
103 			if ( depth==0 ) {
104 				// Patch: Disable output logging
105 				p++;
106 				continue;
107 			}
108 			goto out;
109 		}
110 		else if ( is_unescaped( p, offset, '$' ) ) {
111 			*mathmode = !(*mathmode);
112 			*offset += 1;
113 			if ( *mathmode ) {
114 				newnode = latex_node_new();
115 				if ( !newnode ) { status = BIBL_ERR_MEMERR; goto out; }
116 				newedge->next_node = newnode;
117 				newedge = latex_edge_new();
118 				if ( !newedge ) { status = BIBL_ERR_MEMERR; goto out; }
119 				newnode->next_edge = newedge;
120 				status = build_latex_graph_r( in, offset, mathmode, depth+1, &downnode );
121 				if ( status!=BIBL_OK ) goto out;
122 				newnode->down_node = downnode;
123 				p = str_cstr( in ) + *offset;
124 			}
125 			else {
126 				if ( depth==0 ) {
127 					// Patch: Disable output logging
128 					p++;
129 					continue;
130 				}
131 				goto out;
132 			}
133 		}
134 		else {
135 			str_addchar( &(newedge->text), *p );
136 			p++;
137 			*offset += 1;
138 		}
139 	}
140 
141 	if ( depth!=0 ) {
142 		// Patch: Disable output logging
143 	}
144 
145 out:
146 	if ( status!=BIBL_OK || str_memerr( &(newedge->text) ) ) {
147 		latex_node_delete( newnode );
148 		latex_edge_delete( newedge );
149 		*node = NULL;
150 		return BIBL_ERR_MEMERR;
151 	}
152 
153 	return BIBL_OK;
154 }
155 
156 int
build_latex_graph(str * in,latex_node ** start)157 build_latex_graph( str *in, latex_node **start )
158 {
159 	unsigned long offset = 0;
160 	int mathmode = 0;
161 	latex_node *n;
162 
163 	n = latex_node_new();
164 	if ( !n ) return BIBL_ERR_MEMERR;
165 
166 	return build_latex_graph_r( in, &offset, &mathmode, 0, start );
167 }
168 
169 typedef struct {
170 	const char *wbracket;
171 	int wbracketsize;
172 	const char *wobracket;
173 	const char *toreplace;
174 } latex_cmds_t;
175 
176 static const latex_cmds_t latex_cmds[] = {
177 	{ "\\it",     3, "\\it ",     NULL },
178 	{ "\\em",     3, "\\em ",     NULL },
179 	{ "\\bf",     3, "\\bf ",     NULL },
180 	{ "\\small",  6, "\\small ",  NULL },
181 	/* 'textcomp' annotations */
182 	{ "\\textit", 7, "\\textit ", NULL },
183 	{ "\\textbf", 7, "\\textbf ", NULL },
184 	{ "\\textrm", 7, "\\textrm ", NULL },
185 	{ "\\textsl", 7, "\\textsl ", NULL },
186 	{ "\\textsc", 7, "\\textsc ", NULL },
187 	{ "\\textsf", 7, "\\textsf ", NULL },
188 	{ "\\texttt", 7, "\\texttt ", NULL },
189 	{ "\\emph",   5, "\\emph ",   NULL },
190 	{ "\\url",    4, "\\url ",    NULL },
191 	{ "\\mbox",   5, "\\mbox ",   NULL },
192 	{ "\\mkbibquote", 11, "\\mkbibquote ", NULL },
193 	/* math functions */
194 	{ "\\ln",     3, "\\ln ",     "ln" },
195 	{ "\\sin",    4, "\\sin ",    "sin" },
196 	{ "\\cos",    4, "\\cos ",    "cos" },
197 	{ "\\tan",    4, "\\tan ",    "tan" },
198 };
199 static const int nlatex_cmds = sizeof( latex_cmds ) / sizeof( latex_cmds[0] );
200 
201 static const latex_cmds_t math_cmds[] = {
202 	{ "\\ln",     3, "\\ln ",     "ln" },
203 	{ "\\sin",    4, "\\sin ",    "sin" },
204 	{ "\\cos",    4, "\\cos ",    "cos" },
205 	{ "\\tan",    4, "\\tan ",    "tan" },
206 	{ "\\mathrm", 7, "\\mathrm ", ""    },
207 	{ "\\rm",     3, "\\rm ",     ""    },
208 	{ "\\LaTeX",  6, "\\LaTeX ",  "LaTeX" },
209 };
210 static const int nmath_cmds = sizeof( math_cmds ) / sizeof( math_cmds[0] );
211 
212 /* remove from "ABC \it{DEF}" --> parses to "ABC \it" */
213 static int
remove_latex_cmds_with_brackets(str * s)214 remove_latex_cmds_with_brackets( str *s )
215 {
216 	unsigned long offset;
217 	int i;
218 	for ( i=0; i<nlatex_cmds; ++i ) {
219 		if ( s->len < latex_cmds[i].wbracketsize ) continue;
220 		offset = s->len - latex_cmds[i].wbracketsize;
221 		if ( !strcmp( str_cstr( s ) + offset, latex_cmds[i].wbracket ) ) {
222 			str_trimend( s, latex_cmds[i].wbracketsize );
223 			return 1;
224 		}
225 	}
226 	return 0;
227 }
228 
229 /* remove from "{\it ABC}" */
230 static void
remove_latex_cmds_without_brackets(str * s)231 remove_latex_cmds_without_brackets( str *s )
232 {
233 	int i;
234 	for ( i=0; i<nlatex_cmds; ++i ) {
235 		str_findreplace( s, latex_cmds[i].wobracket, "" );
236 	}
237 }
238 
239 static void
remove_math_cmds(str * s)240 remove_math_cmds( str *s )
241 {
242 	int i;
243 	for ( i=0; i<nmath_cmds; ++i ) {
244 		str_findreplace( s, math_cmds[i].wbracket, math_cmds[i].toreplace );
245 	}
246 }
247 
248 static int
collapse_latex_graph(latex_node * n,str * out)249 collapse_latex_graph( latex_node *n, str *out )
250 {
251 	latex_edge *e;
252 	int status;
253 
254 	if ( n->down_node ) {
255 		status = collapse_latex_graph( n->down_node, out );
256 		if ( status!=BIBL_OK ) return status;
257 	}
258 
259 	e = n->next_edge;
260 	if ( e ) {
261 		if ( !remove_latex_cmds_with_brackets( &(e->text) ) )
262 			remove_latex_cmds_without_brackets( &(e->text) );
263 		remove_math_cmds( &(e->text) );
264 		str_strcat( out, &(e->text) );
265 		if ( str_memerr( &(e->text) ) ) return BIBL_ERR_MEMERR;
266 		if ( e->next_node ) {
267 			status = collapse_latex_graph( e->next_node, out );
268 			if ( status!=BIBL_OK ) return status;
269 		}
270 	}
271 
272 	return BIBL_OK;
273 }
274 
275 static int
string_from_latex_graph(latex_node * n,str * out)276 string_from_latex_graph( latex_node *n, str *out )
277 {
278 	int status;
279 
280 	status = collapse_latex_graph( n, out );
281 	if ( status!=BIBL_OK ) return status;
282 
283 	while( str_findreplace( out, "  ", " " ) ) {}
284 
285 	if ( str_memerr( out ) ) return BIBL_ERR_MEMERR;
286 	else return BIBL_OK;
287 }
288 
289 #if 0
290 static void
291 write_latex_graph( latex_node *n )
292 {
293 	latex_edge *e;
294 
295 	while ( n ) {
296 
297 		if ( n->down_node ) {
298 			printf( "+{" );
299 			write_latex_graph( n->down_node );
300 			printf( "}" );
301 		}
302 		else printf( "." );
303 
304 		e = n->next_edge;
305 		if ( e ) {
306 			if ( str_has_value( &(e->text) ) ) printf( "%s", str_cstr( &(e->text) ) );
307 			n = e->next_node;
308 		}
309 		else n = NULL;
310 	}
311 }
312 #endif
313 
314 int
latex_parse(str * in,str * out)315 latex_parse( str *in, str *out )
316 {
317 	latex_node *n;
318 	int status;
319 
320 	str_empty( out );
321 	if ( str_is_empty( in ) ) return BIBL_OK;
322 
323 	status = build_latex_graph( in, &n );
324 	if ( status!=BIBL_OK ) return status;
325 
326 	status = string_from_latex_graph( n, out );
327 	if ( status!=BIBL_OK ) return status;
328 
329 	str_trimendingws( out );
330 
331 	return BIBL_OK;
332 }
333 
334 int
latex_tokenize(slist * tokens,str * s)335 latex_tokenize( slist *tokens, str *s )
336 {
337 	int i, n = s->len, nbrackets = 0, status = BIBL_OK;
338 	str tok, *t;
339 
340 	str_init( &tok );
341 
342 	for ( i=0; i<n; ++i ) {
343 		if ( s->data[i]=='{' && ( i==0 || s->data[i-1]!='\\' ) ) {
344 			nbrackets++;
345 			str_addchar( &tok, '{' );
346 		} else if ( s->data[i]=='}' && ( i==0 || s->data[i-1]!='\\' ) ) {
347 			nbrackets--;
348 			str_addchar( &tok, '}' );
349 		} else if ( !is_ws( s->data[i] ) || nbrackets ) {
350 			str_addchar( &tok, s->data[i] );
351 		} else if ( is_ws( s->data[i] ) ) {
352 			if ( str_has_value( &tok ) ) {
353 				status = slist_add_ret( tokens, &tok, BIBL_OK, BIBL_ERR_MEMERR );
354 				if ( status!=BIBL_OK ) goto out;
355 			}
356 			str_empty( &tok );
357 		}
358 	}
359 	if ( str_has_value( &tok ) ) {
360 		if ( str_memerr( &tok ) ) { status = BIBL_ERR_MEMERR; goto out; }
361 		status = slist_add_ret( tokens, &tok, BIBL_OK, BIBL_ERR_MEMERR );
362 		if ( status!=BIBL_OK ) goto out;
363 	}
364 
365 	for ( i=0; i<tokens->n; ++i ) {
366 		t = slist_str( tokens, i );
367 		str_trimstartingws( t );
368 		str_trimendingws( t );
369 		if ( str_memerr( t ) ) { status = BIBL_ERR_MEMERR; goto out; }
370 	}
371 out:
372 	str_free( &tok );
373 	return status;
374 }
375