1 /*
2 * latex_parse.c
3 *
4 * Parse LaTeX code.
5 *
6 * Copyright (c) Chris Putnam 2020
7 *
8 * Source code released under the GPL version 2
9 *
10 */
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include "bibdefs.h"
15 #include "is_ws.h"
16 #include "latex_parse.h"
17
18 typedef struct latex_node {
19 struct latex_edge *next_edge;
20 struct latex_node *down_node;
21 } latex_node;
22
23 typedef struct latex_edge {
24 struct latex_node *next_node;
25 str text;
26 } latex_edge;
27
28 static latex_node *
latex_node_new(void)29 latex_node_new( void )
30 {
31 return ( latex_node * ) calloc( 1, sizeof( latex_node ) );
32 }
33
34 static void
latex_node_delete(latex_node * n)35 latex_node_delete( latex_node *n )
36 {
37 free( n );
38 }
39
40 static latex_edge *
latex_edge_new(void)41 latex_edge_new( void )
42 {
43 latex_edge *e;
44 e = ( latex_edge * ) calloc( 1, sizeof( latex_edge ) );
45 if ( e ) str_init( &(e->text) );
46 return e;
47 }
48
49 static void
latex_edge_delete(latex_edge * e)50 latex_edge_delete( latex_edge *e )
51 {
52 str_free( &(e->text) );
53 free( e );
54 }
55
56 static int
is_unescaped(char * p,unsigned long * offset,char c)57 is_unescaped( char *p, unsigned long *offset, char c )
58 {
59 if ( *p!=c ) return 0;
60 if ( *offset > 0 && *(p-1)=='\\' ) return 0;
61 return 1;
62 }
63
64 static int
build_latex_graph_r(str * in,unsigned long * offset,int * mathmode,int depth,latex_node ** node)65 build_latex_graph_r( str *in, unsigned long *offset, int *mathmode, int depth, latex_node **node )
66 {
67 latex_node *newnode, *downnode;
68 int status = BIBL_OK;
69 latex_edge *newedge;
70 char *p;
71
72 newnode = latex_node_new();
73 if ( !newnode ) return BIBL_ERR_MEMERR;
74
75 newedge = latex_edge_new();
76 if ( !newedge ) {
77 latex_node_delete( newnode );
78 return BIBL_ERR_MEMERR;
79 }
80
81 *node = newnode;
82
83 newnode->next_edge = newedge;
84
85 p = str_cstr( in ) + *offset;
86
87 while ( *p ) {
88 if ( is_unescaped( p, offset, '{' ) ) {
89 *offset += 1;
90 newnode = latex_node_new();
91 if ( !newnode ) { status = BIBL_ERR_MEMERR; goto out; }
92 newedge->next_node = newnode;
93 newedge = latex_edge_new();
94 if ( !newedge ) { status = BIBL_ERR_MEMERR; goto out; }
95 newnode->next_edge = newedge;
96 status = build_latex_graph_r( in, offset, mathmode, depth+1, &downnode );
97 if ( status!=BIBL_OK ) goto out;
98 newnode->down_node = downnode;
99 p = str_cstr( in ) + *offset;
100 }
101 else if ( is_unescaped( p, offset, '}' ) ) {
102 *offset += 1;
103 if ( depth==0 ) {
104 // Patch: Disable output logging
105 p++;
106 continue;
107 }
108 goto out;
109 }
110 else if ( is_unescaped( p, offset, '$' ) ) {
111 *mathmode = !(*mathmode);
112 *offset += 1;
113 if ( *mathmode ) {
114 newnode = latex_node_new();
115 if ( !newnode ) { status = BIBL_ERR_MEMERR; goto out; }
116 newedge->next_node = newnode;
117 newedge = latex_edge_new();
118 if ( !newedge ) { status = BIBL_ERR_MEMERR; goto out; }
119 newnode->next_edge = newedge;
120 status = build_latex_graph_r( in, offset, mathmode, depth+1, &downnode );
121 if ( status!=BIBL_OK ) goto out;
122 newnode->down_node = downnode;
123 p = str_cstr( in ) + *offset;
124 }
125 else {
126 if ( depth==0 ) {
127 // Patch: Disable output logging
128 p++;
129 continue;
130 }
131 goto out;
132 }
133 }
134 else {
135 str_addchar( &(newedge->text), *p );
136 p++;
137 *offset += 1;
138 }
139 }
140
141 if ( depth!=0 ) {
142 // Patch: Disable output logging
143 }
144
145 out:
146 if ( status!=BIBL_OK || str_memerr( &(newedge->text) ) ) {
147 latex_node_delete( newnode );
148 latex_edge_delete( newedge );
149 *node = NULL;
150 return BIBL_ERR_MEMERR;
151 }
152
153 return BIBL_OK;
154 }
155
156 int
build_latex_graph(str * in,latex_node ** start)157 build_latex_graph( str *in, latex_node **start )
158 {
159 unsigned long offset = 0;
160 int mathmode = 0;
161 latex_node *n;
162
163 n = latex_node_new();
164 if ( !n ) return BIBL_ERR_MEMERR;
165
166 return build_latex_graph_r( in, &offset, &mathmode, 0, start );
167 }
168
169 typedef struct {
170 const char *wbracket;
171 int wbracketsize;
172 const char *wobracket;
173 const char *toreplace;
174 } latex_cmds_t;
175
176 static const latex_cmds_t latex_cmds[] = {
177 { "\\it", 3, "\\it ", NULL },
178 { "\\em", 3, "\\em ", NULL },
179 { "\\bf", 3, "\\bf ", NULL },
180 { "\\small", 6, "\\small ", NULL },
181 /* 'textcomp' annotations */
182 { "\\textit", 7, "\\textit ", NULL },
183 { "\\textbf", 7, "\\textbf ", NULL },
184 { "\\textrm", 7, "\\textrm ", NULL },
185 { "\\textsl", 7, "\\textsl ", NULL },
186 { "\\textsc", 7, "\\textsc ", NULL },
187 { "\\textsf", 7, "\\textsf ", NULL },
188 { "\\texttt", 7, "\\texttt ", NULL },
189 { "\\emph", 5, "\\emph ", NULL },
190 { "\\url", 4, "\\url ", NULL },
191 { "\\mbox", 5, "\\mbox ", NULL },
192 { "\\mkbibquote", 11, "\\mkbibquote ", NULL },
193 /* math functions */
194 { "\\ln", 3, "\\ln ", "ln" },
195 { "\\sin", 4, "\\sin ", "sin" },
196 { "\\cos", 4, "\\cos ", "cos" },
197 { "\\tan", 4, "\\tan ", "tan" },
198 };
199 static const int nlatex_cmds = sizeof( latex_cmds ) / sizeof( latex_cmds[0] );
200
201 static const latex_cmds_t math_cmds[] = {
202 { "\\ln", 3, "\\ln ", "ln" },
203 { "\\sin", 4, "\\sin ", "sin" },
204 { "\\cos", 4, "\\cos ", "cos" },
205 { "\\tan", 4, "\\tan ", "tan" },
206 { "\\mathrm", 7, "\\mathrm ", "" },
207 { "\\rm", 3, "\\rm ", "" },
208 { "\\LaTeX", 6, "\\LaTeX ", "LaTeX" },
209 };
210 static const int nmath_cmds = sizeof( math_cmds ) / sizeof( math_cmds[0] );
211
212 /* remove from "ABC \it{DEF}" --> parses to "ABC \it" */
213 static int
remove_latex_cmds_with_brackets(str * s)214 remove_latex_cmds_with_brackets( str *s )
215 {
216 unsigned long offset;
217 int i;
218 for ( i=0; i<nlatex_cmds; ++i ) {
219 if ( s->len < latex_cmds[i].wbracketsize ) continue;
220 offset = s->len - latex_cmds[i].wbracketsize;
221 if ( !strcmp( str_cstr( s ) + offset, latex_cmds[i].wbracket ) ) {
222 str_trimend( s, latex_cmds[i].wbracketsize );
223 return 1;
224 }
225 }
226 return 0;
227 }
228
229 /* remove from "{\it ABC}" */
230 static void
remove_latex_cmds_without_brackets(str * s)231 remove_latex_cmds_without_brackets( str *s )
232 {
233 int i;
234 for ( i=0; i<nlatex_cmds; ++i ) {
235 str_findreplace( s, latex_cmds[i].wobracket, "" );
236 }
237 }
238
239 static void
remove_math_cmds(str * s)240 remove_math_cmds( str *s )
241 {
242 int i;
243 for ( i=0; i<nmath_cmds; ++i ) {
244 str_findreplace( s, math_cmds[i].wbracket, math_cmds[i].toreplace );
245 }
246 }
247
248 static int
collapse_latex_graph(latex_node * n,str * out)249 collapse_latex_graph( latex_node *n, str *out )
250 {
251 latex_edge *e;
252 int status;
253
254 if ( n->down_node ) {
255 status = collapse_latex_graph( n->down_node, out );
256 if ( status!=BIBL_OK ) return status;
257 }
258
259 e = n->next_edge;
260 if ( e ) {
261 if ( !remove_latex_cmds_with_brackets( &(e->text) ) )
262 remove_latex_cmds_without_brackets( &(e->text) );
263 remove_math_cmds( &(e->text) );
264 str_strcat( out, &(e->text) );
265 if ( str_memerr( &(e->text) ) ) return BIBL_ERR_MEMERR;
266 if ( e->next_node ) {
267 status = collapse_latex_graph( e->next_node, out );
268 if ( status!=BIBL_OK ) return status;
269 }
270 }
271
272 return BIBL_OK;
273 }
274
275 static int
string_from_latex_graph(latex_node * n,str * out)276 string_from_latex_graph( latex_node *n, str *out )
277 {
278 int status;
279
280 status = collapse_latex_graph( n, out );
281 if ( status!=BIBL_OK ) return status;
282
283 while( str_findreplace( out, " ", " " ) ) {}
284
285 if ( str_memerr( out ) ) return BIBL_ERR_MEMERR;
286 else return BIBL_OK;
287 }
288
289 #if 0
290 static void
291 write_latex_graph( latex_node *n )
292 {
293 latex_edge *e;
294
295 while ( n ) {
296
297 if ( n->down_node ) {
298 printf( "+{" );
299 write_latex_graph( n->down_node );
300 printf( "}" );
301 }
302 else printf( "." );
303
304 e = n->next_edge;
305 if ( e ) {
306 if ( str_has_value( &(e->text) ) ) printf( "%s", str_cstr( &(e->text) ) );
307 n = e->next_node;
308 }
309 else n = NULL;
310 }
311 }
312 #endif
313
314 int
latex_parse(str * in,str * out)315 latex_parse( str *in, str *out )
316 {
317 latex_node *n;
318 int status;
319
320 str_empty( out );
321 if ( str_is_empty( in ) ) return BIBL_OK;
322
323 status = build_latex_graph( in, &n );
324 if ( status!=BIBL_OK ) return status;
325
326 status = string_from_latex_graph( n, out );
327 if ( status!=BIBL_OK ) return status;
328
329 str_trimendingws( out );
330
331 return BIBL_OK;
332 }
333
334 int
latex_tokenize(slist * tokens,str * s)335 latex_tokenize( slist *tokens, str *s )
336 {
337 int i, n = s->len, nbrackets = 0, status = BIBL_OK;
338 str tok, *t;
339
340 str_init( &tok );
341
342 for ( i=0; i<n; ++i ) {
343 if ( s->data[i]=='{' && ( i==0 || s->data[i-1]!='\\' ) ) {
344 nbrackets++;
345 str_addchar( &tok, '{' );
346 } else if ( s->data[i]=='}' && ( i==0 || s->data[i-1]!='\\' ) ) {
347 nbrackets--;
348 str_addchar( &tok, '}' );
349 } else if ( !is_ws( s->data[i] ) || nbrackets ) {
350 str_addchar( &tok, s->data[i] );
351 } else if ( is_ws( s->data[i] ) ) {
352 if ( str_has_value( &tok ) ) {
353 status = slist_add_ret( tokens, &tok, BIBL_OK, BIBL_ERR_MEMERR );
354 if ( status!=BIBL_OK ) goto out;
355 }
356 str_empty( &tok );
357 }
358 }
359 if ( str_has_value( &tok ) ) {
360 if ( str_memerr( &tok ) ) { status = BIBL_ERR_MEMERR; goto out; }
361 status = slist_add_ret( tokens, &tok, BIBL_OK, BIBL_ERR_MEMERR );
362 if ( status!=BIBL_OK ) goto out;
363 }
364
365 for ( i=0; i<tokens->n; ++i ) {
366 t = slist_str( tokens, i );
367 str_trimstartingws( t );
368 str_trimendingws( t );
369 if ( str_memerr( t ) ) { status = BIBL_ERR_MEMERR; goto out; }
370 }
371 out:
372 str_free( &tok );
373 return status;
374 }
375