1 /*
2  * bibtexin.c
3  *
4  * Copyright (c) Chris Putnam 2003-2020
5  * Copyright (c) Georgi N. Boshnakov 2020
6  *
7  * Program and source code released under the GPL version 2
8  *
9  */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <ctype.h>
14 #include "is_ws.h"
15 #include "intlist.h"
16 #include "str.h"
17 #include "utf8.h"
18 #include "str_conv.h"
19 #include "fields.h"
20 #include "slist.h"
21 #include "name.h"
22 #include "title.h"
23 #include "url.h"
24 #include "reftypes.h"
25 #include "latex_parse.h"
26 #include "bibformats.h"
27 #include "generic.h"
28 
29 static slist find    = { 0, 0, 0, NULL };
30 static slist replace = { 0, 0, 0, NULL };
31 
32 extern variants bibtex_all[];
33 extern int bibtex_nall;
34 
35 /*****************************************************
36  PUBLIC: void bibtexin_initparams()
37 *****************************************************/
38 
39 static int bibtexin_convertf( fields *bibin, fields *info, int reftype, param *p );
40 static int bibtexin_processf( fields *bibin, const char *data, const char *filename, long nref, param *p );
41 static int bibtexin_cleanf( bibl *bin, param *p );
42 static int bibtexin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset );
43 static int bibtexin_typef( fields *bibin, const char *filename, int nrefs, param *p );
44 
45 int
bibtexin_initparams(param * pm,const char * progname)46 bibtexin_initparams( param *pm, const char *progname )
47 {
48 	pm->readformat       = BIBL_BIBTEXIN;
49 	pm->charsetin        = BIBL_CHARSET_DEFAULT;
50 	pm->charsetin_src    = BIBL_SRC_DEFAULT;
51 	pm->latexin          = 1;
52 	pm->xmlin            = 0;
53 	pm->utf8in           = 0;
54 	pm->nosplittitle     = 0;
55 	pm->verbose          = 0;
56 	pm->addcount         = 0;
57 	pm->output_raw       = 0;
58 
59 	pm->readf    = bibtexin_readf;
60 	pm->processf = bibtexin_processf;
61 	pm->cleanf   = bibtexin_cleanf;
62 	pm->typef    = bibtexin_typef;
63 	pm->convertf = bibtexin_convertf;
64 	pm->all      = bibtex_all;
65 	pm->nall     = bibtex_nall;
66 
67 	slist_init( &(pm->asis) );
68 	slist_init( &(pm->corps) );
69 
70  	// TODO: these probably should be made parameters, as the others above;
71 	//       note that 'find' and 'replace' work in tandem, so both need to be cleared.
72 	slist_free( &find );
73 	slist_free( &replace );
74 
75 	if ( !progname ) pm->progname = NULL;
76 	else {
77 		pm->progname = strdup( progname );
78 		if ( pm->progname==NULL ) return BIBL_ERR_MEMERR;
79 	}
80 
81 	return BIBL_OK;
82 }
83 
84 /*****************************************************
85  PUBLIC: int bibtexin_readf()
86 *****************************************************/
87 
88 /*
89  * readf can "read too far", so we store this information in line, thus
90  * the next new text is in line, either from having read too far or
91  * from the next chunk obtained via str_fget()
92  *
93  * return 1 on success, 0 on error/end-of-file
94  *
95  */
96 static int
readmore(FILE * fp,char * buf,int bufsize,int * bufpos,str * line)97 readmore( FILE *fp, char *buf, int bufsize, int *bufpos, str *line )
98 {
99 	if ( line->len ) return 1;
100 	else return str_fget( fp, buf, bufsize, bufpos, line );
101 }
102 
103 /*
104  * readf()
105  *
106  * returns zero if cannot get reference and hit end of-file
107  * returns 1 if last reference in file, 2 if reference within file
108  */
109 static int
bibtexin_readf(FILE * fp,char * buf,int bufsize,int * bufpos,str * line,str * reference,int * fcharset)110 bibtexin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset )
111 {
112 	int haveref = 0;
113 	const char *p;
114 	*fcharset = CHARSET_UNKNOWN;
115 	while ( haveref!=2 && readmore( fp, buf, bufsize, bufpos, line ) ) {
116 		if ( line->len == 0 ) continue; /* blank line */
117 		p = &(line->data[0]);
118 		/* Recognize UTF8 BOM */
119 		if ( line->len > 2 &&
120 				(unsigned char)(p[0])==0xEF &&
121 				(unsigned char)(p[1])==0xBB &&
122 				(unsigned char)(p[2])==0xBF ) {
123 			*fcharset = CHARSET_UNICODE;
124 			p += 3;
125 		}
126 		p = skip_ws( p );
127 		if ( *p == '%' ) { /* commented out line */
128 			str_empty( line );
129 			continue;
130 		}
131 		if ( *p == '@' ) haveref++;
132 		if ( haveref && haveref<2 ) {
133 			str_strcatc( reference, p );
134 			str_addchar( reference, '\n' );
135 			str_empty( line );
136 		} else if ( !haveref ) str_empty( line );
137 
138 	}
139 	return haveref;
140 }
141 
142 /*****************************************************
143  PUBLIC: int bibtexin_processf()
144 *****************************************************/
145 
146 typedef struct loc {
147 	const char *progname;
148 	const char *filename;
149 	long nref;
150 } loc;
151 
152 /* process_bibtextype()
153  *
154  * extract 'article', 'book', etc. from:
155  *
156  * @article{...}
157  * @book(...)
158  *
159  * return pointer after '{' or '(' character
160  */
161 static const char*
process_bibtextype(const char * p,str * type)162 process_bibtextype( const char *p, str *type )
163 {
164 	str tmp;
165 
166 	str_init( &tmp );
167 
168 	if ( *p=='@' ) p++;
169 	p = skip_ws( p );
170 
171 	p = str_cpytodelim( &tmp, p, "{( \t\r\n", 0 );
172 	p = skip_ws( p );
173 
174 	if ( *p=='{' || *p=='(' ) p++;
175 	p = skip_ws( p );
176 
177 	if ( str_has_value( &tmp ) ) str_strcpy( type, &tmp );
178 	else str_empty( type );
179 
180 	str_free( &tmp );
181 
182 	return p;
183 }
184 
185 char *dummy_id = "dummyid";
186 
187 static const char *
process_bibtexid(const char * p,str * id)188 process_bibtexid( const char *p, str *id )
189 {
190 	const char *start_p = p;
191 	str tmp;
192 
193 	str_init( &tmp );
194 	p = str_cpytodelim( &tmp, p, ",", 1 );
195 
196 	if ( str_has_value( &tmp ) ) {
197 		if ( strchr( tmp.data, '=' ) ) {
198 			/* Endnote writes bibtex files w/o fields, try to
199 			 * distinguish via presence of an equal sign.... if
200 			 * it's there, assume that it's a tag/data pair instead
201 			 * and roll back.
202 			 */
203 			p = start_p;
204 			str_empty( id );
205 		} else {
206 			str_strcpy( id, &tmp );
207 		}
208 	} else {
209 	  // Georgi was: str_empty( id );
210 	  str_strcpyc( id, dummy_id );
211 	}
212 
213 	str_free( &tmp );
214 	return skip_ws( p );
215 }
216 
217 /* bibtex_tag()
218  *
219  * returns NULL on memory error, else position after tag+whitespace
220  */
221 static const char *
bibtex_tag(const char * p,str * tag)222 bibtex_tag( const char *p, str *tag )
223 {
224 	p = str_cpytodelim( tag, p, "= \t\r\n", 0 );
225 	if ( str_memerr( tag ) ) return NULL;
226 	return skip_ws( p );
227 }
228 
229 static int
quotation_mark_is_escaped(int nbraces,const char * p,const char * startp)230 quotation_mark_is_escaped( int nbraces, const char *p, const char *startp )
231 {
232 	if ( nbraces!=0 ) return 1;
233 	if ( p!=startp && *(p-1)=='\\' ) return 1;
234 	return 0;
235 }
236 
237 static int
brace_is_escaped(int nquotes,const char * p,const char * startp)238 brace_is_escaped( int nquotes, const char *p, const char *startp )
239 {
240 	if ( nquotes!=0 ) return 1;
241 	if ( p!=startp && *(p-1)=='\\' ) return 1;
242 	return 0;
243 }
244 
245 static int
char_is_escaped(int nquotes,int nbraces)246 char_is_escaped( int nquotes, int nbraces )
247 {
248 	if ( nquotes!=0 || nbraces!=0 ) return 1;
249 	return 0;
250 }
251 
252 static int
add_token(slist * tokens,str * token)253 add_token( slist *tokens, str *token )
254 {
255 	int status;
256 
257 	if ( str_memerr( token ) ) return BIBL_ERR_MEMERR;
258 
259 	status = slist_add( tokens, token );
260 	if ( status!=SLIST_OK ) return BIBL_ERR_MEMERR;
261 
262 	str_empty( token );
263 
264 	return BIBL_OK;
265 }
266 
267 static const char *
bibtex_data(const char * p,slist * tokens,loc * currloc)268 bibtex_data( const char *p, slist *tokens, loc *currloc )
269 {
270 	int nbraces = 0, nquotes = 0;
271 	const char *startp = p;
272 	int status;
273 	str token;
274 
275 	str_init( &token );
276 
277 	while ( p && *p ) {
278 
279 		/* ...have we reached end-of-data? */
280 		if ( nquotes==0 && nbraces==0 ) {
281 			if ( *p==',' || *p=='=' || *p=='}' || *p==')' ) goto out;
282 		}
283 
284 		if ( *p=='\"' ) {
285 			str_addchar( &token, *p );
286 			if ( !quotation_mark_is_escaped( nbraces, p, startp ) ) {
287 				nquotes = !nquotes;
288 				if ( nquotes==0 ) {
289 					status = add_token( tokens, &token );
290 					if ( status!=BIBL_OK ) { p=NULL; goto out0; }
291 				}
292 			}
293 		}
294 
295 		else if ( *p=='{' ) {
296 			str_addchar( &token, *p );
297 			if ( !brace_is_escaped( nquotes, p, startp ) ) {
298 				nbraces++;
299 			}
300 		}
301 
302 		else if ( *p=='}' ) {
303 			str_addchar( &token, *p );
304 			if ( !brace_is_escaped( nquotes, p, startp ) ) {
305 				nbraces--;
306 				if ( nbraces==0 ) {
307 					status = add_token( tokens, &token );
308 					if ( status!=BIBL_OK ) { p=NULL; goto out0; }
309 				}
310 				if ( nbraces<0 ) {
311 					goto out;
312 				}
313 			}
314 		}
315 
316 		else if ( *p=='#' ) {
317 			if ( char_is_escaped( nquotes, nbraces ) ) {
318 				str_addchar( &token, *p );
319 			}
320 			/* ...this is a bibtex string concatentation token */
321 			else {
322 				if ( str_has_value( &token ) ) {
323 					status = add_token( tokens, &token );
324 					if ( status!=BIBL_OK ) { p=NULL; goto out0; }
325 				}
326 				status = slist_addc( tokens, "#" );
327 				if ( status!=SLIST_OK ) { p=NULL; goto out0; }
328 			}
329 		}
330 
331 		/* ...add escaped white-space and non-white-space to current token */
332 		else if ( !is_ws( *p ) || char_is_escaped( nquotes, nbraces ) ) {
333 			/* always add non-whitespace characters */
334 			if ( !is_ws( *p ) ) {
335 				str_addchar( &token, *p );
336 			}
337 			/* only add whitespace if token is non-empty; convert CR/LF to space */
338 			else if ( token.len!=0 ) {
339 				if ( *p!='\n' && *p!='\r' )
340 					str_addchar( &token, *p );
341 				else {
342 					str_addchar( &token, ' ' );
343 					while ( is_ws( *(p+1) ) ) p++;
344 				}
345 			}
346 		}
347 
348 		/* ...unescaped white-space marks the end of a token */
349 		else if ( is_ws( *p ) ) {
350 			if ( token.len ) {
351 				status = add_token( tokens, &token );
352 				if ( status!=BIBL_OK ) { p=NULL; goto out0; }
353 			}
354 		}
355 
356 		p++;
357 	}
358 out:
359 	if ( nbraces!=0 ) {
360 		REprintf( "%s: Mismatch in number of braces in file %s reference %ld.\n", currloc->progname, currloc->filename, currloc->nref );
361 	}
362 	if ( nquotes!=0 ) {
363 		REprintf( "%s: Mismatch in number of quotes in file %s reference %ld.\n", currloc->progname, currloc->filename, currloc->nref );
364 	}
365 	if ( str_has_value( &token ) ) {
366 		if ( str_memerr( &token ) ) { p = NULL; goto out; }
367 		status = slist_add( tokens, &token );
368 		if ( status!=SLIST_OK ) p = NULL;
369 	}
370 out0:
371 	str_free( &token );
372 	return p;
373 }
374 
375 #define NOT_ESCAPED    (0)
376 #define ESCAPED_QUOTES (1)
377 #define ESCAPED_BRACES (2)
378 
379 static int
token_is_escaped(str * s)380 token_is_escaped( str *s )
381 {
382 	if ( s->data[0]=='\"' && s->data[s->len-1]=='\"' ) return ESCAPED_QUOTES;
383 	if ( s->data[0]=='{'  && s->data[s->len-1]=='}'  ) return ESCAPED_BRACES;
384 	return NOT_ESCAPED;
385 }
386 
387 /* replace_strings()
388  *
389  * do bibtex string replacement for data tokens
390  */
391 static int
replace_strings(slist * tokens)392 replace_strings( slist *tokens )
393 {
394 	int i, n;
395 	str *s;
396 
397 	for ( i=0; i<tokens->n; ++i ) {
398 
399 		s = slist_str( tokens, i );
400 
401 		/* ...skip if token is protected by quotation marks or braces */
402 		if ( token_is_escaped( s ) ) continue;
403 
404 		/* ...skip if token is string concatentation symbol */
405 		if ( !str_strcmpc( s, "#" ) ) continue;
406 
407 		n = slist_find( &find, s );
408 		if ( slist_wasnotfound( &find, n ) ) continue;
409 
410 		str_strcpy( s, slist_str( &replace, n ) );
411 		if ( str_memerr( s ) ) return BIBL_ERR_MEMERR;
412 
413 	}
414 
415 	return BIBL_OK;
416 }
417 
418 static int
string_concatenate(slist * tokens,loc * currloc)419 string_concatenate( slist *tokens, loc *currloc )
420 {
421 	int i, status, esc_s, esc_t;
422 	str *s, *t;
423 
424 	i = 0;
425 	while ( i < tokens->n ) {
426 
427 		s = slist_str( tokens, i );
428 		if ( str_strcmpc( s, "#" ) ) {
429 			i++;
430 			continue;
431 		}
432 
433 		if ( i==0 || i==tokens->n-1 ) {
434 			REprintf( "%s: Warning: Stray string concatenation ('#' character) in file %s reference %ld\n",
435 					currloc->progname, currloc->filename, currloc->nref );
436 			status = slist_remove( tokens, i );
437 			if ( status!=SLIST_OK ) return BIBL_ERR_MEMERR;
438 			continue;
439 		}
440 
441 		s = slist_str( tokens, i-1 );
442 		t = slist_str( tokens, i+1 );
443 
444 		esc_s = token_is_escaped( s );
445 		esc_t = token_is_escaped( t );
446 
447 		if ( esc_s != NOT_ESCAPED ) str_trimend( s, 1 );
448 		if ( esc_t != NOT_ESCAPED ) str_trimbegin( t, 1 );
449 		if ( esc_s != esc_t ) {
450 			if ( esc_s == NOT_ESCAPED ) {
451 				if ( esc_t == ESCAPED_QUOTES ) str_prepend( s, "\"" );
452 				else                           str_prepend( s, "{" );
453 			}
454 			else {
455 				if ( esc_t != NOT_ESCAPED ) str_trimend( t, 1 );
456 				if ( esc_s == ESCAPED_QUOTES ) str_addchar( t, '\"' );
457 				else                           str_addchar( t, '}' );
458 			}
459 		}
460 
461 		str_strcat( s, t );
462 		if ( str_memerr( s ) ) return BIBL_ERR_MEMERR;
463 
464 		/* ...remove concatenated string t */
465 		status = slist_remove( tokens, i+1 );
466 		if ( status!=SLIST_OK ) return BIBL_ERR_MEMERR;
467 
468 		/* ...remove concatentation token '#' */
469 		status = slist_remove( tokens, i );
470 		if ( status!=SLIST_OK ) return BIBL_ERR_MEMERR;
471 
472 	}
473 
474 	return BIBL_OK;
475 }
476 
477 #define KEEP_QUOTES  (0)
478 #define STRIP_QUOTES (1)
479 
480 static int
merge_tokens_into_data(str * data,slist * tokens,int stripquotes)481 merge_tokens_into_data( str *data, slist *tokens, int stripquotes )
482 {
483 	int i, esc_s;
484 	str *s;
485 
486 	for ( i=0; i<tokens->n; i++ ) {
487 
488 		s     = slist_str( tokens, i );
489 		esc_s = token_is_escaped( s );
490 
491 		if ( ( esc_s == ESCAPED_BRACES ) ||
492 		     ( stripquotes == STRIP_QUOTES && esc_s == ESCAPED_QUOTES ) ) {
493 			str_trimbegin( s, 1 );
494 			str_trimend( s, 1 );
495 		}
496 
497 		str_strcat( data, s );
498 
499 	}
500 
501 	if ( str_memerr( data ) ) return BIBL_ERR_MEMERR;
502 	else return BIBL_OK;
503 }
504 
505 /* return NULL on memory error */
506 static const char *
process_bibtexline(const char * p,str * tag,str * data,uchar stripquotes,loc * currloc)507 process_bibtexline( const char *p, str *tag, str *data, uchar stripquotes, loc *currloc )
508 {
509 	slist tokens;
510 	int status;
511 
512 	str_empty( data );
513 
514 	slist_init( &tokens );
515 
516 	p = bibtex_tag( skip_ws( p ), tag );
517 	if ( p ) {
518 		if ( str_is_empty( tag ) ) {
519 			p = skip_line( p );
520 			goto out;
521 		}
522 	}
523 
524 	if ( p && *p=='=' ) {
525 		p = bibtex_data( p+1, &tokens, currloc );
526 	}
527 
528 	if ( p ) {
529 		status = replace_strings( &tokens );
530 		if ( status!=BIBL_OK ) p = NULL;
531 	}
532 
533 	if ( p ) {
534 		status = string_concatenate( &tokens, currloc );
535 		if ( status!=BIBL_OK ) p = NULL;
536 	}
537 
538 	if ( p ) {
539 		status = merge_tokens_into_data( data, &tokens, stripquotes );
540 		if ( status!=BIBL_OK ) p = NULL;
541 	}
542 
543 out:
544 	slist_free( &tokens );
545 	return p;
546 }
547 
548 /* process_ref()
549  *
550  */
551 static int
process_ref(fields * bibin,const char * p,loc * currloc)552 process_ref( fields *bibin, const char *p, loc *currloc )
553 {
554 	int fstatus, status = BIBL_OK;
555 	str type, id, tag, data;
556 
557 	strs_init( &type, &id, &tag, &data, NULL );
558 
559 	p = process_bibtextype( p, &type );
560 	p = process_bibtexid( p, &id );
561 
562 	if ( str_is_empty( &type ) || str_is_empty( &id ) ) goto out;
563 
564 	fstatus = fields_add( bibin, "INTERNAL_TYPE", str_cstr( &type ), LEVEL_MAIN );
565 	if ( fstatus!=FIELDS_OK ) { status = BIBL_ERR_MEMERR; goto out; }
566 
567 	fstatus = fields_add( bibin, "REFNUM", str_cstr( &id ), LEVEL_MAIN );
568 	if ( fstatus!=FIELDS_OK ) { status = BIBL_ERR_MEMERR; goto out; }
569 
570 	while ( *p ) {
571 
572 		p = process_bibtexline( p, &tag, &data, STRIP_QUOTES, currloc );
573 		if ( p==NULL ) { status = BIBL_ERR_MEMERR; goto out; }
574 
575 		if ( !str_has_value( &tag ) || !str_has_value( &data ) ) continue;
576 
577 		fstatus = fields_add( bibin, str_cstr( &tag ), str_cstr( &data ), LEVEL_MAIN );
578 		if ( fstatus!=FIELDS_OK ) { status = BIBL_ERR_MEMERR; goto out; }
579 
580 	}
581 out:
582 	strs_free( &type, &id, &tag, &data, NULL );
583 	return status;
584 }
585 
586 /* process_string()
587  *
588  * Handle lines like:
589  *
590  * '@STRING{TL = {Tetrahedron Lett.}}'
591  *
592  * p should point to just after '@STRING'
593  *
594  * In BibTeX, if a string is defined several times, the last one is kept.
595  *
596  */
597 static int
process_string(const char * p,loc * currloc)598 process_string( const char *p, loc *currloc )
599 {
600 	int n, status = BIBL_OK;
601 	str s1, s2, *t;
602 
603 	strs_init( &s1, &s2, NULL );
604 
605 	while ( *p && *p!='{' && *p!='(' ) p++;
606 	if ( *p=='{' || *p=='(' ) p++;
607 
608 	p = process_bibtexline( skip_ws( p ), &s1, &s2, KEEP_QUOTES, currloc );
609 	if ( p==NULL ) { status = BIBL_ERR_MEMERR; goto out; }
610 
611 	if ( str_has_value( &s2 ) ) {
612 		str_findreplace( &s2, "\\ ", " " );
613 	} else {
614 		str_strcpyc( &s2, "" );
615 	}
616 
617 	if ( str_has_value( &s1 ) ) {
618 		n = slist_find( &find, &s1 );
619 		if ( n==-1 ) {
620 			status = slist_add_ret( &find,    &s1, BIBL_OK, BIBL_ERR_MEMERR );
621 			if ( status!=BIBL_OK ) goto out;
622 			status = slist_add_ret( &replace, &s2, BIBL_OK, BIBL_ERR_MEMERR );
623 			if ( status!=BIBL_OK ) goto out;
624 		} else {
625 			t = slist_set( &replace, n, &s2 );
626 			if ( t==NULL ) { status = BIBL_ERR_MEMERR; goto out; }
627 		}
628 	}
629 
630 out:
631 	strs_free( &s1, &s2, NULL );
632 	return status;
633 }
634 
635 /* bibtexin_processf()
636  *
637  * Handle '@STRING', '@reftype', and ignore '@COMMENT'
638  *                                   Georgi: also ignore @PREAMBLE
639  */
640 static int
bibtexin_processf(fields * bibin,const char * data,const char * filename,long nref,param * pm)641 bibtexin_processf( fields *bibin, const char *data, const char *filename, long nref, param *pm )
642 {
643 	loc currloc;
644 
645 	currloc.progname = pm->progname;
646 	currloc.filename = filename;
647 	currloc.nref     = nref;
648 
649 	if ( !strncasecmp( data, "@STRING", 7 ) ) {
650 		process_string( data+7, &currloc );
651 		return 0;
652 	} else if ( !strncasecmp( data, "@COMMENT", 8 ) || !strncasecmp( data, "@PREAMBLE", 9 )) {
653 	  // Georgi: added @PREAMBLE
654 	  //    todo: It could make sense to keep it for output to bibtex (or TeX related)
655 
656 		/* Not sure if these are real Bibtex, but not references */
657 		return 0;
658 	} else {
659 		process_ref( bibin, data, &currloc );
660 		return 1;
661 	}
662 }
663 
664 /*****************************************************
665  PUBLIC: void bibtexin_cleanf()
666 *****************************************************/
667 
668 static int
is_url_tag(str * tag)669 is_url_tag( str *tag )
670 {
671 	if ( str_has_value( tag ) ) {
672 		if ( !strcasecmp( str_cstr( tag ), "url" ) ) return 1;
673 		if ( !strcasecmp( str_cstr( tag ), "file" ) ) return 1;
674 		if ( !strcasecmp( str_cstr( tag ), "doi" ) ) return 1;
675 		if ( !strcasecmp( str_cstr( tag ), "sentelink" ) ) return 1;
676 	}
677 	return 0;
678 }
679 
680 static int
is_name_tag(str * tag)681 is_name_tag( str *tag )
682 {
683 	if ( str_has_value( tag ) ) {
684 		if ( !strcasecmp( str_cstr( tag ), "author" ) ) return 1;
685 		if ( !strcasecmp( str_cstr( tag ), "editor" ) ) return 1;
686 		if ( !strcasecmp( str_cstr( tag ), "translator" ) ) return 1;
687 	}
688 	return 0;
689 }
690 
691 static int
bibtex_cleanvalue(str * value)692 bibtex_cleanvalue( str *value )
693 {
694 	int status;
695 	str parsed;
696 
697 	str_init( &parsed );
698 // REprintf("before clean: %s\n", value->data);
699 
700 	status = latex_parse( value, &parsed );
701 	if ( status!=BIBL_OK ) goto out;
702 
703 	str_strcpy( value, &parsed );
704 	if ( str_memerr( value ) ) status = BIBL_ERR_MEMERR;
705 
706 // REprintf("after clean: %s\n", value->data);
707 
708 out:
709 	str_free( &parsed );
710 	return status;
711 }
712 
713 static int
bibtex_matches_list(fields * bibout,char * tag,char * suffix,str * data,int level,slist * names,int * match)714 bibtex_matches_list( fields *bibout, char *tag, char *suffix, str *data, int level, slist *names, int *match )
715 {
716 	int n, fstatus;
717 	str mergedtag;
718 
719 	*match = 0;
720 
721 	n = slist_find( names, data );
722 	if ( slist_wasfound( names, n ) ) {
723 		str_initstrsc( &mergedtag, tag, suffix, NULL );
724 		fstatus = fields_add( bibout, str_cstr( &mergedtag ), str_cstr( data ), level );
725 		str_free( &mergedtag );
726 		if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
727 		*match = 1;
728 	}
729 
730 	return BIBL_OK;
731 }
732 
733 static int
bibtex_matches_asis_or_corps(fields * bibin,int m,param * pm,int * match)734 bibtex_matches_asis_or_corps( fields *bibin, int m, param *pm, int *match )
735 {
736 	int status;
737 
738 	status = bibtex_matches_list( bibin, fields_tag( bibin, m, FIELDS_STRP ), ":ASIS", fields_value( bibin, m, FIELDS_STRP ), LEVEL_MAIN, &(pm->asis), match );
739 	if ( *match==1 || status!=BIBL_OK ) return status;
740 
741 	status = bibtex_matches_list( bibin, fields_tag( bibin, m, FIELDS_STRP ), ":CORP", fields_value( bibin, m, FIELDS_STRP ), LEVEL_MAIN, &(pm->corps), match );
742 	if ( *match==1 || status!=BIBL_OK ) return status;
743 
744 	return BIBL_OK;
745 }
746 
747 /* We need to:
748  *    (1) break names into LaTeX tokens (e.g. respect "{van der Hoff}" as a single name element)
749  *    (2) clean the values by removing brackets and things
750  *    (3) convert the character set before any name processing happens (else things like "\"O" get split up)
751  */
752 static int
bibtex_person_tokenize(fields * bibin,int m,param * pm,slist * tokens)753 bibtex_person_tokenize( fields *bibin, int m, param *pm, slist *tokens )
754 {
755 	int i, ok, status;
756 	str *s;
757 
758 // REprintf("person!\n");
759 	status = latex_tokenize( tokens, fields_value( bibin, m, FIELDS_STRP ) );
760 	if ( status!=BIBL_OK ) return status;
761 
762 
763 	for ( i=0; i<tokens->n; ++i ) {
764 
765 		s = slist_str( tokens, i );
766 
767 		// Georgi: removing since changes latex characters to unicode
768 		//         in names, see comments in bibtexin_cleanref() (in bibtexin.c
769 		//    TODO: check if this causes bad side effects, ideally correct
770 		//
771 		// Reinstating this, bad side effects
772 		status = bibtex_cleanvalue( s );
773 		if ( status!=BIBL_OK ) return status;
774 
775 		// !!! Georgi: conversion is here!
776 		// !!!
777 		// REprintf("\ns before str_convert: %s\n", s->data);
778 		  ok = str_convert( s, pm->charsetin,  1, pm->utf8in,  pm->xmlin,
779 				    // Georgi: change arg. latexout to 1
780 				    // TODO: make it argument to this function?
781 				    //       it should depend on --no-latex
782 				    // v1.3 - restoring latexout to 0
783 				    pm->charsetout, 0, pm->utf8out, pm->xmlout );
784 		  // REprintf("s after str_convert: %s\n", s->data);
785 		if ( !ok ) return BIBL_ERR_MEMERR;
786 
787 	}
788 
789 	return BIBL_OK;
790 }
791 
792 /* We need to:
793  *      (1) Build individual names
794  *      (2) Add them to the end of fields *bibin -- because of this, we have to look up the tag/data every time
795  *          because we can reallocate the raw data and make any pointers stale
796  */
797 static int
bibtex_person_add_names(fields * bibin,int m,slist * tokens)798 bibtex_person_add_names( fields *bibin, int m, slist *tokens )
799 {
800 	int begin, end, ok, n, etal;
801 
802 	etal = name_findetal( tokens );
803 // REprintf("person_add_names!\n");
804 
805 	begin = 0;
806 	n = tokens->n - etal;
807 	while ( begin < n ) {
808 
809 		end = begin + 1;
810 
811 		while ( end < n && strcasecmp( slist_cstr( tokens, end ), "and" ) )
812 			end++;
813 
814 
815 		if ( end - begin == 1 ) {
816 			ok = name_addsingleelement( bibin, fields_tag( bibin,m,FIELDS_CHRP), slist_cstr( tokens, begin ), LEVEL_MAIN, NAME_ASIS );
817 			if ( !ok ) return BIBL_ERR_MEMERR;
818 		} else {
819 			ok = name_addmultielement( bibin, fields_tag(bibin,m,FIELDS_CHRP), tokens, begin, end, LEVEL_MAIN );
820 			if ( !ok ) return BIBL_ERR_MEMERR;
821 		}
822 
823 		begin = end + 1;
824 
825 		/* Handle repeated 'and' errors: authors="G. F. Author and and B. K. Author" */
826 		while ( begin < n && !strcasecmp( slist_cstr( tokens, begin ), "and" ) )
827 			begin++;
828 
829 	}
830 
831 	if ( etal ) {
832 		ok = name_addsingleelement( bibin, fields_tag(bibin,m,FIELDS_CHRP), "et al.", LEVEL_MAIN, NAME_ASIS );
833 		if ( !ok ) return BIBL_ERR_MEMERR;
834 	}
835 
836 	return BIBL_OK;
837 }
838 
839 /* Keep looking up tag values--we can reallocate when we add new names here */
840 static int
bibtexin_person(fields * bibin,int m,param * pm)841 bibtexin_person( fields *bibin, int m, param *pm )
842 {
843 	int status, match = 0;
844 	slist tokens;
845 // REprintf("bibtexin_person!\n");
846 
847 	status = bibtex_matches_asis_or_corps( bibin, m, pm, &match );
848 	if ( status!=BIBL_OK || match==1 ) return status;
849 
850 	slist_init( &tokens );
851 
852 	status = bibtex_person_tokenize( bibin, m, pm, &tokens );
853 	if ( status!=BIBL_OK ) goto out;
854 
855 	status = bibtex_person_add_names( bibin, m, &tokens );
856 	if ( status!=BIBL_OK ) goto out;
857 
858 out:
859 	slist_free( &tokens );
860 	return status;
861 
862 }
863 
864 static int
bibtexin_cleanref(fields * bibin,param * pm)865 bibtexin_cleanref( fields *bibin, param *pm )
866 {
867      int i, n, fstatus, status = BIBL_OK;
868      str *tag, *value;
869      intlist toremove;
870 
871      intlist_init( &toremove );
872 
873      n = fields_num( bibin );
874      // REprintf("n = %d\n", n);
875 
876      // REprintf("n = %d\n" , n);
877      // for(i = 0; i < n; i++) {
878      //   REprintf("i = %d, value = %s\n", i, (bibin->value[i]).data);
879      // }
880 
881 
882 
883      for ( i=0; i<n; ++i ) {
884 
885 	  tag = fields_tag( bibin, i, FIELDS_STRP_NOUSE );
886           // REprintf("\ntag = %s\n", tag->data);
887 	  if ( is_url_tag( tag ) ) continue; /* protect url from parsing */
888 
889 	  /* Georgi:  protecting names, otherwise havoc ensues if the input is
890 	     in a different encoding;
891 	     TODO: test side effects of doing this.
892 	     delay names from undergoing any parsing */
893 	  /* 2020-09-26: but names need parsing since there may be more than one!
894 	     Commenting out to process properly names fields
895 
896 	     TODO: return to this and check again!
897 	     I commented this out because of encodings - do tests!
898 	     Amendment: run the nex two lines  but only if tag is not names tag
899 	     (actually, moved them to the else part)
900 	  */
901 	  // if ( is_name_tag( tag ) ) return BIBL_OK;
902 	  // if ( !is_name_tag( tag ) ){
903 	  value = fields_value( bibin, i, FIELDS_STRP_NOUSE );
904 	  if ( str_is_empty( value ) ) continue;
905 
906 	  // }
907 	  if ( is_name_tag( tag ) ) {
908 	       status = bibtexin_person( bibin, i, pm );
909                // REprintf("i = %d\n", i);
910                // REprintf("value = %s\n", (bibin->value[i]).data);
911 
912 	       if ( status!=BIBL_OK ) goto out;
913 
914 	       fstatus = intlist_add( &toremove, i );
915 	       if ( fstatus!=INTLIST_OK ) { status = BIBL_ERR_MEMERR; goto out; }
916                // REprintf("nout = %d\n" , fields_num( bibin ));
917 	       // goto out;
918 	  }
919 
920 	  // else {
921 	  // //         // REprintf("i = %d, value = %s\n", i, (bibin->value[i]).data);
922 	  // //
923 	  //          value = fields_value( bibin, i, FIELDS_STRP_NOUSE );
924 	  //          if ( str_is_empty( value ) ) continue;
925 	  // //
926 	  // //         // Georgi: bibtex_cleanvalue() drops $, {, }, for now just skip it
927 	  // //         //   TODO: fix bibtex_cleanvalue() to not do that when not necessary
928 	  // // 	// // REprintf("i = %d, value = %s\n", i, value->data);
929 	  // //         // status = bibtex_cleanvalue( value );
930 	  // // 	// // REprintf("i = %d, value = %s\n", i, (bibin->value[i]).data);
931 	  // // 	// if ( status!=BIBL_OK ) goto out;
932 	  // }
933 
934      }
935 
936 
937      // int nout = fields_num( bibin );
938      // if(nout > n) {
939      //   REprintf("nout = %d\n" , nout);
940      //   for(i = 0; i < nout; i++) {
941      //     REprintf("i = %d, value = %s\n", i, (bibin->value[i]).data);
942      //   }
943      //
944      // }
945 
946 
947      for ( i=toremove.n-1; i>=0; i-- ) {
948 	  fstatus = fields_remove( bibin, intlist_get( &toremove, i ) );
949 	  if ( fstatus!=FIELDS_OK ) { status = BIBL_ERR_MEMERR; goto out; }
950      }
951 
952 
953 out:
954 
955      intlist_free( &toremove );
956 
957 
958      // nout = fields_num( bibin );
959      // if(nout > n) {
960      //   REprintf("nout = %d\n" , nout);
961      //   for(i = 0; i < nout; i++) {
962      //     REprintf("i = %d, value = %s\n", i, (bibin->value[i]).data);
963      //   }
964      //
965      // }
966 
967      return status;
968 }
969 
970 static void
bibtexin_nocrossref(bibl * bin,long i,int n,param * p)971 bibtexin_nocrossref( bibl *bin, long i, int n, param *p )
972 {
973 	int n1 = fields_find( bin->ref[i], "REFNUM", LEVEL_ANY );
974 	if ( p->progname ) REprintf( "%s: ", p->progname );
975 	REprintf( "Cannot find cross-reference '%s'", (char*) fields_value( bin->ref[i], n, FIELDS_CHRP_NOUSE ) );
976 	if ( n1!=FIELDS_NOTFOUND ) REprintf( " for reference '%s'\n", (char*) fields_value( bin->ref[i], n1, FIELDS_CHRP_NOUSE ) );
977 	REprintf( "\n" );
978 }
979 
980 static int
bibtexin_crossref_oneref(fields * bibref,fields * bibcross)981 bibtexin_crossref_oneref( fields *bibref, fields *bibcross )
982 {
983 	int i, n, newlevel, ntype, fstatus;
984 	char *type, *newtag, *newvalue;
985 
986 	ntype = fields_find( bibref, "INTERNAL_TYPE", LEVEL_ANY );
987 	type = ( char * ) fields_value( bibref, ntype, FIELDS_CHRP_NOUSE );
988 
989 	n = fields_num( bibcross );
990 
991 	for ( i=0; i<n; ++i ) {
992 
993 		newtag = ( char * ) fields_tag( bibcross, i, FIELDS_CHRP_NOUSE );
994 		if ( !strcasecmp( newtag, "INTERNAL_TYPE" ) ) continue;
995 		if ( !strcasecmp( newtag, "REFNUM" ) ) continue;
996 		if ( !strcasecmp( newtag, "TITLE" ) ) {
997 			if ( !strcasecmp( type, "Inproceedings" ) ||
998 			     !strcasecmp( type, "Incollection" ) )
999 				newtag = "booktitle";
1000 		}
1001 
1002 		newvalue = ( char * ) fields_value( bibcross, i, FIELDS_CHRP_NOUSE );
1003 
1004 		newlevel = fields_level( bibcross, i ) + 1;
1005 
1006 		fstatus = fields_add( bibref, newtag, newvalue, newlevel );
1007 		if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
1008 	}
1009 
1010 	return BIBL_OK;
1011 }
1012 
1013 static int
bibtexin_crossref(bibl * bin,param * p)1014 bibtexin_crossref( bibl *bin, param *p )
1015 {
1016 	int i, n, ncross, status = BIBL_OK;
1017 	fields *bibref, *bibcross;
1018 
1019 	for ( i=0; i<bin->n; ++i ) {
1020 		bibref = bin->ref[i];
1021 		n = fields_find( bibref, "CROSSREF", LEVEL_ANY );
1022 		if ( n==FIELDS_NOTFOUND ) continue;
1023 		fields_set_used( bibref, n );
1024 		ncross = bibl_findref( bin, (char*) fields_value( bibref, n, FIELDS_CHRP_NOUSE ) );
1025 		if ( ncross==-1 ) {
1026 			bibtexin_nocrossref( bin, i, n, p );
1027 			continue;
1028 		}
1029 		bibcross = bin->ref[ncross];
1030 		status = bibtexin_crossref_oneref( bibref, bibcross );
1031 		if ( status!=BIBL_OK ) goto out;
1032 	}
1033 out:
1034 	return status;
1035 }
1036 
1037 static int
bibtexin_cleanf(bibl * bin,param * p)1038 bibtexin_cleanf( bibl *bin, param *p )
1039 {
1040 	int status;
1041 	long i;
1042 
1043         for ( i=0; i<bin->n; ++i ) {
1044 		status = bibtexin_cleanref( bin->ref[i], p );
1045 		if ( status!=BIBL_OK ) return status;
1046 	}
1047 	status = bibtexin_crossref( bin, p );
1048 	return status;
1049 }
1050 
1051 /*****************************************************
1052  PUBLIC: int bibtexin_typef()
1053 *****************************************************/
1054 
1055 static int
bibtexin_typef(fields * bibin,const char * filename,int nrefs,param * p)1056 bibtexin_typef( fields *bibin, const char *filename, int nrefs, param *p )
1057 {
1058 	int ntypename, nrefname, is_default;
1059 	char *refname = "", *typename = "";
1060 
1061 	ntypename = fields_find( bibin, "INTERNAL_TYPE", LEVEL_MAIN );
1062 	nrefname  = fields_find( bibin, "REFNUM",        LEVEL_MAIN );
1063 	if ( nrefname!=FIELDS_NOTFOUND )  refname  = fields_value( bibin, nrefname,  FIELDS_CHRP_NOUSE );
1064 	if ( ntypename!=FIELDS_NOTFOUND ) typename = fields_value( bibin, ntypename, FIELDS_CHRP_NOUSE );
1065 
1066 	return get_reftype( typename, nrefs, p->progname, p->all, p->nall, refname, &is_default, REFTYPE_CHATTY );
1067 }
1068 
1069 /*****************************************************
1070  PUBLIC: int bibtexin_convertf(), returns BIBL_OK or BIBL_ERR_MEMERR
1071 *****************************************************/
1072 
1073 /**** bibtexin_btorg ****/
1074 
1075 /*
1076  * BibTeX uses 'organization' in lieu of publisher if that field is missing.
1077  * Otherwise output as
1078  * <name type="corporate">
1079  *    <namePart>The organization</namePart>
1080  *    <role>
1081  *       <roleTerm authority="marcrelator" type="text">organizer of meeting</roleTerm>
1082  *    </role>
1083  * </name>
1084  */
1085 
1086 static int
bibtexin_btorg(fields * bibin,int m,str * intag,str * invalue,int level,param * pm,char * outtag,fields * bibout)1087 bibtexin_btorg( fields *bibin, int m, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
1088 {
1089 	int n, fstatus;
1090 	n = fields_find( bibin, "publisher", LEVEL_ANY );
1091 	if ( n==FIELDS_NOTFOUND )
1092 		fstatus = fields_add( bibout, "PUBLISHER", str_cstr( invalue ), level );
1093 	else
1094 		fstatus = fields_add( bibout, "ORGANIZER:CORP", str_cstr( invalue ), level );
1095 	if ( fstatus==FIELDS_OK ) return BIBL_OK;
1096 	else return BIBL_ERR_MEMERR;
1097 }
1098 
1099 /**** bibtexin_btsente() ****/
1100 
1101 /*
1102  * sentelink = {file://localhost/full/path/to/file.pdf,Sente,PDF}
1103  *
1104  * Sente is an academic reference manager for MacOSX and Apple iPad.
1105  */
1106 
1107 static int
bibtexin_btsente(fields * bibin,int n,str * intag,str * invalue,int level,param * pm,char * outtag,fields * bibout)1108 bibtexin_btsente( fields *bibin, int n, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
1109 {
1110 	int fstatus, status = BIBL_OK;
1111 	str link;
1112 
1113 	str_init( &link );
1114 	str_cpytodelim( &link, skip_ws( invalue->data ), ",", 0 );
1115 	str_trimendingws( &link );
1116 	if ( str_memerr( &link ) ) status = BIBL_ERR_MEMERR;
1117 
1118 	if ( status==BIBL_OK && link.len ) {
1119 		fstatus = fields_add( bibout, "FILEATTACH", str_cstr( &link ), level );
1120 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1121 	}
1122 
1123 	str_free( &link );
1124 	return status;
1125 }
1126 
1127 /**** bibtexin_linkedfile() ****/
1128 
1129 static int
count_colons(char * p)1130 count_colons( char *p )
1131 {
1132 	int n = 0;
1133 	while ( *p ) {
1134 		if ( *p==':' ) n++;
1135 		p++;
1136 	}
1137 	return n;
1138 }
1139 
1140 static int
first_colon(char * p)1141 first_colon( char *p )
1142 {
1143 	int n = 0;
1144 	while ( p[n] && p[n]!=':' ) n++;
1145 	return n;
1146 }
1147 
1148 static int
last_colon(char * p)1149 last_colon( char *p )
1150 {
1151 	int n = strlen( p ) - 1;
1152 	while ( n>0 && p[n]!=':' ) n--;
1153 	return n;
1154 }
1155 
1156 /*
1157  * file={Description:/full/path/to/file.pdf:PDF}
1158  */
1159 static int
bibtexin_linkedfile(fields * bibin,int m,str * intag,str * invalue,int level,param * pm,char * outtag,fields * bibout)1160 bibtexin_linkedfile( fields *bibin, int m, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
1161 {
1162 	int fstatus, status = BIBL_OK;
1163 	char *p = str_cstr( invalue );
1164 	int i, n, n1, n2;
1165 	str link;
1166 
1167 	n = count_colons( p );
1168 	if ( n > 1 ) {
1169 		/* A DOS file can contain a colon ":C:/....pdf:PDF" */
1170 		/* Extract after 1st and up to last colons */
1171 		n1 = first_colon( p ) + 1;
1172 		n2 = last_colon( p );
1173 		str_init( &link );
1174 		for ( i=n1; i<n2; ++i ) {
1175 			str_addchar( &link, p[i] );
1176 		}
1177 		str_trimstartingws( &link );
1178 		str_trimendingws( &link );
1179 		if ( str_memerr( &link ) ) {
1180 			status = BIBL_ERR_MEMERR;
1181 			goto out;
1182 		}
1183 		if ( link.len ) {
1184 			fstatus = fields_add( bibout, "FILEATTACH", str_cstr( &link ), level );
1185 			if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1186 		}
1187 out:
1188 		str_free( &link );
1189 	} else {
1190 		/* This field isn't formatted properly, so just copy directly */
1191 		fstatus = fields_add( bibout, "FILEATTACH", p, level );
1192 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1193 	}
1194 	return status;
1195 
1196 }
1197 
1198 /**** bibtexin_howpublished() ****/
1199 
1200 /*    howpublished={},
1201  *
1202  * Normally indicates the manner in which something was
1203  * published in lieu of a formal publisher, so typically
1204  * 'howpublished' and 'publisher' will never be in the
1205  * same reference.
1206  *
1207  * Occassionally, people put Diploma thesis information
1208  * into the field, so check that first.
1209  *
1210  * Returns BIBL_OK or BIBL_ERR_MEMERR
1211  */
1212 
1213 static int
bibtexin_howpublished(fields * bibin,int n,str * intag,str * invalue,int level,param * pm,char * outtag,fields * bibout)1214 bibtexin_howpublished( fields *bibin, int n, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
1215 {
1216 	int fstatus, status = BIBL_OK;
1217 	if ( !strncasecmp( str_cstr( invalue ), "Diplom", 6 ) ) {
1218 		fstatus = fields_replace_or_add( bibout, "GENRE:BIBUTILS", "Diploma thesis", level );
1219 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1220 	}
1221 	else if ( !strncasecmp( str_cstr( invalue ), "HSabilitation", 13 ) ) {
1222 		fstatus = fields_replace_or_add( bibout, "GENRE:BIBUTILS", "Habilitation thesis", level );
1223 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1224 	}
1225 	else if ( !strncasecmp( str_cstr( invalue ), "Licentiate", 10 ) ) {
1226 		fstatus = fields_replace_or_add( bibout, "GENRE:BIBUTILS", "Licentiate thesis", level );
1227 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1228 	}
1229 	else if ( is_embedded_link( str_cstr( invalue ) ) ) {
1230 		status =  urls_split_and_add( str_cstr( invalue ), bibout, level );
1231 	}
1232 	else {
1233 		fstatus = fields_add( bibout, "PUBLISHER", str_cstr( invalue ), level );
1234 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1235 	}
1236 	return status;
1237 }
1238 
1239 /**** bibtexin_eprint() ****/
1240 
1241 /* Try to capture situations like
1242  *
1243  * eprint="1605.02026",
1244  * archivePrefix="arXiv",
1245  *
1246  * or
1247  *
1248  * eprint="13211131",
1249  * eprinttype="medline",
1250  *
1251  * If we don't know anything, concatenate archivePrefix:eprint
1252  * and push into URL. (Could be wrong)
1253  *
1254  * If no info, just push eprint into URL. (Could be wrong)
1255  */
1256 static int
process_eprint_with_prefix(fields * bibout,char * prefix,str * value,int level)1257 process_eprint_with_prefix( fields *bibout, char *prefix, str *value, int level )
1258 {
1259 	int fstatus, status = BIBL_OK;
1260 	str merge;
1261 
1262 	if ( !strcmp( prefix, "arXiv" ) ) {
1263 		fstatus = fields_add( bibout, "ARXIV", str_cstr( value ), level );
1264 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1265 	}
1266 
1267 	else if ( !strcmp( prefix, "jstor" ) ) {
1268 		fstatus = fields_add( bibout, "JSTOR", str_cstr( value ), level );
1269 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1270 	}
1271 
1272 	else if ( !strcmp( prefix, "medline" ) ) {
1273 		fstatus = fields_add( bibout, "MEDLINE", str_cstr( value ), level );
1274 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1275 	}
1276 
1277 	else if ( !strcmp( prefix, "pubmed" ) ) {
1278 		fstatus = fields_add( bibout, "PMID", str_cstr( value ), level );
1279 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1280 	}
1281 
1282 	/* ...if this is unknown prefix, merge prefix & eprint */
1283 	else {
1284 		str_init( &merge );
1285 		str_mergestrs( &merge, prefix, ":", str_cstr( value ), NULL );
1286 		fstatus = fields_add( bibout, "URL", str_cstr( &merge ), level );
1287 		if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
1288 		str_free( &merge );
1289 	}
1290 
1291 	return status;
1292 }
1293 static int
process_eprint_without_prefix(fields * bibout,str * value,int level)1294 process_eprint_without_prefix( fields *bibout, str *value, int level )
1295 {
1296 	int fstatus;
1297 
1298 	/* ...no archivePrefix, need to handle just 'eprint' tag */
1299 	fstatus = fields_add( bibout, "URL", str_cstr( value ), level );
1300 
1301 	if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR;
1302 	else return BIBL_OK;
1303 }
1304 
1305 static int
bibtexin_eprint(fields * bibin,int m,str * intag,str * invalue,int level,param * pm,char * outtag,fields * bibout)1306 bibtexin_eprint( fields *bibin, int m, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
1307 {
1308 	char *prefix;
1309 	int n;
1310 
1311 	/* ...do we have an archivePrefix too? */
1312 	n = fields_find( bibin, "ARCHIVEPREFIX", level );
1313 	if ( n==FIELDS_NOTFOUND ) n = fields_find( bibin, "EPRINTTYPE", level );
1314 	if ( n!=FIELDS_NOTFOUND ) {
1315 		prefix = fields_value( bibin, n, FIELDS_CHRP );
1316 		return process_eprint_with_prefix( bibout, prefix, invalue, level );
1317 	}
1318 
1319 	/* ...no we don't */
1320 	return process_eprint_without_prefix( bibout, invalue, level );
1321 }
1322 
1323 /**** bibtexin_keyword() ****/
1324 
1325 /* Split keywords="" with semicolons.
1326  * Commas are also frequently used, but will break
1327  * entries like:
1328  *       keywords="Microscopy, Confocal"
1329  * Returns BIBL_OK or BIBL_ERR_MEMERR
1330  */
1331 
1332 static int
bibtexin_keyword(fields * bibin,int m,str * intag,str * invalue,int level,param * pm,char * outtag,fields * bibout)1333 bibtexin_keyword( fields *bibin, int m, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
1334 {
1335 	int fstatus, status = BIBL_OK;
1336 	const char *p;
1337 	str keyword;
1338 
1339 	p = str_cstr( invalue );
1340 	str_init( &keyword );
1341 
1342 	while ( *p ) {
1343 		p = str_cpytodelim( &keyword, skip_ws( p ), ";", 1 );
1344 		str_trimendingws( &keyword );
1345 		if ( str_memerr( &keyword ) ) {
1346 			status = BIBL_ERR_MEMERR;
1347 			goto out;
1348 		}
1349 		if ( keyword.len ) {
1350 			fstatus = fields_add( bibout, "KEYWORD", str_cstr( &keyword ), level );
1351 			if ( fstatus!=FIELDS_OK ) {
1352 				status = BIBL_ERR_MEMERR;
1353 				goto out;
1354 			}
1355 		}
1356 	}
1357 out:
1358 	str_free( &keyword );
1359 	return status;
1360 }
1361 
1362 /**** bibtexin_title() ****/
1363 
1364 /* bibtexin_titleinbook_isbooktitle()
1365  *
1366  * Normally, the title field of inbook refers to the book.  The
1367  * section in a @inbook reference is untitled.  If it's titled,
1368  * the @incollection should be used.  For example, in:
1369  *
1370  * @inbook{
1371  *    title="xxx"
1372  * }
1373  *
1374  * the booktitle is "xxx".
1375  *
1376  * However, @inbook is frequently abused (and treated like
1377  * @incollection) so that title and booktitle are present
1378  * and title is now 'supposed' to refer to the section.  For example:
1379  *
1380  * @inbook{
1381  *     title="yyy",
1382  *     booktitle="xxx"
1383  * }
1384  *
1385  * Therefore report whether or not booktitle is present as well
1386  * as title in @inbook references.  If not, then make 'title'
1387  * correspond to the title of the book, not the section.
1388  *
1389  */
1390 static int
bibtexin_titleinbook_isbooktitle(fields * bibin,char * intag)1391 bibtexin_titleinbook_isbooktitle( fields *bibin, char *intag )
1392 {
1393 	int n;
1394 
1395 	/* ...look only at 'title="xxx"' elements */
1396 	if ( strcasecmp( intag, "TITLE" ) ) return 0;
1397 
1398 	/* ...look only at '@inbook' references */
1399 	n = fields_find( bibin, "INTERNAL_TYPE", LEVEL_ANY );
1400 	if ( n==FIELDS_NOTFOUND ) return 0;
1401 	if ( strcasecmp( fields_value( bibin, n, FIELDS_CHRP ), "INBOOK" ) ) return 0;
1402 
1403 	/* ...look to see if 'booktitle="yyy"' exists */
1404 	n = fields_find( bibin, "BOOKTITLE", LEVEL_ANY );
1405 	if ( n==FIELDS_NOTFOUND ) return 0;
1406 	else return 1;
1407 }
1408 
1409 static int
bibtexin_title(fields * bibin,int n,str * intag,str * invalue,int level,param * pm,char * outtag,fields * bibout)1410 bibtexin_title( fields *bibin, int n, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout )
1411 {
1412 	int ok;
1413 
1414 	if ( bibtexin_titleinbook_isbooktitle( bibin, intag->data ) ) level=LEVEL_MAIN;
1415 	ok = title_process( bibout, "TITLE", invalue->data, level, pm->nosplittitle );
1416 	if ( ok ) return BIBL_OK;
1417 	else return BIBL_ERR_MEMERR;
1418 }
1419 
1420 static void
bibtexin_notag(param * p,char * tag)1421 bibtexin_notag( param *p, char *tag )
1422 {
1423 	if ( p->verbose && strcmp( tag, "INTERNAL_TYPE" ) ) {
1424 		if ( p->progname ) REprintf( "%s: ", p->progname );
1425 		REprintf( "Cannot find tag '%s'\n", tag );
1426 	}
1427 }
1428 
1429 static int
bibtexin_convertf(fields * bibin,fields * bibout,int reftype,param * p)1430 bibtexin_convertf( fields *bibin, fields *bibout, int reftype, param *p )
1431 {
1432 	static int (*convertfns[NUM_REFTYPES])(fields *, int, str *, str *, int, param *, char *, fields *) = {
1433 		// [ 0 ... NUM_REFTYPES-1 ] = generic_null,
1434 		// [ SIMPLE       ] = generic_simple,
1435 		// [ TITLE        ] = bibtexin_title,
1436 		// [ PERSON       ] = generic_simple,
1437 		// [ PAGES        ] = generic_pages,
1438 		// [ KEYWORD      ] = bibtexin_keyword,
1439 		// [ EPRINT       ] = bibtexin_eprint,
1440 		// [ HOWPUBLISHED ] = bibtexin_howpublished,
1441 		// [ LINKEDFILE   ] = bibtexin_linkedfile,
1442 		// [ NOTES        ] = generic_notes,
1443 		// [ GENRE        ] = generic_genre,
1444 		// [ BT_SENTE     ] = bibtexin_btsente,
1445 		// [ BT_ORG       ] = bibtexin_btorg,
1446 		// [ URL          ] = generic_url
1447 
1448                 [ ALWAYS           ] = generic_null,  // (0)
1449 		[ DEFAULT          ] = generic_null,  // (1)
1450 		[ SKIP             ] = generic_null,  // (2)
1451 		[ SIMPLE           ] = generic_simple,  // (3)
1452 		[ TYPE             ] = generic_null,  // (4)
1453 		[ PERSON           ] = generic_simple, // (5)
1454 		[ DATE             ] = generic_null,  // (6)
1455 		[ PAGES            ] = generic_pages,  // (7)
1456 		[ SERIALNO         ] = generic_null,  // (8)
1457 		[ TITLE            ] = bibtexin_title, // (9)
1458 		[ NOTES            ] = generic_notes,  // (10)
1459 		[ DOI              ] = generic_null,  // (11)
1460 		[ HOWPUBLISHED     ] = bibtexin_howpublished,  // (12)
1461 		[ LINKEDFILE       ] = bibtexin_linkedfile,  // (13)
1462 		[ KEYWORD          ] = bibtexin_keyword, // (14)
1463 		[ URL              ] = generic_url,  // (15)
1464 		[ GENRE            ] = generic_genre, // (16)
1465 		[ BT_SENTE         ] = bibtexin_btsente,  // (17) /* Bibtex 'Sente' */
1466 		[ BT_EPRINT        ] = generic_null,  // (18) /* Bibtex 'Eprint' */
1467 		[ BT_ORG           ] = bibtexin_btorg, // (19) /* Bibtex Organization */
1468 		[ BLT_THESIS_TYPE  ] = generic_null, // (20) /* Biblatex Thesis Type */
1469 		[ BLT_SCHOOL       ] = generic_null,  // (21) /* Biblatex School */
1470 		[ BLT_EDITOR       ] = generic_null, // (22) /* Biblatex Editor */
1471 		[ BLT_SUBTYPE      ] = generic_null,  // (23) /* Biblatex entrysubtype */
1472 		[ BLT_SKIP         ] = generic_skip,  // (24) /* Biblatex Skip Entry */
1473 		[ EPRINT           ] = bibtexin_eprint // (25)
1474 	};
1475 
1476 	int process, level, i, nfields, status = BIBL_OK;
1477 	str *intag, *invalue;
1478 	char *outtag;
1479 
1480 	nfields = fields_num( bibin );
1481 	for ( i=0; i<nfields; ++i ) {
1482 
1483 		if ( fields_used( bibin, i ) )   continue; /* e.g. successful crossref */
1484 		if ( fields_no_tag( bibin, i ) )  continue;
1485 		if ( fields_no_value( bibin, i ) ) continue;
1486 
1487 		intag   = fields_tag( bibin, i, FIELDS_STRP );
1488 		invalue = fields_value( bibin, i, FIELDS_STRP );
1489 
1490 		if ( !translate_oldtag( str_cstr( intag ), reftype, p->all, p->nall, &process, &level, &outtag ) ) {
1491 			bibtexin_notag( p, str_cstr( intag ) );
1492 			continue;
1493 		}
1494 
1495 		status = convertfns[ process ] ( bibin, i, intag, invalue, level, p, outtag, bibout );
1496 		if ( status!=BIBL_OK ) return status;
1497 	}
1498 
1499 	if ( status==BIBL_OK && p->verbose ) fields_report_stderr( bibout );
1500 
1501 	return status;
1502 }
1503