1 /*	$NetBSD: label.y,v 1.2 2016/01/13 19:01:59 christos Exp $	*/
2 
3 /* -*- C++ -*-
4    Copyright (C) 1989, 1990, 1991, 1992, 2000, 2004
5    Free Software Foundation, Inc.
6      Written by James Clark (jjc@jclark.com)
7 
8 This file is part of groff.
9 
10 groff is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free
12 Software Foundation; either version 2, or (at your option) any later
13 version.
14 
15 groff is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or
17 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
18 for more details.
19 
20 You should have received a copy of the GNU General Public License along
21 with groff; see the file COPYING.  If not, write to the Free Software
22 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
23 
24 %{
25 
26 #include "refer.h"
27 #include "refid.h"
28 #include "ref.h"
29 #include "token.h"
30 
31 int yylex();
32 void yyerror(const char *);
33 int yyparse();
34 
35 static const char *format_serial(char c, int n);
36 
37 struct label_info {
38   int start;
39   int length;
40   int count;
41   int total;
42   label_info(const string &);
43 };
44 
45 label_info *lookup_label(const string &label);
46 
47 struct expression {
48   enum {
49     // Does the tentative label depend on the reference?
50     CONTAINS_VARIABLE = 01,
51     CONTAINS_STAR = 02,
52     CONTAINS_FORMAT = 04,
53     CONTAINS_AT = 010
54   };
~expressionexpression55   virtual ~expression() { }
56   virtual void evaluate(int, const reference &, string &,
57 			substring_position &) = 0;
analyzeexpression58   virtual unsigned analyze() { return 0; }
59 };
60 
61 class at_expr : public expression {
62 public:
at_expr()63   at_expr() { }
64   void evaluate(int, const reference &, string &, substring_position &);
analyze()65   unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
66 };
67 
68 class format_expr : public expression {
69   char type;
70   int width;
71   int first_number;
72 public:
73   format_expr(char c, int w = 0, int f = 1)
type(c)74     : type(c), width(w), first_number(f) { }
75   void evaluate(int, const reference &, string &, substring_position &);
analyze()76   unsigned analyze() { return CONTAINS_FORMAT; }
77 };
78 
79 class field_expr : public expression {
80   int number;
81   char name;
82 public:
field_expr(char nm,int num)83   field_expr(char nm, int num) : number(num), name(nm) { }
84   void evaluate(int, const reference &, string &, substring_position &);
analyze()85   unsigned analyze() { return CONTAINS_VARIABLE; }
86 };
87 
88 class literal_expr : public expression {
89   string s;
90 public:
literal_expr(const char * ptr,int len)91   literal_expr(const char *ptr, int len) : s(ptr, len) { }
92   void evaluate(int, const reference &, string &, substring_position &);
93 };
94 
95 class unary_expr : public expression {
96 protected:
97   expression *expr;
98 public:
unary_expr(expression * e)99   unary_expr(expression *e) : expr(e) { }
~unary_expr()100   ~unary_expr() { delete expr; }
101   void evaluate(int, const reference &, string &, substring_position &) = 0;
analyze()102   unsigned analyze() { return expr ? expr->analyze() : 0; }
103 };
104 
105 // This caches the analysis of an expression.
106 
107 class analyzed_expr : public unary_expr {
108   unsigned flags;
109 public:
110   analyzed_expr(expression *);
111   void evaluate(int, const reference &, string &, substring_position &);
analyze()112   unsigned analyze() { return flags; }
113 };
114 
115 class star_expr : public unary_expr {
116 public:
star_expr(expression * e)117   star_expr(expression *e) : unary_expr(e) { }
118   void evaluate(int, const reference &, string &, substring_position &);
analyze()119   unsigned analyze() {
120     return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
121 	    | CONTAINS_STAR);
122   }
123 };
124 
125 typedef void map_func(const char *, const char *, string &);
126 
127 class map_expr : public unary_expr {
128   map_func *func;
129 public:
map_expr(expression * e,map_func * f)130   map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
131   void evaluate(int, const reference &, string &, substring_position &);
132 };
133 
134 typedef const char *extractor_func(const char *, const char *, const char **);
135 
136 class extractor_expr : public unary_expr {
137   int part;
138   extractor_func *func;
139 public:
140   enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
extractor_expr(expression * e,extractor_func * f,int pt)141   extractor_expr(expression *e, extractor_func *f, int pt)
142     : unary_expr(e), part(pt), func(f) { }
143   void evaluate(int, const reference &, string &, substring_position &);
144 };
145 
146 class truncate_expr : public unary_expr {
147   int n;
148 public:
truncate_expr(expression * e,int i)149   truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
150   void evaluate(int, const reference &, string &, substring_position &);
151 };
152 
153 class separator_expr : public unary_expr {
154 public:
separator_expr(expression * e)155   separator_expr(expression *e) : unary_expr(e) { }
156   void evaluate(int, const reference &, string &, substring_position &);
157 };
158 
159 class binary_expr : public expression {
160 protected:
161   expression *expr1;
162   expression *expr2;
163 public:
binary_expr(expression * e1,expression * e2)164   binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
~binary_expr()165   ~binary_expr() { delete expr1; delete expr2; }
166   void evaluate(int, const reference &, string &, substring_position &) = 0;
analyze()167   unsigned analyze() {
168     return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
169   }
170 };
171 
172 class alternative_expr : public binary_expr {
173 public:
alternative_expr(expression * e1,expression * e2)174   alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
175   void evaluate(int, const reference &, string &, substring_position &);
176 };
177 
178 class list_expr : public binary_expr {
179 public:
list_expr(expression * e1,expression * e2)180   list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
181   void evaluate(int, const reference &, string &, substring_position &);
182 };
183 
184 class substitute_expr : public binary_expr {
185 public:
substitute_expr(expression * e1,expression * e2)186   substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
187   void evaluate(int, const reference &, string &, substring_position &);
188 };
189 
190 class ternary_expr : public expression {
191 protected:
192   expression *expr1;
193   expression *expr2;
194   expression *expr3;
195 public:
ternary_expr(expression * e1,expression * e2,expression * e3)196   ternary_expr(expression *e1, expression *e2, expression *e3)
197     : expr1(e1), expr2(e2), expr3(e3) { }
~ternary_expr()198   ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
199   void evaluate(int, const reference &, string &, substring_position &) = 0;
analyze()200   unsigned analyze() {
201     return ((expr1 ? expr1->analyze() : 0)
202 	    | (expr2 ? expr2->analyze() : 0)
203 	    | (expr3 ? expr3->analyze() : 0));
204   }
205 };
206 
207 class conditional_expr : public ternary_expr {
208 public:
conditional_expr(expression * e1,expression * e2,expression * e3)209   conditional_expr(expression *e1, expression *e2, expression *e3)
210     : ternary_expr(e1, e2, e3) { }
211   void evaluate(int, const reference &, string &, substring_position &);
212 };
213 
214 static expression *parsed_label = 0;
215 static expression *parsed_date_label = 0;
216 static expression *parsed_short_label = 0;
217 
218 static expression *parse_result;
219 
220 string literals;
221 
222 %}
223 
224 %union {
225   int num;
226   expression *expr;
227   struct { int ndigits; int val; } dig;
228   struct { int start; int len; } str;
229 }
230 
231 /* uppercase or lowercase letter */
232 %token <num> TOKEN_LETTER
233 /* literal characters */
234 %token <str> TOKEN_LITERAL
235 /* digit */
236 %token <num> TOKEN_DIGIT
237 
238 %type <expr> conditional
239 %type <expr> alternative
240 %type <expr> list
241 %type <expr> string
242 %type <expr> substitute
243 %type <expr> optional_conditional
244 %type <num> number
245 %type <dig> digits
246 %type <num> optional_number
247 %type <num> flag
248 
249 %%
250 
251 expr:
252 	optional_conditional
253 		{ parse_result = ($1 ? new analyzed_expr($1) : 0); }
254 	;
255 
256 conditional:
257 	alternative
258 		{ $$ = $1; }
259 	| alternative '?' optional_conditional ':' conditional
260 		{ $$ = new conditional_expr($1, $3, $5); }
261 	;
262 
263 optional_conditional:
264 	/* empty */
265 		{ $$ = 0; }
266 	| conditional
267 		{ $$ = $1; }
268 	;
269 
270 alternative:
271 	list
272 		{ $$ = $1; }
273 	| alternative '|' list
274 		{ $$ = new alternative_expr($1, $3); }
275 	| alternative '&' list
276 		{ $$ = new conditional_expr($1, $3, 0); }
277 	;
278 
279 list:
280 	substitute
281 		{ $$ = $1; }
282 	| list substitute
283 		{ $$ = new list_expr($1, $2); }
284 	;
285 
286 substitute:
287 	string
288 		{ $$ = $1; }
289 	| substitute '~' string
290 		{ $$ = new substitute_expr($1, $3); }
291 	;
292 
293 string:
294 	'@'
295 		{ $$ = new at_expr; }
296 	| TOKEN_LITERAL
297 		{
298 		  $$ = new literal_expr(literals.contents() + $1.start,
299 					$1.len);
300 		}
301 	| TOKEN_LETTER
302 		{ $$ = new field_expr($1, 0); }
303 	| TOKEN_LETTER number
304 		{ $$ = new field_expr($1, $2 - 1); }
305 	| '%' TOKEN_LETTER
306 		{
307 		  switch ($2) {
308 		  case 'I':
309 		  case 'i':
310 		  case 'A':
311 		  case 'a':
312 		    $$ = new format_expr($2);
313 		    break;
314 		  default:
315 		    command_error("unrecognized format `%1'", char($2));
316 		    $$ = new format_expr('a');
317 		    break;
318 		  }
319 		}
320 
321 	| '%' digits
322 		{
323 		  $$ = new format_expr('0', $2.ndigits, $2.val);
324 		}
325 	| string '.' flag TOKEN_LETTER optional_number
326 		{
327 		  switch ($4) {
328 		  case 'l':
329 		    $$ = new map_expr($1, lowercase);
330 		    break;
331 		  case 'u':
332 		    $$ = new map_expr($1, uppercase);
333 		    break;
334 		  case 'c':
335 		    $$ = new map_expr($1, capitalize);
336 		    break;
337 		  case 'r':
338 		    $$ = new map_expr($1, reverse_name);
339 		    break;
340 		  case 'a':
341 		    $$ = new map_expr($1, abbreviate_name);
342 		    break;
343 		  case 'y':
344 		    $$ = new extractor_expr($1, find_year, $3);
345 		    break;
346 		  case 'n':
347 		    $$ = new extractor_expr($1, find_last_name, $3);
348 		    break;
349 		  default:
350 		    $$ = $1;
351 		    command_error("unknown function `%1'", char($4));
352 		    break;
353 		  }
354 		}
355 
356 	| string '+' number
357 		{ $$ = new truncate_expr($1, $3); }
358 	| string '-' number
359 		{ $$ = new truncate_expr($1, -$3); }
360 	| string '*'
361 		{ $$ = new star_expr($1); }
362 	| '(' optional_conditional ')'
363 		{ $$ = $2; }
364 	| '<' optional_conditional '>'
365 		{ $$ = new separator_expr($2); }
366 	;
367 
368 optional_number:
369 	/* empty */
370 		{ $$ = -1; }
371 	| number
372 		{ $$ = $1; }
373 	;
374 
375 number:
376 	TOKEN_DIGIT
377 		{ $$ = $1; }
378 	| number TOKEN_DIGIT
379 		{ $$ = $1*10 + $2; }
380 	;
381 
382 digits:
383 	TOKEN_DIGIT
384 		{ $$.ndigits = 1; $$.val = $1; }
385 	| digits TOKEN_DIGIT
386 		{ $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
387 	;
388 
389 
390 flag:
391 	/* empty */
392 		{ $$ = 0; }
393 	| '+'
394 		{ $$ = 1; }
395 	| '-'
396 		{ $$ = -1; }
397 	;
398 
399 %%
400 
401 /* bison defines const to be empty unless __STDC__ is defined, which it
402 isn't under cfront */
403 
404 #ifdef const
405 #undef const
406 #endif
407 
408 const char *spec_ptr;
409 const char *spec_end;
410 const char *spec_cur;
411 
412 static char uppercase_array[] = {
413   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
414   'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
415   'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
416   'Y', 'Z',
417 };
418 
419 static char lowercase_array[] = {
420   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
421   'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
422   'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
423   'y', 'z',
424 };
425 
yylex()426 int yylex()
427 {
428   while (spec_ptr < spec_end && csspace(*spec_ptr))
429     spec_ptr++;
430   spec_cur = spec_ptr;
431   if (spec_ptr >= spec_end)
432     return 0;
433   unsigned char c = *spec_ptr++;
434   if (csalpha(c)) {
435     yylval.num = c;
436     return TOKEN_LETTER;
437   }
438   if (csdigit(c)) {
439     yylval.num = c - '0';
440     return TOKEN_DIGIT;
441   }
442   if (c == '\'') {
443     yylval.str.start = literals.length();
444     for (; spec_ptr < spec_end; spec_ptr++) {
445       if (*spec_ptr == '\'') {
446 	if (++spec_ptr < spec_end && *spec_ptr == '\'')
447 	  literals += '\'';
448 	else {
449 	  yylval.str.len = literals.length() - yylval.str.start;
450 	  return TOKEN_LITERAL;
451 	}
452       }
453       else
454 	literals += *spec_ptr;
455     }
456     yylval.str.len = literals.length() - yylval.str.start;
457     return TOKEN_LITERAL;
458   }
459   return c;
460 }
461 
set_label_spec(const char * label_spec)462 int set_label_spec(const char *label_spec)
463 {
464   spec_cur = spec_ptr = label_spec;
465   spec_end = strchr(label_spec, '\0');
466   literals.clear();
467   if (yyparse())
468     return 0;
469   delete parsed_label;
470   parsed_label = parse_result;
471   return 1;
472 }
473 
set_date_label_spec(const char * label_spec)474 int set_date_label_spec(const char *label_spec)
475 {
476   spec_cur = spec_ptr = label_spec;
477   spec_end = strchr(label_spec, '\0');
478   literals.clear();
479   if (yyparse())
480     return 0;
481   delete parsed_date_label;
482   parsed_date_label = parse_result;
483   return 1;
484 }
485 
set_short_label_spec(const char * label_spec)486 int set_short_label_spec(const char *label_spec)
487 {
488   spec_cur = spec_ptr = label_spec;
489   spec_end = strchr(label_spec, '\0');
490   literals.clear();
491   if (yyparse())
492     return 0;
493   delete parsed_short_label;
494   parsed_short_label = parse_result;
495   return 1;
496 }
497 
yyerror(const char * message)498 void yyerror(const char *message)
499 {
500   if (spec_cur < spec_end)
501     command_error("label specification %1 before `%2'", message, spec_cur);
502   else
503     command_error("label specification %1 at end of string",
504 		  message, spec_cur);
505 }
506 
evaluate(int tentative,const reference & ref,string & result,substring_position &)507 void at_expr::evaluate(int tentative, const reference &ref,
508 		       string &result, substring_position &)
509 {
510   if (tentative)
511     ref.canonicalize_authors(result);
512   else {
513     const char *end, *start = ref.get_authors(&end);
514     if (start)
515       result.append(start, end - start);
516   }
517 }
518 
evaluate(int tentative,const reference & ref,string & result,substring_position &)519 void format_expr::evaluate(int tentative, const reference &ref,
520 			   string &result, substring_position &)
521 {
522   if (tentative)
523     return;
524   const label_info *lp = ref.get_label_ptr();
525   int num = lp == 0 ? ref.get_number() : lp->count;
526   if (type != '0')
527     result += format_serial(type, num + 1);
528   else {
529     const char *ptr = i_to_a(num + first_number);
530     int pad = width - strlen(ptr);
531     while (--pad >= 0)
532       result += '0';
533     result += ptr;
534   }
535 }
536 
format_serial(char c,int n)537 static const char *format_serial(char c, int n)
538 {
539   assert(n > 0);
540   static char buf[128]; // more than enough.
541   switch (c) {
542   case 'i':
543   case 'I':
544     {
545       char *p = buf;
546       // troff uses z and w to represent 10000 and 5000 in Roman
547       // numerals; I can find no historical basis for this usage
548       const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
549       if (n >= 40000)
550 	return i_to_a(n);
551       while (n >= 10000) {
552 	*p++ = s[0];
553 	n -= 10000;
554       }
555       for (int i = 1000; i > 0; i /= 10, s += 2) {
556 	int m = n/i;
557 	n -= m*i;
558 	switch (m) {
559 	case 3:
560 	  *p++ = s[2];
561 	  /* falls through */
562 	case 2:
563 	  *p++ = s[2];
564 	  /* falls through */
565 	case 1:
566 	  *p++ = s[2];
567 	  break;
568 	case 4:
569 	  *p++ = s[2];
570 	  *p++ = s[1];
571 	  break;
572 	case 8:
573 	  *p++ = s[1];
574 	  *p++ = s[2];
575 	  *p++ = s[2];
576 	  *p++ = s[2];
577 	  break;
578 	case 7:
579 	  *p++ = s[1];
580 	  *p++ = s[2];
581 	  *p++ = s[2];
582 	  break;
583 	case 6:
584 	  *p++ = s[1];
585 	  *p++ = s[2];
586 	  break;
587 	case 5:
588 	  *p++ = s[1];
589 	  break;
590 	case 9:
591 	  *p++ = s[2];
592 	  *p++ = s[0];
593 	}
594       }
595       *p = 0;
596       break;
597     }
598   case 'a':
599   case 'A':
600     {
601       char *p = buf;
602       // this is derived from troff/reg.c
603       while (n > 0) {
604 	int d = n % 26;
605 	if (d == 0)
606 	  d = 26;
607 	n -= d;
608 	n /= 26;
609 	*p++ = c == 'a' ? lowercase_array[d - 1] :
610 			       uppercase_array[d - 1];
611       }
612       *p-- = 0;
613       // Reverse it.
614       char *q = buf;
615       while (q < p) {
616 	char temp = *q;
617 	*q = *p;
618 	*p = temp;
619 	--p;
620 	++q;
621       }
622       break;
623     }
624   default:
625     assert(0);
626   }
627   return buf;
628 }
629 
evaluate(int,const reference & ref,string & result,substring_position &)630 void field_expr::evaluate(int, const reference &ref,
631 			  string &result, substring_position &)
632 {
633   const char *end;
634   const char *start = ref.get_field(name, &end);
635   if (start) {
636     start = nth_field(number, start, &end);
637     if (start)
638       result.append(start, end - start);
639   }
640 }
641 
evaluate(int,const reference &,string & result,substring_position &)642 void literal_expr::evaluate(int, const reference &,
643 			    string &result, substring_position &)
644 {
645   result += s;
646 }
647 
analyzed_expr(expression * e)648 analyzed_expr::analyzed_expr(expression *e)
649 : unary_expr(e), flags(e ? e->analyze() : 0)
650 {
651 }
652 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)653 void analyzed_expr::evaluate(int tentative, const reference &ref,
654 			     string &result, substring_position &pos)
655 {
656   if (expr)
657     expr->evaluate(tentative, ref, result, pos);
658 }
659 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)660 void star_expr::evaluate(int tentative, const reference &ref,
661 			 string &result, substring_position &pos)
662 {
663   const label_info *lp = ref.get_label_ptr();
664   if (!tentative
665       && (lp == 0 || lp->total > 1)
666       && expr)
667     expr->evaluate(tentative, ref, result, pos);
668 }
669 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)670 void separator_expr::evaluate(int tentative, const reference &ref,
671 			      string &result, substring_position &pos)
672 {
673   int start_length = result.length();
674   int is_first = pos.start < 0;
675   if (expr)
676     expr->evaluate(tentative, ref, result, pos);
677   if (is_first) {
678     pos.start = start_length;
679     pos.length = result.length() - start_length;
680   }
681 }
682 
evaluate(int tentative,const reference & ref,string & result,substring_position &)683 void map_expr::evaluate(int tentative, const reference &ref,
684 			string &result, substring_position &)
685 {
686   if (expr) {
687     string temp;
688     substring_position temp_pos;
689     expr->evaluate(tentative, ref, temp, temp_pos);
690     (*func)(temp.contents(), temp.contents() + temp.length(), result);
691   }
692 }
693 
evaluate(int tentative,const reference & ref,string & result,substring_position &)694 void extractor_expr::evaluate(int tentative, const reference &ref,
695 			      string &result, substring_position &)
696 {
697   if (expr) {
698     string temp;
699     substring_position temp_pos;
700     expr->evaluate(tentative, ref, temp, temp_pos);
701     const char *end, *start = (*func)(temp.contents(),
702 				      temp.contents() + temp.length(),
703 				      &end);
704     switch (part) {
705     case BEFORE:
706       if (start)
707 	result.append(temp.contents(), start - temp.contents());
708       else
709 	result += temp;
710       break;
711     case MATCH:
712       if (start)
713 	result.append(start, end - start);
714       break;
715     case AFTER:
716       if (start)
717 	result.append(end, temp.contents() + temp.length() - end);
718       break;
719     default:
720       assert(0);
721     }
722   }
723 }
724 
first_part(int len,const char * ptr,const char * end,string & result)725 static void first_part(int len, const char *ptr, const char *end,
726 			  string &result)
727 {
728   for (;;) {
729     const char *token_start = ptr;
730     if (!get_token(&ptr, end))
731       break;
732     const token_info *ti = lookup_token(token_start, ptr);
733     int counts = ti->sortify_non_empty(token_start, ptr);
734     if (counts && --len < 0)
735       break;
736     if (counts || ti->is_accent())
737       result.append(token_start, ptr - token_start);
738   }
739 }
740 
last_part(int len,const char * ptr,const char * end,string & result)741 static void last_part(int len, const char *ptr, const char *end,
742 		      string &result)
743 {
744   const char *start = ptr;
745   int count = 0;
746   for (;;) {
747     const char *token_start = ptr;
748     if (!get_token(&ptr, end))
749       break;
750     const token_info *ti = lookup_token(token_start, ptr);
751     if (ti->sortify_non_empty(token_start, ptr))
752       count++;
753   }
754   ptr = start;
755   int skip = count - len;
756   if (skip > 0) {
757     for (;;) {
758       const char *token_start = ptr;
759       if (!get_token(&ptr, end))
760 	assert(0);
761       const token_info *ti = lookup_token(token_start, ptr);
762       if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
763 	ptr = token_start;
764 	break;
765       }
766     }
767   }
768   first_part(len, ptr, end, result);
769 }
770 
evaluate(int tentative,const reference & ref,string & result,substring_position &)771 void truncate_expr::evaluate(int tentative, const reference &ref,
772 			     string &result, substring_position &)
773 {
774   if (expr) {
775     string temp;
776     substring_position temp_pos;
777     expr->evaluate(tentative, ref, temp, temp_pos);
778     const char *start = temp.contents();
779     const char *end = start + temp.length();
780     if (n > 0)
781       first_part(n, start, end, result);
782     else if (n < 0)
783       last_part(-n, start, end, result);
784   }
785 }
786 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)787 void alternative_expr::evaluate(int tentative, const reference &ref,
788 				string &result, substring_position &pos)
789 {
790   int start_length = result.length();
791   if (expr1)
792     expr1->evaluate(tentative, ref, result, pos);
793   if (result.length() == start_length && expr2)
794     expr2->evaluate(tentative, ref, result, pos);
795 }
796 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)797 void list_expr::evaluate(int tentative, const reference &ref,
798 			 string &result, substring_position &pos)
799 {
800   if (expr1)
801     expr1->evaluate(tentative, ref, result, pos);
802   if (expr2)
803     expr2->evaluate(tentative, ref, result, pos);
804 }
805 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)806 void substitute_expr::evaluate(int tentative, const reference &ref,
807 			       string &result, substring_position &pos)
808 {
809   int start_length = result.length();
810   if (expr1)
811     expr1->evaluate(tentative, ref, result, pos);
812   if (result.length() > start_length && result[result.length() - 1] == '-') {
813     // ought to see if pos covers the -
814     result.set_length(result.length() - 1);
815     if (expr2)
816       expr2->evaluate(tentative, ref, result, pos);
817   }
818 }
819 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)820 void conditional_expr::evaluate(int tentative, const reference &ref,
821 				string &result, substring_position &pos)
822 {
823   string temp;
824   substring_position temp_pos;
825   if (expr1)
826     expr1->evaluate(tentative, ref, temp, temp_pos);
827   if (temp.length() > 0) {
828     if (expr2)
829       expr2->evaluate(tentative, ref, result, pos);
830   }
831   else {
832     if (expr3)
833       expr3->evaluate(tentative, ref, result, pos);
834   }
835 }
836 
pre_compute_label()837 void reference::pre_compute_label()
838 {
839   if (parsed_label != 0
840       && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
841     label.clear();
842     substring_position temp_pos;
843     parsed_label->evaluate(1, *this, label, temp_pos);
844     label_ptr = lookup_label(label);
845   }
846 }
847 
compute_label()848 void reference::compute_label()
849 {
850   label.clear();
851   if (parsed_label)
852     parsed_label->evaluate(0, *this, label, separator_pos);
853   if (short_label_flag && parsed_short_label)
854     parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
855   if (date_as_label) {
856     string new_date;
857     if (parsed_date_label) {
858       substring_position temp_pos;
859       parsed_date_label->evaluate(0, *this, new_date, temp_pos);
860     }
861     set_date(new_date);
862   }
863   if (label_ptr)
864     label_ptr->count += 1;
865 }
866 
immediate_compute_label()867 void reference::immediate_compute_label()
868 {
869   if (label_ptr)
870     label_ptr->total = 2;	// force use of disambiguator
871   compute_label();
872 }
873 
merge_labels(reference ** v,int n,label_type type,string & result)874 int reference::merge_labels(reference **v, int n, label_type type,
875 			    string &result)
876 {
877   if (abbreviate_label_ranges)
878     return merge_labels_by_number(v, n, type, result);
879   else
880     return merge_labels_by_parts(v, n, type, result);
881 }
882 
merge_labels_by_number(reference ** v,int n,label_type type,string & result)883 int reference::merge_labels_by_number(reference **v, int n, label_type type,
884 				      string &result)
885 {
886   if (n <= 1)
887     return 0;
888   int num = get_number();
889   // Only merge three or more labels.
890   if (v[0]->get_number() != num + 1
891       || v[1]->get_number() != num + 2)
892     return 0;
893   int i;
894   for (i = 2; i < n; i++)
895     if (v[i]->get_number() != num + i + 1)
896       break;
897   result = get_label(type);
898   result += label_range_indicator;
899   result += v[i - 1]->get_label(type);
900   return i;
901 }
902 
get_separator_pos(label_type type)903 const substring_position &reference::get_separator_pos(label_type type) const
904 {
905   if (type == SHORT_LABEL && short_label_flag)
906     return short_separator_pos;
907   else
908     return separator_pos;
909 }
910 
get_label(label_type type)911 const string &reference::get_label(label_type type) const
912 {
913   if (type == SHORT_LABEL && short_label_flag)
914     return short_label;
915   else
916     return label;
917 }
918 
merge_labels_by_parts(reference ** v,int n,label_type type,string & result)919 int reference::merge_labels_by_parts(reference **v, int n, label_type type,
920 				     string &result)
921 {
922   if (n <= 0)
923     return 0;
924   const string &lb = get_label(type);
925   const substring_position &sp = get_separator_pos(type);
926   if (sp.start < 0
927       || sp.start != v[0]->get_separator_pos(type).start
928       || memcmp(lb.contents(), v[0]->get_label(type).contents(),
929 		sp.start) != 0)
930     return 0;
931   result = lb;
932   int i = 0;
933   do {
934     result += separate_label_second_parts;
935     const substring_position &s = v[i]->get_separator_pos(type);
936     int sep_end_pos = s.start + s.length;
937     result.append(v[i]->get_label(type).contents() + sep_end_pos,
938 		  v[i]->get_label(type).length() - sep_end_pos);
939   } while (++i < n
940 	   && sp.start == v[i]->get_separator_pos(type).start
941 	   && memcmp(lb.contents(), v[i]->get_label(type).contents(),
942 		     sp.start) == 0);
943   return i;
944 }
945 
946 string label_pool;
947 
label_info(const string & s)948 label_info::label_info(const string &s)
949 : start(label_pool.length()), length(s.length()), count(0), total(1)
950 {
951   label_pool += s;
952 }
953 
954 static label_info **label_table = 0;
955 static int label_table_size = 0;
956 static int label_table_used = 0;
957 
lookup_label(const string & label)958 label_info *lookup_label(const string &label)
959 {
960   if (label_table == 0) {
961     label_table = new label_info *[17];
962     label_table_size = 17;
963     for (int i = 0; i < 17; i++)
964       label_table[i] = 0;
965   }
966   unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
967   label_info **ptr;
968   for (ptr = label_table + h;
969        *ptr != 0;
970        (ptr == label_table)
971        ? (ptr = label_table + label_table_size - 1)
972        : ptr--)
973     if ((*ptr)->length == label.length()
974 	&& memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
975 		  label.length()) == 0) {
976       (*ptr)->total += 1;
977       return *ptr;
978     }
979   label_info *result = *ptr = new label_info(label);
980   if (++label_table_used * 2 > label_table_size) {
981     // Rehash the table.
982     label_info **old_table = label_table;
983     int old_size = label_table_size;
984     label_table_size = next_size(label_table_size);
985     label_table = new label_info *[label_table_size];
986     int i;
987     for (i = 0; i < label_table_size; i++)
988       label_table[i] = 0;
989     for (i = 0; i < old_size; i++)
990       if (old_table[i]) {
991 	h = hash_string(label_pool.contents() + old_table[i]->start,
992 			old_table[i]->length);
993 	label_info **p;
994 	for (p = label_table + (h % label_table_size);
995 	     *p != 0;
996 	     (p == label_table)
997 	     ? (p = label_table + label_table_size - 1)
998 	     : --p)
999 	    ;
1000 	*p = old_table[i];
1001 	}
1002     a_delete old_table;
1003   }
1004   return result;
1005 }
1006 
clear_labels()1007 void clear_labels()
1008 {
1009   for (int i = 0; i < label_table_size; i++) {
1010     delete label_table[i];
1011     label_table[i] = 0;
1012   }
1013   label_table_used = 0;
1014   label_pool.clear();
1015 }
1016 
1017 static void consider_authors(reference **start, reference **end, int i);
1018 
compute_labels(reference ** v,int n)1019 void compute_labels(reference **v, int n)
1020 {
1021   if (parsed_label
1022       && (parsed_label->analyze() & expression::CONTAINS_AT)
1023       && sort_fields.length() >= 2
1024       && sort_fields[0] == 'A'
1025       && sort_fields[1] == '+')
1026     consider_authors(v, v + n, 0);
1027   for (int i = 0; i < n; i++)
1028     v[i]->compute_label();
1029 }
1030 
1031 
1032 /* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1033 where 0 <= i <= N if there exists a reference with a list of authors
1034 <B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1035 and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1036 A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1037 <B0,B1,...,BM>.  If a reference needs author i we only have to call
1038 need_author(j) for some j >= i such that the reference also needs
1039 author j. */
1040 
1041 /* This function handles 2 tasks:
1042 determine which authors are needed (cannot be elided with et al.);
1043 determine which authors can have only last names in the labels.
1044 
1045 References >= start and < end have the same first i author names.
1046 Also they're sorted by A+. */
1047 
consider_authors(reference ** start,reference ** end,int i)1048 static void consider_authors(reference **start, reference **end, int i)
1049 {
1050   if (start >= end)
1051     return;
1052   reference **p = start;
1053   if (i >= (*p)->get_nauthors()) {
1054     for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1055       ;
1056     if (p < end && i > 0) {
1057       // If we have an author list <A B C> and an author list <A B C D>,
1058       // then both lists need C.
1059       for (reference **q = start; q < end; q++)
1060 	(*q)->need_author(i - 1);
1061     }
1062     start = p;
1063   }
1064   while (p < end) {
1065     reference **last_name_start = p;
1066     reference **name_start = p;
1067     for (++p;
1068 	 p < end && i < (*p)->get_nauthors()
1069 	 && same_author_last_name(**last_name_start, **p, i);
1070 	 p++) {
1071       if (!same_author_name(**name_start, **p, i)) {
1072 	consider_authors(name_start, p, i + 1);
1073 	name_start = p;
1074       }
1075     }
1076     consider_authors(name_start, p, i + 1);
1077     if (last_name_start == name_start) {
1078       for (reference **q = last_name_start; q < p; q++)
1079 	(*q)->set_last_name_unambiguous(i);
1080     }
1081     // If we have an author list <A B C D> and <A B C E>, then the lists
1082     // need author D and E respectively.
1083     if (name_start > start || p < end) {
1084       for (reference **q = last_name_start; q < p; q++)
1085 	(*q)->need_author(i);
1086     }
1087   }
1088 }
1089 
same_author_last_name(const reference & r1,const reference & r2,int n)1090 int same_author_last_name(const reference &r1, const reference &r2, int n)
1091 {
1092   const char *ae1;
1093   const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1094   const char *ae2;
1095   const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1096   if (!as1 && !as2) return 1;	// they are the same
1097   if (!as1 || !as2) return 0;
1098   return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1099 }
1100 
same_author_name(const reference & r1,const reference & r2,int n)1101 int same_author_name(const reference &r1, const reference &r2, int n)
1102 {
1103   const char *ae1;
1104   const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1105   const char *ae2;
1106   const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1107   if (!as1 && !as2) return 1;	// they are the same
1108   if (!as1 || !as2) return 0;
1109   return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1110 }
1111 
1112 
set(int i)1113 void int_set::set(int i)
1114 {
1115   assert(i >= 0);
1116   int bytei = i >> 3;
1117   if (bytei >= v.length()) {
1118     int old_length = v.length();
1119     v.set_length(bytei + 1);
1120     for (int j = old_length; j <= bytei; j++)
1121       v[j] = 0;
1122   }
1123   v[bytei] |= 1 << (i & 7);
1124 }
1125 
get(int i)1126 int int_set::get(int i) const
1127 {
1128   assert(i >= 0);
1129   int bytei = i >> 3;
1130   return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1131 }
1132 
set_last_name_unambiguous(int i)1133 void reference::set_last_name_unambiguous(int i)
1134 {
1135   last_name_unambiguous.set(i);
1136 }
1137 
need_author(int n)1138 void reference::need_author(int n)
1139 {
1140   if (n > last_needed_author)
1141     last_needed_author = n;
1142 }
1143 
get_authors(const char ** end)1144 const char *reference::get_authors(const char **end) const
1145 {
1146   if (!computed_authors) {
1147     ((reference *)this)->computed_authors = 1;
1148     string &result = ((reference *)this)->authors;
1149     int na = get_nauthors();
1150     result.clear();
1151     for (int i = 0; i < na; i++) {
1152       if (last_name_unambiguous.get(i)) {
1153 	const char *e, *start = get_author_last_name(i, &e);
1154 	assert(start != 0);
1155 	result.append(start, e - start);
1156       }
1157       else {
1158 	const char *e, *start = get_author(i, &e);
1159 	assert(start != 0);
1160 	result.append(start, e - start);
1161       }
1162       if (i == last_needed_author
1163 	  && et_al.length() > 0
1164 	  && et_al_min_elide > 0
1165 	  && last_needed_author + et_al_min_elide < na
1166 	  && na >= et_al_min_total) {
1167 	result += et_al;
1168 	break;
1169       }
1170       if (i < na - 1) {
1171 	if (na == 2)
1172 	  result += join_authors_exactly_two;
1173 	else if (i < na - 2)
1174 	  result += join_authors_default;
1175 	else
1176 	  result += join_authors_last_two;
1177       }
1178     }
1179   }
1180   const char *start = authors.contents();
1181   *end = start + authors.length();
1182   return start;
1183 }
1184 
get_nauthors()1185 int reference::get_nauthors() const
1186 {
1187   if (nauthors < 0) {
1188     const char *dummy;
1189     int na;
1190     for (na = 0; get_author(na, &dummy) != 0; na++)
1191       ;
1192     ((reference *)this)->nauthors = na;
1193   }
1194   return nauthors;
1195 }
1196