1 #error "JIT code is no longer maintained -- cmdlist is almost as fast on ix86"
2 
3 //
4 //	JIT-compiled filter-running code.
5 //
6 //        Copyright (c) 2002-2003 Jim Peters <http://uazu.net/>.  This
7 //        file is released under the GNU Lesser General Public License
8 //        (LGPL) version 2.1 as published by the Free Software
9 //        Foundation.  See the file COPYING_LIB for details, or visit
10 //	  <http://www.fsf.org/licenses/licenses.html>.
11 //
12 //	The aim of this version of the filter-running code is to go as
13 //	fast as possible (without flattening the sub-filters together)
14 //	by generating the necessary code at run-time.
15 //
16 //	This runs the filter exactly as specified, without convolving
17 //	the sub-filters together or changing their order.  The only
18 //	rearrangement performed is making the IIR first coefficient
19 //	1.0, and gathering any lone 1-coefficient FIR filters together
20 //	into a single initial gain adjustment.  For this reason, the
21 //	routine runs fastest if IIR and FIR sub-filters are grouped
22 //	together in IIR/FIR pairs, as these can then share common
23 //	working buffers.
24 //
25 //	The generated code is cached, and is reused for more than one
26 //	filter if possible.  This means that a bank of 1000s of
27 //	filters of similar types will probably all end up sharing the
28 //	same generated routine, which improves processor cache and
29 //	memory usage.
30 //
31 //	Probably the generated code could be improved, but it is not
32 //	too bad.  Copying the buffer values using 'rep movsl' turned
33 //	out to be much faster than loading and storing the floating
34 //	point values individually whilst working through the buffer.
35 //
36 //	The generated code was tested for speed on a Celeron-900 and
37 //	on a Pentium-133.  It always beats the RF_CMDLIST option.  It
38 //	can be slightly slower than the RF_COMBINED option, but only
39 //	where that option gets a big advantage from flattening the
40 //	sub-filters.  For pre-flattened filters, it is faster.
41 //
42 //	The generated code can be dumped out at any point in .s format
43 //	using fid_run_dump().  This can be assembled using 'gas' and
44 //	then disassembled with 'objdump -d' to see all the generated
45 //	code.
46 //
47 //	Things that could be improved:
48 //
49 //	- Don't keep the fir running total on the stack at all times.
50 //	Instead create it at the first FIR operation.  This means
51 //	generating about 10 new special-case macros.  This would save
52 //	an add for every filter stage, and some of the messing around
53 //	at start and end currently done to set up / clean up this
54 //	value on the FP stack.
55 //
56 
57 typedef struct Routine Routine;
58 struct Routine {
59    Routine *nxt;	// Next in list, or 0
60    int ref;		// Reference count
61    int hash;		// Hash of routine
62    char *code;		// Routine itself
63    int len;		// Length of code in bytes
64 };
65 
66 typedef struct Run {
67    int magic;		// Magic: 0x64966325
68    int n_buf;		// Length of working buffer required in doubles
69    double *coef;	// Coefficient list
70    Routine *rout;	// Routine used
71 } Run;
72 
73 typedef struct RunBuf {
74    double *coef;	// Coefficient array
75    int mov_cnt;		// Number of 4-byte chunks to copy from &buf[1] to &buf[0]
76    double buf[0];	// Buffer itself
77 } RunBuf;
78 
79 static unsigned long int do_hash(unsigned char *, unsigned long int, unsigned long int);
80 #define HASH(p,len) ((int)do_hash((unsigned char *)p, (unsigned long int)len, 0))
81 
82 
83 //	Code generation
84 //
85 //	%edx is the working buffer pointer
86 //	%eax is the coefficient pointer
87 //	%ecx is the loop counter
88 //	floating point stack contains working values at the top, then
89 //	  previous buffer value, then running iir total, then running
90 //	  fir total
91 //
92 //	Codes in the add() string:
93 //
94 //	  %C  4-byte long value count for loop
95 //	  %L  Label -- remember this address for looping back to
96 //	  %R  1-byte relative jump back to %L address
97 //	  %D  1-byte relative address of buffer value.  If zero, this adjusts the
98 //		previous byte by ^=0x40 to make it a pure (%edx) form instead of 0(%edx)
99 //	  %D+ 1-byte relative address of buffer value as above, plus increment %edx
100 //		if we are getting close to the end of the range
101 //	  %A  1-byte relative address of coefficient value.  If zero does same as for %D.
102 //	  %A+ 1-byte relative address of coefficient value, plus %eax inc
103 //		if necessary
104 //	  %=  Insert code to update %edx and %eax to point to the given offsets
105 //
106 //	Startup code
107 //
108 //	  pushl %ebp
109 //	  movl %esp,%ebp
110 //	  movl 8(%ebp),%edx
111 //	  movl (%edx),%eax
112 //	  movl 4(%edx),%ecx
113 //	  fldz
114 //	  fldl 12(%ebp)
115 //	  fldl 8(%edx)
116 //	  fmull (%eax)
117 //	  leal 8(%edx),%edi
118 //	  leal 16(%edx),%esi
119 //	  cld
120 //        rep movsl
121 
122 #define STARTUP add("55 89E5 8B5508 8B02 8B4A04 D9EE DD450C DD4208 DC08 8D7A08 8D7210 FC F3A5")
123 
124 //	Return
125 //
126 //	  fstp %st(0)	// pop
127 //	  fstp %st(1)
128 //	  leave
129 //	  ret
130 
131 #define RETURN add("DDD8 DDD9 C9 C3")
132 
133 //	Looping
134 //
135 //	  movl $100,%ecx
136 //	.LXX
137 //	  ...
138 //	  loop .LXX
139 //
140 //	//WAS  decl %ecx
141 //	//WAS  testl %ecx,%ecx
142 //	//WAS  jg .LXX
143 
144 #define FOR(xx, nnd, nna) add("B9%C %= %L", xx, (nnd)*8, (nna)*8)
145 //WAS #define NEXT(nnd, nna) add("%= 49 85C9 7F%R", (nnd)*8, (nna)*8)
146 #define NEXT(nnd, nna) add("%= E2%R", (nnd)*8, (nna)*8)
147 
148 //	Fetching/storing buffer values
149 //
150 //	tmp= buf[n];
151 //	  fldl nn(%edx)
152 //
153 //	buf[nn]= iir;
154 //	  fld %st(1)
155 //	  fstpl nn(%edx)
156 
157 #define GETB(nn) add("DD42%D+", (nn)*8)
158 #define PUTB(nn) add("D9C1 DD5A%D+", (nn)*8)
159 
160 //	FIR element with following IIR element
161 //
162 //	fir -= 2 * tmp;
163 //	  fsub %st(0),%st(2)
164 //	  fsub %st(0),%st(2)
165 //	fir -= tmp;
166 //	  fsub %st(0),%st(2)
167 //	fir += tmp;
168 //	  fadd %st(0),%st(2)
169 //	fir += 2 * tmp;
170 //	  fadd %st(0),%st(2)
171 //	  fadd %st(0),%st(2)
172 //	fir += coef[nn] * tmp;
173 //	  fld %st(0)
174 //	  fmull nn(%eax)
175 //	  faddp %st(0),%st(3)
176 
177 #define FIRc_M2 add("DCEA DCEA")
178 #define FIRc_M1 add("DCEA")
179 #define FIRc_P1 add("DCC2")
180 #define FIRc_P2 add("DCC2 DCC2")
181 #define FIRc(nn) add("D9C0 DC48%A+ DEC3", (nn)*8)
182 
183 //	FIR element with no following IIR element
184 //
185 //	fir -= 2 * tmp;
186 //	  fsub %st(0),%st(2)
187 //	  fsubp %st(0),%st(2)
188 //	fir -= tmp;
189 //	  fsubp %st(0),%st(2)
190 //	fir += 0 * tmp;
191 //	  fstp %st(0),%st(0)	// Really I just want to pop the top value
192 //	fir += tmp;
193 //	  faddp %st(0),%st(2)
194 //	fir += 2 * tmp;
195 //	  fadd %st(0),%st(2)
196 //	  faddp %st(0),%st(2)
197 //	fir += coef[0] * tmp;
198 //	  fmull nn(%eax)
199 //	  faddp %st(0),%st(2)
200 
201 #define FIR_M2 add("DCEA DEEA")
202 #define FIR_M1 add("DEEA")
203 #define FIR_0 add("DDD8")
204 #define FIR_P1 add("DEC2")
205 #define FIR_P2 add("DCC2 DEC2")
206 #define FIR(nn) add("DC48%A+ DEC2", (nn)*8)
207 
208 //	IIR element
209 //
210 //	iir -= coef[nn] * tmp;
211 //	  fmull nn(%eax)
212 //	  fsubp %st(0),%st(1)
213 
214 #define IIR(nn) add("DC48%A+ DEE9", (nn)*8)
215 
216 //	Final FIR element of pure-FIR or mixed FIR-IIR stage
217 //
218 //	iir= fir + coef[nn] * iir; fir= 0;
219 //	  fxch
220 //	  fmull nn(%eax)
221 //	  faddp %st(2)
222 //	  fldz
223 //	  fstp %st(3)
224 //	iir= fir + 1.0 * iir; fir= 0;
225 //	  fxch
226 //	  faddp %st(2)
227 //	  fldz
228 //	  fstp %st(3)
229 //	iir= fir - 1.0 * iir; fir= 0;
230 //	  fxch
231 //	  fsubp %st(2)
232 //	  fldz
233 //	  fstp %st(3)
234 
235 #define FIREND(nn) add("D9C9 DC48%A+ DEC2 D9EE DDDB", (nn)*8)
236 #define FIREND_P1 add("D9C9 DEC2 D9EE DDDB")
237 #define FIREND_M1 add("D9C9 DEEA D9EE DDDB")
238 
239 //
240 //	Globals for handling routines
241 //
242 
243 static char *r_buf;	// Buffer address
244 static char *r_end;	// Current end of buffer
245 static char *r_cp;	// Current write-position
246 static char *r_lab;	// Current loop-back label, or 0
247 static int r_loop;	// Loop count
248 static int r_edx;	// %edx offset relative to initial position
249 static int r_eax;	// %eax offset relative to initial position
250 static Routine *r_list;	// List of routines or 0
251 
252 //
253 //	Add code to the current routine.  This uses global variables,
254 //	and so is not thread-safe.
255 //
256 
257 static void
add(char * fmt,...)258 add(char *fmt, ...) {
259    va_list ap;
260    int ch, val;
261    va_start(ap, fmt);
262 
263    if (r_end - r_cp < 32)
264       error("JIT error: routine buffer exceeded");
265 
266    while ((ch= *fmt++)) {
267       if (isspace(ch)) continue;
268       if (isdigit(ch) || (ch >= 'A' && ch <= 'F')) {
269 	 val= ch >= 'A' ? ch - 'A' + 10 : ch - '0';
270 	 ch= *fmt++;
271 	 if (!isdigit(ch) && !(ch >= 'A' && ch <= 'F'))
272 	    error("JIT error: Bad format for add() routine");
273 	 val= (val*16) + (ch >= 'A' ? ch - 'A' + 10 : ch - '0');
274 	 *r_cp++= val;
275 	 continue;
276       }
277       if (ch != '%')
278 	 error("JIT error: add() routine bad format string");
279       switch (ch= *fmt++) {
280        case 'C':
281 	  val= va_arg(ap, int);
282 	  r_loop= val;
283 	  *r_cp++= val;
284 	  *r_cp++= val>>8;
285 	  *r_cp++= val>>16;
286 	  *r_cp++= val>>24;
287 	  break;
288        case 'L':
289 	  if (r_lab) error("JIT error: two stacked %L formats");
290 	  r_lab= r_cp;
291 	  break;
292        case 'R':
293 	  if (!r_lab) error("JIT error: %R without matching %L");
294 	  val= r_lab - (r_cp+1);
295 	  if (val < -128) error("JIT error: %R too far from %L");
296 	  *r_cp++= val;
297 	  r_lab= 0;
298 	  break;
299        case 'D':
300 	  val= va_arg(ap, int) - r_edx;
301 	  if (val < -128 || val >= 128) error("JIT error: %%edx offset out of range");
302 	  if (val == 0)
303 	     r_cp[-1] ^= 0x40;
304 	  else
305 	     *r_cp++= val;
306 	  if (*fmt == '+') {
307 	     fmt++;
308 	     if (val >= 120) {
309 		*r_cp++= 0x83;	// addl $120,%edx
310 		*r_cp++= 0xC2;
311 		*r_cp++= 0x78;
312 		r_edx += 120;
313 	     }
314 	  }
315 	  break;
316        case 'A':
317 	  val= va_arg(ap, int) - r_eax;
318 	  if (val < -128 || val >= 128) error("JIT error: %%eax offset out of range");
319 	  if (val == 0)
320 	     r_cp[-1] ^= 0x40;
321 	  else
322 	     *r_cp++= val;
323 	  if (*fmt == '+') {
324 	     fmt++;
325 	     if (val >= 120) {
326 		*r_cp++= 0x83;	// addl $120,%eax
327 		*r_cp++= 0xC0;
328 		*r_cp++= 0x78;
329 		r_eax += 120;
330 	     }
331 	  }
332 	  break;
333        case '=':
334 	  val= va_arg(ap, int) - r_edx;
335 	  if (val != 0) {
336 	     if (val < -128 || val >= 128)
337 		error("JIT error: %%= adjust for %%edx is out of range");
338 	     *r_cp++= 0x83;	// addl $120,%edx
339 	     *r_cp++= 0xC2;
340 	     *r_cp++= val;
341 	     r_edx += val * (r_lab ? r_loop : 1);
342 	  }
343 	  val= va_arg(ap, int) - r_eax;
344 	  if (val != 0) {
345 	     if (val < -128 || val >= 128)
346 		error("JIT error: %%= adjust for %%edx is out of range");
347 	     *r_cp++= 0x83;	// addl $120,%edx
348 	     *r_cp++= 0xC0;
349 	     *r_cp++= val;
350 	     r_eax += val * (r_lab ? r_loop : 1);
351 	  }
352 	  break;
353        default:
354 	  error("JIT error: bad format for add()");
355       }
356    }
357 }
358 
359 
360 //
361 //	Create an instance of a filter, ready to run.  This returns a
362 //	void* handle, and a JIT-compiled function to call to execute
363 //	the filter.  (The functions are cached, so if several versions
364 //	of the same filter are generated with different parameters, it
365 //	is likely that the same routine will end up servicing all of
366 //	them).
367 //
368 //	Working buffers for the filter instances must be allocated
369 //	separately using fid_run_newbuf().  This allows many
370 //	simultaneous instances of the same filter to be run.
371 //
372 //	The sub-filters are executed in the precise order that they
373 //	are given.  This may lead to some inefficiency, because
374 //	normally when an IIR filter is followed by an FIR filter, the
375 //	buffers can be shared.  So, if the sub-filters are not in
376 //	IIR/FIR pairs, then extra memory accesses are required.
377 //
378 //	In any case, factors are extracted from IIR filters (so that
379 //	the first coefficient is 1), and single-element FIR filters
380 //	are merged into the global gain factor, and are ignored.
381 //
382 //	The returned handle must be released using fid_run_free().
383 //
384 
385 // Loop the generated code above LOOP repeats (8)
386 #define LOOP 8
387 
388 void *
fid_run_new(FidFilter * filt,double (** funcpp)(void *,double))389 fid_run_new(FidFilter *filt, double (**funcpp)(void *,double)) {
390    FidFilter *ff;
391    double *dp;
392    double gain= 1.0;
393    int a, val;
394    double *coef_tmp;
395    char *rout_tmp;
396    int coef_cnt, coef_max;
397    int rout_cnt, rout_max;
398    int filt_cnt= 0;
399    Run *rr;
400    int o_buf= 1;	// Current offset into working buffer
401    int o_coef= 1;	// Current offset into coefficient array
402    int hash;
403    Routine *rout;
404 
405    for (ff= filt; ff->len; ff= FFNEXT(ff))
406       filt_cnt += ff->len;
407 
408    // Allocate rough worst-case sizes for temporary arrays
409    coef_tmp= ALLOC_ARR(coef_max= filt_cnt+1, double);
410    rout_tmp= ALLOC_ARR(rout_max= filt_cnt * 16 + 20 + 32, char);
411    dp= coef_tmp+o_coef;	// Leave space to put gain back in later
412 
413    // Setup JIT globals
414    r_buf= rout_tmp;
415    r_end= rout_tmp + rout_max;
416    r_cp= r_buf;
417    r_lab= 0;
418    r_loop= 0;
419    r_edx= 0;
420    r_eax= 0;
421 
422    STARTUP;	// Setup iir/fir running totals on stack, apply gain
423 
424    // Generate command and coefficient lists
425    while (filt->len) {
426       int n_iir, n_fir, cnt;
427       double *iir, *fir;
428       double adj;
429       if (filt->typ == 'F' && filt->len == 1) {
430 	 gain *= filt->val[0];
431 	 filt= FFNEXT(filt);
432 	 continue;
433       }
434       if (filt->typ == 'F') {
435 	 iir= 0; n_iir= 0;
436 	 fir= filt->val; n_fir= filt->len;
437 	 filt= FFNEXT(filt);
438       } else if (filt->typ == 'I') {
439          iir= filt->val; n_iir= filt->len;
440          fir= 0; n_fir= 0;
441          filt= FFNEXT(filt);
442          while (filt->typ == 'F' && filt->len == 1) {
443             gain *= filt->val[0];
444             filt= FFNEXT(filt);
445          }
446          if (filt->typ == 'F') {
447             fir= filt->val; n_fir= filt->len;
448             filt= FFNEXT(filt);
449          }
450       } else
451          error("Internal error: fid_run_new can only handle IIR + FIR types");
452 
453       // Okay, we now have an IIR/FIR pair to process, possibly with
454       // n_iir or n_fir == 0 if one half is missing
455       cnt= n_iir > n_fir ? n_iir : n_fir;
456       if (n_iir) {
457 	 adj= 1.0 / iir[0];
458 	 gain *= adj;
459       }
460 
461       // Sort out any trailing IIR coefficients where there are more
462       // IIR than FIR
463       if (cnt > n_fir) {
464 	 a= cnt - (n_fir ? n_fir : 1);
465 	 if (a >= LOOP) {
466 	    FOR(a, o_buf, o_coef);
467 	    IIR(o_coef); o_coef++;
468 	    GETB(o_buf); o_buf++;
469 	    NEXT(o_buf, o_coef);
470 	    o_buf += (a-1);
471 	    o_coef += (a-1);
472 	    while (a-- > 0) *dp++= iir[--cnt] * adj;
473 	 } else while (a-- > 0) {
474 	    *dp++= iir[--cnt] * adj;
475 	    IIR(o_coef); o_coef++;
476 	    GETB(o_buf); o_buf++;
477 	 }
478       }
479 
480       // Sort out any trailing FIR coefficients where there are more
481       // FIR than IIR
482       if (cnt > n_iir) {
483 	 a= cnt - (n_iir ? n_iir : 1);
484 	 if (a >= LOOP) {
485 	    FOR(a, o_buf, o_coef);
486 	    FIR(o_coef); o_coef++;
487 	    GETB(o_buf); o_buf++;
488 	    NEXT(o_buf, o_coef);
489 	    o_buf += (a-1);
490 	    o_coef += (a-1);
491 	    while (a-- > 0) *dp++= fir[--cnt];
492 	 } else while (a-- > 0) {
493 	    val= fir[--cnt];
494 	    if (val == -2.0) FIR_M2;
495 	    else if (val == -1.0) FIR_M1;
496 	    else if (val == 0.0) FIR_0;
497 	    else if (val == 1.0) FIR_P1;
498 	    else if (val == 2.0) FIR_P2;
499 	    else { *dp++= val; FIR(o_coef); o_coef++; }
500 	    GETB(o_buf); o_buf++;
501 	 }
502       }
503 
504       // Sort out any common IIR/FIR coefficients remaining
505       if (cnt > 1) {
506 	 a= cnt - 1;
507 	 if (a >= LOOP) {
508 	    FOR(a, o_buf, o_coef);
509 	    FIRc(o_coef); o_coef++;
510 	    IIR(o_coef); o_coef++;
511 	    GETB(o_buf); o_buf++;
512 	    NEXT(o_buf, o_coef);
513 	    o_buf += (a-1);
514 	    o_coef += 2 * (a-1);
515 	    while (a-- > 0) {
516 	       *dp++= fir[--cnt] * adj;
517 	       *dp++= iir[cnt] * adj;
518 	    }
519 	 } else while (a-- > 0) {
520 	    val= fir[--cnt];
521 	    if (val == -2.0) FIRc_M2;
522 	    else if (val == -1.0) FIRc_M1;
523 	    else if (val == 0.0) ;
524 	    else if (val == 1.0) FIRc_P1;
525 	    else if (val == 2.0) FIRc_P2;
526 	    else { *dp++= val; FIRc(o_coef); o_coef++; }
527 
528 	    *dp++= iir[cnt] * adj;
529 	    IIR(o_coef); o_coef++;
530 	    GETB(o_buf); o_buf++;
531 	 }
532       }
533 
534       // Handle the final element, according to whether there was any
535       // FIR activity in this filter stage
536       PUTB(o_buf-1);
537       if (n_fir) {
538 	 if (fir[0] == 1.0) { FIREND_P1; }
539 	 else if (fir[0] == -1.0) { FIREND_M1; }
540 	 else { *dp++= fir[0]; FIREND(o_coef); o_coef++; }
541       }
542    }
543 
544    coef_tmp[0]= gain;
545    RETURN;
546 
547    // Sanity checks
548    coef_cnt= dp-coef_tmp;
549    rout_cnt= r_cp-r_buf;
550    if (coef_cnt > coef_max ||
551        rout_cnt > rout_max)
552       error("fid_run_new internal error; arrays exceeded");
553 
554    // Now generate a hash of the code we've created, and see if we've
555    // already got a cached version of that routine
556    hash= HASH(rout_tmp, rout_cnt);
557    for (rout= r_list; rout; rout= rout->nxt) {
558       if (rout->hash == hash &&
559 	  rout->len == rout_cnt &&
560 	  0 == memcmp(rout->code, rout_tmp, rout_cnt))
561 	 break;
562    }
563    if (!rout) {
564       rout= Alloc(sizeof(Routine) + rout_cnt);
565       rout->nxt= r_list; r_list= rout;
566       rout->ref= 0;
567       rout->hash= hash;
568       rout->code= (char*)(rout+1);
569       rout->len= rout_cnt;
570       memcpy(rout->code, rout_tmp, rout_cnt);
571       // Maybe flush caches at this point on processors other than x86
572    }
573    free(rout_tmp);
574 
575    // Allocate the final Run structure to return
576    rr= (Run*)Alloc(sizeof(Run) +
577 		   coef_cnt*sizeof(double));
578    rr->magic= 0x64966325;
579    rr->n_buf= o_buf;
580    rr->coef= (double*)(rr+1);
581    memcpy(rr->coef, coef_tmp, coef_cnt*sizeof(double));
582    rr->rout= rout;
583    rout->ref++;
584 
585    free(coef_tmp);
586 
587    *funcpp= (void*)rout->code;
588    return rr;
589 }
590 
591 //
592 //	Create a new instance of the given filter
593 //
594 
595 void *
fid_run_newbuf(void * run)596 fid_run_newbuf(void *run) {
597    Run *rr= run;
598    RunBuf *rb;
599 
600    if (rr->magic != 0x64966325)
601       error("Bad handle passed to fid_run_newbuf()");
602 
603    rb= (RunBuf*)ALLOC_ARR(rr->n_buf, double);
604    rb->coef= rr->coef;
605    rb->mov_cnt= (rr->n_buf-1) * sizeof(double) / 4;
606 
607    return rb;
608 }
609 
610 //
611 //	Delete an instance
612 //
613 
614 void
fid_run_freebuf(void * runbuf)615 fid_run_freebuf(void *runbuf) {
616    free(runbuf);
617 }
618 
619 //
620 //	Delete the filter
621 //
622 
623 void
fid_run_free(void * run)624 fid_run_free(void *run) {
625    Routine *rout= ((Run*)run)->rout;
626    rout->ref--;
627    if (!rout->ref) {
628       // Delete the routine out of the cache
629       Routine *p, **prvp;
630       for (prvp= &r_list; (p= *prvp); prvp= &p->nxt)
631 	 if (p == rout) {
632 	    *prvp= p->nxt;
633 	    break;
634 	 }
635       free(rout);
636    }
637    free(run);
638 }
639 
640 //
641 //	Dump all the routines in memory
642 //
643 
644 void
fid_run_dump(FILE * out)645 fid_run_dump(FILE *out) {
646    Routine *rr;
647    int a, cnt= 0;
648    fprintf(out,
649 	   "	.file	\"fid_run_dump.s\"\n"
650 	   "	.version	\"01.01\"\n"
651 	   ".text\n"
652 	   "	.align 4\n");
653    for (rr= r_list; rr; rr= rr->nxt, cnt++) {
654       fprintf(out,
655 	      ".globl	process_%d\n"
656 	      "	.type	process_%d,@function\n"
657 	      "process_%d:\n",
658 	      cnt, cnt, cnt);
659       for (a= 0; a<rr->len; a++)
660 	 fprintf(out, "	.byte 0x%02X\n", 255&rr->code[a]);
661       fprintf(out,
662 	      ".Lfe1%d:\n"
663 	      "	.size	process_%d,.Lfe1%d-process_%d\n",
664 	      cnt, cnt, cnt, cnt);
665    }
666 }
667 
668 
669 //
670 //	Hashing function.  Overkill for this job, but might as well
671 //	use a good one as it's available.  See below for credits.
672 //
673 
674 typedef  unsigned long  int  ub4;   /* unsigned 4-byte quantities */
675 typedef  unsigned       char ub1;   /* unsigned 1-byte quantities */
676 
677 #define hashsize(n) ((ub4)1<<(n))
678 #define hashmask(n) (hashsize(n)-1)
679 
680 /*
681 --------------------------------------------------------------------
682 mix -- mix 3 32-bit values reversibly.
683 For every delta with one or two bits set, and the deltas of all three
684   high bits or all three low bits, whether the original value of a,b,c
685   is almost all zero or is uniformly distributed,
686   * If mix() is run forward or backward, at least 32 bits in a,b,c
687   have at least 1/4 probability of changing.
688   * If mix() is run forward, every bit of c will change between 1/3 and
689   2/3 of the time.  (Well, 22/100 and 78/100 for some 2-bit deltas.)
690 mix() was built out of 36 single-cycle latency instructions in a
691   structure that could supported 2x parallelism, like so:
692       a -= b;
693       a -= c; x = (c>>13);
694       b -= c; a ^= x;
695       b -= a; x = (a<<8);
696       c -= a; b ^= x;
697       c -= b; x = (b>>13);
698       ...
699   Unfortunately, superscalar Pentiums and Sparcs can't take advantage
700   of that parallelism.  They've also turned some of those single-cycle
701   latency instructions into multi-cycle latency instructions.  Still,
702   this is the fastest good hash I could find.  There were about 2^^68
703   to choose from.  I only looked at a billion or so.
704 --------------------------------------------------------------------
705 */
706 #define mix(a,b,c) \
707 { \
708   a -= b; a -= c; a ^= (c>>13); \
709   b -= c; b -= a; b ^= (a<<8); \
710   c -= a; c -= b; c ^= (b>>13); \
711   a -= b; a -= c; a ^= (c>>12);  \
712   b -= c; b -= a; b ^= (a<<16); \
713   c -= a; c -= b; c ^= (b>>5); \
714   a -= b; a -= c; a ^= (c>>3);  \
715   b -= c; b -= a; b ^= (a<<10); \
716   c -= a; c -= b; c ^= (b>>15); \
717 }
718 
719 /*
720 --------------------------------------------------------------------
721 hash() -- hash a variable-length key into a 32-bit value
722   k       : the key (the unaligned variable-length array of bytes)
723   len     : the length of the key, counting by bytes
724   initval : can be any 4-byte value
725 Returns a 32-bit value.  Every bit of the key affects every bit of
726 the return value.  Every 1-bit and 2-bit delta achieves avalanche.
727 About 6*len+35 instructions.
728 
729 The best hash table sizes are powers of 2.  There is no need to do
730 mod a prime (mod is sooo slow!).  If you need less than 32 bits,
731 use a bitmask.  For example, if you need only 10 bits, do
732   h = (h & hashmask(10));
733 In which case, the hash table should have hashsize(10) elements.
734 
735 If you are hashing n strings (ub1 **)k, do it like this:
736   for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
737 
738 By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use this
739 code any way you wish, private, educational, or commercial.  It's free.
740 
741 See http://burtleburtle.net/bob/hash/evahash.html
742 Use for hash table lookup, or anything where one collision in 2^^32 is
743 acceptable.  Do NOT use for cryptographic purposes.
744 --------------------------------------------------------------------
745 */
746 
747 static ub4
do_hash(register ub1 * k,register ub4 length,register ub4 initval)748 do_hash(register ub1 *k,        /* the key */
749 	register ub4  length,   /* the length of the key */
750 	register ub4  initval)  /* the previous hash, or an arbitrary value */
751 {
752    register ub4 a,b,c,len;
753 
754    /* Set up the internal state */
755    len = length;
756    a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
757    c = initval;         /* the previous hash value */
758 
759    /*---------------------------------------- handle most of the key */
760    while (len >= 12)
761       {
762 	 a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24));
763 	 b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24));
764 	 c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24));
765 	 mix(a,b,c);
766 	 k += 12; len -= 12;
767       }
768 
769    /*------------------------------------- handle the last 11 bytes */
770    c += length;
771    switch(len)              /* all the case statements fall through */
772       {
773        case 11: c+=((ub4)k[10]<<24);
774        case 10: c+=((ub4)k[9]<<16);
775        case 9 : c+=((ub4)k[8]<<8);
776 	  /* the first byte of c is reserved for the length */
777        case 8 : b+=((ub4)k[7]<<24);
778        case 7 : b+=((ub4)k[6]<<16);
779        case 6 : b+=((ub4)k[5]<<8);
780        case 5 : b+=k[4];
781        case 4 : a+=((ub4)k[3]<<24);
782        case 3 : a+=((ub4)k[2]<<16);
783        case 2 : a+=((ub4)k[1]<<8);
784        case 1 : a+=k[0];
785 	  /* case 0: nothing left to add */
786       }
787    mix(a,b,c);
788    /*-------------------------------------------- report the result */
789    return c;
790 }
791 
792 // END //
793