1 #error "JIT code is no longer maintained -- cmdlist is almost as fast on ix86"
2
3 //
4 // JIT-compiled filter-running code.
5 //
6 // Copyright (c) 2002-2003 Jim Peters <http://uazu.net/>. This
7 // file is released under the GNU Lesser General Public License
8 // (LGPL) version 2.1 as published by the Free Software
9 // Foundation. See the file COPYING_LIB for details, or visit
10 // <http://www.fsf.org/licenses/licenses.html>.
11 //
12 // The aim of this version of the filter-running code is to go as
13 // fast as possible (without flattening the sub-filters together)
14 // by generating the necessary code at run-time.
15 //
16 // This runs the filter exactly as specified, without convolving
17 // the sub-filters together or changing their order. The only
18 // rearrangement performed is making the IIR first coefficient
19 // 1.0, and gathering any lone 1-coefficient FIR filters together
20 // into a single initial gain adjustment. For this reason, the
21 // routine runs fastest if IIR and FIR sub-filters are grouped
22 // together in IIR/FIR pairs, as these can then share common
23 // working buffers.
24 //
25 // The generated code is cached, and is reused for more than one
26 // filter if possible. This means that a bank of 1000s of
27 // filters of similar types will probably all end up sharing the
28 // same generated routine, which improves processor cache and
29 // memory usage.
30 //
31 // Probably the generated code could be improved, but it is not
32 // too bad. Copying the buffer values using 'rep movsl' turned
33 // out to be much faster than loading and storing the floating
34 // point values individually whilst working through the buffer.
35 //
36 // The generated code was tested for speed on a Celeron-900 and
37 // on a Pentium-133. It always beats the RF_CMDLIST option. It
38 // can be slightly slower than the RF_COMBINED option, but only
39 // where that option gets a big advantage from flattening the
40 // sub-filters. For pre-flattened filters, it is faster.
41 //
42 // The generated code can be dumped out at any point in .s format
43 // using fid_run_dump(). This can be assembled using 'gas' and
44 // then disassembled with 'objdump -d' to see all the generated
45 // code.
46 //
47 // Things that could be improved:
48 //
49 // - Don't keep the fir running total on the stack at all times.
50 // Instead create it at the first FIR operation. This means
51 // generating about 10 new special-case macros. This would save
52 // an add for every filter stage, and some of the messing around
53 // at start and end currently done to set up / clean up this
54 // value on the FP stack.
55 //
56
57 typedef struct Routine Routine;
58 struct Routine {
59 Routine *nxt; // Next in list, or 0
60 int ref; // Reference count
61 int hash; // Hash of routine
62 char *code; // Routine itself
63 int len; // Length of code in bytes
64 };
65
66 typedef struct Run {
67 int magic; // Magic: 0x64966325
68 int n_buf; // Length of working buffer required in doubles
69 double *coef; // Coefficient list
70 Routine *rout; // Routine used
71 } Run;
72
73 typedef struct RunBuf {
74 double *coef; // Coefficient array
75 int mov_cnt; // Number of 4-byte chunks to copy from &buf[1] to &buf[0]
76 double buf[0]; // Buffer itself
77 } RunBuf;
78
79 static unsigned long int do_hash(unsigned char *, unsigned long int, unsigned long int);
80 #define HASH(p,len) ((int)do_hash((unsigned char *)p, (unsigned long int)len, 0))
81
82
83 // Code generation
84 //
85 // %edx is the working buffer pointer
86 // %eax is the coefficient pointer
87 // %ecx is the loop counter
88 // floating point stack contains working values at the top, then
89 // previous buffer value, then running iir total, then running
90 // fir total
91 //
92 // Codes in the add() string:
93 //
94 // %C 4-byte long value count for loop
95 // %L Label -- remember this address for looping back to
96 // %R 1-byte relative jump back to %L address
97 // %D 1-byte relative address of buffer value. If zero, this adjusts the
98 // previous byte by ^=0x40 to make it a pure (%edx) form instead of 0(%edx)
99 // %D+ 1-byte relative address of buffer value as above, plus increment %edx
100 // if we are getting close to the end of the range
101 // %A 1-byte relative address of coefficient value. If zero does same as for %D.
102 // %A+ 1-byte relative address of coefficient value, plus %eax inc
103 // if necessary
104 // %= Insert code to update %edx and %eax to point to the given offsets
105 //
106 // Startup code
107 //
108 // pushl %ebp
109 // movl %esp,%ebp
110 // movl 8(%ebp),%edx
111 // movl (%edx),%eax
112 // movl 4(%edx),%ecx
113 // fldz
114 // fldl 12(%ebp)
115 // fldl 8(%edx)
116 // fmull (%eax)
117 // leal 8(%edx),%edi
118 // leal 16(%edx),%esi
119 // cld
120 // rep movsl
121
122 #define STARTUP add("55 89E5 8B5508 8B02 8B4A04 D9EE DD450C DD4208 DC08 8D7A08 8D7210 FC F3A5")
123
124 // Return
125 //
126 // fstp %st(0) // pop
127 // fstp %st(1)
128 // leave
129 // ret
130
131 #define RETURN add("DDD8 DDD9 C9 C3")
132
133 // Looping
134 //
135 // movl $100,%ecx
136 // .LXX
137 // ...
138 // loop .LXX
139 //
140 // //WAS decl %ecx
141 // //WAS testl %ecx,%ecx
142 // //WAS jg .LXX
143
144 #define FOR(xx, nnd, nna) add("B9%C %= %L", xx, (nnd)*8, (nna)*8)
145 //WAS #define NEXT(nnd, nna) add("%= 49 85C9 7F%R", (nnd)*8, (nna)*8)
146 #define NEXT(nnd, nna) add("%= E2%R", (nnd)*8, (nna)*8)
147
148 // Fetching/storing buffer values
149 //
150 // tmp= buf[n];
151 // fldl nn(%edx)
152 //
153 // buf[nn]= iir;
154 // fld %st(1)
155 // fstpl nn(%edx)
156
157 #define GETB(nn) add("DD42%D+", (nn)*8)
158 #define PUTB(nn) add("D9C1 DD5A%D+", (nn)*8)
159
160 // FIR element with following IIR element
161 //
162 // fir -= 2 * tmp;
163 // fsub %st(0),%st(2)
164 // fsub %st(0),%st(2)
165 // fir -= tmp;
166 // fsub %st(0),%st(2)
167 // fir += tmp;
168 // fadd %st(0),%st(2)
169 // fir += 2 * tmp;
170 // fadd %st(0),%st(2)
171 // fadd %st(0),%st(2)
172 // fir += coef[nn] * tmp;
173 // fld %st(0)
174 // fmull nn(%eax)
175 // faddp %st(0),%st(3)
176
177 #define FIRc_M2 add("DCEA DCEA")
178 #define FIRc_M1 add("DCEA")
179 #define FIRc_P1 add("DCC2")
180 #define FIRc_P2 add("DCC2 DCC2")
181 #define FIRc(nn) add("D9C0 DC48%A+ DEC3", (nn)*8)
182
183 // FIR element with no following IIR element
184 //
185 // fir -= 2 * tmp;
186 // fsub %st(0),%st(2)
187 // fsubp %st(0),%st(2)
188 // fir -= tmp;
189 // fsubp %st(0),%st(2)
190 // fir += 0 * tmp;
191 // fstp %st(0),%st(0) // Really I just want to pop the top value
192 // fir += tmp;
193 // faddp %st(0),%st(2)
194 // fir += 2 * tmp;
195 // fadd %st(0),%st(2)
196 // faddp %st(0),%st(2)
197 // fir += coef[0] * tmp;
198 // fmull nn(%eax)
199 // faddp %st(0),%st(2)
200
201 #define FIR_M2 add("DCEA DEEA")
202 #define FIR_M1 add("DEEA")
203 #define FIR_0 add("DDD8")
204 #define FIR_P1 add("DEC2")
205 #define FIR_P2 add("DCC2 DEC2")
206 #define FIR(nn) add("DC48%A+ DEC2", (nn)*8)
207
208 // IIR element
209 //
210 // iir -= coef[nn] * tmp;
211 // fmull nn(%eax)
212 // fsubp %st(0),%st(1)
213
214 #define IIR(nn) add("DC48%A+ DEE9", (nn)*8)
215
216 // Final FIR element of pure-FIR or mixed FIR-IIR stage
217 //
218 // iir= fir + coef[nn] * iir; fir= 0;
219 // fxch
220 // fmull nn(%eax)
221 // faddp %st(2)
222 // fldz
223 // fstp %st(3)
224 // iir= fir + 1.0 * iir; fir= 0;
225 // fxch
226 // faddp %st(2)
227 // fldz
228 // fstp %st(3)
229 // iir= fir - 1.0 * iir; fir= 0;
230 // fxch
231 // fsubp %st(2)
232 // fldz
233 // fstp %st(3)
234
235 #define FIREND(nn) add("D9C9 DC48%A+ DEC2 D9EE DDDB", (nn)*8)
236 #define FIREND_P1 add("D9C9 DEC2 D9EE DDDB")
237 #define FIREND_M1 add("D9C9 DEEA D9EE DDDB")
238
239 //
240 // Globals for handling routines
241 //
242
243 static char *r_buf; // Buffer address
244 static char *r_end; // Current end of buffer
245 static char *r_cp; // Current write-position
246 static char *r_lab; // Current loop-back label, or 0
247 static int r_loop; // Loop count
248 static int r_edx; // %edx offset relative to initial position
249 static int r_eax; // %eax offset relative to initial position
250 static Routine *r_list; // List of routines or 0
251
252 //
253 // Add code to the current routine. This uses global variables,
254 // and so is not thread-safe.
255 //
256
257 static void
add(char * fmt,...)258 add(char *fmt, ...) {
259 va_list ap;
260 int ch, val;
261 va_start(ap, fmt);
262
263 if (r_end - r_cp < 32)
264 error("JIT error: routine buffer exceeded");
265
266 while ((ch= *fmt++)) {
267 if (isspace(ch)) continue;
268 if (isdigit(ch) || (ch >= 'A' && ch <= 'F')) {
269 val= ch >= 'A' ? ch - 'A' + 10 : ch - '0';
270 ch= *fmt++;
271 if (!isdigit(ch) && !(ch >= 'A' && ch <= 'F'))
272 error("JIT error: Bad format for add() routine");
273 val= (val*16) + (ch >= 'A' ? ch - 'A' + 10 : ch - '0');
274 *r_cp++= val;
275 continue;
276 }
277 if (ch != '%')
278 error("JIT error: add() routine bad format string");
279 switch (ch= *fmt++) {
280 case 'C':
281 val= va_arg(ap, int);
282 r_loop= val;
283 *r_cp++= val;
284 *r_cp++= val>>8;
285 *r_cp++= val>>16;
286 *r_cp++= val>>24;
287 break;
288 case 'L':
289 if (r_lab) error("JIT error: two stacked %L formats");
290 r_lab= r_cp;
291 break;
292 case 'R':
293 if (!r_lab) error("JIT error: %R without matching %L");
294 val= r_lab - (r_cp+1);
295 if (val < -128) error("JIT error: %R too far from %L");
296 *r_cp++= val;
297 r_lab= 0;
298 break;
299 case 'D':
300 val= va_arg(ap, int) - r_edx;
301 if (val < -128 || val >= 128) error("JIT error: %%edx offset out of range");
302 if (val == 0)
303 r_cp[-1] ^= 0x40;
304 else
305 *r_cp++= val;
306 if (*fmt == '+') {
307 fmt++;
308 if (val >= 120) {
309 *r_cp++= 0x83; // addl $120,%edx
310 *r_cp++= 0xC2;
311 *r_cp++= 0x78;
312 r_edx += 120;
313 }
314 }
315 break;
316 case 'A':
317 val= va_arg(ap, int) - r_eax;
318 if (val < -128 || val >= 128) error("JIT error: %%eax offset out of range");
319 if (val == 0)
320 r_cp[-1] ^= 0x40;
321 else
322 *r_cp++= val;
323 if (*fmt == '+') {
324 fmt++;
325 if (val >= 120) {
326 *r_cp++= 0x83; // addl $120,%eax
327 *r_cp++= 0xC0;
328 *r_cp++= 0x78;
329 r_eax += 120;
330 }
331 }
332 break;
333 case '=':
334 val= va_arg(ap, int) - r_edx;
335 if (val != 0) {
336 if (val < -128 || val >= 128)
337 error("JIT error: %%= adjust for %%edx is out of range");
338 *r_cp++= 0x83; // addl $120,%edx
339 *r_cp++= 0xC2;
340 *r_cp++= val;
341 r_edx += val * (r_lab ? r_loop : 1);
342 }
343 val= va_arg(ap, int) - r_eax;
344 if (val != 0) {
345 if (val < -128 || val >= 128)
346 error("JIT error: %%= adjust for %%edx is out of range");
347 *r_cp++= 0x83; // addl $120,%edx
348 *r_cp++= 0xC0;
349 *r_cp++= val;
350 r_eax += val * (r_lab ? r_loop : 1);
351 }
352 break;
353 default:
354 error("JIT error: bad format for add()");
355 }
356 }
357 }
358
359
360 //
361 // Create an instance of a filter, ready to run. This returns a
362 // void* handle, and a JIT-compiled function to call to execute
363 // the filter. (The functions are cached, so if several versions
364 // of the same filter are generated with different parameters, it
365 // is likely that the same routine will end up servicing all of
366 // them).
367 //
368 // Working buffers for the filter instances must be allocated
369 // separately using fid_run_newbuf(). This allows many
370 // simultaneous instances of the same filter to be run.
371 //
372 // The sub-filters are executed in the precise order that they
373 // are given. This may lead to some inefficiency, because
374 // normally when an IIR filter is followed by an FIR filter, the
375 // buffers can be shared. So, if the sub-filters are not in
376 // IIR/FIR pairs, then extra memory accesses are required.
377 //
378 // In any case, factors are extracted from IIR filters (so that
379 // the first coefficient is 1), and single-element FIR filters
380 // are merged into the global gain factor, and are ignored.
381 //
382 // The returned handle must be released using fid_run_free().
383 //
384
385 // Loop the generated code above LOOP repeats (8)
386 #define LOOP 8
387
388 void *
fid_run_new(FidFilter * filt,double (** funcpp)(void *,double))389 fid_run_new(FidFilter *filt, double (**funcpp)(void *,double)) {
390 FidFilter *ff;
391 double *dp;
392 double gain= 1.0;
393 int a, val;
394 double *coef_tmp;
395 char *rout_tmp;
396 int coef_cnt, coef_max;
397 int rout_cnt, rout_max;
398 int filt_cnt= 0;
399 Run *rr;
400 int o_buf= 1; // Current offset into working buffer
401 int o_coef= 1; // Current offset into coefficient array
402 int hash;
403 Routine *rout;
404
405 for (ff= filt; ff->len; ff= FFNEXT(ff))
406 filt_cnt += ff->len;
407
408 // Allocate rough worst-case sizes for temporary arrays
409 coef_tmp= ALLOC_ARR(coef_max= filt_cnt+1, double);
410 rout_tmp= ALLOC_ARR(rout_max= filt_cnt * 16 + 20 + 32, char);
411 dp= coef_tmp+o_coef; // Leave space to put gain back in later
412
413 // Setup JIT globals
414 r_buf= rout_tmp;
415 r_end= rout_tmp + rout_max;
416 r_cp= r_buf;
417 r_lab= 0;
418 r_loop= 0;
419 r_edx= 0;
420 r_eax= 0;
421
422 STARTUP; // Setup iir/fir running totals on stack, apply gain
423
424 // Generate command and coefficient lists
425 while (filt->len) {
426 int n_iir, n_fir, cnt;
427 double *iir, *fir;
428 double adj;
429 if (filt->typ == 'F' && filt->len == 1) {
430 gain *= filt->val[0];
431 filt= FFNEXT(filt);
432 continue;
433 }
434 if (filt->typ == 'F') {
435 iir= 0; n_iir= 0;
436 fir= filt->val; n_fir= filt->len;
437 filt= FFNEXT(filt);
438 } else if (filt->typ == 'I') {
439 iir= filt->val; n_iir= filt->len;
440 fir= 0; n_fir= 0;
441 filt= FFNEXT(filt);
442 while (filt->typ == 'F' && filt->len == 1) {
443 gain *= filt->val[0];
444 filt= FFNEXT(filt);
445 }
446 if (filt->typ == 'F') {
447 fir= filt->val; n_fir= filt->len;
448 filt= FFNEXT(filt);
449 }
450 } else
451 error("Internal error: fid_run_new can only handle IIR + FIR types");
452
453 // Okay, we now have an IIR/FIR pair to process, possibly with
454 // n_iir or n_fir == 0 if one half is missing
455 cnt= n_iir > n_fir ? n_iir : n_fir;
456 if (n_iir) {
457 adj= 1.0 / iir[0];
458 gain *= adj;
459 }
460
461 // Sort out any trailing IIR coefficients where there are more
462 // IIR than FIR
463 if (cnt > n_fir) {
464 a= cnt - (n_fir ? n_fir : 1);
465 if (a >= LOOP) {
466 FOR(a, o_buf, o_coef);
467 IIR(o_coef); o_coef++;
468 GETB(o_buf); o_buf++;
469 NEXT(o_buf, o_coef);
470 o_buf += (a-1);
471 o_coef += (a-1);
472 while (a-- > 0) *dp++= iir[--cnt] * adj;
473 } else while (a-- > 0) {
474 *dp++= iir[--cnt] * adj;
475 IIR(o_coef); o_coef++;
476 GETB(o_buf); o_buf++;
477 }
478 }
479
480 // Sort out any trailing FIR coefficients where there are more
481 // FIR than IIR
482 if (cnt > n_iir) {
483 a= cnt - (n_iir ? n_iir : 1);
484 if (a >= LOOP) {
485 FOR(a, o_buf, o_coef);
486 FIR(o_coef); o_coef++;
487 GETB(o_buf); o_buf++;
488 NEXT(o_buf, o_coef);
489 o_buf += (a-1);
490 o_coef += (a-1);
491 while (a-- > 0) *dp++= fir[--cnt];
492 } else while (a-- > 0) {
493 val= fir[--cnt];
494 if (val == -2.0) FIR_M2;
495 else if (val == -1.0) FIR_M1;
496 else if (val == 0.0) FIR_0;
497 else if (val == 1.0) FIR_P1;
498 else if (val == 2.0) FIR_P2;
499 else { *dp++= val; FIR(o_coef); o_coef++; }
500 GETB(o_buf); o_buf++;
501 }
502 }
503
504 // Sort out any common IIR/FIR coefficients remaining
505 if (cnt > 1) {
506 a= cnt - 1;
507 if (a >= LOOP) {
508 FOR(a, o_buf, o_coef);
509 FIRc(o_coef); o_coef++;
510 IIR(o_coef); o_coef++;
511 GETB(o_buf); o_buf++;
512 NEXT(o_buf, o_coef);
513 o_buf += (a-1);
514 o_coef += 2 * (a-1);
515 while (a-- > 0) {
516 *dp++= fir[--cnt] * adj;
517 *dp++= iir[cnt] * adj;
518 }
519 } else while (a-- > 0) {
520 val= fir[--cnt];
521 if (val == -2.0) FIRc_M2;
522 else if (val == -1.0) FIRc_M1;
523 else if (val == 0.0) ;
524 else if (val == 1.0) FIRc_P1;
525 else if (val == 2.0) FIRc_P2;
526 else { *dp++= val; FIRc(o_coef); o_coef++; }
527
528 *dp++= iir[cnt] * adj;
529 IIR(o_coef); o_coef++;
530 GETB(o_buf); o_buf++;
531 }
532 }
533
534 // Handle the final element, according to whether there was any
535 // FIR activity in this filter stage
536 PUTB(o_buf-1);
537 if (n_fir) {
538 if (fir[0] == 1.0) { FIREND_P1; }
539 else if (fir[0] == -1.0) { FIREND_M1; }
540 else { *dp++= fir[0]; FIREND(o_coef); o_coef++; }
541 }
542 }
543
544 coef_tmp[0]= gain;
545 RETURN;
546
547 // Sanity checks
548 coef_cnt= dp-coef_tmp;
549 rout_cnt= r_cp-r_buf;
550 if (coef_cnt > coef_max ||
551 rout_cnt > rout_max)
552 error("fid_run_new internal error; arrays exceeded");
553
554 // Now generate a hash of the code we've created, and see if we've
555 // already got a cached version of that routine
556 hash= HASH(rout_tmp, rout_cnt);
557 for (rout= r_list; rout; rout= rout->nxt) {
558 if (rout->hash == hash &&
559 rout->len == rout_cnt &&
560 0 == memcmp(rout->code, rout_tmp, rout_cnt))
561 break;
562 }
563 if (!rout) {
564 rout= Alloc(sizeof(Routine) + rout_cnt);
565 rout->nxt= r_list; r_list= rout;
566 rout->ref= 0;
567 rout->hash= hash;
568 rout->code= (char*)(rout+1);
569 rout->len= rout_cnt;
570 memcpy(rout->code, rout_tmp, rout_cnt);
571 // Maybe flush caches at this point on processors other than x86
572 }
573 free(rout_tmp);
574
575 // Allocate the final Run structure to return
576 rr= (Run*)Alloc(sizeof(Run) +
577 coef_cnt*sizeof(double));
578 rr->magic= 0x64966325;
579 rr->n_buf= o_buf;
580 rr->coef= (double*)(rr+1);
581 memcpy(rr->coef, coef_tmp, coef_cnt*sizeof(double));
582 rr->rout= rout;
583 rout->ref++;
584
585 free(coef_tmp);
586
587 *funcpp= (void*)rout->code;
588 return rr;
589 }
590
591 //
592 // Create a new instance of the given filter
593 //
594
595 void *
fid_run_newbuf(void * run)596 fid_run_newbuf(void *run) {
597 Run *rr= run;
598 RunBuf *rb;
599
600 if (rr->magic != 0x64966325)
601 error("Bad handle passed to fid_run_newbuf()");
602
603 rb= (RunBuf*)ALLOC_ARR(rr->n_buf, double);
604 rb->coef= rr->coef;
605 rb->mov_cnt= (rr->n_buf-1) * sizeof(double) / 4;
606
607 return rb;
608 }
609
610 //
611 // Delete an instance
612 //
613
614 void
fid_run_freebuf(void * runbuf)615 fid_run_freebuf(void *runbuf) {
616 free(runbuf);
617 }
618
619 //
620 // Delete the filter
621 //
622
623 void
fid_run_free(void * run)624 fid_run_free(void *run) {
625 Routine *rout= ((Run*)run)->rout;
626 rout->ref--;
627 if (!rout->ref) {
628 // Delete the routine out of the cache
629 Routine *p, **prvp;
630 for (prvp= &r_list; (p= *prvp); prvp= &p->nxt)
631 if (p == rout) {
632 *prvp= p->nxt;
633 break;
634 }
635 free(rout);
636 }
637 free(run);
638 }
639
640 //
641 // Dump all the routines in memory
642 //
643
644 void
fid_run_dump(FILE * out)645 fid_run_dump(FILE *out) {
646 Routine *rr;
647 int a, cnt= 0;
648 fprintf(out,
649 " .file \"fid_run_dump.s\"\n"
650 " .version \"01.01\"\n"
651 ".text\n"
652 " .align 4\n");
653 for (rr= r_list; rr; rr= rr->nxt, cnt++) {
654 fprintf(out,
655 ".globl process_%d\n"
656 " .type process_%d,@function\n"
657 "process_%d:\n",
658 cnt, cnt, cnt);
659 for (a= 0; a<rr->len; a++)
660 fprintf(out, " .byte 0x%02X\n", 255&rr->code[a]);
661 fprintf(out,
662 ".Lfe1%d:\n"
663 " .size process_%d,.Lfe1%d-process_%d\n",
664 cnt, cnt, cnt, cnt);
665 }
666 }
667
668
669 //
670 // Hashing function. Overkill for this job, but might as well
671 // use a good one as it's available. See below for credits.
672 //
673
674 typedef unsigned long int ub4; /* unsigned 4-byte quantities */
675 typedef unsigned char ub1; /* unsigned 1-byte quantities */
676
677 #define hashsize(n) ((ub4)1<<(n))
678 #define hashmask(n) (hashsize(n)-1)
679
680 /*
681 --------------------------------------------------------------------
682 mix -- mix 3 32-bit values reversibly.
683 For every delta with one or two bits set, and the deltas of all three
684 high bits or all three low bits, whether the original value of a,b,c
685 is almost all zero or is uniformly distributed,
686 * If mix() is run forward or backward, at least 32 bits in a,b,c
687 have at least 1/4 probability of changing.
688 * If mix() is run forward, every bit of c will change between 1/3 and
689 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.)
690 mix() was built out of 36 single-cycle latency instructions in a
691 structure that could supported 2x parallelism, like so:
692 a -= b;
693 a -= c; x = (c>>13);
694 b -= c; a ^= x;
695 b -= a; x = (a<<8);
696 c -= a; b ^= x;
697 c -= b; x = (b>>13);
698 ...
699 Unfortunately, superscalar Pentiums and Sparcs can't take advantage
700 of that parallelism. They've also turned some of those single-cycle
701 latency instructions into multi-cycle latency instructions. Still,
702 this is the fastest good hash I could find. There were about 2^^68
703 to choose from. I only looked at a billion or so.
704 --------------------------------------------------------------------
705 */
706 #define mix(a,b,c) \
707 { \
708 a -= b; a -= c; a ^= (c>>13); \
709 b -= c; b -= a; b ^= (a<<8); \
710 c -= a; c -= b; c ^= (b>>13); \
711 a -= b; a -= c; a ^= (c>>12); \
712 b -= c; b -= a; b ^= (a<<16); \
713 c -= a; c -= b; c ^= (b>>5); \
714 a -= b; a -= c; a ^= (c>>3); \
715 b -= c; b -= a; b ^= (a<<10); \
716 c -= a; c -= b; c ^= (b>>15); \
717 }
718
719 /*
720 --------------------------------------------------------------------
721 hash() -- hash a variable-length key into a 32-bit value
722 k : the key (the unaligned variable-length array of bytes)
723 len : the length of the key, counting by bytes
724 initval : can be any 4-byte value
725 Returns a 32-bit value. Every bit of the key affects every bit of
726 the return value. Every 1-bit and 2-bit delta achieves avalanche.
727 About 6*len+35 instructions.
728
729 The best hash table sizes are powers of 2. There is no need to do
730 mod a prime (mod is sooo slow!). If you need less than 32 bits,
731 use a bitmask. For example, if you need only 10 bits, do
732 h = (h & hashmask(10));
733 In which case, the hash table should have hashsize(10) elements.
734
735 If you are hashing n strings (ub1 **)k, do it like this:
736 for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
737
738 By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use this
739 code any way you wish, private, educational, or commercial. It's free.
740
741 See http://burtleburtle.net/bob/hash/evahash.html
742 Use for hash table lookup, or anything where one collision in 2^^32 is
743 acceptable. Do NOT use for cryptographic purposes.
744 --------------------------------------------------------------------
745 */
746
747 static ub4
do_hash(register ub1 * k,register ub4 length,register ub4 initval)748 do_hash(register ub1 *k, /* the key */
749 register ub4 length, /* the length of the key */
750 register ub4 initval) /* the previous hash, or an arbitrary value */
751 {
752 register ub4 a,b,c,len;
753
754 /* Set up the internal state */
755 len = length;
756 a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
757 c = initval; /* the previous hash value */
758
759 /*---------------------------------------- handle most of the key */
760 while (len >= 12)
761 {
762 a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24));
763 b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24));
764 c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24));
765 mix(a,b,c);
766 k += 12; len -= 12;
767 }
768
769 /*------------------------------------- handle the last 11 bytes */
770 c += length;
771 switch(len) /* all the case statements fall through */
772 {
773 case 11: c+=((ub4)k[10]<<24);
774 case 10: c+=((ub4)k[9]<<16);
775 case 9 : c+=((ub4)k[8]<<8);
776 /* the first byte of c is reserved for the length */
777 case 8 : b+=((ub4)k[7]<<24);
778 case 7 : b+=((ub4)k[6]<<16);
779 case 6 : b+=((ub4)k[5]<<8);
780 case 5 : b+=k[4];
781 case 4 : a+=((ub4)k[3]<<24);
782 case 3 : a+=((ub4)k[2]<<16);
783 case 2 : a+=((ub4)k[1]<<8);
784 case 1 : a+=k[0];
785 /* case 0: nothing left to add */
786 }
787 mix(a,b,c);
788 /*-------------------------------------------- report the result */
789 return c;
790 }
791
792 // END //
793