1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
5 #include "mpi.h"
6 #include "prtypes.h"
7 
8 /*
9  * This file implements a single function: s_mpi_getProcessorLineSize();
10  * s_mpi_getProcessorLineSize() returns the size in bytes of the cache line
11  * if a cache exists, or zero if there is no cache. If more than one
12  * cache line exists, it should return the smallest line size (which is
13  * usually the L1 cache).
14  *
15  * mp_modexp uses this information to make sure that private key information
16  * isn't being leaked through the cache.
17  *
18  * Currently the file returns good data for most modern x86 processors, and
19  * reasonable data on 64-bit ppc processors. All other processors are assumed
20  * to have a cache line size of 32 bytes unless modified by target.mk.
21  *
22  */
23 
24 #if defined(i386) || defined(__i386) || defined(__X86__) || defined(_M_IX86) || defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
25 /* X86 processors have special instructions that tell us about the cache */
26 #include "string.h"
27 
28 #if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
29 #define AMD_64 1
30 #endif
31 
32 /* Generic CPUID function */
33 #if defined(AMD_64)
34 
35 #if defined(__GNUC__)
36 
37 void
freebl_cpuid(unsigned long op,unsigned long * eax,unsigned long * ebx,unsigned long * ecx,unsigned long * edx)38 freebl_cpuid(unsigned long op, unsigned long *eax,
39              unsigned long *ebx, unsigned long *ecx,
40              unsigned long *edx)
41 {
42     __asm__("cpuid\n\t"
43             : "=a"(*eax),
44               "=b"(*ebx),
45               "=c"(*ecx),
46               "=d"(*edx)
47             : "0"(op));
48 }
49 
50 #elif defined(_MSC_VER)
51 
52 #include <intrin.h>
53 
54 void
freebl_cpuid(unsigned long op,unsigned long * eax,unsigned long * ebx,unsigned long * ecx,unsigned long * edx)55 freebl_cpuid(unsigned long op, unsigned long *eax,
56              unsigned long *ebx, unsigned long *ecx,
57              unsigned long *edx)
58 {
59     int intrinsic_out[4];
60 
61     __cpuid(intrinsic_out, op);
62     *eax = intrinsic_out[0];
63     *ebx = intrinsic_out[1];
64     *ecx = intrinsic_out[2];
65     *edx = intrinsic_out[3];
66 }
67 
68 #endif
69 
70 #else /* !defined(AMD_64) */
71 
72 /* x86 */
73 
74 #if defined(__GNUC__)
75 void
freebl_cpuid(unsigned long op,unsigned long * eax,unsigned long * ebx,unsigned long * ecx,unsigned long * edx)76 freebl_cpuid(unsigned long op, unsigned long *eax,
77              unsigned long *ebx, unsigned long *ecx,
78              unsigned long *edx)
79 {
80     /* Some older processors don't fill the ecx register with cpuid, so clobber it
81      * before calling cpuid, so that there's no risk of picking random bits that
82      * erroneously indicate that absent CPU features are present.
83      * Also, GCC isn't smart enough to save the ebx PIC register on its own
84      * in this case, so do it by hand. Use edi to store ebx and pass the
85      * value returned in ebx from cpuid through edi. */
86     __asm__("xor %%ecx, %%ecx\n\t"
87             "mov %%ebx,%%edi\n\t"
88             "cpuid\n\t"
89             "xchgl %%ebx,%%edi\n\t"
90             : "=a"(*eax),
91               "=D"(*ebx),
92               "=c"(*ecx),
93               "=d"(*edx)
94             : "0"(op));
95 }
96 
97 /*
98  * try flipping a processor flag to determine CPU type
99  */
100 static unsigned long
changeFlag(unsigned long flag)101 changeFlag(unsigned long flag)
102 {
103     unsigned long changedFlags, originalFlags;
104     __asm__("pushfl\n\t" /* get the flags */
105             "popl %0\n\t"
106             "movl %0,%1\n\t" /* save the original flags */
107             "xorl %2,%0\n\t" /* flip the bit */
108             "pushl %0\n\t"   /* set the flags */
109             "popfl\n\t"
110             "pushfl\n\t" /* get the flags again (for return) */
111             "popl %0\n\t"
112             "pushl %1\n\t" /* restore the original flags */
113             "popfl\n\t"
114             : "=r"(changedFlags),
115               "=r"(originalFlags),
116               "=r"(flag)
117             : "2"(flag));
118     return changedFlags ^ originalFlags;
119 }
120 
121 #elif defined(_MSC_VER)
122 
123 /*
124  * windows versions of the above assembler
125  */
126 #define wcpuid __asm __emit 0fh __asm __emit 0a2h
127 void
freebl_cpuid(unsigned long op,unsigned long * Reax,unsigned long * Rebx,unsigned long * Recx,unsigned long * Redx)128 freebl_cpuid(unsigned long op, unsigned long *Reax,
129              unsigned long *Rebx, unsigned long *Recx, unsigned long *Redx)
130 {
131     unsigned long Leax, Lebx, Lecx, Ledx;
132     __asm {
133         pushad
134         xor     ecx,ecx
135         mov     eax,op
136         wcpuid
137         mov     Leax,eax
138         mov     Lebx,ebx
139         mov     Lecx,ecx
140         mov     Ledx,edx
141         popad
142     }
143     *Reax = Leax;
144     *Rebx = Lebx;
145     *Recx = Lecx;
146     *Redx = Ledx;
147 }
148 
149 static unsigned long
changeFlag(unsigned long flag)150 changeFlag(unsigned long flag)
151 {
152     unsigned long changedFlags, originalFlags;
153     __asm {
154         push eax
155         push ebx
156         pushfd /* get the flags */
157             pop  eax
158         push eax /* save the flags on the stack */
159             mov  originalFlags,eax /* save the original flags */
160         mov  ebx,flag
161             xor  eax,ebx /* flip the bit */
162         push eax /* set the flags */
163             popfd
164         pushfd /* get the flags again (for return) */
165         pop  eax
166         popfd /* restore the original flags */
167         mov changedFlags,eax
168         pop ebx
169         pop eax
170     }
171     return changedFlags ^ originalFlags;
172 }
173 #endif
174 
175 #endif
176 
177 #if !defined(AMD_64)
178 #define AC_FLAG 0x40000
179 #define ID_FLAG 0x200000
180 
181 /* 386 processors can't flip the AC_FLAG, intel AP Note AP-485 */
182 static int
is386()183 is386()
184 {
185     return changeFlag(AC_FLAG) == 0;
186 }
187 
188 /* 486 processors can't flip the ID_FLAG, intel AP Note AP-485 */
189 static int
is486()190 is486()
191 {
192     return changeFlag(ID_FLAG) == 0;
193 }
194 #endif
195 
196 /*
197  * table for Intel Cache.
198  * See Intel Application Note AP-485 for more information
199  */
200 
201 typedef unsigned char CacheTypeEntry;
202 
203 typedef enum {
204     Cache_NONE = 0,
205     Cache_UNKNOWN = 1,
206     Cache_TLB = 2,
207     Cache_TLBi = 3,
208     Cache_TLBd = 4,
209     Cache_Trace = 5,
210     Cache_L1 = 6,
211     Cache_L1i = 7,
212     Cache_L1d = 8,
213     Cache_L2 = 9,
214     Cache_L2i = 10,
215     Cache_L2d = 11,
216     Cache_L3 = 12,
217     Cache_L3i = 13,
218     Cache_L3d = 14
219 } CacheType;
220 
221 struct _cache {
222     CacheTypeEntry type;
223     unsigned char lineSize;
224 };
225 static const struct _cache CacheMap[256] = {
226     /* 00 */ { Cache_NONE, 0 },
227     /* 01 */ { Cache_TLBi, 0 },
228     /* 02 */ { Cache_TLBi, 0 },
229     /* 03 */ { Cache_TLBd, 0 },
230     /* 04 */ {
231         Cache_TLBd,
232     },
233     /* 05 */ { Cache_UNKNOWN, 0 },
234     /* 06 */ { Cache_L1i, 32 },
235     /* 07 */ { Cache_UNKNOWN, 0 },
236     /* 08 */ { Cache_L1i, 32 },
237     /* 09 */ { Cache_UNKNOWN, 0 },
238     /* 0a */ { Cache_L1d, 32 },
239     /* 0b */ { Cache_UNKNOWN, 0 },
240     /* 0c */ { Cache_L1d, 32 },
241     /* 0d */ { Cache_UNKNOWN, 0 },
242     /* 0e */ { Cache_UNKNOWN, 0 },
243     /* 0f */ { Cache_UNKNOWN, 0 },
244     /* 10 */ { Cache_UNKNOWN, 0 },
245     /* 11 */ { Cache_UNKNOWN, 0 },
246     /* 12 */ { Cache_UNKNOWN, 0 },
247     /* 13 */ { Cache_UNKNOWN, 0 },
248     /* 14 */ { Cache_UNKNOWN, 0 },
249     /* 15 */ { Cache_UNKNOWN, 0 },
250     /* 16 */ { Cache_UNKNOWN, 0 },
251     /* 17 */ { Cache_UNKNOWN, 0 },
252     /* 18 */ { Cache_UNKNOWN, 0 },
253     /* 19 */ { Cache_UNKNOWN, 0 },
254     /* 1a */ { Cache_UNKNOWN, 0 },
255     /* 1b */ { Cache_UNKNOWN, 0 },
256     /* 1c */ { Cache_UNKNOWN, 0 },
257     /* 1d */ { Cache_UNKNOWN, 0 },
258     /* 1e */ { Cache_UNKNOWN, 0 },
259     /* 1f */ { Cache_UNKNOWN, 0 },
260     /* 20 */ { Cache_UNKNOWN, 0 },
261     /* 21 */ { Cache_UNKNOWN, 0 },
262     /* 22 */ { Cache_L3, 64 },
263     /* 23 */ { Cache_L3, 64 },
264     /* 24 */ { Cache_UNKNOWN, 0 },
265     /* 25 */ { Cache_L3, 64 },
266     /* 26 */ { Cache_UNKNOWN, 0 },
267     /* 27 */ { Cache_UNKNOWN, 0 },
268     /* 28 */ { Cache_UNKNOWN, 0 },
269     /* 29 */ { Cache_L3, 64 },
270     /* 2a */ { Cache_UNKNOWN, 0 },
271     /* 2b */ { Cache_UNKNOWN, 0 },
272     /* 2c */ { Cache_L1d, 64 },
273     /* 2d */ { Cache_UNKNOWN, 0 },
274     /* 2e */ { Cache_UNKNOWN, 0 },
275     /* 2f */ { Cache_UNKNOWN, 0 },
276     /* 30 */ { Cache_L1i, 64 },
277     /* 31 */ { Cache_UNKNOWN, 0 },
278     /* 32 */ { Cache_UNKNOWN, 0 },
279     /* 33 */ { Cache_UNKNOWN, 0 },
280     /* 34 */ { Cache_UNKNOWN, 0 },
281     /* 35 */ { Cache_UNKNOWN, 0 },
282     /* 36 */ { Cache_UNKNOWN, 0 },
283     /* 37 */ { Cache_UNKNOWN, 0 },
284     /* 38 */ { Cache_UNKNOWN, 0 },
285     /* 39 */ { Cache_L2, 64 },
286     /* 3a */ { Cache_UNKNOWN, 0 },
287     /* 3b */ { Cache_L2, 64 },
288     /* 3c */ { Cache_L2, 64 },
289     /* 3d */ { Cache_UNKNOWN, 0 },
290     /* 3e */ { Cache_UNKNOWN, 0 },
291     /* 3f */ { Cache_UNKNOWN, 0 },
292     /* 40 */ { Cache_L2, 0 },
293     /* 41 */ { Cache_L2, 32 },
294     /* 42 */ { Cache_L2, 32 },
295     /* 43 */ { Cache_L2, 32 },
296     /* 44 */ { Cache_L2, 32 },
297     /* 45 */ { Cache_L2, 32 },
298     /* 46 */ { Cache_UNKNOWN, 0 },
299     /* 47 */ { Cache_UNKNOWN, 0 },
300     /* 48 */ { Cache_UNKNOWN, 0 },
301     /* 49 */ { Cache_UNKNOWN, 0 },
302     /* 4a */ { Cache_UNKNOWN, 0 },
303     /* 4b */ { Cache_UNKNOWN, 0 },
304     /* 4c */ { Cache_UNKNOWN, 0 },
305     /* 4d */ { Cache_UNKNOWN, 0 },
306     /* 4e */ { Cache_UNKNOWN, 0 },
307     /* 4f */ { Cache_UNKNOWN, 0 },
308     /* 50 */ { Cache_TLBi, 0 },
309     /* 51 */ { Cache_TLBi, 0 },
310     /* 52 */ { Cache_TLBi, 0 },
311     /* 53 */ { Cache_UNKNOWN, 0 },
312     /* 54 */ { Cache_UNKNOWN, 0 },
313     /* 55 */ { Cache_UNKNOWN, 0 },
314     /* 56 */ { Cache_UNKNOWN, 0 },
315     /* 57 */ { Cache_UNKNOWN, 0 },
316     /* 58 */ { Cache_UNKNOWN, 0 },
317     /* 59 */ { Cache_UNKNOWN, 0 },
318     /* 5a */ { Cache_UNKNOWN, 0 },
319     /* 5b */ { Cache_TLBd, 0 },
320     /* 5c */ { Cache_TLBd, 0 },
321     /* 5d */ { Cache_TLBd, 0 },
322     /* 5e */ { Cache_UNKNOWN, 0 },
323     /* 5f */ { Cache_UNKNOWN, 0 },
324     /* 60 */ { Cache_UNKNOWN, 0 },
325     /* 61 */ { Cache_UNKNOWN, 0 },
326     /* 62 */ { Cache_UNKNOWN, 0 },
327     /* 63 */ { Cache_UNKNOWN, 0 },
328     /* 64 */ { Cache_UNKNOWN, 0 },
329     /* 65 */ { Cache_UNKNOWN, 0 },
330     /* 66 */ { Cache_L1d, 64 },
331     /* 67 */ { Cache_L1d, 64 },
332     /* 68 */ { Cache_L1d, 64 },
333     /* 69 */ { Cache_UNKNOWN, 0 },
334     /* 6a */ { Cache_UNKNOWN, 0 },
335     /* 6b */ { Cache_UNKNOWN, 0 },
336     /* 6c */ { Cache_UNKNOWN, 0 },
337     /* 6d */ { Cache_UNKNOWN, 0 },
338     /* 6e */ { Cache_UNKNOWN, 0 },
339     /* 6f */ { Cache_UNKNOWN, 0 },
340     /* 70 */ { Cache_Trace, 1 },
341     /* 71 */ { Cache_Trace, 1 },
342     /* 72 */ { Cache_Trace, 1 },
343     /* 73 */ { Cache_UNKNOWN, 0 },
344     /* 74 */ { Cache_UNKNOWN, 0 },
345     /* 75 */ { Cache_UNKNOWN, 0 },
346     /* 76 */ { Cache_UNKNOWN, 0 },
347     /* 77 */ { Cache_UNKNOWN, 0 },
348     /* 78 */ { Cache_UNKNOWN, 0 },
349     /* 79 */ { Cache_L2, 64 },
350     /* 7a */ { Cache_L2, 64 },
351     /* 7b */ { Cache_L2, 64 },
352     /* 7c */ { Cache_L2, 64 },
353     /* 7d */ { Cache_UNKNOWN, 0 },
354     /* 7e */ { Cache_UNKNOWN, 0 },
355     /* 7f */ { Cache_UNKNOWN, 0 },
356     /* 80 */ { Cache_UNKNOWN, 0 },
357     /* 81 */ { Cache_UNKNOWN, 0 },
358     /* 82 */ { Cache_L2, 32 },
359     /* 83 */ { Cache_L2, 32 },
360     /* 84 */ { Cache_L2, 32 },
361     /* 85 */ { Cache_L2, 32 },
362     /* 86 */ { Cache_L2, 64 },
363     /* 87 */ { Cache_L2, 64 },
364     /* 88 */ { Cache_UNKNOWN, 0 },
365     /* 89 */ { Cache_UNKNOWN, 0 },
366     /* 8a */ { Cache_UNKNOWN, 0 },
367     /* 8b */ { Cache_UNKNOWN, 0 },
368     /* 8c */ { Cache_UNKNOWN, 0 },
369     /* 8d */ { Cache_UNKNOWN, 0 },
370     /* 8e */ { Cache_UNKNOWN, 0 },
371     /* 8f */ { Cache_UNKNOWN, 0 },
372     /* 90 */ { Cache_UNKNOWN, 0 },
373     /* 91 */ { Cache_UNKNOWN, 0 },
374     /* 92 */ { Cache_UNKNOWN, 0 },
375     /* 93 */ { Cache_UNKNOWN, 0 },
376     /* 94 */ { Cache_UNKNOWN, 0 },
377     /* 95 */ { Cache_UNKNOWN, 0 },
378     /* 96 */ { Cache_UNKNOWN, 0 },
379     /* 97 */ { Cache_UNKNOWN, 0 },
380     /* 98 */ { Cache_UNKNOWN, 0 },
381     /* 99 */ { Cache_UNKNOWN, 0 },
382     /* 9a */ { Cache_UNKNOWN, 0 },
383     /* 9b */ { Cache_UNKNOWN, 0 },
384     /* 9c */ { Cache_UNKNOWN, 0 },
385     /* 9d */ { Cache_UNKNOWN, 0 },
386     /* 9e */ { Cache_UNKNOWN, 0 },
387     /* 9f */ { Cache_UNKNOWN, 0 },
388     /* a0 */ { Cache_UNKNOWN, 0 },
389     /* a1 */ { Cache_UNKNOWN, 0 },
390     /* a2 */ { Cache_UNKNOWN, 0 },
391     /* a3 */ { Cache_UNKNOWN, 0 },
392     /* a4 */ { Cache_UNKNOWN, 0 },
393     /* a5 */ { Cache_UNKNOWN, 0 },
394     /* a6 */ { Cache_UNKNOWN, 0 },
395     /* a7 */ { Cache_UNKNOWN, 0 },
396     /* a8 */ { Cache_UNKNOWN, 0 },
397     /* a9 */ { Cache_UNKNOWN, 0 },
398     /* aa */ { Cache_UNKNOWN, 0 },
399     /* ab */ { Cache_UNKNOWN, 0 },
400     /* ac */ { Cache_UNKNOWN, 0 },
401     /* ad */ { Cache_UNKNOWN, 0 },
402     /* ae */ { Cache_UNKNOWN, 0 },
403     /* af */ { Cache_UNKNOWN, 0 },
404     /* b0 */ { Cache_TLBi, 0 },
405     /* b1 */ { Cache_UNKNOWN, 0 },
406     /* b2 */ { Cache_UNKNOWN, 0 },
407     /* b3 */ { Cache_TLBd, 0 },
408     /* b4 */ { Cache_UNKNOWN, 0 },
409     /* b5 */ { Cache_UNKNOWN, 0 },
410     /* b6 */ { Cache_UNKNOWN, 0 },
411     /* b7 */ { Cache_UNKNOWN, 0 },
412     /* b8 */ { Cache_UNKNOWN, 0 },
413     /* b9 */ { Cache_UNKNOWN, 0 },
414     /* ba */ { Cache_UNKNOWN, 0 },
415     /* bb */ { Cache_UNKNOWN, 0 },
416     /* bc */ { Cache_UNKNOWN, 0 },
417     /* bd */ { Cache_UNKNOWN, 0 },
418     /* be */ { Cache_UNKNOWN, 0 },
419     /* bf */ { Cache_UNKNOWN, 0 },
420     /* c0 */ { Cache_UNKNOWN, 0 },
421     /* c1 */ { Cache_UNKNOWN, 0 },
422     /* c2 */ { Cache_UNKNOWN, 0 },
423     /* c3 */ { Cache_UNKNOWN, 0 },
424     /* c4 */ { Cache_UNKNOWN, 0 },
425     /* c5 */ { Cache_UNKNOWN, 0 },
426     /* c6 */ { Cache_UNKNOWN, 0 },
427     /* c7 */ { Cache_UNKNOWN, 0 },
428     /* c8 */ { Cache_UNKNOWN, 0 },
429     /* c9 */ { Cache_UNKNOWN, 0 },
430     /* ca */ { Cache_UNKNOWN, 0 },
431     /* cb */ { Cache_UNKNOWN, 0 },
432     /* cc */ { Cache_UNKNOWN, 0 },
433     /* cd */ { Cache_UNKNOWN, 0 },
434     /* ce */ { Cache_UNKNOWN, 0 },
435     /* cf */ { Cache_UNKNOWN, 0 },
436     /* d0 */ { Cache_UNKNOWN, 0 },
437     /* d1 */ { Cache_UNKNOWN, 0 },
438     /* d2 */ { Cache_UNKNOWN, 0 },
439     /* d3 */ { Cache_UNKNOWN, 0 },
440     /* d4 */ { Cache_UNKNOWN, 0 },
441     /* d5 */ { Cache_UNKNOWN, 0 },
442     /* d6 */ { Cache_UNKNOWN, 0 },
443     /* d7 */ { Cache_UNKNOWN, 0 },
444     /* d8 */ { Cache_UNKNOWN, 0 },
445     /* d9 */ { Cache_UNKNOWN, 0 },
446     /* da */ { Cache_UNKNOWN, 0 },
447     /* db */ { Cache_UNKNOWN, 0 },
448     /* dc */ { Cache_UNKNOWN, 0 },
449     /* dd */ { Cache_UNKNOWN, 0 },
450     /* de */ { Cache_UNKNOWN, 0 },
451     /* df */ { Cache_UNKNOWN, 0 },
452     /* e0 */ { Cache_UNKNOWN, 0 },
453     /* e1 */ { Cache_UNKNOWN, 0 },
454     /* e2 */ { Cache_UNKNOWN, 0 },
455     /* e3 */ { Cache_UNKNOWN, 0 },
456     /* e4 */ { Cache_UNKNOWN, 0 },
457     /* e5 */ { Cache_UNKNOWN, 0 },
458     /* e6 */ { Cache_UNKNOWN, 0 },
459     /* e7 */ { Cache_UNKNOWN, 0 },
460     /* e8 */ { Cache_UNKNOWN, 0 },
461     /* e9 */ { Cache_UNKNOWN, 0 },
462     /* ea */ { Cache_UNKNOWN, 0 },
463     /* eb */ { Cache_UNKNOWN, 0 },
464     /* ec */ { Cache_UNKNOWN, 0 },
465     /* ed */ { Cache_UNKNOWN, 0 },
466     /* ee */ { Cache_UNKNOWN, 0 },
467     /* ef */ { Cache_UNKNOWN, 0 },
468     /* f0 */ { Cache_UNKNOWN, 0 },
469     /* f1 */ { Cache_UNKNOWN, 0 },
470     /* f2 */ { Cache_UNKNOWN, 0 },
471     /* f3 */ { Cache_UNKNOWN, 0 },
472     /* f4 */ { Cache_UNKNOWN, 0 },
473     /* f5 */ { Cache_UNKNOWN, 0 },
474     /* f6 */ { Cache_UNKNOWN, 0 },
475     /* f7 */ { Cache_UNKNOWN, 0 },
476     /* f8 */ { Cache_UNKNOWN, 0 },
477     /* f9 */ { Cache_UNKNOWN, 0 },
478     /* fa */ { Cache_UNKNOWN, 0 },
479     /* fb */ { Cache_UNKNOWN, 0 },
480     /* fc */ { Cache_UNKNOWN, 0 },
481     /* fd */ { Cache_UNKNOWN, 0 },
482     /* fe */ { Cache_UNKNOWN, 0 },
483     /* ff */ { Cache_UNKNOWN, 0 }
484 };
485 
486 /*
487  * use the above table to determine the CacheEntryLineSize.
488  */
489 static void
getIntelCacheEntryLineSize(unsigned long val,int * level,unsigned long * lineSize)490 getIntelCacheEntryLineSize(unsigned long val, int *level,
491                            unsigned long *lineSize)
492 {
493     CacheType type;
494 
495     type = CacheMap[val].type;
496     /* only interested in data caches */
497     /* NOTE val = 0x40 is a special value that means no L2 or L3 cache.
498      * this data check has the side effect of rejecting that entry. If
499      * that wasn't the case, we could have to reject it explicitly */
500     if (CacheMap[val].lineSize == 0) {
501         return;
502     }
503     /* look at the caches, skip types we aren't interested in.
504      * if we already have a value for a lower level cache, skip the
505      * current entry */
506     if ((type == Cache_L1) || (type == Cache_L1d)) {
507         *level = 1;
508         *lineSize = CacheMap[val].lineSize;
509     } else if ((*level >= 2) && ((type == Cache_L2) || (type == Cache_L2d))) {
510         *level = 2;
511         *lineSize = CacheMap[val].lineSize;
512     } else if ((*level >= 3) && ((type == Cache_L3) || (type == Cache_L3d))) {
513         *level = 3;
514         *lineSize = CacheMap[val].lineSize;
515     }
516     return;
517 }
518 
519 static void
getIntelRegisterCacheLineSize(unsigned long val,int * level,unsigned long * lineSize)520 getIntelRegisterCacheLineSize(unsigned long val,
521                               int *level, unsigned long *lineSize)
522 {
523     getIntelCacheEntryLineSize(val >> 24 & 0xff, level, lineSize);
524     getIntelCacheEntryLineSize(val >> 16 & 0xff, level, lineSize);
525     getIntelCacheEntryLineSize(val >> 8 & 0xff, level, lineSize);
526     getIntelCacheEntryLineSize(val & 0xff, level, lineSize);
527 }
528 
529 /*
530  * returns '0' if no recognized cache is found, or if the cache
531  * information is supported by this processor
532  */
533 static unsigned long
getIntelCacheLineSize(int cpuidLevel)534 getIntelCacheLineSize(int cpuidLevel)
535 {
536     int level = 4;
537     unsigned long lineSize = 0;
538     unsigned long eax, ebx, ecx, edx;
539     int repeat, count;
540 
541     if (cpuidLevel < 2) {
542         return 0;
543     }
544 
545     /* command '2' of the cpuid is intel's cache info call. Each byte of the
546      * 4 registers contain a potential descriptor for the cache. The CacheMap
547      * table maps the cache entry with the processor cache. Register 'al'
548      * contains a count value that cpuid '2' needs to be called in order to
549      * find all the cache descriptors. Only registers with the high bit set
550      * to 'zero' have valid descriptors. This code loops through all the
551      * required calls to cpuid '2' and passes any valid descriptors it finds
552      * to the getIntelRegisterCacheLineSize code, which breaks the registers
553      * down into their component descriptors. In the end the lineSize of the
554      * lowest level cache data cache is returned. */
555     freebl_cpuid(2, &eax, &ebx, &ecx, &edx);
556     repeat = eax & 0xf;
557     for (count = 0; count < repeat; count++) {
558         if ((eax & 0x80000000) == 0) {
559             getIntelRegisterCacheLineSize(eax & 0xffffff00, &level, &lineSize);
560         }
561         if ((ebx & 0x80000000) == 0) {
562             getIntelRegisterCacheLineSize(ebx, &level, &lineSize);
563         }
564         if ((ecx & 0x80000000) == 0) {
565             getIntelRegisterCacheLineSize(ecx, &level, &lineSize);
566         }
567         if ((edx & 0x80000000) == 0) {
568             getIntelRegisterCacheLineSize(edx, &level, &lineSize);
569         }
570         if (count + 1 != repeat) {
571             freebl_cpuid(2, &eax, &ebx, &ecx, &edx);
572         }
573     }
574     return lineSize;
575 }
576 
577 /*
578  * returns '0' if the cache info is not supported by this processor.
579  * This is based on the AMD extended cache commands for cpuid.
580  * (see "AMD Processor Recognition Application Note" Publication 20734).
581  * Some other processors use the identical scheme.
582  * (see "Processor Recognition, Transmeta Corporation").
583  */
584 static unsigned long
getOtherCacheLineSize(unsigned long cpuidLevel)585 getOtherCacheLineSize(unsigned long cpuidLevel)
586 {
587     unsigned long lineSize = 0;
588     unsigned long eax, ebx, ecx, edx;
589 
590     /* get the Extended CPUID level */
591     freebl_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
592     cpuidLevel = eax;
593 
594     if (cpuidLevel >= 0x80000005) {
595         freebl_cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
596         lineSize = ecx & 0xff; /* line Size, L1 Data Cache */
597     }
598     return lineSize;
599 }
600 
601 static const char *const manMap[] = {
602 #define INTEL 0
603     "GenuineIntel",
604 #define AMD 1
605     "AuthenticAMD",
606 #define CYRIX 2
607     "CyrixInstead",
608 #define CENTAUR 2
609     "CentaurHauls",
610 #define NEXGEN 3
611     "NexGenDriven",
612 #define TRANSMETA 4
613     "GenuineTMx86",
614 #define RISE 5
615     "RiseRiseRise",
616 #define UMC 6
617     "UMC UMC UMC ",
618 #define SIS 7
619     "Sis Sis Sis ",
620 #define NATIONAL 8
621     "Geode by NSC",
622 };
623 
624 static const int n_manufacturers = sizeof(manMap) / sizeof(manMap[0]);
625 
626 #define MAN_UNKNOWN 9
627 
628 #if !defined(AMD_64)
629 #define SSE2_FLAG (1 << 26)
630 unsigned long
s_mpi_is_sse2()631 s_mpi_is_sse2()
632 {
633     unsigned long eax, ebx, ecx, edx;
634 
635     if (is386() || is486()) {
636         return 0;
637     }
638     freebl_cpuid(0, &eax, &ebx, &ecx, &edx);
639 
640     /* has no SSE2 extensions */
641     if (eax == 0) {
642         return 0;
643     }
644 
645     freebl_cpuid(1, &eax, &ebx, &ecx, &edx);
646     return (edx & SSE2_FLAG) == SSE2_FLAG;
647 }
648 #endif
649 
650 unsigned long
s_mpi_getProcessorLineSize()651 s_mpi_getProcessorLineSize()
652 {
653     unsigned long eax, ebx, ecx, edx;
654     PRUint32 cpuid[3];
655     unsigned long cpuidLevel;
656     unsigned long cacheLineSize = 0;
657     int manufacturer = MAN_UNKNOWN;
658     int i;
659     char string[13];
660 
661 #if !defined(AMD_64)
662     if (is386()) {
663         return 0; /* 386 had no cache */
664     }
665     if (is486()) {
666         return 32; /* really? need more info */
667     }
668 #endif
669 
670     /* Pentium, cpuid command is available */
671     freebl_cpuid(0, &eax, &ebx, &ecx, &edx);
672     cpuidLevel = eax;
673     /* string holds the CPU's manufacturer ID string - a twelve
674      * character ASCII string stored in ebx, edx, ecx, and
675      * the 32-bit extended feature flags are in edx, ecx.
676      */
677     cpuid[0] = ebx;
678     cpuid[1] = ecx;
679     cpuid[2] = edx;
680     memcpy(string, cpuid, sizeof(cpuid));
681     string[12] = 0;
682 
683     manufacturer = MAN_UNKNOWN;
684     for (i = 0; i < n_manufacturers; i++) {
685         if (strcmp(manMap[i], string) == 0) {
686             manufacturer = i;
687         }
688     }
689 
690     if (manufacturer == INTEL) {
691         cacheLineSize = getIntelCacheLineSize(cpuidLevel);
692     } else {
693         cacheLineSize = getOtherCacheLineSize(cpuidLevel);
694     }
695     /* doesn't support cache info based on cpuid. This means
696      * an old pentium class processor, which have cache lines of
697      * 32. If we learn differently, we can use a switch based on
698      * the Manufacturer id  */
699     if (cacheLineSize == 0) {
700         cacheLineSize = 32;
701     }
702     return cacheLineSize;
703 }
704 #define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
705 #endif
706 
707 #if defined(__ppc64__)
708 /*
709  *  Sigh, The PPC has some really nice features to help us determine cache
710  *  size, since it had lots of direct control functions to do so. The POWER
711  *  processor even has an instruction to do this, but it was dropped in
712  *  PowerPC. Unfortunately most of them are not available in user mode.
713  *
714  *  The dcbz function would be a great way to determine cache line size except
715  *  1) it only works on write-back memory (it throws an exception otherwise),
716  *  and 2) because so many mac programs 'knew' the processor cache size was
717  *  32 bytes, they used this instruction as a fast 'zero 32 bytes'. Now the new
718  *  G5 processor has 128 byte cache, but dcbz only clears 32 bytes to keep
719  *  these programs happy. dcbzl work if 64 bit instructions are supported.
720  *  If you know 64 bit instructions are supported, and that stack is
721  *  write-back, you can use this code.
722  */
723 #include "memory.h"
724 
725 /* clear the cache line that contains 'array' */
726 static inline void
dcbzl(char * array)727 dcbzl(char *array)
728 {
729     register char *a asm("r2") = array;
730     __asm__ __volatile__("dcbzl %0,r0"
731                          : "=r"(a)
732                          : "0"(a));
733 }
734 
735 #define PPC_DO_ALIGN(x, y) ((char *)((((long long)(x)) + ((y)-1)) & ~((y)-1)))
736 
737 #define PPC_MAX_LINE_SIZE 256
738 unsigned long
s_mpi_getProcessorLineSize()739 s_mpi_getProcessorLineSize()
740 {
741     char testArray[2 * PPC_MAX_LINE_SIZE + 1];
742     char *test;
743     int i;
744 
745     /* align the array on a maximum line size boundary, so we
746      * know we are starting to clear from the first address */
747     test = PPC_DO_ALIGN(testArray, PPC_MAX_LINE_SIZE);
748     /* set all the values to 1's */
749     memset(test, 0xff, PPC_MAX_LINE_SIZE);
750     /* clear one cache block starting at 'test' */
751     dcbzl(test);
752 
753     /* find the size of the cleared area, that's our block size */
754     for (i = PPC_MAX_LINE_SIZE; i != 0; i = i / 2) {
755         if (test[i - 1] == 0) {
756             return i;
757         }
758     }
759     return 0;
760 }
761 
762 #define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
763 #endif
764 
765 /*
766  * put other processor and platform specific cache code here
767  * return the smallest cache line size in bytes on the processor
768  * (usually the L1 cache). If the OS has a call, this would be
769  * a greate place to put it.
770  *
771  * If there is no cache, return 0;
772  *
773  * define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED so the generic functions
774  * below aren't compiled.
775  *
776  */
777 
778 /* target.mk can define MPI_CACHE_LINE_SIZE if it's common for the family or
779  * OS */
780 #if defined(MPI_CACHE_LINE_SIZE) && !defined(MPI_GET_PROCESSOR_LINE_SIZE_DEFINED)
781 
782 unsigned long
s_mpi_getProcessorLineSize()783 s_mpi_getProcessorLineSize()
784 {
785     return MPI_CACHE_LINE_SIZE;
786 }
787 #define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
788 #endif
789 
790 /* If no way to get the processor cache line size has been defined, assume
791  * it's 32 bytes (most common value, does not significantly impact performance)
792  */
793 #ifndef MPI_GET_PROCESSOR_LINE_SIZE_DEFINED
794 unsigned long
s_mpi_getProcessorLineSize()795 s_mpi_getProcessorLineSize()
796 {
797     return 32;
798 }
799 #endif
800 
801 #ifdef TEST_IT
802 #include <stdio.h>
803 
main()804 main()
805 {
806     printf("line size = %d\n", s_mpi_getProcessorLineSize());
807 }
808 #endif
809