1 /*
2 aclib - advanced C library ;)
3 This file contains functions which improve and expand standard C-library
4 */
5
6 #ifndef HAVE_SSE2
7 /*
8 P3 processor has only one SSE decoder so can execute only 1 sse insn per
9 cpu clock, but it has 3 mmx decoders (include load/store unit)
10 and executes 3 mmx insns per cpu clock.
11 P4 processor has some chances, but after reading:
12 http://www.emulators.com/pentium4.htm
13 I have doubts. Anyway SSE2 version of this code can be written better.
14 */
15 #undef HAVE_SSE
16 #endif
17
18
19 /*
20 This part of code was taken by me from Linux-2.4.3 and slightly modified
21 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
22 blocks but mplayer uses weakly ordered data and original sources can not
23 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
24
25 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
26
27 Order Number 245470:
28 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
29
30 Data referenced by a program can be temporal (data will be used again) or
31 non-temporal (data will be referenced once and not reused in the immediate
32 future). To make efficient use of the processor's caches, it is generally
33 desirable to cache temporal data and not cache non-temporal data. Overloading
34 the processor's caches with non-temporal data is sometimes referred to as
35 "polluting the caches".
36 The non-temporal data is written to memory with Write-Combining semantics.
37
38 The PREFETCHh instructions permits a program to load data into the processor
39 at a suggested cache level, so that it is closer to the processors load and
40 store unit when it is needed. If the data is already present in a level of
41 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
42 will not result in any data movement.
43 But we should you PREFETCHNTA: Non-temporal data fetch data into location
44 close to the processor, minimizing cache pollution.
45
46 The MOVNTQ (store quadword using non-temporal hint) instruction stores
47 packed integer data from an MMX register to memory, using a non-temporal hint.
48 The MOVNTPS (store packed single-precision floating-point values using
49 non-temporal hint) instruction stores packed floating-point data from an
50 XMM register to memory, using a non-temporal hint.
51
52 The SFENCE (Store Fence) instruction controls write ordering by creating a
53 fence for memory store operations. This instruction guarantees that the results
54 of every store instruction that precedes the store fence in program order is
55 globally visible before any store instruction that follows the fence. The
56 SFENCE instruction provides an efficient way of ensuring ordering between
57 procedures that produce weakly-ordered data and procedures that consume that
58 data.
59
60 If you have questions please contact with me: nickols_k@mail.ru.
61 */
62
63 /* for small memory blocks (<256 bytes) this version is faster */
64 #define small_memcpy(to,from,n)\
65 {\
66 register unsigned long int siz;\
67 register unsigned long int dummy;\
68 siz=n&0x7; n>>=3;\
69 if(siz)\
70 __asm__ __volatile__(\
71 "rep; movsb"\
72 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
73 :"0" (to), "1" (from),"2" (siz)\
74 : "memory","cc");\
75 if(n)\
76 __asm__ __volatile__(\
77 "rep; movsq"\
78 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
79 :"0" (to), "1" (from),"2" (n)\
80 : "memory","cc");\
81 }
82
83
84 #define MMREG_SIZE 16ULL
85 #define MIN_LEN 257ULL
86 #define CL_SIZE 256ULL /*always align on 256 byte boundary */
87
RENAME(fast_memcpy)88 static inline void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
89 {
90 void *retval;
91 const unsigned char *cfrom=from;
92 unsigned char *tto=to;
93 size_t i=0;
94 retval = to;
95 if(!len) return retval;
96 /* PREFETCH has effect even for MOVSB instruction ;) */
97 __asm__ __volatile__ (
98 "prefetcht0 (%0)\n"
99 "prefetcht0 64(%0)\n"
100 "prefetcht0 128(%0)\n"
101 "prefetcht0 192(%0)\n"
102 :: "r" (cfrom));
103 if(len >= MIN_LEN)
104 {
105 register unsigned long int delta;
106 /* Align destinition to cache-line size -boundary */
107 delta = ((unsigned long long int)tto)&(CL_SIZE-1ULL);
108 if(delta)
109 {
110 delta=CL_SIZE-delta;
111 len -=delta;
112 small_memcpy(tto, cfrom, delta);
113 }
114 i = len>>8; /* len/256 */
115 len=len-(i<<8);
116 }
117 if(i) {
118 /*
119 This algorithm is top effective when the code consequently
120 reads and writes blocks which have size of cache line.
121 Size of cache line is processor-dependent.
122 It will, however, be a minimum of 32 bytes on any processors.
123 It would be better to have a number of instructions which
124 perform reading and writing to be multiple to a number of
125 processor's decoders, but it's not always possible.
126 */
127 if(((unsigned long long)cfrom) & 15)
128 /* if SRC is misaligned */
129 for(; i>0; i--)
130 {
131 __asm__ __volatile__ (
132 "prefetcht0 256(%0)\n"
133 "prefetcht0 320(%0)\n"
134 "movdqu (%0), %%xmm0\n"
135 "movdqu 16(%0), %%xmm1\n"
136 "movdqu 32(%0), %%xmm2\n"
137 "movdqu 48(%0), %%xmm3\n"
138 "movdqu 64(%0), %%xmm4\n"
139 "movdqu 80(%0), %%xmm5\n"
140 "movdqu 96(%0), %%xmm6\n"
141 "movdqu 112(%0), %%xmm7\n"
142 "prefetcht0 384(%0)\n"
143 "prefetcht0 448(%0)\n"
144 "movdqu 128(%0), %%xmm8\n"
145 "movdqu 144(%0), %%xmm9\n"
146 "movdqu 160(%0), %%xmm10\n"
147 "movdqu 176(%0), %%xmm11\n"
148 "movdqu 192(%0), %%xmm12\n"
149 "movdqu 208(%0), %%xmm13\n"
150 "movdqu 224(%0), %%xmm14\n"
151 "movdqu 240(%0), %%xmm15\n"
152 "movntdq %%xmm0, (%1)\n"
153 "movntdq %%xmm1, 16(%1)\n"
154 "movntdq %%xmm2, 32(%1)\n"
155 "movntdq %%xmm3, 48(%1)\n"
156 "movntdq %%xmm4, 64(%1)\n"
157 "movntdq %%xmm5, 80(%1)\n"
158 "movntdq %%xmm6, 96(%1)\n"
159 "movntdq %%xmm7, 112(%1)\n"
160 "movntdq %%xmm8, 128(%1)\n"
161 "movntdq %%xmm9, 144(%1)\n"
162 "movntdq %%xmm10, 160(%1)\n"
163 "movntdq %%xmm11, 176(%1)\n"
164 "movntdq %%xmm12, 192(%1)\n"
165 "movntdq %%xmm13, 208(%1)\n"
166 "movntdq %%xmm14, 224(%1)\n"
167 "movntdq %%xmm15, 240(%1)\n"
168 :: "r" (cfrom), "r" (tto):
169 "memory"
170 ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"
171 );
172 cfrom+=256ULL;
173 tto+=256ULL;
174 }
175 else
176 /*
177 Only if SRC is aligned on 16-byte boundary.
178 It allows to use movdqa instead of movdqu, which required data
179 to be aligned or a general-protection exception (#GP) is generated.
180 */
181 for(; i>0; i--)
182 {
183 __asm__ __volatile__ (
184 "prefetcht0 256(%0)\n"
185 "prefetcht0 320(%0)\n"
186 "movdqa (%0), %%xmm0\n"
187 "movdqa 16(%0), %%xmm1\n"
188 "movdqa 32(%0), %%xmm2\n"
189 "movdqa 48(%0), %%xmm3\n"
190 "movdqa 64(%0), %%xmm4\n"
191 "movdqa 80(%0), %%xmm5\n"
192 "movdqa 96(%0), %%xmm6\n"
193 "movdqa 112(%0), %%xmm7\n"
194 "prefetcht0 384(%0)\n"
195 "prefetcht0 448(%0)\n"
196 "movdqa 128(%0), %%xmm8\n"
197 "movdqa 144(%0), %%xmm9\n"
198 "movdqa 160(%0), %%xmm10\n"
199 "movdqa 176(%0), %%xmm11\n"
200 "movdqa 192(%0), %%xmm12\n"
201 "movdqa 208(%0), %%xmm13\n"
202 "movdqa 224(%0), %%xmm14\n"
203 "movdqa 240(%0), %%xmm15\n"
204 "movntdq %%xmm0, (%1)\n"
205 "movntdq %%xmm1, 16(%1)\n"
206 "movntdq %%xmm2, 32(%1)\n"
207 "movntdq %%xmm3, 48(%1)\n"
208 "movntdq %%xmm4, 64(%1)\n"
209 "movntdq %%xmm5, 80(%1)\n"
210 "movntdq %%xmm6, 96(%1)\n"
211 "movntdq %%xmm7, 112(%1)\n"
212 "movntdq %%xmm8, 128(%1)\n"
213 "movntdq %%xmm9, 144(%1)\n"
214 "movntdq %%xmm10, 160(%1)\n"
215 "movntdq %%xmm11, 176(%1)\n"
216 "movntdq %%xmm12, 192(%1)\n"
217 "movntdq %%xmm13, 208(%1)\n"
218 "movntdq %%xmm14, 224(%1)\n"
219 "movntdq %%xmm15, 240(%1)\n"
220 :: "r" (cfrom), "r" (tto):
221 "memory"
222 ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"
223 );
224 cfrom+=256ULL;
225 tto+=256ULL;
226 }
227 __asm__ __volatile__ ("sfence":::"memory");
228 }
229 /*
230 * Now do the tail of the block
231 */
232 if(len) small_memcpy(tto, cfrom, len);
233 return retval;
234 }
235
236 #define small_memset(to,val,n)\
237 {\
238 register unsigned long int dummy;\
239 if(n)\
240 __asm__ __volatile__(\
241 "rep; stosb"\
242 :"=&D"(to), "=&c"(dummy)\
243 :"0" (to), "1" (n), "a" ((char)val)\
244 : "memory","cc");\
245 }
246
247 #define XMMREG_SIZE 16
248 /* Fast memory set. See comments for fast_memcpy */
RENAME(fast_memset)249 static void * RENAME(fast_memset)(void * to, int val, size_t len)
250 {
251 void *retval;
252 size_t i;
253 unsigned char mm_reg[XMMREG_SIZE], *pmm_reg;
254 unsigned char *tto=to;
255
256 retval = tto;
257 if(len >= MIN_LEN)
258 {
259 register unsigned long int delta;
260 delta = ((unsigned long long int)tto)&(XMMREG_SIZE-1);
261 if(delta)
262 {
263 delta=XMMREG_SIZE-delta;
264 len -= delta;
265 small_memset(tto, val, delta);
266 }
267 i = len >> 7; /* len/128 */
268 len&=127;
269 pmm_reg = mm_reg;
270 small_memset(pmm_reg,val,sizeof(mm_reg));
271 __asm__ __volatile__(
272 "movdqa (%0), %%xmm0\n"
273 :: "r"(mm_reg):"memory");
274 for(; i>0; i--)
275 {
276 __asm__ __volatile__ (
277 "movntdq %%xmm0, (%0)\n"
278 "movntdq %%xmm0, 16(%0)\n"
279 "movntdq %%xmm0, 32(%0)\n"
280 "movntdq %%xmm0, 48(%0)\n"
281 "movntdq %%xmm0, 64(%0)\n"
282 "movntdq %%xmm0, 80(%0)\n"
283 "movntdq %%xmm0, 96(%0)\n"
284 "movntdq %%xmm0, 112(%0)\n"
285 :: "r" (tto) : "memory");
286 tto+=128ULL;
287 }
288 __asm__ __volatile__ ("sfence":::"memory");
289 }
290 /*
291 * Now do the tail of the block
292 */
293 if(len) small_memset(tto, val, len);
294 return retval;
295 }
296
297 #ifdef REGMM_SIZE
298 #undef REGMM_SIZE
299 #endif
300 #define REGMM_SIZE 16
RENAME(InterleaveBuffers)301 static void __FASTCALL__ RENAME(InterleaveBuffers)(tUInt32 limit,
302 void *destbuffer,
303 const void *evenbuffer,
304 const void *oddbuffer)
305 {
306 register char *destbuffptr;
307 register const char *oddptr, *evenptr;
308 register tUInt32 freq;
309 destbuffptr = (char *)destbuffer;
310 evenptr = (const char *)evenbuffer;
311 oddptr = (const char *)oddbuffer;
312 freq = 0;
313 if(limit>REGMM_SIZE*4-1)
314 {
315 register tUInt64 delta, nlimit, step;
316 step = REGMM_SIZE*2;
317 /* Try to align buffers on boundary of REGMM_SIZE */
318 delta = ((tUInt64)evenptr)&(REGMM_SIZE-1);
319 if(delta) delta=REGMM_SIZE-delta;
320 nlimit=(limit-delta)/step;
321 freq=delta+(nlimit*step);
322 while(delta)
323 {
324 *destbuffptr++ = *evenptr++;
325 *destbuffptr++ = *oddptr++;
326 delta--;
327 }
328 /* Perform MMX optimized interleaving */
329 while(nlimit)
330 {
331 /* Interleave mmx and cpu instructions */
332 __asm __volatile("movdqa (%0), %%xmm0\n\t"
333 ::"r"(evenptr):"memory");
334 evenptr+=step;
335 __asm __volatile("movdqa %%xmm0, %%xmm1\n\t"
336 "punpckhbw (%0), %%xmm0\n\t"
337 ::"r"(oddptr):"memory");
338 nlimit--;
339 __asm __volatile("punpcklbw (%0), %%xmm1\n\t"
340 ::"r"(oddptr):"memory");
341 oddptr+=step;
342 __asm __volatile("movdqu %%xmm0, (%0)\n\t"
343 "movdqu %%xmm2, 16(%0)\n\t"
344 ::"r"(destbuffptr):"memory");
345 destbuffptr+=step*2;
346 }
347 }
348 /* If tail exists then finish it */
349 while(freq<limit)
350 {
351 *destbuffptr++ = *evenptr++;
352 *destbuffptr++ = *oddptr++;
353 freq++;
354 }
355 }
356
RENAME(CharsToShorts)357 static void __FASTCALL__ RENAME(CharsToShorts)(tUInt32 limit,
358 void *destbuffer,
359 const void *evenbuffer)
360 {
361 register char *destbuffptr;
362 register const char *evenptr;
363 register tUInt32 freq;
364 destbuffptr = (char *)destbuffer;
365 evenptr = (const char *)evenbuffer;
366 freq = 0;
367 if(limit>REGMM_SIZE*4-1)
368 {
369 register tUInt64 delta, nlimit, step;
370 step = REGMM_SIZE*2;
371 /* Try to align buffer on boundary of REGMM_SIZE */
372 delta = ((tUInt64)evenptr)&(REGMM_SIZE-1);
373 if(delta) delta=REGMM_SIZE-delta;
374 nlimit=(limit-delta)/step;
375 freq=delta+(nlimit*step);
376 while(delta)
377 {
378 *destbuffptr++ = *evenptr++;
379 *destbuffptr++ = 0;
380 delta--;
381 }
382 /* Perform MMX optimized loop */
383 __asm __volatile("pxor %%xmm7, %%xmm7":::"memory");
384 while(nlimit)
385 {
386 /* Interleave mmx and cpu instructions */
387 __asm __volatile("movdqa (%0),%%xmm0\n\t"
388 ::"r"(evenptr):"memory");
389 evenptr+=step;
390 __asm __volatile("movdqa %%xmm0, %%xmm1\n\t"
391 "punpckhbw %%xmm7, %%xmm0\n\t"
392 :::"memory");
393 nlimit--;
394 __asm __volatile(
395 "punpcklbw %%xmm7, %%xmm1\n\t"
396 :::"memory");
397 __asm __volatile("movdqu %%xmm0, (%0)\n\t"
398 "movdqu %%xmm1, 16(%0)\n\t"
399 ::"r"(destbuffptr):"memory");
400 destbuffptr+=step*2;
401 }
402 }
403 /* If tail exists then finish it */
404 while(freq<limit)
405 {
406 *destbuffptr++ = *evenptr++;
407 *destbuffptr++ = 0;
408 freq++;
409 }
410 }
411
RENAME(ShortsToChars)412 static void __FASTCALL__ RENAME(ShortsToChars)(tUInt32 limit,
413 void * destbuffer, const void * srcbuffer)
414 {
415 register char *destbuffptr;
416 register const char *srcptr;
417 register tUInt32 freq;
418 destbuffptr = (char *)destbuffer;
419 srcptr = (const char *)srcbuffer;
420 freq = 0;
421 if(limit>REGMM_SIZE*4-1)
422 {
423 tUInt64 delta, nlimit, step;
424 step = REGMM_SIZE*2;
425 /* Try to align buffers on boundary of REGMM_SIZE */
426 delta=((tUInt64)destbuffptr)&(REGMM_SIZE-1);
427 if(delta) delta=REGMM_SIZE-delta;
428 nlimit=(limit-delta)/step;
429 freq=delta+(nlimit*step);
430 while(delta)
431 {
432 *destbuffptr++ = *srcptr;
433 srcptr+=2;
434 delta--;
435 }
436 /* Perform MMX optimized loop */
437 while(nlimit)
438 {
439 /* Interleave mmx and cpu instructions */
440 __asm __volatile("movdqu (%0), %%xmm0\n\t"
441 ::"r"(srcptr):"memory");
442 nlimit--;
443 __asm __volatile("packuswb (%0), %%xmm0\n\t"
444 ::"r"(&srcptr[REGMM_SIZE]):"memory");
445 srcptr+=step*2;
446 __asm __volatile("movdqa %%xmm0, (%0)\n\t"
447 ::"r"(destbuffptr):"memory");
448 destbuffptr+=step;
449 }
450 }
451 /* If tail exists then finish it */
452 while(freq<limit)
453 {
454 *destbuffptr++ = *srcptr;
455 srcptr+=2;
456 freq++;
457 }
458 }
459