1 /*
2   aclib - advanced C library ;)
3   This file contains functions which improve and expand standard C-library
4 */
5 
6 #ifndef HAVE_SSE2
7 /*
8    P3 processor has only one SSE decoder so can execute only 1 sse insn per
9    cpu clock, but it has 3 mmx decoders (include load/store unit)
10    and executes 3 mmx insns per cpu clock.
11    P4 processor has some chances, but after reading:
12    http://www.emulators.com/pentium4.htm
13    I have doubts. Anyway SSE2 version of this code can be written better.
14 */
15 #undef HAVE_SSE
16 #endif
17 
18 
19 /*
20  This part of code was taken by me from Linux-2.4.3 and slightly modified
21 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
22 blocks but mplayer uses weakly ordered data and original sources can not
23 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
24 
25 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
26 
27 Order Number 245470:
28 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
29 
30 Data referenced by a program can be temporal (data will be used again) or
31 non-temporal (data will be referenced once and not reused in the immediate
32 future). To make efficient use of the processor's caches, it is generally
33 desirable to cache temporal data and not cache non-temporal data. Overloading
34 the processor's caches with non-temporal data is sometimes referred to as
35 "polluting the caches".
36 The non-temporal data is written to memory with Write-Combining semantics.
37 
38 The PREFETCHh instructions permits a program to load data into the processor
39 at a suggested cache level, so that it is closer to the processors load and
40 store unit when it is needed. If the data is already present in a level of
41 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
42 will not result in any data movement.
43 But we should you PREFETCHNTA: Non-temporal data fetch data into location
44 close to the processor, minimizing cache pollution.
45 
46 The MOVNTQ (store quadword using non-temporal hint) instruction stores
47 packed integer data from an MMX register to memory, using a non-temporal hint.
48 The MOVNTPS (store packed single-precision floating-point values using
49 non-temporal hint) instruction stores packed floating-point data from an
50 XMM register to memory, using a non-temporal hint.
51 
52 The SFENCE (Store Fence) instruction controls write ordering by creating a
53 fence for memory store operations. This instruction guarantees that the results
54 of every store instruction that precedes the store fence in program order is
55 globally visible before any store instruction that follows the fence. The
56 SFENCE instruction provides an efficient way of ensuring ordering between
57 procedures that produce weakly-ordered data and procedures that consume that
58 data.
59 
60 If you have questions please contact with me: nickols_k@mail.ru.
61 */
62 
63 /* for small memory blocks (<256 bytes) this version is faster */
64 #define small_memcpy(to,from,n)\
65 {\
66 register unsigned long int siz;\
67 register unsigned long int dummy;\
68     siz=n&0x7;  n>>=3;\
69     if(siz)\
70 __asm__ __volatile__(\
71 	"rep; movsb"\
72 	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
73 	:"0" (to), "1" (from),"2" (siz)\
74 	: "memory","cc");\
75     if(n)\
76 __asm__ __volatile__(\
77 	"rep; movsq"\
78 	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
79 	:"0" (to), "1" (from),"2" (n)\
80 	: "memory","cc");\
81 }
82 
83 
84 #define MMREG_SIZE 16ULL
85 #define MIN_LEN 257ULL
86 #define CL_SIZE 256ULL /*always align on 256 byte boundary */
87 
RENAME(fast_memcpy)88 static inline void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
89 {
90 	void *retval;
91 	const unsigned char *cfrom=from;
92 	unsigned char *tto=to;
93 	size_t i=0;
94 	retval = to;
95 	if(!len) return retval;
96         /* PREFETCH has effect even for MOVSB instruction ;) */
97 	__asm__ __volatile__ (
98 		"prefetcht0 (%0)\n"
99 		"prefetcht0 64(%0)\n"
100 		"prefetcht0 128(%0)\n"
101 		"prefetcht0 192(%0)\n"
102 		:: "r" (cfrom));
103         if(len >= MIN_LEN)
104 	{
105 	  register unsigned long int delta;
106           /* Align destinition to cache-line size -boundary */
107           delta = ((unsigned long long int)tto)&(CL_SIZE-1ULL);
108           if(delta)
109 	  {
110 	    delta=CL_SIZE-delta;
111 	    len -=delta;
112 	    small_memcpy(tto, cfrom, delta);
113 	  }
114 	  i = len>>8; /* len/256 */
115 	  len=len-(i<<8);
116 	}
117 	if(i) {
118         /*
119            This algorithm is top effective when the code consequently
120            reads and writes blocks which have size of cache line.
121            Size of cache line is processor-dependent.
122            It will, however, be a minimum of 32 bytes on any processors.
123            It would be better to have a number of instructions which
124            perform reading and writing to be multiple to a number of
125            processor's decoders, but it's not always possible.
126         */
127 	if(((unsigned long long)cfrom) & 15)
128 	/* if SRC is misaligned */
129 	for(; i>0; i--)
130 	{
131 		__asm__ __volatile__ (
132 		"prefetcht0 256(%0)\n"
133 		"prefetcht0 320(%0)\n"
134 		"movdqu (%0), %%xmm0\n"
135 		"movdqu 16(%0), %%xmm1\n"
136 		"movdqu 32(%0), %%xmm2\n"
137 		"movdqu 48(%0), %%xmm3\n"
138 		"movdqu 64(%0), %%xmm4\n"
139 		"movdqu 80(%0), %%xmm5\n"
140 		"movdqu 96(%0), %%xmm6\n"
141 		"movdqu 112(%0), %%xmm7\n"
142 		"prefetcht0 384(%0)\n"
143 		"prefetcht0 448(%0)\n"
144 		"movdqu 128(%0), %%xmm8\n"
145 		"movdqu 144(%0), %%xmm9\n"
146 		"movdqu 160(%0), %%xmm10\n"
147 		"movdqu 176(%0), %%xmm11\n"
148 		"movdqu 192(%0), %%xmm12\n"
149 		"movdqu 208(%0), %%xmm13\n"
150 		"movdqu 224(%0), %%xmm14\n"
151 		"movdqu 240(%0), %%xmm15\n"
152 		"movntdq %%xmm0, (%1)\n"
153 		"movntdq %%xmm1, 16(%1)\n"
154 		"movntdq %%xmm2, 32(%1)\n"
155 		"movntdq %%xmm3, 48(%1)\n"
156 		"movntdq %%xmm4, 64(%1)\n"
157 		"movntdq %%xmm5, 80(%1)\n"
158 		"movntdq %%xmm6, 96(%1)\n"
159 		"movntdq %%xmm7, 112(%1)\n"
160 		"movntdq %%xmm8, 128(%1)\n"
161 		"movntdq %%xmm9, 144(%1)\n"
162 		"movntdq %%xmm10, 160(%1)\n"
163 		"movntdq %%xmm11, 176(%1)\n"
164 		"movntdq %%xmm12, 192(%1)\n"
165 		"movntdq %%xmm13, 208(%1)\n"
166 		"movntdq %%xmm14, 224(%1)\n"
167 		"movntdq %%xmm15, 240(%1)\n"
168 		:: "r" (cfrom), "r" (tto):
169 		"memory"
170 		,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"
171 		);
172 		cfrom+=256ULL;
173 		tto+=256ULL;
174 	}
175 	else
176 	/*
177 	   Only if SRC is aligned on 16-byte boundary.
178 	   It allows to use movdqa instead of movdqu, which required data
179 	   to be aligned or a general-protection exception (#GP) is generated.
180 	*/
181 	for(; i>0; i--)
182 	{
183 		__asm__ __volatile__ (
184 		"prefetcht0 256(%0)\n"
185 		"prefetcht0 320(%0)\n"
186 		"movdqa (%0), %%xmm0\n"
187 		"movdqa 16(%0), %%xmm1\n"
188 		"movdqa 32(%0), %%xmm2\n"
189 		"movdqa 48(%0), %%xmm3\n"
190 		"movdqa 64(%0), %%xmm4\n"
191 		"movdqa 80(%0), %%xmm5\n"
192 		"movdqa 96(%0), %%xmm6\n"
193 		"movdqa 112(%0), %%xmm7\n"
194 		"prefetcht0 384(%0)\n"
195 		"prefetcht0 448(%0)\n"
196 		"movdqa 128(%0), %%xmm8\n"
197 		"movdqa 144(%0), %%xmm9\n"
198 		"movdqa 160(%0), %%xmm10\n"
199 		"movdqa 176(%0), %%xmm11\n"
200 		"movdqa 192(%0), %%xmm12\n"
201 		"movdqa 208(%0), %%xmm13\n"
202 		"movdqa 224(%0), %%xmm14\n"
203 		"movdqa 240(%0), %%xmm15\n"
204 		"movntdq %%xmm0, (%1)\n"
205 		"movntdq %%xmm1, 16(%1)\n"
206 		"movntdq %%xmm2, 32(%1)\n"
207 		"movntdq %%xmm3, 48(%1)\n"
208 		"movntdq %%xmm4, 64(%1)\n"
209 		"movntdq %%xmm5, 80(%1)\n"
210 		"movntdq %%xmm6, 96(%1)\n"
211 		"movntdq %%xmm7, 112(%1)\n"
212 		"movntdq %%xmm8, 128(%1)\n"
213 		"movntdq %%xmm9, 144(%1)\n"
214 		"movntdq %%xmm10, 160(%1)\n"
215 		"movntdq %%xmm11, 176(%1)\n"
216 		"movntdq %%xmm12, 192(%1)\n"
217 		"movntdq %%xmm13, 208(%1)\n"
218 		"movntdq %%xmm14, 224(%1)\n"
219 		"movntdq %%xmm15, 240(%1)\n"
220 		:: "r" (cfrom), "r" (tto):
221 		"memory"
222 		,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"
223 		);
224 		cfrom+=256ULL;
225 		tto+=256ULL;
226 	  }
227 	__asm__ __volatile__ ("sfence":::"memory");
228 	}
229 	/*
230 	 *	Now do the tail of the block
231 	 */
232 	if(len) small_memcpy(tto, cfrom, len);
233 	return retval;
234 }
235 
236 #define small_memset(to,val,n)\
237 {\
238 register unsigned long int dummy;\
239     if(n)\
240 __asm__ __volatile__(\
241 	"rep; stosb"\
242 	:"=&D"(to), "=&c"(dummy)\
243 	:"0" (to), "1" (n), "a" ((char)val)\
244 	: "memory","cc");\
245 }
246 
247 #define XMMREG_SIZE 16
248 /* Fast memory set. See comments for fast_memcpy */
RENAME(fast_memset)249 static void * RENAME(fast_memset)(void * to, int val, size_t len)
250 {
251 	void *retval;
252 	size_t i;
253 	unsigned char mm_reg[XMMREG_SIZE], *pmm_reg;
254 	unsigned char *tto=to;
255 
256 	retval = tto;
257 	if(len >= MIN_LEN)
258 	{
259 	  register unsigned long int delta;
260 	  delta = ((unsigned long long int)tto)&(XMMREG_SIZE-1);
261 	  if(delta)
262 	  {
263 	    delta=XMMREG_SIZE-delta;
264 	    len -= delta;
265 	    small_memset(tto, val, delta);
266 	  }
267 	  i = len >> 7; /* len/128 */
268 	  len&=127;
269 	  pmm_reg = mm_reg;
270 	  small_memset(pmm_reg,val,sizeof(mm_reg));
271 	__asm__ __volatile__(
272 		"movdqa (%0), %%xmm0\n"
273 		:: "r"(mm_reg):"memory");
274 	for(; i>0; i--)
275 	{
276 		__asm__ __volatile__ (
277 		"movntdq %%xmm0, (%0)\n"
278 		"movntdq %%xmm0, 16(%0)\n"
279 		"movntdq %%xmm0, 32(%0)\n"
280 		"movntdq %%xmm0, 48(%0)\n"
281 		"movntdq %%xmm0, 64(%0)\n"
282 		"movntdq %%xmm0, 80(%0)\n"
283 		"movntdq %%xmm0, 96(%0)\n"
284 		"movntdq %%xmm0, 112(%0)\n"
285 		:: "r" (tto) : "memory");
286 		tto+=128ULL;
287 	}
288 	__asm__ __volatile__ ("sfence":::"memory");
289 	}
290 	/*
291 	 *	Now do the tail of the block
292 	 */
293 	if(len) small_memset(tto, val, len);
294 	return retval;
295 }
296 
297 #ifdef REGMM_SIZE
298 #undef REGMM_SIZE
299 #endif
300 #define REGMM_SIZE 16
RENAME(InterleaveBuffers)301 static void __FASTCALL__ RENAME(InterleaveBuffers)(tUInt32 limit,
302 				    void *destbuffer,
303 				    const void *evenbuffer,
304 				    const void *oddbuffer)
305 {
306   register char *destbuffptr;
307   register const char *oddptr, *evenptr;
308   register tUInt32 freq;
309   destbuffptr = (char *)destbuffer;
310   evenptr = (const char *)evenbuffer;
311   oddptr = (const char *)oddbuffer;
312   freq = 0;
313   if(limit>REGMM_SIZE*4-1)
314   {
315       register tUInt64 delta, nlimit, step;
316       step = REGMM_SIZE*2;
317       /* Try to align buffers on boundary of REGMM_SIZE */
318       delta = ((tUInt64)evenptr)&(REGMM_SIZE-1);
319       if(delta) delta=REGMM_SIZE-delta;
320       nlimit=(limit-delta)/step;
321       freq=delta+(nlimit*step);
322       while(delta)
323       {
324 	*destbuffptr++ = *evenptr++;
325 	*destbuffptr++ = *oddptr++;
326 	delta--;
327       }
328       /* Perform MMX optimized interleaving */
329       while(nlimit)
330       {
331 	 /* Interleave mmx and cpu instructions */
332 	 __asm __volatile("movdqa	(%0), %%xmm0\n\t"
333 	       ::"r"(evenptr):"memory");
334 	 evenptr+=step;
335 	 __asm __volatile("movdqa	%%xmm0, %%xmm1\n\t"
336 	       "punpckhbw (%0), %%xmm0\n\t"
337 	      ::"r"(oddptr):"memory");
338 	 nlimit--;
339 	 __asm __volatile("punpcklbw (%0), %%xmm1\n\t"
340 	       ::"r"(oddptr):"memory");
341 	 oddptr+=step;
342 	 __asm __volatile("movdqu	%%xmm0, (%0)\n\t"
343 	       "movdqu	%%xmm2, 16(%0)\n\t"
344 	      ::"r"(destbuffptr):"memory");
345 	 destbuffptr+=step*2;
346       }
347   }
348   /* If tail exists then finish it */
349   while(freq<limit)
350   {
351     *destbuffptr++ = *evenptr++;
352     *destbuffptr++ = *oddptr++;
353     freq++;
354   }
355 }
356 
RENAME(CharsToShorts)357 static void __FASTCALL__ RENAME(CharsToShorts)(tUInt32 limit,
358 					     void *destbuffer,
359 					     const void *evenbuffer)
360 {
361   register char *destbuffptr;
362   register const char *evenptr;
363   register tUInt32 freq;
364   destbuffptr = (char *)destbuffer;
365   evenptr = (const char *)evenbuffer;
366   freq = 0;
367   if(limit>REGMM_SIZE*4-1)
368   {
369       register tUInt64 delta, nlimit, step;
370       step = REGMM_SIZE*2;
371       /* Try to align buffer on boundary of REGMM_SIZE */
372       delta = ((tUInt64)evenptr)&(REGMM_SIZE-1);
373       if(delta) delta=REGMM_SIZE-delta;
374       nlimit=(limit-delta)/step;
375       freq=delta+(nlimit*step);
376       while(delta)
377       {
378 	*destbuffptr++ = *evenptr++;
379 	*destbuffptr++ = 0;
380 	delta--;
381       }
382       /* Perform MMX optimized loop */
383       __asm __volatile("pxor	%%xmm7, %%xmm7":::"memory");
384       while(nlimit)
385       {
386 	 /* Interleave mmx and cpu instructions */
387 	 __asm __volatile("movdqa	(%0),%%xmm0\n\t"
388 	       ::"r"(evenptr):"memory");
389 	 evenptr+=step;
390 	 __asm __volatile("movdqa	%%xmm0, %%xmm1\n\t"
391 	       "punpckhbw %%xmm7, %%xmm0\n\t"
392 	      :::"memory");
393 	 nlimit--;
394 	 __asm __volatile(
395 	       "punpcklbw %%xmm7, %%xmm1\n\t"
396 	       :::"memory");
397 	 __asm __volatile("movdqu	%%xmm0, (%0)\n\t"
398 	       "movdqu	%%xmm1, 16(%0)\n\t"
399 	       ::"r"(destbuffptr):"memory");
400 	 destbuffptr+=step*2;
401       }
402   }
403   /* If tail exists then finish it */
404   while(freq<limit)
405   {
406     *destbuffptr++ = *evenptr++;
407     *destbuffptr++ = 0;
408     freq++;
409   }
410 }
411 
RENAME(ShortsToChars)412 static void __FASTCALL__ RENAME(ShortsToChars)(tUInt32 limit,
413 				     void * destbuffer, const void * srcbuffer)
414 {
415   register char *destbuffptr;
416   register const char *srcptr;
417   register tUInt32 freq;
418   destbuffptr = (char *)destbuffer;
419   srcptr = (const char *)srcbuffer;
420   freq = 0;
421   if(limit>REGMM_SIZE*4-1)
422   {
423       tUInt64 delta, nlimit, step;
424       step = REGMM_SIZE*2;
425       /* Try to align buffers on boundary of REGMM_SIZE */
426       delta=((tUInt64)destbuffptr)&(REGMM_SIZE-1);
427       if(delta) delta=REGMM_SIZE-delta;
428       nlimit=(limit-delta)/step;
429       freq=delta+(nlimit*step);
430       while(delta)
431       {
432 	*destbuffptr++ = *srcptr;
433 	srcptr+=2;
434 	delta--;
435       }
436       /* Perform MMX optimized loop */
437       while(nlimit)
438       {
439 	 /* Interleave mmx and cpu instructions */
440 	 __asm __volatile("movdqu	(%0), %%xmm0\n\t"
441 	       ::"r"(srcptr):"memory");
442 	 nlimit--;
443 	 __asm __volatile("packuswb (%0), %%xmm0\n\t"
444 	       ::"r"(&srcptr[REGMM_SIZE]):"memory");
445 	 srcptr+=step*2;
446 	 __asm __volatile("movdqa	%%xmm0, (%0)\n\t"
447 	       ::"r"(destbuffptr):"memory");
448 	 destbuffptr+=step;
449       }
450   }
451   /* If tail exists then finish it */
452   while(freq<limit)
453   {
454     *destbuffptr++ = *srcptr;
455     srcptr+=2;
456     freq++;
457   }
458 }
459