1 /*
2  * This file is part of the Scale2x project.
3  *
4  * Copyright (C) 2001-2003 Andrea Mazzoleni
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20 
21 /*
22  * This file contains a fast C and MMX implementation of the Scale2x effect.
23  *
24  * You can find an high level description of the effect at :
25  *
26  * http://scale2x.sourceforge.net/
27  *
28  * Alternatively at the previous license terms, you are allowed to use this
29  * code in your program with these conditions:
30  * - the program is not used in commercial activities.
31  * - the whole source code of the program is released with the binary.
32  * - derivative works of the program are allowed.
33  */
34 
35 #ifndef __SCALE2X_H
36 #define __SCALE2X_H
37 
38 #include <assert.h>
39 
40 /***************************************************************************/
41 /* Basic types */
42 
43 typedef unsigned char scale2x_uint8;
44 typedef unsigned short scale2x_uint16;
45 typedef unsigned scale2x_uint32;
46 
47 /***************************************************************************/
48 /* Scale2x C implementation */
49 
scale2x_8_def_single(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)50 static void scale2x_8_def_single(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
51 {
52 	assert(count >= 2);
53 
54 	/* first pixel */
55 	dst[0] = src1[0];
56 	if (src1[1] == src0[0] && src2[0] != src0[0])
57 		dst[1] = src0[0];
58 	else
59 		dst[1] = src1[0];
60 	++src0;
61 	++src1;
62 	++src2;
63 	dst += 2;
64 
65 	/* central pixels */
66 	count -= 2;
67 	while (count) {
68 		if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
69 			dst[0] = src0[0];
70 		else
71 			dst[0] = src1[0];
72 		if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
73 			dst[1] = src0[0];
74 		else
75 			dst[1] = src1[0];
76 
77 		++src0;
78 		++src1;
79 		++src2;
80 		dst += 2;
81 		--count;
82 	}
83 
84 	/* last pixel */
85 	if (src1[-1] == src0[0] && src2[0] != src0[0])
86 		dst[0] = src0[0];
87 	else
88 		dst[0] = src1[0];
89 	dst[1] = src1[0];
90 }
91 
scale2x_16_def_single(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)92 static void scale2x_16_def_single(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
93 {
94 	assert(count >= 2);
95 
96 	/* first pixel */
97 	dst[0] = src1[0];
98 	if (src1[1] == src0[0] && src2[0] != src0[0])
99 		dst[1] = src0[0];
100 	else
101 		dst[1] = src1[0];
102 	++src0;
103 	++src1;
104 	++src2;
105 	dst += 2;
106 
107 	/* central pixels */
108 	count -= 2;
109 	while (count) {
110 		if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
111 			dst[0] = src0[0];
112 		else
113 			dst[0] = src1[0];
114 		if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
115 			dst[1] = src0[0];
116 		else
117 			dst[1] = src1[0];
118 
119 		++src0;
120 		++src1;
121 		++src2;
122 		dst += 2;
123 		--count;
124 	}
125 
126 	/* last pixel */
127 	if (src1[-1] == src0[0] && src2[0] != src0[0])
128 		dst[0] = src0[0];
129 	else
130 		dst[0] = src1[0];
131 	dst[1] = src1[0];
132 }
133 
scale2x_32_def_single(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)134 static void scale2x_32_def_single(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
135 {
136 	assert(count >= 2);
137 
138 	/* first pixel */
139 	dst[0] = src1[0];
140 	if (src1[1] == src0[0] && src2[0] != src0[0])
141 		dst[1] = src0[0];
142 	else
143 		dst[1] = src1[0];
144 	++src0;
145 	++src1;
146 	++src2;
147 	dst += 2;
148 
149 	/* central pixels */
150 	count -= 2;
151 	while (count) {
152 		if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
153 			dst[0] = src0[0];
154 		else
155 			dst[0] = src1[0];
156 		if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
157 			dst[1] = src0[0];
158 		else
159 			dst[1] = src1[0];
160 
161 		++src0;
162 		++src1;
163 		++src2;
164 		dst += 2;
165 		--count;
166 	}
167 
168 	/* last pixel */
169 	if (src1[-1] == src0[0] && src2[0] != src0[0])
170 		dst[0] = src0[0];
171 	else
172 		dst[0] = src1[0];
173 	dst[1] = src1[0];
174 }
175 
176 /**
177  * Scale by a factor of 2 a row of pixels of 8 bits.
178  * The function is implemented in C.
179  * The pixels over the left and right borders are assumed of the same color of
180  * the pixels on the border.
181  * \param src0 Pointer at the first pixel of the previous row.
182  * \param src1 Pointer at the first pixel of the current row.
183  * \param src2 Pointer at the first pixel of the next row.
184  * \param count Length in pixels of the src0, src1 and src2 rows.
185  * It must be at least 2.
186  * \param dst0 First destination row, double length in pixels.
187  * \param dst1 Second destination row, double length in pixels.
188  */
189 //static inline void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
scale2x_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)190 static void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
191 {
192 	assert(count >= 2);
193 
194 	scale2x_8_def_single(dst0, src0, src1, src2, count);
195 	scale2x_8_def_single(dst1, src2, src1, src0, count);
196 }
197 
198 /**
199  * Scale by a factor of 2 a row of pixels of 16 bits.
200  * This function operates like scale2x_8_def() but for 16 bits pixels.
201  * \param src0 Pointer at the first pixel of the previous row.
202  * \param src1 Pointer at the first pixel of the current row.
203  * \param src2 Pointer at the first pixel of the next row.
204  * \param count Length in pixels of the src0, src1 and src2 rows.
205  * It must be at least 2.
206  * \param dst0 First destination row, double length in pixels.
207  * \param dst1 Second destination row, double length in pixels.
208  */
209 //static inline void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
scale2x_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)210 static void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
211 {
212 	assert(count >= 2);
213 
214 	scale2x_16_def_single(dst0, src0, src1, src2, count);
215 	scale2x_16_def_single(dst1, src2, src1, src0, count);
216 }
217 
218 /**
219  * Scale by a factor of 2 a row of pixels of 32 bits.
220  * This function operates like scale2x_8_def() but for 32 bits pixels.
221  * \param src0 Pointer at the first pixel of the previous row.
222  * \param src1 Pointer at the first pixel of the current row.
223  * \param src2 Pointer at the first pixel of the next row.
224  * \param count Length in pixels of the src0, src1 and src2 rows.
225  * It must be at least 2.
226  * \param dst0 First destination row, double length in pixels.
227  * \param dst1 Second destination row, double length in pixels.
228  */
229 //static inline void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
scale2x_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)230 static void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
231 {
232 	assert(count >= 2);
233 
234 	scale2x_32_def_single(dst0, src0, src1, src2, count);
235 	scale2x_32_def_single(dst1, src2, src1, src0, count);
236 }
237 
238 /***************************************************************************/
239 /* Scale2x MMX implementation */
240 
241 #if defined(__GNUC__) && defined(__i386__)
242 
243 /*
244  * Apply the Scale2x effect at a single row.
245  * This function must be called only by the other scale2x functions.
246  *
247  * Considering the pixel map :
248  *
249  *      ABC (src0)
250  *      DEF (src1)
251  *      GHI (src2)
252  *
253  * this functions compute 2 new pixels in substitution of the source pixel E
254  * like this map :
255  *
256  *      ab (dst)
257  *
258  * with these variables :
259  *
260  *      &current -> E
261  *      &current_left -> D
262  *      &current_right -> F
263  *      &current_upper -> B
264  *      &current_lower -> H
265  *
266  *      %0 -> current_upper
267  *      %1 -> current
268  *      %2 -> current_lower
269  *      %3 -> dst
270  *      %4 -> counter
271  *
272  *      %mm0 -> *current_left
273  *      %mm1 -> *current_next
274  *      %mm2 -> tmp0
275  *      %mm3 -> tmp1
276  *      %mm4 -> tmp2
277  *      %mm5 -> tmp3
278  *      %mm6 -> *current_upper
279  *      %mm7 -> *current
280  */
scale2x_8_mmx_single(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)281 static inline void scale2x_8_mmx_single(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
282 {
283 	assert(count >= 16);
284 	assert(count % 8 == 0);
285 
286 	/* always do the first and last run */
287 	count -= 2*8;
288 
289 	__asm__ __volatile__(
290 /* first run */
291 		/* set the current, current_pre, current_next registers */
292 		"movq 0(%1), %%mm0\n"
293 		"movq 0(%1), %%mm7\n"
294 		"movq 8(%1), %%mm1\n"
295 		"psllq $56, %%mm0\n"
296 		"psllq $56, %%mm1\n"
297 		"psrlq $56, %%mm0\n"
298 		"movq %%mm7, %%mm2\n"
299 		"movq %%mm7, %%mm3\n"
300 		"psllq $8, %%mm2\n"
301 		"psrlq $8, %%mm3\n"
302 		"por %%mm2, %%mm0\n"
303 		"por %%mm3, %%mm1\n"
304 
305 		/* current_upper */
306 		"movq (%0), %%mm6\n"
307 
308 		/* compute the upper-left pixel for dst on %%mm2 */
309 		/* compute the upper-right pixel for dst on %%mm4 */
310 		"movq %%mm0, %%mm2\n"
311 		"movq %%mm1, %%mm4\n"
312 		"movq %%mm0, %%mm3\n"
313 		"movq %%mm1, %%mm5\n"
314 		"pcmpeqb %%mm6, %%mm2\n"
315 		"pcmpeqb %%mm6, %%mm4\n"
316 		"pcmpeqb (%2), %%mm3\n"
317 		"pcmpeqb (%2), %%mm5\n"
318 		"pandn %%mm2, %%mm3\n"
319 		"pandn %%mm4, %%mm5\n"
320 		"movq %%mm0, %%mm2\n"
321 		"movq %%mm1, %%mm4\n"
322 		"pcmpeqb %%mm1, %%mm2\n"
323 		"pcmpeqb %%mm0, %%mm4\n"
324 		"pandn %%mm3, %%mm2\n"
325 		"pandn %%mm5, %%mm4\n"
326 		"movq %%mm2, %%mm3\n"
327 		"movq %%mm4, %%mm5\n"
328 		"pand %%mm6, %%mm2\n"
329 		"pand %%mm6, %%mm4\n"
330 		"pandn %%mm7, %%mm3\n"
331 		"pandn %%mm7, %%mm5\n"
332 		"por %%mm3, %%mm2\n"
333 		"por %%mm5, %%mm4\n"
334 
335 		/* set *dst */
336 		"movq %%mm2, %%mm3\n"
337 		"punpcklbw %%mm4, %%mm2\n"
338 		"punpckhbw %%mm4, %%mm3\n"
339 		"movq %%mm2, (%3)\n"
340 		"movq %%mm3, 8(%3)\n"
341 
342 		/* next */
343 		"addl $8, %0\n"
344 		"addl $8, %1\n"
345 		"addl $8, %2\n"
346 		"addl $16, %3\n"
347 
348 /* central runs */
349 		"shrl $3, %4\n"
350 		"jz 1f\n"
351 
352 		"0:\n"
353 
354 		/* set the current, current_pre, current_next registers */
355 		"movq -8(%1), %%mm0\n"
356 		"movq (%1), %%mm7\n"
357 		"movq 8(%1), %%mm1\n"
358 		"psrlq $56, %%mm0\n"
359 		"psllq $56, %%mm1\n"
360 		"movq %%mm7, %%mm2\n"
361 		"movq %%mm7, %%mm3\n"
362 		"psllq $8, %%mm2\n"
363 		"psrlq $8, %%mm3\n"
364 		"por %%mm2, %%mm0\n"
365 		"por %%mm3, %%mm1\n"
366 
367 		/* current_upper */
368 		"movq (%0), %%mm6\n"
369 
370 		/* compute the upper-left pixel for dst on %%mm2 */
371 		/* compute the upper-right pixel for dst on %%mm4 */
372 		"movq %%mm0, %%mm2\n"
373 		"movq %%mm1, %%mm4\n"
374 		"movq %%mm0, %%mm3\n"
375 		"movq %%mm1, %%mm5\n"
376 		"pcmpeqb %%mm6, %%mm2\n"
377 		"pcmpeqb %%mm6, %%mm4\n"
378 		"pcmpeqb (%2), %%mm3\n"
379 		"pcmpeqb (%2), %%mm5\n"
380 		"pandn %%mm2, %%mm3\n"
381 		"pandn %%mm4, %%mm5\n"
382 		"movq %%mm0, %%mm2\n"
383 		"movq %%mm1, %%mm4\n"
384 		"pcmpeqb %%mm1, %%mm2\n"
385 		"pcmpeqb %%mm0, %%mm4\n"
386 		"pandn %%mm3, %%mm2\n"
387 		"pandn %%mm5, %%mm4\n"
388 		"movq %%mm2, %%mm3\n"
389 		"movq %%mm4, %%mm5\n"
390 		"pand %%mm6, %%mm2\n"
391 		"pand %%mm6, %%mm4\n"
392 		"pandn %%mm7, %%mm3\n"
393 		"pandn %%mm7, %%mm5\n"
394 		"por %%mm3, %%mm2\n"
395 		"por %%mm5, %%mm4\n"
396 
397 		/* set *dst */
398 		"movq %%mm2, %%mm3\n"
399 		"punpcklbw %%mm4, %%mm2\n"
400 		"punpckhbw %%mm4, %%mm3\n"
401 		"movq %%mm2, (%3)\n"
402 		"movq %%mm3, 8(%3)\n"
403 
404 		/* next */
405 		"addl $8, %0\n"
406 		"addl $8, %1\n"
407 		"addl $8, %2\n"
408 		"addl $16, %3\n"
409 
410 		"decl %4\n"
411 		"jnz 0b\n"
412 		"1:\n"
413 
414 /* final run */
415 		/* set the current, current_pre, current_next registers */
416 		"movq (%1), %%mm1\n"
417 		"movq (%1), %%mm7\n"
418 		"movq -8(%1), %%mm0\n"
419 		"psrlq $56, %%mm1\n"
420 		"psrlq $56, %%mm0\n"
421 		"psllq $56, %%mm1\n"
422 		"movq %%mm7, %%mm2\n"
423 		"movq %%mm7, %%mm3\n"
424 		"psllq $8, %%mm2\n"
425 		"psrlq $8, %%mm3\n"
426 		"por %%mm2, %%mm0\n"
427 		"por %%mm3, %%mm1\n"
428 
429 		/* current_upper */
430 		"movq (%0), %%mm6\n"
431 
432 		/* compute the upper-left pixel for dst on %%mm2 */
433 		/* compute the upper-right pixel for dst on %%mm4 */
434 		"movq %%mm0, %%mm2\n"
435 		"movq %%mm1, %%mm4\n"
436 		"movq %%mm0, %%mm3\n"
437 		"movq %%mm1, %%mm5\n"
438 		"pcmpeqb %%mm6, %%mm2\n"
439 		"pcmpeqb %%mm6, %%mm4\n"
440 		"pcmpeqb (%2), %%mm3\n"
441 		"pcmpeqb (%2), %%mm5\n"
442 		"pandn %%mm2, %%mm3\n"
443 		"pandn %%mm4, %%mm5\n"
444 		"movq %%mm0, %%mm2\n"
445 		"movq %%mm1, %%mm4\n"
446 		"pcmpeqb %%mm1, %%mm2\n"
447 		"pcmpeqb %%mm0, %%mm4\n"
448 		"pandn %%mm3, %%mm2\n"
449 		"pandn %%mm5, %%mm4\n"
450 		"movq %%mm2, %%mm3\n"
451 		"movq %%mm4, %%mm5\n"
452 		"pand %%mm6, %%mm2\n"
453 		"pand %%mm6, %%mm4\n"
454 		"pandn %%mm7, %%mm3\n"
455 		"pandn %%mm7, %%mm5\n"
456 		"por %%mm3, %%mm2\n"
457 		"por %%mm5, %%mm4\n"
458 
459 		/* set *dst */
460 		"movq %%mm2, %%mm3\n"
461 		"punpcklbw %%mm4, %%mm2\n"
462 		"punpckhbw %%mm4, %%mm3\n"
463 		"movq %%mm2, (%3)\n"
464 		"movq %%mm3, 8(%3)\n"
465 
466 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
467 		:
468 		: "cc"
469 	);
470 }
471 
scale2x_16_mmx_single(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)472 static inline void scale2x_16_mmx_single(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
473 {
474 	assert(count >= 8);
475 	assert(count % 4 == 0);
476 
477 	/* always do the first and last run */
478 	count -= 2*4;
479 
480 	__asm__ __volatile__(
481 /* first run */
482 		/* set the current, current_pre, current_next registers */
483 		"movq 0(%1), %%mm0\n"
484 		"movq 0(%1), %%mm7\n"
485 		"movq 8(%1), %%mm1\n"
486 		"psllq $48, %%mm0\n"
487 		"psllq $48, %%mm1\n"
488 		"psrlq $48, %%mm0\n"
489 		"movq %%mm7, %%mm2\n"
490 		"movq %%mm7, %%mm3\n"
491 		"psllq $16, %%mm2\n"
492 		"psrlq $16, %%mm3\n"
493 		"por %%mm2, %%mm0\n"
494 		"por %%mm3, %%mm1\n"
495 
496 		/* current_upper */
497 		"movq (%0), %%mm6\n"
498 
499 		/* compute the upper-left pixel for dst on %%mm2 */
500 		/* compute the upper-right pixel for dst on %%mm4 */
501 		"movq %%mm0, %%mm2\n"
502 		"movq %%mm1, %%mm4\n"
503 		"movq %%mm0, %%mm3\n"
504 		"movq %%mm1, %%mm5\n"
505 		"pcmpeqw %%mm6, %%mm2\n"
506 		"pcmpeqw %%mm6, %%mm4\n"
507 		"pcmpeqw (%2), %%mm3\n"
508 		"pcmpeqw (%2), %%mm5\n"
509 		"pandn %%mm2, %%mm3\n"
510 		"pandn %%mm4, %%mm5\n"
511 		"movq %%mm0, %%mm2\n"
512 		"movq %%mm1, %%mm4\n"
513 		"pcmpeqw %%mm1, %%mm2\n"
514 		"pcmpeqw %%mm0, %%mm4\n"
515 		"pandn %%mm3, %%mm2\n"
516 		"pandn %%mm5, %%mm4\n"
517 		"movq %%mm2, %%mm3\n"
518 		"movq %%mm4, %%mm5\n"
519 		"pand %%mm6, %%mm2\n"
520 		"pand %%mm6, %%mm4\n"
521 		"pandn %%mm7, %%mm3\n"
522 		"pandn %%mm7, %%mm5\n"
523 		"por %%mm3, %%mm2\n"
524 		"por %%mm5, %%mm4\n"
525 
526 		/* set *dst */
527 		"movq %%mm2, %%mm3\n"
528 		"punpcklwd %%mm4, %%mm2\n"
529 		"punpckhwd %%mm4, %%mm3\n"
530 		"movq %%mm2, (%3)\n"
531 		"movq %%mm3, 8(%3)\n"
532 
533 		/* next */
534 		"addl $8, %0\n"
535 		"addl $8, %1\n"
536 		"addl $8, %2\n"
537 		"addl $16, %3\n"
538 
539 /* central runs */
540 		"shrl $2, %4\n"
541 		"jz 1f\n"
542 
543 		"0:\n"
544 
545 		/* set the current, current_pre, current_next registers */
546 		"movq -8(%1), %%mm0\n"
547 		"movq (%1), %%mm7\n"
548 		"movq 8(%1), %%mm1\n"
549 		"psrlq $48, %%mm0\n"
550 		"psllq $48, %%mm1\n"
551 		"movq %%mm7, %%mm2\n"
552 		"movq %%mm7, %%mm3\n"
553 		"psllq $16, %%mm2\n"
554 		"psrlq $16, %%mm3\n"
555 		"por %%mm2, %%mm0\n"
556 		"por %%mm3, %%mm1\n"
557 
558 		/* current_upper */
559 		"movq (%0), %%mm6\n"
560 
561 		/* compute the upper-left pixel for dst on %%mm2 */
562 		/* compute the upper-right pixel for dst on %%mm4 */
563 		"movq %%mm0, %%mm2\n"
564 		"movq %%mm1, %%mm4\n"
565 		"movq %%mm0, %%mm3\n"
566 		"movq %%mm1, %%mm5\n"
567 		"pcmpeqw %%mm6, %%mm2\n"
568 		"pcmpeqw %%mm6, %%mm4\n"
569 		"pcmpeqw (%2), %%mm3\n"
570 		"pcmpeqw (%2), %%mm5\n"
571 		"pandn %%mm2, %%mm3\n"
572 		"pandn %%mm4, %%mm5\n"
573 		"movq %%mm0, %%mm2\n"
574 		"movq %%mm1, %%mm4\n"
575 		"pcmpeqw %%mm1, %%mm2\n"
576 		"pcmpeqw %%mm0, %%mm4\n"
577 		"pandn %%mm3, %%mm2\n"
578 		"pandn %%mm5, %%mm4\n"
579 		"movq %%mm2, %%mm3\n"
580 		"movq %%mm4, %%mm5\n"
581 		"pand %%mm6, %%mm2\n"
582 		"pand %%mm6, %%mm4\n"
583 		"pandn %%mm7, %%mm3\n"
584 		"pandn %%mm7, %%mm5\n"
585 		"por %%mm3, %%mm2\n"
586 		"por %%mm5, %%mm4\n"
587 
588 		/* set *dst */
589 		"movq %%mm2, %%mm3\n"
590 		"punpcklwd %%mm4, %%mm2\n"
591 		"punpckhwd %%mm4, %%mm3\n"
592 		"movq %%mm2, (%3)\n"
593 		"movq %%mm3, 8(%3)\n"
594 
595 		/* next */
596 		"addl $8, %0\n"
597 		"addl $8, %1\n"
598 		"addl $8, %2\n"
599 		"addl $16, %3\n"
600 
601 		"decl %4\n"
602 		"jnz 0b\n"
603 		"1:\n"
604 
605 /* final run */
606 		/* set the current, current_pre, current_next registers */
607 		"movq (%1), %%mm1\n"
608 		"movq (%1), %%mm7\n"
609 		"movq -8(%1), %%mm0\n"
610 		"psrlq $48, %%mm1\n"
611 		"psrlq $48, %%mm0\n"
612 		"psllq $48, %%mm1\n"
613 		"movq %%mm7, %%mm2\n"
614 		"movq %%mm7, %%mm3\n"
615 		"psllq $16, %%mm2\n"
616 		"psrlq $16, %%mm3\n"
617 		"por %%mm2, %%mm0\n"
618 		"por %%mm3, %%mm1\n"
619 
620 		/* current_upper */
621 		"movq (%0), %%mm6\n"
622 
623 		/* compute the upper-left pixel for dst on %%mm2 */
624 		/* compute the upper-right pixel for dst on %%mm4 */
625 		"movq %%mm0, %%mm2\n"
626 		"movq %%mm1, %%mm4\n"
627 		"movq %%mm0, %%mm3\n"
628 		"movq %%mm1, %%mm5\n"
629 		"pcmpeqw %%mm6, %%mm2\n"
630 		"pcmpeqw %%mm6, %%mm4\n"
631 		"pcmpeqw (%2), %%mm3\n"
632 		"pcmpeqw (%2), %%mm5\n"
633 		"pandn %%mm2, %%mm3\n"
634 		"pandn %%mm4, %%mm5\n"
635 		"movq %%mm0, %%mm2\n"
636 		"movq %%mm1, %%mm4\n"
637 		"pcmpeqw %%mm1, %%mm2\n"
638 		"pcmpeqw %%mm0, %%mm4\n"
639 		"pandn %%mm3, %%mm2\n"
640 		"pandn %%mm5, %%mm4\n"
641 		"movq %%mm2, %%mm3\n"
642 		"movq %%mm4, %%mm5\n"
643 		"pand %%mm6, %%mm2\n"
644 		"pand %%mm6, %%mm4\n"
645 		"pandn %%mm7, %%mm3\n"
646 		"pandn %%mm7, %%mm5\n"
647 		"por %%mm3, %%mm2\n"
648 		"por %%mm5, %%mm4\n"
649 
650 		/* set *dst */
651 		"movq %%mm2, %%mm3\n"
652 		"punpcklwd %%mm4, %%mm2\n"
653 		"punpckhwd %%mm4, %%mm3\n"
654 		"movq %%mm2, (%3)\n"
655 		"movq %%mm3, 8(%3)\n"
656 
657 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
658 		:
659 		: "cc"
660 	);
661 }
662 
scale2x_32_mmx_single(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)663 static inline void scale2x_32_mmx_single(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
664 {
665 	assert(count >= 4);
666 	assert(count % 2 == 0);
667 
668 	/* always do the first and last run */
669 	count -= 2*2;
670 
671 	__asm__ __volatile__(
672 /* first run */
673 		/* set the current, current_pre, current_next registers */
674 		"movq 0(%1), %%mm0\n"
675 		"movq 0(%1), %%mm7\n"
676 		"movq 8(%1), %%mm1\n"
677 		"psllq $32, %%mm0\n"
678 		"psllq $32, %%mm1\n"
679 		"psrlq $32, %%mm0\n"
680 		"movq %%mm7, %%mm2\n"
681 		"movq %%mm7, %%mm3\n"
682 		"psllq $32, %%mm2\n"
683 		"psrlq $32, %%mm3\n"
684 		"por %%mm2, %%mm0\n"
685 		"por %%mm3, %%mm1\n"
686 
687 		/* current_upper */
688 		"movq (%0), %%mm6\n"
689 
690 		/* compute the upper-left pixel for dst on %%mm2 */
691 		/* compute the upper-right pixel for dst on %%mm4 */
692 		"movq %%mm0, %%mm2\n"
693 		"movq %%mm1, %%mm4\n"
694 		"movq %%mm0, %%mm3\n"
695 		"movq %%mm1, %%mm5\n"
696 		"pcmpeqd %%mm6, %%mm2\n"
697 		"pcmpeqd %%mm6, %%mm4\n"
698 		"pcmpeqd (%2), %%mm3\n"
699 		"pcmpeqd (%2), %%mm5\n"
700 		"pandn %%mm2, %%mm3\n"
701 		"pandn %%mm4, %%mm5\n"
702 		"movq %%mm0, %%mm2\n"
703 		"movq %%mm1, %%mm4\n"
704 		"pcmpeqd %%mm1, %%mm2\n"
705 		"pcmpeqd %%mm0, %%mm4\n"
706 		"pandn %%mm3, %%mm2\n"
707 		"pandn %%mm5, %%mm4\n"
708 		"movq %%mm2, %%mm3\n"
709 		"movq %%mm4, %%mm5\n"
710 		"pand %%mm6, %%mm2\n"
711 		"pand %%mm6, %%mm4\n"
712 		"pandn %%mm7, %%mm3\n"
713 		"pandn %%mm7, %%mm5\n"
714 		"por %%mm3, %%mm2\n"
715 		"por %%mm5, %%mm4\n"
716 
717 		/* set *dst */
718 		"movq %%mm2, %%mm3\n"
719 		"punpckldq %%mm4, %%mm2\n"
720 		"punpckhdq %%mm4, %%mm3\n"
721 		"movq %%mm2, (%3)\n"
722 		"movq %%mm3, 8(%3)\n"
723 
724 		/* next */
725 		"addl $8, %0\n"
726 		"addl $8, %1\n"
727 		"addl $8, %2\n"
728 		"addl $16, %3\n"
729 
730 /* central runs */
731 		"shrl $1, %4\n"
732 		"jz 1f\n"
733 
734 		"0:\n"
735 
736 		/* set the current, current_pre, current_next registers */
737 		"movq -8(%1), %%mm0\n"
738 		"movq (%1), %%mm7\n"
739 		"movq 8(%1), %%mm1\n"
740 		"psrlq $32, %%mm0\n"
741 		"psllq $32, %%mm1\n"
742 		"movq %%mm7, %%mm2\n"
743 		"movq %%mm7, %%mm3\n"
744 		"psllq $32, %%mm2\n"
745 		"psrlq $32, %%mm3\n"
746 		"por %%mm2, %%mm0\n"
747 		"por %%mm3, %%mm1\n"
748 
749 		/* current_upper */
750 		"movq (%0), %%mm6\n"
751 
752 		/* compute the upper-left pixel for dst on %%mm2 */
753 		/* compute the upper-right pixel for dst on %%mm4 */
754 		"movq %%mm0, %%mm2\n"
755 		"movq %%mm1, %%mm4\n"
756 		"movq %%mm0, %%mm3\n"
757 		"movq %%mm1, %%mm5\n"
758 		"pcmpeqd %%mm6, %%mm2\n"
759 		"pcmpeqd %%mm6, %%mm4\n"
760 		"pcmpeqd (%2), %%mm3\n"
761 		"pcmpeqd (%2), %%mm5\n"
762 		"pandn %%mm2, %%mm3\n"
763 		"pandn %%mm4, %%mm5\n"
764 		"movq %%mm0, %%mm2\n"
765 		"movq %%mm1, %%mm4\n"
766 		"pcmpeqd %%mm1, %%mm2\n"
767 		"pcmpeqd %%mm0, %%mm4\n"
768 		"pandn %%mm3, %%mm2\n"
769 		"pandn %%mm5, %%mm4\n"
770 		"movq %%mm2, %%mm3\n"
771 		"movq %%mm4, %%mm5\n"
772 		"pand %%mm6, %%mm2\n"
773 		"pand %%mm6, %%mm4\n"
774 		"pandn %%mm7, %%mm3\n"
775 		"pandn %%mm7, %%mm5\n"
776 		"por %%mm3, %%mm2\n"
777 		"por %%mm5, %%mm4\n"
778 
779 		/* set *dst */
780 		"movq %%mm2, %%mm3\n"
781 		"punpckldq %%mm4, %%mm2\n"
782 		"punpckhdq %%mm4, %%mm3\n"
783 		"movq %%mm2, (%3)\n"
784 		"movq %%mm3, 8(%3)\n"
785 
786 		/* next */
787 		"addl $8, %0\n"
788 		"addl $8, %1\n"
789 		"addl $8, %2\n"
790 		"addl $16, %3\n"
791 
792 		"decl %4\n"
793 		"jnz 0b\n"
794 		"1:\n"
795 
796 /* final run */
797 		/* set the current, current_pre, current_next registers */
798 		"movq (%1), %%mm1\n"
799 		"movq (%1), %%mm7\n"
800 		"movq -8(%1), %%mm0\n"
801 		"psrlq $32, %%mm1\n"
802 		"psrlq $32, %%mm0\n"
803 		"psllq $32, %%mm1\n"
804 		"movq %%mm7, %%mm2\n"
805 		"movq %%mm7, %%mm3\n"
806 		"psllq $32, %%mm2\n"
807 		"psrlq $32, %%mm3\n"
808 		"por %%mm2, %%mm0\n"
809 		"por %%mm3, %%mm1\n"
810 
811 		/* current_upper */
812 		"movq (%0), %%mm6\n"
813 
814 		/* compute the upper-left pixel for dst on %%mm2 */
815 		/* compute the upper-right pixel for dst on %%mm4 */
816 		"movq %%mm0, %%mm2\n"
817 		"movq %%mm1, %%mm4\n"
818 		"movq %%mm0, %%mm3\n"
819 		"movq %%mm1, %%mm5\n"
820 		"pcmpeqd %%mm6, %%mm2\n"
821 		"pcmpeqd %%mm6, %%mm4\n"
822 		"pcmpeqd (%2), %%mm3\n"
823 		"pcmpeqd (%2), %%mm5\n"
824 		"pandn %%mm2, %%mm3\n"
825 		"pandn %%mm4, %%mm5\n"
826 		"movq %%mm0, %%mm2\n"
827 		"movq %%mm1, %%mm4\n"
828 		"pcmpeqd %%mm1, %%mm2\n"
829 		"pcmpeqd %%mm0, %%mm4\n"
830 		"pandn %%mm3, %%mm2\n"
831 		"pandn %%mm5, %%mm4\n"
832 		"movq %%mm2, %%mm3\n"
833 		"movq %%mm4, %%mm5\n"
834 		"pand %%mm6, %%mm2\n"
835 		"pand %%mm6, %%mm4\n"
836 		"pandn %%mm7, %%mm3\n"
837 		"pandn %%mm7, %%mm5\n"
838 		"por %%mm3, %%mm2\n"
839 		"por %%mm5, %%mm4\n"
840 
841 		/* set *dst */
842 		"movq %%mm2, %%mm3\n"
843 		"punpckldq %%mm4, %%mm2\n"
844 		"punpckhdq %%mm4, %%mm3\n"
845 		"movq %%mm2, (%3)\n"
846 		"movq %%mm3, 8(%3)\n"
847 
848 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
849 		:
850 		: "cc"
851 	);
852 }
853 
854 /**
855  * Scale by a factor of 2 a row of pixels of 8 bits.
856  * This is a very fast MMX implementation.
857  * The implementation uses a combination of cmp/and/not operations to
858  * completly remove the need of conditional jumps. This trick give the
859  * major speed improvement.
860  * Also, using the 8 bytes MMX registers more than one pixel are computed
861  * at the same time.
862  * Before calling this function you must ensure that the currenct CPU supports
863  * the MMX instruction set. After calling it you must be sure to call the EMMS
864  * instruction before any floating-point operation.
865  * The pixels over the left and right borders are assumed of the same color of
866  * the pixels on the border.
867  * \param src0 Pointer at the first pixel of the previous row.
868  * \param src1 Pointer at the first pixel of the current row.
869  * \param src2 Pointer at the first pixel of the next row.
870  * \param count Length in pixels of the src0, src1 and src2 rows. It must
871  * be at least 16 and a multiple of 8.
872  * \param dst0 First destination row, double length in pixels.
873  * \param dst1 Second destination row, double length in pixels.
874  */
scale2x_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)875 static void scale2x_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
876 {
877 	assert(count >= 16);
878 	assert(count % 8 == 0);
879 
880 	scale2x_8_mmx_single(dst0, src0, src1, src2, count);
881 	scale2x_8_mmx_single(dst1, src2, src1, src0, count);
882 }
883 
884 /**
885  * Scale by a factor of 2 a row of pixels of 16 bits.
886  * This function operates like scale2x_8_mmx() but for 16 bits pixels.
887  * \param src0 Pointer at the first pixel of the previous row.
888  * \param src1 Pointer at the first pixel of the current row.
889  * \param src2 Pointer at the first pixel of the next row.
890  * \param count Length in pixels of the src0, src1 and src2 rows. It must
891  * be at least 8 and a multiple of 4.
892  * \param dst0 First destination row, double length in pixels.
893  * \param dst1 Second destination row, double length in pixels.
894  */
scale2x_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)895 static void scale2x_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
896 {
897 	assert(count >= 8);
898 	assert(count % 4 == 0);
899 
900 	scale2x_16_mmx_single(dst0, src0, src1, src2, count);
901 	scale2x_16_mmx_single(dst1, src2, src1, src0, count);
902 }
903 
904 /**
905  * Scale by a factor of 2 a row of pixels of 32 bits.
906  * This function operates like scale2x_8_mmx() but for 32 bits pixels.
907  * \param src0 Pointer at the first pixel of the previous row.
908  * \param src1 Pointer at the first pixel of the current row.
909  * \param src2 Pointer at the first pixel of the next row.
910  * \param count Length in pixels of the src0, src1 and src2 rows. It must
911  * be at least 4 and a multiple of 2.
912  * \param dst0 First destination row, double length in pixels.
913  * \param dst1 Second destination row, double length in pixels.
914  */
scale2x_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)915 static void scale2x_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
916 {
917 	assert(count >= 4);
918 	assert(count % 2 == 0);
919 
920 	scale2x_32_mmx_single(dst0, src0, src1, src2, count);
921 	scale2x_32_mmx_single(dst1, src2, src1, src0, count);
922 }
923 
924 /**
925  * End the use of the MMX instructions.
926  * This function must be called before using any floating-point operations.
927  */
scale2x_mmx_emms(void)928 static inline void scale2x_mmx_emms(void)
929 {
930 	__asm__ __volatile__ (
931 		"emms"
932 	);
933 }
934 
935 #endif
936 
937 #endif
938