1 /*
2  * This file is part of the Scale2x project.
3  *
4  * Copyright (C) 2001-2003 Andrea Mazzoleni
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20 
21 /*
22  * This file contains a C and MMX implementation of the Scale2x effect.
23  *
24  * You can find an high level description of the effect at :
25  *
26  * http://scale2x.sourceforge.net/
27  *
28  * Alternatively at the previous license terms, you are allowed to use this
29  * code in your program with these conditions:
30  * - the program is not used in commercial activities.
31  * - the whole source code of the program is released with the binary.
32  * - derivative works of the program are allowed.
33  */
34 
35 #ifndef __SCALE2X_H
36 #define __SCALE2X_H
37 
38 #include <assert.h>
39 
40 /***************************************************************************/
41 /* Basic types */
42 
43 typedef unsigned char scale2x_uint8;
44 typedef unsigned short scale2x_uint16;
45 typedef unsigned scale2x_uint32;
46 
47 /***************************************************************************/
48 /* Scale2x C implementation */
49 
scale2x_8_def_single(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)50 static void scale2x_8_def_single(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
51 {
52 	assert(count >= 2);
53 
54 	/* first pixel */
55 	dst[0] = src1[0];
56 	if (src1[1] == src0[0] && src2[0] != src0[0])
57 		dst[1] = src0[0];
58 	else
59 		dst[1] = src1[0];
60 	++src0;
61 	++src1;
62 	++src2;
63 	dst += 2;
64 
65 	/* central pixels */
66 	count -= 2;
67 	while (count) {
68 		if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
69 			dst[0] = src0[0];
70 		else
71 			dst[0] = src1[0];
72 		if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
73 			dst[1] = src0[0];
74 		else
75 			dst[1] = src1[0];
76 
77 		++src0;
78 		++src1;
79 		++src2;
80 		dst += 2;
81 		--count;
82 	}
83 
84 	/* last pixel */
85 	if (src1[-1] == src0[0] && src2[0] != src0[0])
86 		dst[0] = src0[0];
87 	else
88 		dst[0] = src1[0];
89 	dst[1] = src1[0];
90 }
91 
scale2x_16_def_single(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)92 static void scale2x_16_def_single(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
93 {
94 	assert(count >= 2);
95 
96 	/* first pixel */
97 	dst[0] = src1[0];
98 	if (src1[1] == src0[0] && src2[0] != src0[0])
99 		dst[1] = src0[0];
100 	else
101 		dst[1] = src1[0];
102 	++src0;
103 	++src1;
104 	++src2;
105 	dst += 2;
106 
107 	/* central pixels */
108 	count -= 2;
109 	while (count) {
110 		if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
111 			dst[0] = src0[0];
112 		else
113 			dst[0] = src1[0];
114 		if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
115 			dst[1] = src0[0];
116 		else
117 			dst[1] = src1[0];
118 
119 		++src0;
120 		++src1;
121 		++src2;
122 		dst += 2;
123 		--count;
124 	}
125 
126 	/* last pixel */
127 	if (src1[-1] == src0[0] && src2[0] != src0[0])
128 		dst[0] = src0[0];
129 	else
130 		dst[0] = src1[0];
131 	dst[1] = src1[0];
132 }
133 
scale2x_32_def_single(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)134 static void scale2x_32_def_single(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
135 {
136 	assert(count >= 2);
137 
138 	/* first pixel */
139 	dst[0] = src1[0];
140 	if (src1[1] == src0[0] && src2[0] != src0[0])
141 		dst[1] = src0[0];
142 	else
143 		dst[1] = src1[0];
144 	++src0;
145 	++src1;
146 	++src2;
147 	dst += 2;
148 
149 	/* central pixels */
150 	count -= 2;
151 	while (count) {
152 		if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
153 			dst[0] = src0[0];
154 		else
155 			dst[0] = src1[0];
156 		if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
157 			dst[1] = src0[0];
158 		else
159 			dst[1] = src1[0];
160 
161 		++src0;
162 		++src1;
163 		++src2;
164 		dst += 2;
165 		--count;
166 	}
167 
168 	/* last pixel */
169 	if (src1[-1] == src0[0] && src2[0] != src0[0])
170 		dst[0] = src0[0];
171 	else
172 		dst[0] = src1[0];
173 	dst[1] = src1[0];
174 }
175 
176 /**
177  * Scale by a factor of 2 a row of pixels of 8 bits.
178  * The function is implemented in C.
179  * The pixels over the left and right borders are assumed of the same color of
180  * the pixels on the border.
181  * \param src0 Pointer at the first pixel of the previous row.
182  * \param src1 Pointer at the first pixel of the current row.
183  * \param src2 Pointer at the first pixel of the next row.
184  * \param count Length in pixels of the src0, src1 and src2 rows.
185  * It must be at least 2.
186  * \param dst0 First destination row, double length in pixels.
187  * \param dst1 Second destination row, double length in pixels.
188  */
scale2x_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)189 static inline void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
190 {
191 	assert(count >= 2);
192 
193 	scale2x_8_def_single(dst0, src0, src1, src2, count);
194 	scale2x_8_def_single(dst1, src2, src1, src0, count);
195 }
196 
197 /**
198  * Scale by a factor of 2 a row of pixels of 16 bits.
199  * This function operates like scale2x_8_def() but for 16 bits pixels.
200  * \param src0 Pointer at the first pixel of the previous row.
201  * \param src1 Pointer at the first pixel of the current row.
202  * \param src2 Pointer at the first pixel of the next row.
203  * \param count Length in pixels of the src0, src1 and src2 rows.
204  * It must be at least 2.
205  * \param dst0 First destination row, double length in pixels.
206  * \param dst1 Second destination row, double length in pixels.
207  */
scale2x_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)208 static inline void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
209 {
210 	assert(count >= 2);
211 
212 	scale2x_16_def_single(dst0, src0, src1, src2, count);
213 	scale2x_16_def_single(dst1, src2, src1, src0, count);
214 }
215 
216 /**
217  * Scale by a factor of 2 a row of pixels of 32 bits.
218  * This function operates like scale2x_8_def() but for 32 bits pixels.
219  * \param src0 Pointer at the first pixel of the previous row.
220  * \param src1 Pointer at the first pixel of the current row.
221  * \param src2 Pointer at the first pixel of the next row.
222  * \param count Length in pixels of the src0, src1 and src2 rows.
223  * It must be at least 2.
224  * \param dst0 First destination row, double length in pixels.
225  * \param dst1 Second destination row, double length in pixels.
226  */
scale2x_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)227 static inline void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
228 {
229 	assert(count >= 2);
230 
231 	scale2x_32_def_single(dst0, src0, src1, src2, count);
232 	scale2x_32_def_single(dst1, src2, src1, src0, count);
233 }
234 
235 /***************************************************************************/
236 /* Scale2x MMX implementation */
237 
238 #if defined(__GNUC__) && defined(__i386__)
239 
240 /*
241  * Apply the Scale2x effect at a single row.
242  * This function must be called only by the other scale2x functions.
243  *
244  * Considering the pixel map :
245  *
246  *      ABC (src0)
247  *      DEF (src1)
248  *      GHI (src2)
249  *
250  * this functions compute 2 new pixels in substitution of the source pixel E
251  * like this map :
252  *
253  *      ab (dst)
254  *
255  * with these variables :
256  *
257  *      &current -> E
258  *      &current_left -> D
259  *      &current_right -> F
260  *      &current_upper -> B
261  *      &current_lower -> H
262  *
263  *      %0 -> current_upper
264  *      %1 -> current
265  *      %2 -> current_lower
266  *      %3 -> dst
267  *      %4 -> counter
268  *
269  *      %mm0 -> *current_left
270  *      %mm1 -> *current_next
271  *      %mm2 -> tmp0
272  *      %mm3 -> tmp1
273  *      %mm4 -> tmp2
274  *      %mm5 -> tmp3
275  *      %mm6 -> *current_upper
276  *      %mm7 -> *current
277  */
scale2x_8_mmx_single(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)278 static inline void scale2x_8_mmx_single(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
279 {
280 	assert(count >= 16);
281 	assert(count % 8 == 0);
282 
283 	/* always do the first and last run */
284 	count -= 2*8;
285 
286 	__asm__ __volatile__(
287 /* first run */
288 		/* set the current, current_pre, current_next registers */
289 		"movq 0(%1), %%mm0\n"
290 		"movq 0(%1), %%mm7\n"
291 		"movq 8(%1), %%mm1\n"
292 		"psllq $56, %%mm0\n"
293 		"psllq $56, %%mm1\n"
294 		"psrlq $56, %%mm0\n"
295 		"movq %%mm7, %%mm2\n"
296 		"movq %%mm7, %%mm3\n"
297 		"psllq $8, %%mm2\n"
298 		"psrlq $8, %%mm3\n"
299 		"por %%mm2, %%mm0\n"
300 		"por %%mm3, %%mm1\n"
301 
302 		/* current_upper */
303 		"movq (%0), %%mm6\n"
304 
305 		/* compute the upper-left pixel for dst on %%mm2 */
306 		/* compute the upper-right pixel for dst on %%mm4 */
307 		"movq %%mm0, %%mm2\n"
308 		"movq %%mm1, %%mm4\n"
309 		"movq %%mm0, %%mm3\n"
310 		"movq %%mm1, %%mm5\n"
311 		"pcmpeqb %%mm6, %%mm2\n"
312 		"pcmpeqb %%mm6, %%mm4\n"
313 		"pcmpeqb (%2), %%mm3\n"
314 		"pcmpeqb (%2), %%mm5\n"
315 		"pandn %%mm2, %%mm3\n"
316 		"pandn %%mm4, %%mm5\n"
317 		"movq %%mm0, %%mm2\n"
318 		"movq %%mm1, %%mm4\n"
319 		"pcmpeqb %%mm1, %%mm2\n"
320 		"pcmpeqb %%mm0, %%mm4\n"
321 		"pandn %%mm3, %%mm2\n"
322 		"pandn %%mm5, %%mm4\n"
323 		"movq %%mm2, %%mm3\n"
324 		"movq %%mm4, %%mm5\n"
325 		"pand %%mm6, %%mm2\n"
326 		"pand %%mm6, %%mm4\n"
327 		"pandn %%mm7, %%mm3\n"
328 		"pandn %%mm7, %%mm5\n"
329 		"por %%mm3, %%mm2\n"
330 		"por %%mm5, %%mm4\n"
331 
332 		/* set *dst */
333 		"movq %%mm2, %%mm3\n"
334 		"punpcklbw %%mm4, %%mm2\n"
335 		"punpckhbw %%mm4, %%mm3\n"
336 		"movq %%mm2, (%3)\n"
337 		"movq %%mm3, 8(%3)\n"
338 
339 		/* next */
340 		"addl $8, %0\n"
341 		"addl $8, %1\n"
342 		"addl $8, %2\n"
343 		"addl $16, %3\n"
344 
345 /* central runs */
346 		"shrl $3, %4\n"
347 		"jz 1f\n"
348 
349 		"0:\n"
350 
351 		/* set the current, current_pre, current_next registers */
352 		"movq -8(%1), %%mm0\n"
353 		"movq (%1), %%mm7\n"
354 		"movq 8(%1), %%mm1\n"
355 		"psrlq $56, %%mm0\n"
356 		"psllq $56, %%mm1\n"
357 		"movq %%mm7, %%mm2\n"
358 		"movq %%mm7, %%mm3\n"
359 		"psllq $8, %%mm2\n"
360 		"psrlq $8, %%mm3\n"
361 		"por %%mm2, %%mm0\n"
362 		"por %%mm3, %%mm1\n"
363 
364 		/* current_upper */
365 		"movq (%0), %%mm6\n"
366 
367 		/* compute the upper-left pixel for dst on %%mm2 */
368 		/* compute the upper-right pixel for dst on %%mm4 */
369 		"movq %%mm0, %%mm2\n"
370 		"movq %%mm1, %%mm4\n"
371 		"movq %%mm0, %%mm3\n"
372 		"movq %%mm1, %%mm5\n"
373 		"pcmpeqb %%mm6, %%mm2\n"
374 		"pcmpeqb %%mm6, %%mm4\n"
375 		"pcmpeqb (%2), %%mm3\n"
376 		"pcmpeqb (%2), %%mm5\n"
377 		"pandn %%mm2, %%mm3\n"
378 		"pandn %%mm4, %%mm5\n"
379 		"movq %%mm0, %%mm2\n"
380 		"movq %%mm1, %%mm4\n"
381 		"pcmpeqb %%mm1, %%mm2\n"
382 		"pcmpeqb %%mm0, %%mm4\n"
383 		"pandn %%mm3, %%mm2\n"
384 		"pandn %%mm5, %%mm4\n"
385 		"movq %%mm2, %%mm3\n"
386 		"movq %%mm4, %%mm5\n"
387 		"pand %%mm6, %%mm2\n"
388 		"pand %%mm6, %%mm4\n"
389 		"pandn %%mm7, %%mm3\n"
390 		"pandn %%mm7, %%mm5\n"
391 		"por %%mm3, %%mm2\n"
392 		"por %%mm5, %%mm4\n"
393 
394 		/* set *dst */
395 		"movq %%mm2, %%mm3\n"
396 		"punpcklbw %%mm4, %%mm2\n"
397 		"punpckhbw %%mm4, %%mm3\n"
398 		"movq %%mm2, (%3)\n"
399 		"movq %%mm3, 8(%3)\n"
400 
401 		/* next */
402 		"addl $8, %0\n"
403 		"addl $8, %1\n"
404 		"addl $8, %2\n"
405 		"addl $16, %3\n"
406 
407 		"decl %4\n"
408 		"jnz 0b\n"
409 		"1:\n"
410 
411 /* final run */
412 		/* set the current, current_pre, current_next registers */
413 		"movq (%1), %%mm1\n"
414 		"movq (%1), %%mm7\n"
415 		"movq -8(%1), %%mm0\n"
416 		"psrlq $56, %%mm1\n"
417 		"psrlq $56, %%mm0\n"
418 		"psllq $56, %%mm1\n"
419 		"movq %%mm7, %%mm2\n"
420 		"movq %%mm7, %%mm3\n"
421 		"psllq $8, %%mm2\n"
422 		"psrlq $8, %%mm3\n"
423 		"por %%mm2, %%mm0\n"
424 		"por %%mm3, %%mm1\n"
425 
426 		/* current_upper */
427 		"movq (%0), %%mm6\n"
428 
429 		/* compute the upper-left pixel for dst on %%mm2 */
430 		/* compute the upper-right pixel for dst on %%mm4 */
431 		"movq %%mm0, %%mm2\n"
432 		"movq %%mm1, %%mm4\n"
433 		"movq %%mm0, %%mm3\n"
434 		"movq %%mm1, %%mm5\n"
435 		"pcmpeqb %%mm6, %%mm2\n"
436 		"pcmpeqb %%mm6, %%mm4\n"
437 		"pcmpeqb (%2), %%mm3\n"
438 		"pcmpeqb (%2), %%mm5\n"
439 		"pandn %%mm2, %%mm3\n"
440 		"pandn %%mm4, %%mm5\n"
441 		"movq %%mm0, %%mm2\n"
442 		"movq %%mm1, %%mm4\n"
443 		"pcmpeqb %%mm1, %%mm2\n"
444 		"pcmpeqb %%mm0, %%mm4\n"
445 		"pandn %%mm3, %%mm2\n"
446 		"pandn %%mm5, %%mm4\n"
447 		"movq %%mm2, %%mm3\n"
448 		"movq %%mm4, %%mm5\n"
449 		"pand %%mm6, %%mm2\n"
450 		"pand %%mm6, %%mm4\n"
451 		"pandn %%mm7, %%mm3\n"
452 		"pandn %%mm7, %%mm5\n"
453 		"por %%mm3, %%mm2\n"
454 		"por %%mm5, %%mm4\n"
455 
456 		/* set *dst */
457 		"movq %%mm2, %%mm3\n"
458 		"punpcklbw %%mm4, %%mm2\n"
459 		"punpckhbw %%mm4, %%mm3\n"
460 		"movq %%mm2, (%3)\n"
461 		"movq %%mm3, 8(%3)\n"
462 
463 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
464 		:
465 		: "cc"
466 	);
467 }
468 
scale2x_16_mmx_single(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)469 static inline void scale2x_16_mmx_single(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
470 {
471 	assert(count >= 8);
472 	assert(count % 4 == 0);
473 
474 	/* always do the first and last run */
475 	count -= 2*4;
476 
477 	__asm__ __volatile__(
478 /* first run */
479 		/* set the current, current_pre, current_next registers */
480 		"movq 0(%1), %%mm0\n"
481 		"movq 0(%1), %%mm7\n"
482 		"movq 8(%1), %%mm1\n"
483 		"psllq $48, %%mm0\n"
484 		"psllq $48, %%mm1\n"
485 		"psrlq $48, %%mm0\n"
486 		"movq %%mm7, %%mm2\n"
487 		"movq %%mm7, %%mm3\n"
488 		"psllq $16, %%mm2\n"
489 		"psrlq $16, %%mm3\n"
490 		"por %%mm2, %%mm0\n"
491 		"por %%mm3, %%mm1\n"
492 
493 		/* current_upper */
494 		"movq (%0), %%mm6\n"
495 
496 		/* compute the upper-left pixel for dst on %%mm2 */
497 		/* compute the upper-right pixel for dst on %%mm4 */
498 		"movq %%mm0, %%mm2\n"
499 		"movq %%mm1, %%mm4\n"
500 		"movq %%mm0, %%mm3\n"
501 		"movq %%mm1, %%mm5\n"
502 		"pcmpeqw %%mm6, %%mm2\n"
503 		"pcmpeqw %%mm6, %%mm4\n"
504 		"pcmpeqw (%2), %%mm3\n"
505 		"pcmpeqw (%2), %%mm5\n"
506 		"pandn %%mm2, %%mm3\n"
507 		"pandn %%mm4, %%mm5\n"
508 		"movq %%mm0, %%mm2\n"
509 		"movq %%mm1, %%mm4\n"
510 		"pcmpeqw %%mm1, %%mm2\n"
511 		"pcmpeqw %%mm0, %%mm4\n"
512 		"pandn %%mm3, %%mm2\n"
513 		"pandn %%mm5, %%mm4\n"
514 		"movq %%mm2, %%mm3\n"
515 		"movq %%mm4, %%mm5\n"
516 		"pand %%mm6, %%mm2\n"
517 		"pand %%mm6, %%mm4\n"
518 		"pandn %%mm7, %%mm3\n"
519 		"pandn %%mm7, %%mm5\n"
520 		"por %%mm3, %%mm2\n"
521 		"por %%mm5, %%mm4\n"
522 
523 		/* set *dst */
524 		"movq %%mm2, %%mm3\n"
525 		"punpcklwd %%mm4, %%mm2\n"
526 		"punpckhwd %%mm4, %%mm3\n"
527 		"movq %%mm2, (%3)\n"
528 		"movq %%mm3, 8(%3)\n"
529 
530 		/* next */
531 		"addl $8, %0\n"
532 		"addl $8, %1\n"
533 		"addl $8, %2\n"
534 		"addl $16, %3\n"
535 
536 /* central runs */
537 		"shrl $2, %4\n"
538 		"jz 1f\n"
539 
540 		"0:\n"
541 
542 		/* set the current, current_pre, current_next registers */
543 		"movq -8(%1), %%mm0\n"
544 		"movq (%1), %%mm7\n"
545 		"movq 8(%1), %%mm1\n"
546 		"psrlq $48, %%mm0\n"
547 		"psllq $48, %%mm1\n"
548 		"movq %%mm7, %%mm2\n"
549 		"movq %%mm7, %%mm3\n"
550 		"psllq $16, %%mm2\n"
551 		"psrlq $16, %%mm3\n"
552 		"por %%mm2, %%mm0\n"
553 		"por %%mm3, %%mm1\n"
554 
555 		/* current_upper */
556 		"movq (%0), %%mm6\n"
557 
558 		/* compute the upper-left pixel for dst on %%mm2 */
559 		/* compute the upper-right pixel for dst on %%mm4 */
560 		"movq %%mm0, %%mm2\n"
561 		"movq %%mm1, %%mm4\n"
562 		"movq %%mm0, %%mm3\n"
563 		"movq %%mm1, %%mm5\n"
564 		"pcmpeqw %%mm6, %%mm2\n"
565 		"pcmpeqw %%mm6, %%mm4\n"
566 		"pcmpeqw (%2), %%mm3\n"
567 		"pcmpeqw (%2), %%mm5\n"
568 		"pandn %%mm2, %%mm3\n"
569 		"pandn %%mm4, %%mm5\n"
570 		"movq %%mm0, %%mm2\n"
571 		"movq %%mm1, %%mm4\n"
572 		"pcmpeqw %%mm1, %%mm2\n"
573 		"pcmpeqw %%mm0, %%mm4\n"
574 		"pandn %%mm3, %%mm2\n"
575 		"pandn %%mm5, %%mm4\n"
576 		"movq %%mm2, %%mm3\n"
577 		"movq %%mm4, %%mm5\n"
578 		"pand %%mm6, %%mm2\n"
579 		"pand %%mm6, %%mm4\n"
580 		"pandn %%mm7, %%mm3\n"
581 		"pandn %%mm7, %%mm5\n"
582 		"por %%mm3, %%mm2\n"
583 		"por %%mm5, %%mm4\n"
584 
585 		/* set *dst */
586 		"movq %%mm2, %%mm3\n"
587 		"punpcklwd %%mm4, %%mm2\n"
588 		"punpckhwd %%mm4, %%mm3\n"
589 		"movq %%mm2, (%3)\n"
590 		"movq %%mm3, 8(%3)\n"
591 
592 		/* next */
593 		"addl $8, %0\n"
594 		"addl $8, %1\n"
595 		"addl $8, %2\n"
596 		"addl $16, %3\n"
597 
598 		"decl %4\n"
599 		"jnz 0b\n"
600 		"1:\n"
601 
602 /* final run */
603 		/* set the current, current_pre, current_next registers */
604 		"movq (%1), %%mm1\n"
605 		"movq (%1), %%mm7\n"
606 		"movq -8(%1), %%mm0\n"
607 		"psrlq $48, %%mm1\n"
608 		"psrlq $48, %%mm0\n"
609 		"psllq $48, %%mm1\n"
610 		"movq %%mm7, %%mm2\n"
611 		"movq %%mm7, %%mm3\n"
612 		"psllq $16, %%mm2\n"
613 		"psrlq $16, %%mm3\n"
614 		"por %%mm2, %%mm0\n"
615 		"por %%mm3, %%mm1\n"
616 
617 		/* current_upper */
618 		"movq (%0), %%mm6\n"
619 
620 		/* compute the upper-left pixel for dst on %%mm2 */
621 		/* compute the upper-right pixel for dst on %%mm4 */
622 		"movq %%mm0, %%mm2\n"
623 		"movq %%mm1, %%mm4\n"
624 		"movq %%mm0, %%mm3\n"
625 		"movq %%mm1, %%mm5\n"
626 		"pcmpeqw %%mm6, %%mm2\n"
627 		"pcmpeqw %%mm6, %%mm4\n"
628 		"pcmpeqw (%2), %%mm3\n"
629 		"pcmpeqw (%2), %%mm5\n"
630 		"pandn %%mm2, %%mm3\n"
631 		"pandn %%mm4, %%mm5\n"
632 		"movq %%mm0, %%mm2\n"
633 		"movq %%mm1, %%mm4\n"
634 		"pcmpeqw %%mm1, %%mm2\n"
635 		"pcmpeqw %%mm0, %%mm4\n"
636 		"pandn %%mm3, %%mm2\n"
637 		"pandn %%mm5, %%mm4\n"
638 		"movq %%mm2, %%mm3\n"
639 		"movq %%mm4, %%mm5\n"
640 		"pand %%mm6, %%mm2\n"
641 		"pand %%mm6, %%mm4\n"
642 		"pandn %%mm7, %%mm3\n"
643 		"pandn %%mm7, %%mm5\n"
644 		"por %%mm3, %%mm2\n"
645 		"por %%mm5, %%mm4\n"
646 
647 		/* set *dst */
648 		"movq %%mm2, %%mm3\n"
649 		"punpcklwd %%mm4, %%mm2\n"
650 		"punpckhwd %%mm4, %%mm3\n"
651 		"movq %%mm2, (%3)\n"
652 		"movq %%mm3, 8(%3)\n"
653 
654 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
655 		:
656 		: "cc"
657 	);
658 }
659 
scale2x_32_mmx_single(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)660 static inline void scale2x_32_mmx_single(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
661 {
662 	assert(count >= 4);
663 	assert(count % 2 == 0);
664 
665 	/* always do the first and last run */
666 	count -= 2*2;
667 
668 	__asm__ __volatile__(
669 /* first run */
670 		/* set the current, current_pre, current_next registers */
671 		"movq 0(%1), %%mm0\n"
672 		"movq 0(%1), %%mm7\n"
673 		"movq 8(%1), %%mm1\n"
674 		"psllq $32, %%mm0\n"
675 		"psllq $32, %%mm1\n"
676 		"psrlq $32, %%mm0\n"
677 		"movq %%mm7, %%mm2\n"
678 		"movq %%mm7, %%mm3\n"
679 		"psllq $32, %%mm2\n"
680 		"psrlq $32, %%mm3\n"
681 		"por %%mm2, %%mm0\n"
682 		"por %%mm3, %%mm1\n"
683 
684 		/* current_upper */
685 		"movq (%0), %%mm6\n"
686 
687 		/* compute the upper-left pixel for dst on %%mm2 */
688 		/* compute the upper-right pixel for dst on %%mm4 */
689 		"movq %%mm0, %%mm2\n"
690 		"movq %%mm1, %%mm4\n"
691 		"movq %%mm0, %%mm3\n"
692 		"movq %%mm1, %%mm5\n"
693 		"pcmpeqd %%mm6, %%mm2\n"
694 		"pcmpeqd %%mm6, %%mm4\n"
695 		"pcmpeqd (%2), %%mm3\n"
696 		"pcmpeqd (%2), %%mm5\n"
697 		"pandn %%mm2, %%mm3\n"
698 		"pandn %%mm4, %%mm5\n"
699 		"movq %%mm0, %%mm2\n"
700 		"movq %%mm1, %%mm4\n"
701 		"pcmpeqd %%mm1, %%mm2\n"
702 		"pcmpeqd %%mm0, %%mm4\n"
703 		"pandn %%mm3, %%mm2\n"
704 		"pandn %%mm5, %%mm4\n"
705 		"movq %%mm2, %%mm3\n"
706 		"movq %%mm4, %%mm5\n"
707 		"pand %%mm6, %%mm2\n"
708 		"pand %%mm6, %%mm4\n"
709 		"pandn %%mm7, %%mm3\n"
710 		"pandn %%mm7, %%mm5\n"
711 		"por %%mm3, %%mm2\n"
712 		"por %%mm5, %%mm4\n"
713 
714 		/* set *dst */
715 		"movq %%mm2, %%mm3\n"
716 		"punpckldq %%mm4, %%mm2\n"
717 		"punpckhdq %%mm4, %%mm3\n"
718 		"movq %%mm2, (%3)\n"
719 		"movq %%mm3, 8(%3)\n"
720 
721 		/* next */
722 		"addl $8, %0\n"
723 		"addl $8, %1\n"
724 		"addl $8, %2\n"
725 		"addl $16, %3\n"
726 
727 /* central runs */
728 		"shrl $1, %4\n"
729 		"jz 1f\n"
730 
731 		"0:\n"
732 
733 		/* set the current, current_pre, current_next registers */
734 		"movq -8(%1), %%mm0\n"
735 		"movq (%1), %%mm7\n"
736 		"movq 8(%1), %%mm1\n"
737 		"psrlq $32, %%mm0\n"
738 		"psllq $32, %%mm1\n"
739 		"movq %%mm7, %%mm2\n"
740 		"movq %%mm7, %%mm3\n"
741 		"psllq $32, %%mm2\n"
742 		"psrlq $32, %%mm3\n"
743 		"por %%mm2, %%mm0\n"
744 		"por %%mm3, %%mm1\n"
745 
746 		/* current_upper */
747 		"movq (%0), %%mm6\n"
748 
749 		/* compute the upper-left pixel for dst on %%mm2 */
750 		/* compute the upper-right pixel for dst on %%mm4 */
751 		"movq %%mm0, %%mm2\n"
752 		"movq %%mm1, %%mm4\n"
753 		"movq %%mm0, %%mm3\n"
754 		"movq %%mm1, %%mm5\n"
755 		"pcmpeqd %%mm6, %%mm2\n"
756 		"pcmpeqd %%mm6, %%mm4\n"
757 		"pcmpeqd (%2), %%mm3\n"
758 		"pcmpeqd (%2), %%mm5\n"
759 		"pandn %%mm2, %%mm3\n"
760 		"pandn %%mm4, %%mm5\n"
761 		"movq %%mm0, %%mm2\n"
762 		"movq %%mm1, %%mm4\n"
763 		"pcmpeqd %%mm1, %%mm2\n"
764 		"pcmpeqd %%mm0, %%mm4\n"
765 		"pandn %%mm3, %%mm2\n"
766 		"pandn %%mm5, %%mm4\n"
767 		"movq %%mm2, %%mm3\n"
768 		"movq %%mm4, %%mm5\n"
769 		"pand %%mm6, %%mm2\n"
770 		"pand %%mm6, %%mm4\n"
771 		"pandn %%mm7, %%mm3\n"
772 		"pandn %%mm7, %%mm5\n"
773 		"por %%mm3, %%mm2\n"
774 		"por %%mm5, %%mm4\n"
775 
776 		/* set *dst */
777 		"movq %%mm2, %%mm3\n"
778 		"punpckldq %%mm4, %%mm2\n"
779 		"punpckhdq %%mm4, %%mm3\n"
780 		"movq %%mm2, (%3)\n"
781 		"movq %%mm3, 8(%3)\n"
782 
783 		/* next */
784 		"addl $8, %0\n"
785 		"addl $8, %1\n"
786 		"addl $8, %2\n"
787 		"addl $16, %3\n"
788 
789 		"decl %4\n"
790 		"jnz 0b\n"
791 		"1:\n"
792 
793 /* final run */
794 		/* set the current, current_pre, current_next registers */
795 		"movq (%1), %%mm1\n"
796 		"movq (%1), %%mm7\n"
797 		"movq -8(%1), %%mm0\n"
798 		"psrlq $32, %%mm1\n"
799 		"psrlq $32, %%mm0\n"
800 		"psllq $32, %%mm1\n"
801 		"movq %%mm7, %%mm2\n"
802 		"movq %%mm7, %%mm3\n"
803 		"psllq $32, %%mm2\n"
804 		"psrlq $32, %%mm3\n"
805 		"por %%mm2, %%mm0\n"
806 		"por %%mm3, %%mm1\n"
807 
808 		/* current_upper */
809 		"movq (%0), %%mm6\n"
810 
811 		/* compute the upper-left pixel for dst on %%mm2 */
812 		/* compute the upper-right pixel for dst on %%mm4 */
813 		"movq %%mm0, %%mm2\n"
814 		"movq %%mm1, %%mm4\n"
815 		"movq %%mm0, %%mm3\n"
816 		"movq %%mm1, %%mm5\n"
817 		"pcmpeqd %%mm6, %%mm2\n"
818 		"pcmpeqd %%mm6, %%mm4\n"
819 		"pcmpeqd (%2), %%mm3\n"
820 		"pcmpeqd (%2), %%mm5\n"
821 		"pandn %%mm2, %%mm3\n"
822 		"pandn %%mm4, %%mm5\n"
823 		"movq %%mm0, %%mm2\n"
824 		"movq %%mm1, %%mm4\n"
825 		"pcmpeqd %%mm1, %%mm2\n"
826 		"pcmpeqd %%mm0, %%mm4\n"
827 		"pandn %%mm3, %%mm2\n"
828 		"pandn %%mm5, %%mm4\n"
829 		"movq %%mm2, %%mm3\n"
830 		"movq %%mm4, %%mm5\n"
831 		"pand %%mm6, %%mm2\n"
832 		"pand %%mm6, %%mm4\n"
833 		"pandn %%mm7, %%mm3\n"
834 		"pandn %%mm7, %%mm5\n"
835 		"por %%mm3, %%mm2\n"
836 		"por %%mm5, %%mm4\n"
837 
838 		/* set *dst */
839 		"movq %%mm2, %%mm3\n"
840 		"punpckldq %%mm4, %%mm2\n"
841 		"punpckhdq %%mm4, %%mm3\n"
842 		"movq %%mm2, (%3)\n"
843 		"movq %%mm3, 8(%3)\n"
844 
845 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
846 		:
847 		: "cc"
848 	);
849 }
850 
851 /**
852  * Scale by a factor of 2 a row of pixels of 8 bits.
853  * This is a very fast MMX implementation.
854  * The implementation uses a combination of cmp/and/not operations to
855  * completly remove the need of conditional jumps. This trick give the
856  * major speed improvement.
857  * Also, using the 8 bytes MMX registers more than one pixel are computed
858  * at the same time.
859  * Before calling this function you must ensure that the currenct CPU supports
860  * the MMX instruction set. After calling it you must be sure to call the EMMS
861  * instruction before any floating-point operation.
862  * The pixels over the left and right borders are assumed of the same color of
863  * the pixels on the border.
864  * \param src0 Pointer at the first pixel of the previous row.
865  * \param src1 Pointer at the first pixel of the current row.
866  * \param src2 Pointer at the first pixel of the next row.
867  * \param count Length in pixels of the src0, src1 and src2 rows. It must
868  * be at least 16 and a multiple of 8.
869  * \param dst0 First destination row, double length in pixels.
870  * \param dst1 Second destination row, double length in pixels.
871  */
872 #if 0
873 static void scale2x_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
874 {
875 	assert(count >= 16);
876 	assert(count % 8 == 0);
877 
878 	scale2x_8_mmx_single(dst0, src0, src1, src2, count);
879 	scale2x_8_mmx_single(dst1, src2, src1, src0, count);
880 }
881 #endif
882 /**
883  * Scale by a factor of 2 a row of pixels of 16 bits.
884  * This function operates like scale2x_8_mmx() but for 16 bits pixels.
885  * \param src0 Pointer at the first pixel of the previous row.
886  * \param src1 Pointer at the first pixel of the current row.
887  * \param src2 Pointer at the first pixel of the next row.
888  * \param count Length in pixels of the src0, src1 and src2 rows. It must
889  * be at least 8 and a multiple of 4.
890  * \param dst0 First destination row, double length in pixels.
891  * \param dst1 Second destination row, double length in pixels.
892  */
scale2x_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)893 static void scale2x_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
894 {
895 	assert(count >= 8);
896 	assert(count % 4 == 0);
897 
898 	scale2x_16_mmx_single(dst0, src0, src1, src2, count);
899 	scale2x_16_mmx_single(dst1, src2, src1, src0, count);
900 }
901 
902 /**
903  * Scale by a factor of 2 a row of pixels of 32 bits.
904  * This function operates like scale2x_8_mmx() but for 32 bits pixels.
905  * \param src0 Pointer at the first pixel of the previous row.
906  * \param src1 Pointer at the first pixel of the current row.
907  * \param src2 Pointer at the first pixel of the next row.
908  * \param count Length in pixels of the src0, src1 and src2 rows. It must
909  * be at least 4 and a multiple of 2.
910  * \param dst0 First destination row, double length in pixels.
911  * \param dst1 Second destination row, double length in pixels.
912  */
scale2x_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)913 static void scale2x_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
914 {
915 	assert(count >= 4);
916 	assert(count % 2 == 0);
917 
918 	scale2x_32_mmx_single(dst0, src0, src1, src2, count);
919 	scale2x_32_mmx_single(dst1, src2, src1, src0, count);
920 }
921 
922 #endif
923 
924 #endif
925