1 /*
2 * This file is part of the Scale2x project.
3 *
4 * Copyright (C) 2001-2003 Andrea Mazzoleni
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21 /*
22 * This file contains a fast C and MMX implementation of the Scale2x effect.
23 *
24 * You can find an high level description of the effect at :
25 *
26 * http://scale2x.sourceforge.net/
27 *
28 * Alternatively at the previous license terms, you are allowed to use this
29 * code in your program with these conditions:
30 * - the program is not used in commercial activities.
31 * - the whole source code of the program is released with the binary.
32 * - derivative works of the program are allowed.
33 */
34
35 #ifndef __SCALE2X_H
36 #define __SCALE2X_H
37
38 #include <assert.h>
39
40 /***************************************************************************/
41 /* Basic types */
42
43 typedef unsigned char scale2x_uint8;
44 typedef unsigned short scale2x_uint16;
45 typedef unsigned scale2x_uint32;
46
47 /***************************************************************************/
48 /* Scale2x C implementation */
49
scale2x_8_def_single(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)50 static void scale2x_8_def_single(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
51 {
52 assert(count >= 2);
53
54 /* first pixel */
55 dst[0] = src1[0];
56 if (src1[1] == src0[0] && src2[0] != src0[0])
57 dst[1] = src0[0];
58 else
59 dst[1] = src1[0];
60 ++src0;
61 ++src1;
62 ++src2;
63 dst += 2;
64
65 /* central pixels */
66 count -= 2;
67 while (count) {
68 if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
69 dst[0] = src0[0];
70 else
71 dst[0] = src1[0];
72 if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
73 dst[1] = src0[0];
74 else
75 dst[1] = src1[0];
76
77 ++src0;
78 ++src1;
79 ++src2;
80 dst += 2;
81 --count;
82 }
83
84 /* last pixel */
85 if (src1[-1] == src0[0] && src2[0] != src0[0])
86 dst[0] = src0[0];
87 else
88 dst[0] = src1[0];
89 dst[1] = src1[0];
90 }
91
scale2x_16_def_single(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)92 static void scale2x_16_def_single(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
93 {
94 assert(count >= 2);
95
96 /* first pixel */
97 dst[0] = src1[0];
98 if (src1[1] == src0[0] && src2[0] != src0[0])
99 dst[1] = src0[0];
100 else
101 dst[1] = src1[0];
102 ++src0;
103 ++src1;
104 ++src2;
105 dst += 2;
106
107 /* central pixels */
108 count -= 2;
109 while (count) {
110 if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
111 dst[0] = src0[0];
112 else
113 dst[0] = src1[0];
114 if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
115 dst[1] = src0[0];
116 else
117 dst[1] = src1[0];
118
119 ++src0;
120 ++src1;
121 ++src2;
122 dst += 2;
123 --count;
124 }
125
126 /* last pixel */
127 if (src1[-1] == src0[0] && src2[0] != src0[0])
128 dst[0] = src0[0];
129 else
130 dst[0] = src1[0];
131 dst[1] = src1[0];
132 }
133
scale2x_32_def_single(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)134 static void scale2x_32_def_single(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
135 {
136 assert(count >= 2);
137
138 /* first pixel */
139 dst[0] = src1[0];
140 if (src1[1] == src0[0] && src2[0] != src0[0])
141 dst[1] = src0[0];
142 else
143 dst[1] = src1[0];
144 ++src0;
145 ++src1;
146 ++src2;
147 dst += 2;
148
149 /* central pixels */
150 count -= 2;
151 while (count) {
152 if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
153 dst[0] = src0[0];
154 else
155 dst[0] = src1[0];
156 if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
157 dst[1] = src0[0];
158 else
159 dst[1] = src1[0];
160
161 ++src0;
162 ++src1;
163 ++src2;
164 dst += 2;
165 --count;
166 }
167
168 /* last pixel */
169 if (src1[-1] == src0[0] && src2[0] != src0[0])
170 dst[0] = src0[0];
171 else
172 dst[0] = src1[0];
173 dst[1] = src1[0];
174 }
175
176 /**
177 * Scale by a factor of 2 a row of pixels of 8 bits.
178 * The function is implemented in C.
179 * The pixels over the left and right borders are assumed of the same color of
180 * the pixels on the border.
181 * \param src0 Pointer at the first pixel of the previous row.
182 * \param src1 Pointer at the first pixel of the current row.
183 * \param src2 Pointer at the first pixel of the next row.
184 * \param count Length in pixels of the src0, src1 and src2 rows.
185 * It must be at least 2.
186 * \param dst0 First destination row, double length in pixels.
187 * \param dst1 Second destination row, double length in pixels.
188 */
189 //static inline void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
scale2x_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)190 static void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
191 {
192 assert(count >= 2);
193
194 scale2x_8_def_single(dst0, src0, src1, src2, count);
195 scale2x_8_def_single(dst1, src2, src1, src0, count);
196 }
197
198 /**
199 * Scale by a factor of 2 a row of pixels of 16 bits.
200 * This function operates like scale2x_8_def() but for 16 bits pixels.
201 * \param src0 Pointer at the first pixel of the previous row.
202 * \param src1 Pointer at the first pixel of the current row.
203 * \param src2 Pointer at the first pixel of the next row.
204 * \param count Length in pixels of the src0, src1 and src2 rows.
205 * It must be at least 2.
206 * \param dst0 First destination row, double length in pixels.
207 * \param dst1 Second destination row, double length in pixels.
208 */
209 //static inline void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
scale2x_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)210 static void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
211 {
212 assert(count >= 2);
213
214 scale2x_16_def_single(dst0, src0, src1, src2, count);
215 scale2x_16_def_single(dst1, src2, src1, src0, count);
216 }
217
218 /**
219 * Scale by a factor of 2 a row of pixels of 32 bits.
220 * This function operates like scale2x_8_def() but for 32 bits pixels.
221 * \param src0 Pointer at the first pixel of the previous row.
222 * \param src1 Pointer at the first pixel of the current row.
223 * \param src2 Pointer at the first pixel of the next row.
224 * \param count Length in pixels of the src0, src1 and src2 rows.
225 * It must be at least 2.
226 * \param dst0 First destination row, double length in pixels.
227 * \param dst1 Second destination row, double length in pixels.
228 */
229 //static inline void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
scale2x_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)230 static void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
231 {
232 assert(count >= 2);
233
234 scale2x_32_def_single(dst0, src0, src1, src2, count);
235 scale2x_32_def_single(dst1, src2, src1, src0, count);
236 }
237
238 /***************************************************************************/
239 /* Scale2x MMX implementation */
240
241 #if defined(__GNUC__) && defined(__i386__)
242
243 /*
244 * Apply the Scale2x effect at a single row.
245 * This function must be called only by the other scale2x functions.
246 *
247 * Considering the pixel map :
248 *
249 * ABC (src0)
250 * DEF (src1)
251 * GHI (src2)
252 *
253 * this functions compute 2 new pixels in substitution of the source pixel E
254 * like this map :
255 *
256 * ab (dst)
257 *
258 * with these variables :
259 *
260 * ¤t -> E
261 * ¤t_left -> D
262 * ¤t_right -> F
263 * ¤t_upper -> B
264 * ¤t_lower -> H
265 *
266 * %0 -> current_upper
267 * %1 -> current
268 * %2 -> current_lower
269 * %3 -> dst
270 * %4 -> counter
271 *
272 * %mm0 -> *current_left
273 * %mm1 -> *current_next
274 * %mm2 -> tmp0
275 * %mm3 -> tmp1
276 * %mm4 -> tmp2
277 * %mm5 -> tmp3
278 * %mm6 -> *current_upper
279 * %mm7 -> *current
280 */
scale2x_8_mmx_single(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)281 static inline void scale2x_8_mmx_single(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
282 {
283 assert(count >= 16);
284 assert(count % 8 == 0);
285
286 /* always do the first and last run */
287 count -= 2*8;
288
289 __asm__ __volatile__(
290 /* first run */
291 /* set the current, current_pre, current_next registers */
292 "movq 0(%1), %%mm0\n"
293 "movq 0(%1), %%mm7\n"
294 "movq 8(%1), %%mm1\n"
295 "psllq $56, %%mm0\n"
296 "psllq $56, %%mm1\n"
297 "psrlq $56, %%mm0\n"
298 "movq %%mm7, %%mm2\n"
299 "movq %%mm7, %%mm3\n"
300 "psllq $8, %%mm2\n"
301 "psrlq $8, %%mm3\n"
302 "por %%mm2, %%mm0\n"
303 "por %%mm3, %%mm1\n"
304
305 /* current_upper */
306 "movq (%0), %%mm6\n"
307
308 /* compute the upper-left pixel for dst on %%mm2 */
309 /* compute the upper-right pixel for dst on %%mm4 */
310 "movq %%mm0, %%mm2\n"
311 "movq %%mm1, %%mm4\n"
312 "movq %%mm0, %%mm3\n"
313 "movq %%mm1, %%mm5\n"
314 "pcmpeqb %%mm6, %%mm2\n"
315 "pcmpeqb %%mm6, %%mm4\n"
316 "pcmpeqb (%2), %%mm3\n"
317 "pcmpeqb (%2), %%mm5\n"
318 "pandn %%mm2, %%mm3\n"
319 "pandn %%mm4, %%mm5\n"
320 "movq %%mm0, %%mm2\n"
321 "movq %%mm1, %%mm4\n"
322 "pcmpeqb %%mm1, %%mm2\n"
323 "pcmpeqb %%mm0, %%mm4\n"
324 "pandn %%mm3, %%mm2\n"
325 "pandn %%mm5, %%mm4\n"
326 "movq %%mm2, %%mm3\n"
327 "movq %%mm4, %%mm5\n"
328 "pand %%mm6, %%mm2\n"
329 "pand %%mm6, %%mm4\n"
330 "pandn %%mm7, %%mm3\n"
331 "pandn %%mm7, %%mm5\n"
332 "por %%mm3, %%mm2\n"
333 "por %%mm5, %%mm4\n"
334
335 /* set *dst */
336 "movq %%mm2, %%mm3\n"
337 "punpcklbw %%mm4, %%mm2\n"
338 "punpckhbw %%mm4, %%mm3\n"
339 "movq %%mm2, (%3)\n"
340 "movq %%mm3, 8(%3)\n"
341
342 /* next */
343 "addl $8, %0\n"
344 "addl $8, %1\n"
345 "addl $8, %2\n"
346 "addl $16, %3\n"
347
348 /* central runs */
349 "shrl $3, %4\n"
350 "jz 1f\n"
351
352 "0:\n"
353
354 /* set the current, current_pre, current_next registers */
355 "movq -8(%1), %%mm0\n"
356 "movq (%1), %%mm7\n"
357 "movq 8(%1), %%mm1\n"
358 "psrlq $56, %%mm0\n"
359 "psllq $56, %%mm1\n"
360 "movq %%mm7, %%mm2\n"
361 "movq %%mm7, %%mm3\n"
362 "psllq $8, %%mm2\n"
363 "psrlq $8, %%mm3\n"
364 "por %%mm2, %%mm0\n"
365 "por %%mm3, %%mm1\n"
366
367 /* current_upper */
368 "movq (%0), %%mm6\n"
369
370 /* compute the upper-left pixel for dst on %%mm2 */
371 /* compute the upper-right pixel for dst on %%mm4 */
372 "movq %%mm0, %%mm2\n"
373 "movq %%mm1, %%mm4\n"
374 "movq %%mm0, %%mm3\n"
375 "movq %%mm1, %%mm5\n"
376 "pcmpeqb %%mm6, %%mm2\n"
377 "pcmpeqb %%mm6, %%mm4\n"
378 "pcmpeqb (%2), %%mm3\n"
379 "pcmpeqb (%2), %%mm5\n"
380 "pandn %%mm2, %%mm3\n"
381 "pandn %%mm4, %%mm5\n"
382 "movq %%mm0, %%mm2\n"
383 "movq %%mm1, %%mm4\n"
384 "pcmpeqb %%mm1, %%mm2\n"
385 "pcmpeqb %%mm0, %%mm4\n"
386 "pandn %%mm3, %%mm2\n"
387 "pandn %%mm5, %%mm4\n"
388 "movq %%mm2, %%mm3\n"
389 "movq %%mm4, %%mm5\n"
390 "pand %%mm6, %%mm2\n"
391 "pand %%mm6, %%mm4\n"
392 "pandn %%mm7, %%mm3\n"
393 "pandn %%mm7, %%mm5\n"
394 "por %%mm3, %%mm2\n"
395 "por %%mm5, %%mm4\n"
396
397 /* set *dst */
398 "movq %%mm2, %%mm3\n"
399 "punpcklbw %%mm4, %%mm2\n"
400 "punpckhbw %%mm4, %%mm3\n"
401 "movq %%mm2, (%3)\n"
402 "movq %%mm3, 8(%3)\n"
403
404 /* next */
405 "addl $8, %0\n"
406 "addl $8, %1\n"
407 "addl $8, %2\n"
408 "addl $16, %3\n"
409
410 "decl %4\n"
411 "jnz 0b\n"
412 "1:\n"
413
414 /* final run */
415 /* set the current, current_pre, current_next registers */
416 "movq (%1), %%mm1\n"
417 "movq (%1), %%mm7\n"
418 "movq -8(%1), %%mm0\n"
419 "psrlq $56, %%mm1\n"
420 "psrlq $56, %%mm0\n"
421 "psllq $56, %%mm1\n"
422 "movq %%mm7, %%mm2\n"
423 "movq %%mm7, %%mm3\n"
424 "psllq $8, %%mm2\n"
425 "psrlq $8, %%mm3\n"
426 "por %%mm2, %%mm0\n"
427 "por %%mm3, %%mm1\n"
428
429 /* current_upper */
430 "movq (%0), %%mm6\n"
431
432 /* compute the upper-left pixel for dst on %%mm2 */
433 /* compute the upper-right pixel for dst on %%mm4 */
434 "movq %%mm0, %%mm2\n"
435 "movq %%mm1, %%mm4\n"
436 "movq %%mm0, %%mm3\n"
437 "movq %%mm1, %%mm5\n"
438 "pcmpeqb %%mm6, %%mm2\n"
439 "pcmpeqb %%mm6, %%mm4\n"
440 "pcmpeqb (%2), %%mm3\n"
441 "pcmpeqb (%2), %%mm5\n"
442 "pandn %%mm2, %%mm3\n"
443 "pandn %%mm4, %%mm5\n"
444 "movq %%mm0, %%mm2\n"
445 "movq %%mm1, %%mm4\n"
446 "pcmpeqb %%mm1, %%mm2\n"
447 "pcmpeqb %%mm0, %%mm4\n"
448 "pandn %%mm3, %%mm2\n"
449 "pandn %%mm5, %%mm4\n"
450 "movq %%mm2, %%mm3\n"
451 "movq %%mm4, %%mm5\n"
452 "pand %%mm6, %%mm2\n"
453 "pand %%mm6, %%mm4\n"
454 "pandn %%mm7, %%mm3\n"
455 "pandn %%mm7, %%mm5\n"
456 "por %%mm3, %%mm2\n"
457 "por %%mm5, %%mm4\n"
458
459 /* set *dst */
460 "movq %%mm2, %%mm3\n"
461 "punpcklbw %%mm4, %%mm2\n"
462 "punpckhbw %%mm4, %%mm3\n"
463 "movq %%mm2, (%3)\n"
464 "movq %%mm3, 8(%3)\n"
465
466 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
467 :
468 : "cc"
469 );
470 }
471
scale2x_16_mmx_single(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)472 static inline void scale2x_16_mmx_single(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
473 {
474 assert(count >= 8);
475 assert(count % 4 == 0);
476
477 /* always do the first and last run */
478 count -= 2*4;
479
480 __asm__ __volatile__(
481 /* first run */
482 /* set the current, current_pre, current_next registers */
483 "movq 0(%1), %%mm0\n"
484 "movq 0(%1), %%mm7\n"
485 "movq 8(%1), %%mm1\n"
486 "psllq $48, %%mm0\n"
487 "psllq $48, %%mm1\n"
488 "psrlq $48, %%mm0\n"
489 "movq %%mm7, %%mm2\n"
490 "movq %%mm7, %%mm3\n"
491 "psllq $16, %%mm2\n"
492 "psrlq $16, %%mm3\n"
493 "por %%mm2, %%mm0\n"
494 "por %%mm3, %%mm1\n"
495
496 /* current_upper */
497 "movq (%0), %%mm6\n"
498
499 /* compute the upper-left pixel for dst on %%mm2 */
500 /* compute the upper-right pixel for dst on %%mm4 */
501 "movq %%mm0, %%mm2\n"
502 "movq %%mm1, %%mm4\n"
503 "movq %%mm0, %%mm3\n"
504 "movq %%mm1, %%mm5\n"
505 "pcmpeqw %%mm6, %%mm2\n"
506 "pcmpeqw %%mm6, %%mm4\n"
507 "pcmpeqw (%2), %%mm3\n"
508 "pcmpeqw (%2), %%mm5\n"
509 "pandn %%mm2, %%mm3\n"
510 "pandn %%mm4, %%mm5\n"
511 "movq %%mm0, %%mm2\n"
512 "movq %%mm1, %%mm4\n"
513 "pcmpeqw %%mm1, %%mm2\n"
514 "pcmpeqw %%mm0, %%mm4\n"
515 "pandn %%mm3, %%mm2\n"
516 "pandn %%mm5, %%mm4\n"
517 "movq %%mm2, %%mm3\n"
518 "movq %%mm4, %%mm5\n"
519 "pand %%mm6, %%mm2\n"
520 "pand %%mm6, %%mm4\n"
521 "pandn %%mm7, %%mm3\n"
522 "pandn %%mm7, %%mm5\n"
523 "por %%mm3, %%mm2\n"
524 "por %%mm5, %%mm4\n"
525
526 /* set *dst */
527 "movq %%mm2, %%mm3\n"
528 "punpcklwd %%mm4, %%mm2\n"
529 "punpckhwd %%mm4, %%mm3\n"
530 "movq %%mm2, (%3)\n"
531 "movq %%mm3, 8(%3)\n"
532
533 /* next */
534 "addl $8, %0\n"
535 "addl $8, %1\n"
536 "addl $8, %2\n"
537 "addl $16, %3\n"
538
539 /* central runs */
540 "shrl $2, %4\n"
541 "jz 1f\n"
542
543 "0:\n"
544
545 /* set the current, current_pre, current_next registers */
546 "movq -8(%1), %%mm0\n"
547 "movq (%1), %%mm7\n"
548 "movq 8(%1), %%mm1\n"
549 "psrlq $48, %%mm0\n"
550 "psllq $48, %%mm1\n"
551 "movq %%mm7, %%mm2\n"
552 "movq %%mm7, %%mm3\n"
553 "psllq $16, %%mm2\n"
554 "psrlq $16, %%mm3\n"
555 "por %%mm2, %%mm0\n"
556 "por %%mm3, %%mm1\n"
557
558 /* current_upper */
559 "movq (%0), %%mm6\n"
560
561 /* compute the upper-left pixel for dst on %%mm2 */
562 /* compute the upper-right pixel for dst on %%mm4 */
563 "movq %%mm0, %%mm2\n"
564 "movq %%mm1, %%mm4\n"
565 "movq %%mm0, %%mm3\n"
566 "movq %%mm1, %%mm5\n"
567 "pcmpeqw %%mm6, %%mm2\n"
568 "pcmpeqw %%mm6, %%mm4\n"
569 "pcmpeqw (%2), %%mm3\n"
570 "pcmpeqw (%2), %%mm5\n"
571 "pandn %%mm2, %%mm3\n"
572 "pandn %%mm4, %%mm5\n"
573 "movq %%mm0, %%mm2\n"
574 "movq %%mm1, %%mm4\n"
575 "pcmpeqw %%mm1, %%mm2\n"
576 "pcmpeqw %%mm0, %%mm4\n"
577 "pandn %%mm3, %%mm2\n"
578 "pandn %%mm5, %%mm4\n"
579 "movq %%mm2, %%mm3\n"
580 "movq %%mm4, %%mm5\n"
581 "pand %%mm6, %%mm2\n"
582 "pand %%mm6, %%mm4\n"
583 "pandn %%mm7, %%mm3\n"
584 "pandn %%mm7, %%mm5\n"
585 "por %%mm3, %%mm2\n"
586 "por %%mm5, %%mm4\n"
587
588 /* set *dst */
589 "movq %%mm2, %%mm3\n"
590 "punpcklwd %%mm4, %%mm2\n"
591 "punpckhwd %%mm4, %%mm3\n"
592 "movq %%mm2, (%3)\n"
593 "movq %%mm3, 8(%3)\n"
594
595 /* next */
596 "addl $8, %0\n"
597 "addl $8, %1\n"
598 "addl $8, %2\n"
599 "addl $16, %3\n"
600
601 "decl %4\n"
602 "jnz 0b\n"
603 "1:\n"
604
605 /* final run */
606 /* set the current, current_pre, current_next registers */
607 "movq (%1), %%mm1\n"
608 "movq (%1), %%mm7\n"
609 "movq -8(%1), %%mm0\n"
610 "psrlq $48, %%mm1\n"
611 "psrlq $48, %%mm0\n"
612 "psllq $48, %%mm1\n"
613 "movq %%mm7, %%mm2\n"
614 "movq %%mm7, %%mm3\n"
615 "psllq $16, %%mm2\n"
616 "psrlq $16, %%mm3\n"
617 "por %%mm2, %%mm0\n"
618 "por %%mm3, %%mm1\n"
619
620 /* current_upper */
621 "movq (%0), %%mm6\n"
622
623 /* compute the upper-left pixel for dst on %%mm2 */
624 /* compute the upper-right pixel for dst on %%mm4 */
625 "movq %%mm0, %%mm2\n"
626 "movq %%mm1, %%mm4\n"
627 "movq %%mm0, %%mm3\n"
628 "movq %%mm1, %%mm5\n"
629 "pcmpeqw %%mm6, %%mm2\n"
630 "pcmpeqw %%mm6, %%mm4\n"
631 "pcmpeqw (%2), %%mm3\n"
632 "pcmpeqw (%2), %%mm5\n"
633 "pandn %%mm2, %%mm3\n"
634 "pandn %%mm4, %%mm5\n"
635 "movq %%mm0, %%mm2\n"
636 "movq %%mm1, %%mm4\n"
637 "pcmpeqw %%mm1, %%mm2\n"
638 "pcmpeqw %%mm0, %%mm4\n"
639 "pandn %%mm3, %%mm2\n"
640 "pandn %%mm5, %%mm4\n"
641 "movq %%mm2, %%mm3\n"
642 "movq %%mm4, %%mm5\n"
643 "pand %%mm6, %%mm2\n"
644 "pand %%mm6, %%mm4\n"
645 "pandn %%mm7, %%mm3\n"
646 "pandn %%mm7, %%mm5\n"
647 "por %%mm3, %%mm2\n"
648 "por %%mm5, %%mm4\n"
649
650 /* set *dst */
651 "movq %%mm2, %%mm3\n"
652 "punpcklwd %%mm4, %%mm2\n"
653 "punpckhwd %%mm4, %%mm3\n"
654 "movq %%mm2, (%3)\n"
655 "movq %%mm3, 8(%3)\n"
656
657 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
658 :
659 : "cc"
660 );
661 }
662
scale2x_32_mmx_single(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)663 static inline void scale2x_32_mmx_single(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
664 {
665 assert(count >= 4);
666 assert(count % 2 == 0);
667
668 /* always do the first and last run */
669 count -= 2*2;
670
671 __asm__ __volatile__(
672 /* first run */
673 /* set the current, current_pre, current_next registers */
674 "movq 0(%1), %%mm0\n"
675 "movq 0(%1), %%mm7\n"
676 "movq 8(%1), %%mm1\n"
677 "psllq $32, %%mm0\n"
678 "psllq $32, %%mm1\n"
679 "psrlq $32, %%mm0\n"
680 "movq %%mm7, %%mm2\n"
681 "movq %%mm7, %%mm3\n"
682 "psllq $32, %%mm2\n"
683 "psrlq $32, %%mm3\n"
684 "por %%mm2, %%mm0\n"
685 "por %%mm3, %%mm1\n"
686
687 /* current_upper */
688 "movq (%0), %%mm6\n"
689
690 /* compute the upper-left pixel for dst on %%mm2 */
691 /* compute the upper-right pixel for dst on %%mm4 */
692 "movq %%mm0, %%mm2\n"
693 "movq %%mm1, %%mm4\n"
694 "movq %%mm0, %%mm3\n"
695 "movq %%mm1, %%mm5\n"
696 "pcmpeqd %%mm6, %%mm2\n"
697 "pcmpeqd %%mm6, %%mm4\n"
698 "pcmpeqd (%2), %%mm3\n"
699 "pcmpeqd (%2), %%mm5\n"
700 "pandn %%mm2, %%mm3\n"
701 "pandn %%mm4, %%mm5\n"
702 "movq %%mm0, %%mm2\n"
703 "movq %%mm1, %%mm4\n"
704 "pcmpeqd %%mm1, %%mm2\n"
705 "pcmpeqd %%mm0, %%mm4\n"
706 "pandn %%mm3, %%mm2\n"
707 "pandn %%mm5, %%mm4\n"
708 "movq %%mm2, %%mm3\n"
709 "movq %%mm4, %%mm5\n"
710 "pand %%mm6, %%mm2\n"
711 "pand %%mm6, %%mm4\n"
712 "pandn %%mm7, %%mm3\n"
713 "pandn %%mm7, %%mm5\n"
714 "por %%mm3, %%mm2\n"
715 "por %%mm5, %%mm4\n"
716
717 /* set *dst */
718 "movq %%mm2, %%mm3\n"
719 "punpckldq %%mm4, %%mm2\n"
720 "punpckhdq %%mm4, %%mm3\n"
721 "movq %%mm2, (%3)\n"
722 "movq %%mm3, 8(%3)\n"
723
724 /* next */
725 "addl $8, %0\n"
726 "addl $8, %1\n"
727 "addl $8, %2\n"
728 "addl $16, %3\n"
729
730 /* central runs */
731 "shrl $1, %4\n"
732 "jz 1f\n"
733
734 "0:\n"
735
736 /* set the current, current_pre, current_next registers */
737 "movq -8(%1), %%mm0\n"
738 "movq (%1), %%mm7\n"
739 "movq 8(%1), %%mm1\n"
740 "psrlq $32, %%mm0\n"
741 "psllq $32, %%mm1\n"
742 "movq %%mm7, %%mm2\n"
743 "movq %%mm7, %%mm3\n"
744 "psllq $32, %%mm2\n"
745 "psrlq $32, %%mm3\n"
746 "por %%mm2, %%mm0\n"
747 "por %%mm3, %%mm1\n"
748
749 /* current_upper */
750 "movq (%0), %%mm6\n"
751
752 /* compute the upper-left pixel for dst on %%mm2 */
753 /* compute the upper-right pixel for dst on %%mm4 */
754 "movq %%mm0, %%mm2\n"
755 "movq %%mm1, %%mm4\n"
756 "movq %%mm0, %%mm3\n"
757 "movq %%mm1, %%mm5\n"
758 "pcmpeqd %%mm6, %%mm2\n"
759 "pcmpeqd %%mm6, %%mm4\n"
760 "pcmpeqd (%2), %%mm3\n"
761 "pcmpeqd (%2), %%mm5\n"
762 "pandn %%mm2, %%mm3\n"
763 "pandn %%mm4, %%mm5\n"
764 "movq %%mm0, %%mm2\n"
765 "movq %%mm1, %%mm4\n"
766 "pcmpeqd %%mm1, %%mm2\n"
767 "pcmpeqd %%mm0, %%mm4\n"
768 "pandn %%mm3, %%mm2\n"
769 "pandn %%mm5, %%mm4\n"
770 "movq %%mm2, %%mm3\n"
771 "movq %%mm4, %%mm5\n"
772 "pand %%mm6, %%mm2\n"
773 "pand %%mm6, %%mm4\n"
774 "pandn %%mm7, %%mm3\n"
775 "pandn %%mm7, %%mm5\n"
776 "por %%mm3, %%mm2\n"
777 "por %%mm5, %%mm4\n"
778
779 /* set *dst */
780 "movq %%mm2, %%mm3\n"
781 "punpckldq %%mm4, %%mm2\n"
782 "punpckhdq %%mm4, %%mm3\n"
783 "movq %%mm2, (%3)\n"
784 "movq %%mm3, 8(%3)\n"
785
786 /* next */
787 "addl $8, %0\n"
788 "addl $8, %1\n"
789 "addl $8, %2\n"
790 "addl $16, %3\n"
791
792 "decl %4\n"
793 "jnz 0b\n"
794 "1:\n"
795
796 /* final run */
797 /* set the current, current_pre, current_next registers */
798 "movq (%1), %%mm1\n"
799 "movq (%1), %%mm7\n"
800 "movq -8(%1), %%mm0\n"
801 "psrlq $32, %%mm1\n"
802 "psrlq $32, %%mm0\n"
803 "psllq $32, %%mm1\n"
804 "movq %%mm7, %%mm2\n"
805 "movq %%mm7, %%mm3\n"
806 "psllq $32, %%mm2\n"
807 "psrlq $32, %%mm3\n"
808 "por %%mm2, %%mm0\n"
809 "por %%mm3, %%mm1\n"
810
811 /* current_upper */
812 "movq (%0), %%mm6\n"
813
814 /* compute the upper-left pixel for dst on %%mm2 */
815 /* compute the upper-right pixel for dst on %%mm4 */
816 "movq %%mm0, %%mm2\n"
817 "movq %%mm1, %%mm4\n"
818 "movq %%mm0, %%mm3\n"
819 "movq %%mm1, %%mm5\n"
820 "pcmpeqd %%mm6, %%mm2\n"
821 "pcmpeqd %%mm6, %%mm4\n"
822 "pcmpeqd (%2), %%mm3\n"
823 "pcmpeqd (%2), %%mm5\n"
824 "pandn %%mm2, %%mm3\n"
825 "pandn %%mm4, %%mm5\n"
826 "movq %%mm0, %%mm2\n"
827 "movq %%mm1, %%mm4\n"
828 "pcmpeqd %%mm1, %%mm2\n"
829 "pcmpeqd %%mm0, %%mm4\n"
830 "pandn %%mm3, %%mm2\n"
831 "pandn %%mm5, %%mm4\n"
832 "movq %%mm2, %%mm3\n"
833 "movq %%mm4, %%mm5\n"
834 "pand %%mm6, %%mm2\n"
835 "pand %%mm6, %%mm4\n"
836 "pandn %%mm7, %%mm3\n"
837 "pandn %%mm7, %%mm5\n"
838 "por %%mm3, %%mm2\n"
839 "por %%mm5, %%mm4\n"
840
841 /* set *dst */
842 "movq %%mm2, %%mm3\n"
843 "punpckldq %%mm4, %%mm2\n"
844 "punpckhdq %%mm4, %%mm3\n"
845 "movq %%mm2, (%3)\n"
846 "movq %%mm3, 8(%3)\n"
847
848 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
849 :
850 : "cc"
851 );
852 }
853
854 /**
855 * Scale by a factor of 2 a row of pixels of 8 bits.
856 * This is a very fast MMX implementation.
857 * The implementation uses a combination of cmp/and/not operations to
858 * completly remove the need of conditional jumps. This trick give the
859 * major speed improvement.
860 * Also, using the 8 bytes MMX registers more than one pixel are computed
861 * at the same time.
862 * Before calling this function you must ensure that the currenct CPU supports
863 * the MMX instruction set. After calling it you must be sure to call the EMMS
864 * instruction before any floating-point operation.
865 * The pixels over the left and right borders are assumed of the same color of
866 * the pixels on the border.
867 * \param src0 Pointer at the first pixel of the previous row.
868 * \param src1 Pointer at the first pixel of the current row.
869 * \param src2 Pointer at the first pixel of the next row.
870 * \param count Length in pixels of the src0, src1 and src2 rows. It must
871 * be at least 16 and a multiple of 8.
872 * \param dst0 First destination row, double length in pixels.
873 * \param dst1 Second destination row, double length in pixels.
874 */
scale2x_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)875 static void scale2x_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
876 {
877 assert(count >= 16);
878 assert(count % 8 == 0);
879
880 scale2x_8_mmx_single(dst0, src0, src1, src2, count);
881 scale2x_8_mmx_single(dst1, src2, src1, src0, count);
882 }
883
884 /**
885 * Scale by a factor of 2 a row of pixels of 16 bits.
886 * This function operates like scale2x_8_mmx() but for 16 bits pixels.
887 * \param src0 Pointer at the first pixel of the previous row.
888 * \param src1 Pointer at the first pixel of the current row.
889 * \param src2 Pointer at the first pixel of the next row.
890 * \param count Length in pixels of the src0, src1 and src2 rows. It must
891 * be at least 8 and a multiple of 4.
892 * \param dst0 First destination row, double length in pixels.
893 * \param dst1 Second destination row, double length in pixels.
894 */
scale2x_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)895 static void scale2x_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
896 {
897 assert(count >= 8);
898 assert(count % 4 == 0);
899
900 scale2x_16_mmx_single(dst0, src0, src1, src2, count);
901 scale2x_16_mmx_single(dst1, src2, src1, src0, count);
902 }
903
904 /**
905 * Scale by a factor of 2 a row of pixels of 32 bits.
906 * This function operates like scale2x_8_mmx() but for 32 bits pixels.
907 * \param src0 Pointer at the first pixel of the previous row.
908 * \param src1 Pointer at the first pixel of the current row.
909 * \param src2 Pointer at the first pixel of the next row.
910 * \param count Length in pixels of the src0, src1 and src2 rows. It must
911 * be at least 4 and a multiple of 2.
912 * \param dst0 First destination row, double length in pixels.
913 * \param dst1 Second destination row, double length in pixels.
914 */
scale2x_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)915 static void scale2x_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
916 {
917 assert(count >= 4);
918 assert(count % 2 == 0);
919
920 scale2x_32_mmx_single(dst0, src0, src1, src2, count);
921 scale2x_32_mmx_single(dst1, src2, src1, src0, count);
922 }
923
924 /**
925 * End the use of the MMX instructions.
926 * This function must be called before using any floating-point operations.
927 */
scale2x_mmx_emms(void)928 static inline void scale2x_mmx_emms(void)
929 {
930 __asm__ __volatile__ (
931 "emms"
932 );
933 }
934
935 #endif
936
937 #endif
938