1 /*
2 * This file is part of the Scale2x project.
3 *
4 * Copyright (C) 2001-2003 Andrea Mazzoleni
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21 /*
22 * This file contains a C and MMX implementation of the Scale2x effect.
23 *
24 * You can find an high level description of the effect at :
25 *
26 * http://scale2x.sourceforge.net/
27 *
28 * Alternatively at the previous license terms, you are allowed to use this
29 * code in your program with these conditions:
30 * - the program is not used in commercial activities.
31 * - the whole source code of the program is released with the binary.
32 * - derivative works of the program are allowed.
33 */
34
35 #ifndef __SCALE2X_H
36 #define __SCALE2X_H
37
38 #include <assert.h>
39
40 /***************************************************************************/
41 /* Basic types */
42
43 typedef unsigned char scale2x_uint8;
44 typedef unsigned short scale2x_uint16;
45 typedef unsigned scale2x_uint32;
46
47 /***************************************************************************/
48 /* Scale2x C implementation */
49
scale2x_8_def_single(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)50 static void scale2x_8_def_single(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
51 {
52 assert(count >= 2);
53
54 /* first pixel */
55 dst[0] = src1[0];
56 if (src1[1] == src0[0] && src2[0] != src0[0])
57 dst[1] = src0[0];
58 else
59 dst[1] = src1[0];
60 ++src0;
61 ++src1;
62 ++src2;
63 dst += 2;
64
65 /* central pixels */
66 count -= 2;
67 while (count) {
68 if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
69 dst[0] = src0[0];
70 else
71 dst[0] = src1[0];
72 if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
73 dst[1] = src0[0];
74 else
75 dst[1] = src1[0];
76
77 ++src0;
78 ++src1;
79 ++src2;
80 dst += 2;
81 --count;
82 }
83
84 /* last pixel */
85 if (src1[-1] == src0[0] && src2[0] != src0[0])
86 dst[0] = src0[0];
87 else
88 dst[0] = src1[0];
89 dst[1] = src1[0];
90 }
91
scale2x_16_def_single(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)92 static void scale2x_16_def_single(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
93 {
94 assert(count >= 2);
95
96 /* first pixel */
97 dst[0] = src1[0];
98 if (src1[1] == src0[0] && src2[0] != src0[0])
99 dst[1] = src0[0];
100 else
101 dst[1] = src1[0];
102 ++src0;
103 ++src1;
104 ++src2;
105 dst += 2;
106
107 /* central pixels */
108 count -= 2;
109 while (count) {
110 if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
111 dst[0] = src0[0];
112 else
113 dst[0] = src1[0];
114 if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
115 dst[1] = src0[0];
116 else
117 dst[1] = src1[0];
118
119 ++src0;
120 ++src1;
121 ++src2;
122 dst += 2;
123 --count;
124 }
125
126 /* last pixel */
127 if (src1[-1] == src0[0] && src2[0] != src0[0])
128 dst[0] = src0[0];
129 else
130 dst[0] = src1[0];
131 dst[1] = src1[0];
132 }
133
scale2x_32_def_single(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)134 static void scale2x_32_def_single(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
135 {
136 assert(count >= 2);
137
138 /* first pixel */
139 dst[0] = src1[0];
140 if (src1[1] == src0[0] && src2[0] != src0[0])
141 dst[1] = src0[0];
142 else
143 dst[1] = src1[0];
144 ++src0;
145 ++src1;
146 ++src2;
147 dst += 2;
148
149 /* central pixels */
150 count -= 2;
151 while (count) {
152 if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
153 dst[0] = src0[0];
154 else
155 dst[0] = src1[0];
156 if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
157 dst[1] = src0[0];
158 else
159 dst[1] = src1[0];
160
161 ++src0;
162 ++src1;
163 ++src2;
164 dst += 2;
165 --count;
166 }
167
168 /* last pixel */
169 if (src1[-1] == src0[0] && src2[0] != src0[0])
170 dst[0] = src0[0];
171 else
172 dst[0] = src1[0];
173 dst[1] = src1[0];
174 }
175
176 /**
177 * Scale by a factor of 2 a row of pixels of 8 bits.
178 * The function is implemented in C.
179 * The pixels over the left and right borders are assumed of the same color of
180 * the pixels on the border.
181 * \param src0 Pointer at the first pixel of the previous row.
182 * \param src1 Pointer at the first pixel of the current row.
183 * \param src2 Pointer at the first pixel of the next row.
184 * \param count Length in pixels of the src0, src1 and src2 rows.
185 * It must be at least 2.
186 * \param dst0 First destination row, double length in pixels.
187 * \param dst1 Second destination row, double length in pixels.
188 */
scale2x_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)189 static inline void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
190 {
191 assert(count >= 2);
192
193 scale2x_8_def_single(dst0, src0, src1, src2, count);
194 scale2x_8_def_single(dst1, src2, src1, src0, count);
195 }
196
197 /**
198 * Scale by a factor of 2 a row of pixels of 16 bits.
199 * This function operates like scale2x_8_def() but for 16 bits pixels.
200 * \param src0 Pointer at the first pixel of the previous row.
201 * \param src1 Pointer at the first pixel of the current row.
202 * \param src2 Pointer at the first pixel of the next row.
203 * \param count Length in pixels of the src0, src1 and src2 rows.
204 * It must be at least 2.
205 * \param dst0 First destination row, double length in pixels.
206 * \param dst1 Second destination row, double length in pixels.
207 */
scale2x_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)208 static inline void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
209 {
210 assert(count >= 2);
211
212 scale2x_16_def_single(dst0, src0, src1, src2, count);
213 scale2x_16_def_single(dst1, src2, src1, src0, count);
214 }
215
216 /**
217 * Scale by a factor of 2 a row of pixels of 32 bits.
218 * This function operates like scale2x_8_def() but for 32 bits pixels.
219 * \param src0 Pointer at the first pixel of the previous row.
220 * \param src1 Pointer at the first pixel of the current row.
221 * \param src2 Pointer at the first pixel of the next row.
222 * \param count Length in pixels of the src0, src1 and src2 rows.
223 * It must be at least 2.
224 * \param dst0 First destination row, double length in pixels.
225 * \param dst1 Second destination row, double length in pixels.
226 */
scale2x_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)227 static inline void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
228 {
229 assert(count >= 2);
230
231 scale2x_32_def_single(dst0, src0, src1, src2, count);
232 scale2x_32_def_single(dst1, src2, src1, src0, count);
233 }
234
235 /***************************************************************************/
236 /* Scale2x MMX implementation */
237
238 #if defined(__GNUC__) && defined(__i386__)
239
240 /*
241 * Apply the Scale2x effect at a single row.
242 * This function must be called only by the other scale2x functions.
243 *
244 * Considering the pixel map :
245 *
246 * ABC (src0)
247 * DEF (src1)
248 * GHI (src2)
249 *
250 * this functions compute 2 new pixels in substitution of the source pixel E
251 * like this map :
252 *
253 * ab (dst)
254 *
255 * with these variables :
256 *
257 * ¤t -> E
258 * ¤t_left -> D
259 * ¤t_right -> F
260 * ¤t_upper -> B
261 * ¤t_lower -> H
262 *
263 * %0 -> current_upper
264 * %1 -> current
265 * %2 -> current_lower
266 * %3 -> dst
267 * %4 -> counter
268 *
269 * %mm0 -> *current_left
270 * %mm1 -> *current_next
271 * %mm2 -> tmp0
272 * %mm3 -> tmp1
273 * %mm4 -> tmp2
274 * %mm5 -> tmp3
275 * %mm6 -> *current_upper
276 * %mm7 -> *current
277 */
scale2x_8_mmx_single(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)278 static inline void scale2x_8_mmx_single(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
279 {
280 assert(count >= 16);
281 assert(count % 8 == 0);
282
283 /* always do the first and last run */
284 count -= 2*8;
285
286 __asm__ __volatile__(
287 /* first run */
288 /* set the current, current_pre, current_next registers */
289 "movq 0(%1), %%mm0\n"
290 "movq 0(%1), %%mm7\n"
291 "movq 8(%1), %%mm1\n"
292 "psllq $56, %%mm0\n"
293 "psllq $56, %%mm1\n"
294 "psrlq $56, %%mm0\n"
295 "movq %%mm7, %%mm2\n"
296 "movq %%mm7, %%mm3\n"
297 "psllq $8, %%mm2\n"
298 "psrlq $8, %%mm3\n"
299 "por %%mm2, %%mm0\n"
300 "por %%mm3, %%mm1\n"
301
302 /* current_upper */
303 "movq (%0), %%mm6\n"
304
305 /* compute the upper-left pixel for dst on %%mm2 */
306 /* compute the upper-right pixel for dst on %%mm4 */
307 "movq %%mm0, %%mm2\n"
308 "movq %%mm1, %%mm4\n"
309 "movq %%mm0, %%mm3\n"
310 "movq %%mm1, %%mm5\n"
311 "pcmpeqb %%mm6, %%mm2\n"
312 "pcmpeqb %%mm6, %%mm4\n"
313 "pcmpeqb (%2), %%mm3\n"
314 "pcmpeqb (%2), %%mm5\n"
315 "pandn %%mm2, %%mm3\n"
316 "pandn %%mm4, %%mm5\n"
317 "movq %%mm0, %%mm2\n"
318 "movq %%mm1, %%mm4\n"
319 "pcmpeqb %%mm1, %%mm2\n"
320 "pcmpeqb %%mm0, %%mm4\n"
321 "pandn %%mm3, %%mm2\n"
322 "pandn %%mm5, %%mm4\n"
323 "movq %%mm2, %%mm3\n"
324 "movq %%mm4, %%mm5\n"
325 "pand %%mm6, %%mm2\n"
326 "pand %%mm6, %%mm4\n"
327 "pandn %%mm7, %%mm3\n"
328 "pandn %%mm7, %%mm5\n"
329 "por %%mm3, %%mm2\n"
330 "por %%mm5, %%mm4\n"
331
332 /* set *dst */
333 "movq %%mm2, %%mm3\n"
334 "punpcklbw %%mm4, %%mm2\n"
335 "punpckhbw %%mm4, %%mm3\n"
336 "movq %%mm2, (%3)\n"
337 "movq %%mm3, 8(%3)\n"
338
339 /* next */
340 "addl $8, %0\n"
341 "addl $8, %1\n"
342 "addl $8, %2\n"
343 "addl $16, %3\n"
344
345 /* central runs */
346 "shrl $3, %4\n"
347 "jz 1f\n"
348
349 "0:\n"
350
351 /* set the current, current_pre, current_next registers */
352 "movq -8(%1), %%mm0\n"
353 "movq (%1), %%mm7\n"
354 "movq 8(%1), %%mm1\n"
355 "psrlq $56, %%mm0\n"
356 "psllq $56, %%mm1\n"
357 "movq %%mm7, %%mm2\n"
358 "movq %%mm7, %%mm3\n"
359 "psllq $8, %%mm2\n"
360 "psrlq $8, %%mm3\n"
361 "por %%mm2, %%mm0\n"
362 "por %%mm3, %%mm1\n"
363
364 /* current_upper */
365 "movq (%0), %%mm6\n"
366
367 /* compute the upper-left pixel for dst on %%mm2 */
368 /* compute the upper-right pixel for dst on %%mm4 */
369 "movq %%mm0, %%mm2\n"
370 "movq %%mm1, %%mm4\n"
371 "movq %%mm0, %%mm3\n"
372 "movq %%mm1, %%mm5\n"
373 "pcmpeqb %%mm6, %%mm2\n"
374 "pcmpeqb %%mm6, %%mm4\n"
375 "pcmpeqb (%2), %%mm3\n"
376 "pcmpeqb (%2), %%mm5\n"
377 "pandn %%mm2, %%mm3\n"
378 "pandn %%mm4, %%mm5\n"
379 "movq %%mm0, %%mm2\n"
380 "movq %%mm1, %%mm4\n"
381 "pcmpeqb %%mm1, %%mm2\n"
382 "pcmpeqb %%mm0, %%mm4\n"
383 "pandn %%mm3, %%mm2\n"
384 "pandn %%mm5, %%mm4\n"
385 "movq %%mm2, %%mm3\n"
386 "movq %%mm4, %%mm5\n"
387 "pand %%mm6, %%mm2\n"
388 "pand %%mm6, %%mm4\n"
389 "pandn %%mm7, %%mm3\n"
390 "pandn %%mm7, %%mm5\n"
391 "por %%mm3, %%mm2\n"
392 "por %%mm5, %%mm4\n"
393
394 /* set *dst */
395 "movq %%mm2, %%mm3\n"
396 "punpcklbw %%mm4, %%mm2\n"
397 "punpckhbw %%mm4, %%mm3\n"
398 "movq %%mm2, (%3)\n"
399 "movq %%mm3, 8(%3)\n"
400
401 /* next */
402 "addl $8, %0\n"
403 "addl $8, %1\n"
404 "addl $8, %2\n"
405 "addl $16, %3\n"
406
407 "decl %4\n"
408 "jnz 0b\n"
409 "1:\n"
410
411 /* final run */
412 /* set the current, current_pre, current_next registers */
413 "movq (%1), %%mm1\n"
414 "movq (%1), %%mm7\n"
415 "movq -8(%1), %%mm0\n"
416 "psrlq $56, %%mm1\n"
417 "psrlq $56, %%mm0\n"
418 "psllq $56, %%mm1\n"
419 "movq %%mm7, %%mm2\n"
420 "movq %%mm7, %%mm3\n"
421 "psllq $8, %%mm2\n"
422 "psrlq $8, %%mm3\n"
423 "por %%mm2, %%mm0\n"
424 "por %%mm3, %%mm1\n"
425
426 /* current_upper */
427 "movq (%0), %%mm6\n"
428
429 /* compute the upper-left pixel for dst on %%mm2 */
430 /* compute the upper-right pixel for dst on %%mm4 */
431 "movq %%mm0, %%mm2\n"
432 "movq %%mm1, %%mm4\n"
433 "movq %%mm0, %%mm3\n"
434 "movq %%mm1, %%mm5\n"
435 "pcmpeqb %%mm6, %%mm2\n"
436 "pcmpeqb %%mm6, %%mm4\n"
437 "pcmpeqb (%2), %%mm3\n"
438 "pcmpeqb (%2), %%mm5\n"
439 "pandn %%mm2, %%mm3\n"
440 "pandn %%mm4, %%mm5\n"
441 "movq %%mm0, %%mm2\n"
442 "movq %%mm1, %%mm4\n"
443 "pcmpeqb %%mm1, %%mm2\n"
444 "pcmpeqb %%mm0, %%mm4\n"
445 "pandn %%mm3, %%mm2\n"
446 "pandn %%mm5, %%mm4\n"
447 "movq %%mm2, %%mm3\n"
448 "movq %%mm4, %%mm5\n"
449 "pand %%mm6, %%mm2\n"
450 "pand %%mm6, %%mm4\n"
451 "pandn %%mm7, %%mm3\n"
452 "pandn %%mm7, %%mm5\n"
453 "por %%mm3, %%mm2\n"
454 "por %%mm5, %%mm4\n"
455
456 /* set *dst */
457 "movq %%mm2, %%mm3\n"
458 "punpcklbw %%mm4, %%mm2\n"
459 "punpckhbw %%mm4, %%mm3\n"
460 "movq %%mm2, (%3)\n"
461 "movq %%mm3, 8(%3)\n"
462
463 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
464 :
465 : "cc"
466 );
467 }
468
scale2x_16_mmx_single(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)469 static inline void scale2x_16_mmx_single(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
470 {
471 assert(count >= 8);
472 assert(count % 4 == 0);
473
474 /* always do the first and last run */
475 count -= 2*4;
476
477 __asm__ __volatile__(
478 /* first run */
479 /* set the current, current_pre, current_next registers */
480 "movq 0(%1), %%mm0\n"
481 "movq 0(%1), %%mm7\n"
482 "movq 8(%1), %%mm1\n"
483 "psllq $48, %%mm0\n"
484 "psllq $48, %%mm1\n"
485 "psrlq $48, %%mm0\n"
486 "movq %%mm7, %%mm2\n"
487 "movq %%mm7, %%mm3\n"
488 "psllq $16, %%mm2\n"
489 "psrlq $16, %%mm3\n"
490 "por %%mm2, %%mm0\n"
491 "por %%mm3, %%mm1\n"
492
493 /* current_upper */
494 "movq (%0), %%mm6\n"
495
496 /* compute the upper-left pixel for dst on %%mm2 */
497 /* compute the upper-right pixel for dst on %%mm4 */
498 "movq %%mm0, %%mm2\n"
499 "movq %%mm1, %%mm4\n"
500 "movq %%mm0, %%mm3\n"
501 "movq %%mm1, %%mm5\n"
502 "pcmpeqw %%mm6, %%mm2\n"
503 "pcmpeqw %%mm6, %%mm4\n"
504 "pcmpeqw (%2), %%mm3\n"
505 "pcmpeqw (%2), %%mm5\n"
506 "pandn %%mm2, %%mm3\n"
507 "pandn %%mm4, %%mm5\n"
508 "movq %%mm0, %%mm2\n"
509 "movq %%mm1, %%mm4\n"
510 "pcmpeqw %%mm1, %%mm2\n"
511 "pcmpeqw %%mm0, %%mm4\n"
512 "pandn %%mm3, %%mm2\n"
513 "pandn %%mm5, %%mm4\n"
514 "movq %%mm2, %%mm3\n"
515 "movq %%mm4, %%mm5\n"
516 "pand %%mm6, %%mm2\n"
517 "pand %%mm6, %%mm4\n"
518 "pandn %%mm7, %%mm3\n"
519 "pandn %%mm7, %%mm5\n"
520 "por %%mm3, %%mm2\n"
521 "por %%mm5, %%mm4\n"
522
523 /* set *dst */
524 "movq %%mm2, %%mm3\n"
525 "punpcklwd %%mm4, %%mm2\n"
526 "punpckhwd %%mm4, %%mm3\n"
527 "movq %%mm2, (%3)\n"
528 "movq %%mm3, 8(%3)\n"
529
530 /* next */
531 "addl $8, %0\n"
532 "addl $8, %1\n"
533 "addl $8, %2\n"
534 "addl $16, %3\n"
535
536 /* central runs */
537 "shrl $2, %4\n"
538 "jz 1f\n"
539
540 "0:\n"
541
542 /* set the current, current_pre, current_next registers */
543 "movq -8(%1), %%mm0\n"
544 "movq (%1), %%mm7\n"
545 "movq 8(%1), %%mm1\n"
546 "psrlq $48, %%mm0\n"
547 "psllq $48, %%mm1\n"
548 "movq %%mm7, %%mm2\n"
549 "movq %%mm7, %%mm3\n"
550 "psllq $16, %%mm2\n"
551 "psrlq $16, %%mm3\n"
552 "por %%mm2, %%mm0\n"
553 "por %%mm3, %%mm1\n"
554
555 /* current_upper */
556 "movq (%0), %%mm6\n"
557
558 /* compute the upper-left pixel for dst on %%mm2 */
559 /* compute the upper-right pixel for dst on %%mm4 */
560 "movq %%mm0, %%mm2\n"
561 "movq %%mm1, %%mm4\n"
562 "movq %%mm0, %%mm3\n"
563 "movq %%mm1, %%mm5\n"
564 "pcmpeqw %%mm6, %%mm2\n"
565 "pcmpeqw %%mm6, %%mm4\n"
566 "pcmpeqw (%2), %%mm3\n"
567 "pcmpeqw (%2), %%mm5\n"
568 "pandn %%mm2, %%mm3\n"
569 "pandn %%mm4, %%mm5\n"
570 "movq %%mm0, %%mm2\n"
571 "movq %%mm1, %%mm4\n"
572 "pcmpeqw %%mm1, %%mm2\n"
573 "pcmpeqw %%mm0, %%mm4\n"
574 "pandn %%mm3, %%mm2\n"
575 "pandn %%mm5, %%mm4\n"
576 "movq %%mm2, %%mm3\n"
577 "movq %%mm4, %%mm5\n"
578 "pand %%mm6, %%mm2\n"
579 "pand %%mm6, %%mm4\n"
580 "pandn %%mm7, %%mm3\n"
581 "pandn %%mm7, %%mm5\n"
582 "por %%mm3, %%mm2\n"
583 "por %%mm5, %%mm4\n"
584
585 /* set *dst */
586 "movq %%mm2, %%mm3\n"
587 "punpcklwd %%mm4, %%mm2\n"
588 "punpckhwd %%mm4, %%mm3\n"
589 "movq %%mm2, (%3)\n"
590 "movq %%mm3, 8(%3)\n"
591
592 /* next */
593 "addl $8, %0\n"
594 "addl $8, %1\n"
595 "addl $8, %2\n"
596 "addl $16, %3\n"
597
598 "decl %4\n"
599 "jnz 0b\n"
600 "1:\n"
601
602 /* final run */
603 /* set the current, current_pre, current_next registers */
604 "movq (%1), %%mm1\n"
605 "movq (%1), %%mm7\n"
606 "movq -8(%1), %%mm0\n"
607 "psrlq $48, %%mm1\n"
608 "psrlq $48, %%mm0\n"
609 "psllq $48, %%mm1\n"
610 "movq %%mm7, %%mm2\n"
611 "movq %%mm7, %%mm3\n"
612 "psllq $16, %%mm2\n"
613 "psrlq $16, %%mm3\n"
614 "por %%mm2, %%mm0\n"
615 "por %%mm3, %%mm1\n"
616
617 /* current_upper */
618 "movq (%0), %%mm6\n"
619
620 /* compute the upper-left pixel for dst on %%mm2 */
621 /* compute the upper-right pixel for dst on %%mm4 */
622 "movq %%mm0, %%mm2\n"
623 "movq %%mm1, %%mm4\n"
624 "movq %%mm0, %%mm3\n"
625 "movq %%mm1, %%mm5\n"
626 "pcmpeqw %%mm6, %%mm2\n"
627 "pcmpeqw %%mm6, %%mm4\n"
628 "pcmpeqw (%2), %%mm3\n"
629 "pcmpeqw (%2), %%mm5\n"
630 "pandn %%mm2, %%mm3\n"
631 "pandn %%mm4, %%mm5\n"
632 "movq %%mm0, %%mm2\n"
633 "movq %%mm1, %%mm4\n"
634 "pcmpeqw %%mm1, %%mm2\n"
635 "pcmpeqw %%mm0, %%mm4\n"
636 "pandn %%mm3, %%mm2\n"
637 "pandn %%mm5, %%mm4\n"
638 "movq %%mm2, %%mm3\n"
639 "movq %%mm4, %%mm5\n"
640 "pand %%mm6, %%mm2\n"
641 "pand %%mm6, %%mm4\n"
642 "pandn %%mm7, %%mm3\n"
643 "pandn %%mm7, %%mm5\n"
644 "por %%mm3, %%mm2\n"
645 "por %%mm5, %%mm4\n"
646
647 /* set *dst */
648 "movq %%mm2, %%mm3\n"
649 "punpcklwd %%mm4, %%mm2\n"
650 "punpckhwd %%mm4, %%mm3\n"
651 "movq %%mm2, (%3)\n"
652 "movq %%mm3, 8(%3)\n"
653
654 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
655 :
656 : "cc"
657 );
658 }
659
scale2x_32_mmx_single(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)660 static inline void scale2x_32_mmx_single(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
661 {
662 assert(count >= 4);
663 assert(count % 2 == 0);
664
665 /* always do the first and last run */
666 count -= 2*2;
667
668 __asm__ __volatile__(
669 /* first run */
670 /* set the current, current_pre, current_next registers */
671 "movq 0(%1), %%mm0\n"
672 "movq 0(%1), %%mm7\n"
673 "movq 8(%1), %%mm1\n"
674 "psllq $32, %%mm0\n"
675 "psllq $32, %%mm1\n"
676 "psrlq $32, %%mm0\n"
677 "movq %%mm7, %%mm2\n"
678 "movq %%mm7, %%mm3\n"
679 "psllq $32, %%mm2\n"
680 "psrlq $32, %%mm3\n"
681 "por %%mm2, %%mm0\n"
682 "por %%mm3, %%mm1\n"
683
684 /* current_upper */
685 "movq (%0), %%mm6\n"
686
687 /* compute the upper-left pixel for dst on %%mm2 */
688 /* compute the upper-right pixel for dst on %%mm4 */
689 "movq %%mm0, %%mm2\n"
690 "movq %%mm1, %%mm4\n"
691 "movq %%mm0, %%mm3\n"
692 "movq %%mm1, %%mm5\n"
693 "pcmpeqd %%mm6, %%mm2\n"
694 "pcmpeqd %%mm6, %%mm4\n"
695 "pcmpeqd (%2), %%mm3\n"
696 "pcmpeqd (%2), %%mm5\n"
697 "pandn %%mm2, %%mm3\n"
698 "pandn %%mm4, %%mm5\n"
699 "movq %%mm0, %%mm2\n"
700 "movq %%mm1, %%mm4\n"
701 "pcmpeqd %%mm1, %%mm2\n"
702 "pcmpeqd %%mm0, %%mm4\n"
703 "pandn %%mm3, %%mm2\n"
704 "pandn %%mm5, %%mm4\n"
705 "movq %%mm2, %%mm3\n"
706 "movq %%mm4, %%mm5\n"
707 "pand %%mm6, %%mm2\n"
708 "pand %%mm6, %%mm4\n"
709 "pandn %%mm7, %%mm3\n"
710 "pandn %%mm7, %%mm5\n"
711 "por %%mm3, %%mm2\n"
712 "por %%mm5, %%mm4\n"
713
714 /* set *dst */
715 "movq %%mm2, %%mm3\n"
716 "punpckldq %%mm4, %%mm2\n"
717 "punpckhdq %%mm4, %%mm3\n"
718 "movq %%mm2, (%3)\n"
719 "movq %%mm3, 8(%3)\n"
720
721 /* next */
722 "addl $8, %0\n"
723 "addl $8, %1\n"
724 "addl $8, %2\n"
725 "addl $16, %3\n"
726
727 /* central runs */
728 "shrl $1, %4\n"
729 "jz 1f\n"
730
731 "0:\n"
732
733 /* set the current, current_pre, current_next registers */
734 "movq -8(%1), %%mm0\n"
735 "movq (%1), %%mm7\n"
736 "movq 8(%1), %%mm1\n"
737 "psrlq $32, %%mm0\n"
738 "psllq $32, %%mm1\n"
739 "movq %%mm7, %%mm2\n"
740 "movq %%mm7, %%mm3\n"
741 "psllq $32, %%mm2\n"
742 "psrlq $32, %%mm3\n"
743 "por %%mm2, %%mm0\n"
744 "por %%mm3, %%mm1\n"
745
746 /* current_upper */
747 "movq (%0), %%mm6\n"
748
749 /* compute the upper-left pixel for dst on %%mm2 */
750 /* compute the upper-right pixel for dst on %%mm4 */
751 "movq %%mm0, %%mm2\n"
752 "movq %%mm1, %%mm4\n"
753 "movq %%mm0, %%mm3\n"
754 "movq %%mm1, %%mm5\n"
755 "pcmpeqd %%mm6, %%mm2\n"
756 "pcmpeqd %%mm6, %%mm4\n"
757 "pcmpeqd (%2), %%mm3\n"
758 "pcmpeqd (%2), %%mm5\n"
759 "pandn %%mm2, %%mm3\n"
760 "pandn %%mm4, %%mm5\n"
761 "movq %%mm0, %%mm2\n"
762 "movq %%mm1, %%mm4\n"
763 "pcmpeqd %%mm1, %%mm2\n"
764 "pcmpeqd %%mm0, %%mm4\n"
765 "pandn %%mm3, %%mm2\n"
766 "pandn %%mm5, %%mm4\n"
767 "movq %%mm2, %%mm3\n"
768 "movq %%mm4, %%mm5\n"
769 "pand %%mm6, %%mm2\n"
770 "pand %%mm6, %%mm4\n"
771 "pandn %%mm7, %%mm3\n"
772 "pandn %%mm7, %%mm5\n"
773 "por %%mm3, %%mm2\n"
774 "por %%mm5, %%mm4\n"
775
776 /* set *dst */
777 "movq %%mm2, %%mm3\n"
778 "punpckldq %%mm4, %%mm2\n"
779 "punpckhdq %%mm4, %%mm3\n"
780 "movq %%mm2, (%3)\n"
781 "movq %%mm3, 8(%3)\n"
782
783 /* next */
784 "addl $8, %0\n"
785 "addl $8, %1\n"
786 "addl $8, %2\n"
787 "addl $16, %3\n"
788
789 "decl %4\n"
790 "jnz 0b\n"
791 "1:\n"
792
793 /* final run */
794 /* set the current, current_pre, current_next registers */
795 "movq (%1), %%mm1\n"
796 "movq (%1), %%mm7\n"
797 "movq -8(%1), %%mm0\n"
798 "psrlq $32, %%mm1\n"
799 "psrlq $32, %%mm0\n"
800 "psllq $32, %%mm1\n"
801 "movq %%mm7, %%mm2\n"
802 "movq %%mm7, %%mm3\n"
803 "psllq $32, %%mm2\n"
804 "psrlq $32, %%mm3\n"
805 "por %%mm2, %%mm0\n"
806 "por %%mm3, %%mm1\n"
807
808 /* current_upper */
809 "movq (%0), %%mm6\n"
810
811 /* compute the upper-left pixel for dst on %%mm2 */
812 /* compute the upper-right pixel for dst on %%mm4 */
813 "movq %%mm0, %%mm2\n"
814 "movq %%mm1, %%mm4\n"
815 "movq %%mm0, %%mm3\n"
816 "movq %%mm1, %%mm5\n"
817 "pcmpeqd %%mm6, %%mm2\n"
818 "pcmpeqd %%mm6, %%mm4\n"
819 "pcmpeqd (%2), %%mm3\n"
820 "pcmpeqd (%2), %%mm5\n"
821 "pandn %%mm2, %%mm3\n"
822 "pandn %%mm4, %%mm5\n"
823 "movq %%mm0, %%mm2\n"
824 "movq %%mm1, %%mm4\n"
825 "pcmpeqd %%mm1, %%mm2\n"
826 "pcmpeqd %%mm0, %%mm4\n"
827 "pandn %%mm3, %%mm2\n"
828 "pandn %%mm5, %%mm4\n"
829 "movq %%mm2, %%mm3\n"
830 "movq %%mm4, %%mm5\n"
831 "pand %%mm6, %%mm2\n"
832 "pand %%mm6, %%mm4\n"
833 "pandn %%mm7, %%mm3\n"
834 "pandn %%mm7, %%mm5\n"
835 "por %%mm3, %%mm2\n"
836 "por %%mm5, %%mm4\n"
837
838 /* set *dst */
839 "movq %%mm2, %%mm3\n"
840 "punpckldq %%mm4, %%mm2\n"
841 "punpckhdq %%mm4, %%mm3\n"
842 "movq %%mm2, (%3)\n"
843 "movq %%mm3, 8(%3)\n"
844
845 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
846 :
847 : "cc"
848 );
849 }
850
851 /**
852 * Scale by a factor of 2 a row of pixels of 8 bits.
853 * This is a very fast MMX implementation.
854 * The implementation uses a combination of cmp/and/not operations to
855 * completly remove the need of conditional jumps. This trick give the
856 * major speed improvement.
857 * Also, using the 8 bytes MMX registers more than one pixel are computed
858 * at the same time.
859 * Before calling this function you must ensure that the currenct CPU supports
860 * the MMX instruction set. After calling it you must be sure to call the EMMS
861 * instruction before any floating-point operation.
862 * The pixels over the left and right borders are assumed of the same color of
863 * the pixels on the border.
864 * \param src0 Pointer at the first pixel of the previous row.
865 * \param src1 Pointer at the first pixel of the current row.
866 * \param src2 Pointer at the first pixel of the next row.
867 * \param count Length in pixels of the src0, src1 and src2 rows. It must
868 * be at least 16 and a multiple of 8.
869 * \param dst0 First destination row, double length in pixels.
870 * \param dst1 Second destination row, double length in pixels.
871 */
872 #if 0
873 static void scale2x_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
874 {
875 assert(count >= 16);
876 assert(count % 8 == 0);
877
878 scale2x_8_mmx_single(dst0, src0, src1, src2, count);
879 scale2x_8_mmx_single(dst1, src2, src1, src0, count);
880 }
881 #endif
882 /**
883 * Scale by a factor of 2 a row of pixels of 16 bits.
884 * This function operates like scale2x_8_mmx() but for 16 bits pixels.
885 * \param src0 Pointer at the first pixel of the previous row.
886 * \param src1 Pointer at the first pixel of the current row.
887 * \param src2 Pointer at the first pixel of the next row.
888 * \param count Length in pixels of the src0, src1 and src2 rows. It must
889 * be at least 8 and a multiple of 4.
890 * \param dst0 First destination row, double length in pixels.
891 * \param dst1 Second destination row, double length in pixels.
892 */
scale2x_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)893 static void scale2x_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
894 {
895 assert(count >= 8);
896 assert(count % 4 == 0);
897
898 scale2x_16_mmx_single(dst0, src0, src1, src2, count);
899 scale2x_16_mmx_single(dst1, src2, src1, src0, count);
900 }
901
902 /**
903 * Scale by a factor of 2 a row of pixels of 32 bits.
904 * This function operates like scale2x_8_mmx() but for 32 bits pixels.
905 * \param src0 Pointer at the first pixel of the previous row.
906 * \param src1 Pointer at the first pixel of the current row.
907 * \param src2 Pointer at the first pixel of the next row.
908 * \param count Length in pixels of the src0, src1 and src2 rows. It must
909 * be at least 4 and a multiple of 2.
910 * \param dst0 First destination row, double length in pixels.
911 * \param dst1 Second destination row, double length in pixels.
912 */
scale2x_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)913 static void scale2x_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
914 {
915 assert(count >= 4);
916 assert(count % 2 == 0);
917
918 scale2x_32_mmx_single(dst0, src0, src1, src2, count);
919 scale2x_32_mmx_single(dst1, src2, src1, src0, count);
920 }
921
922 #endif
923
924 #endif
925