1 /*
2 * This file is part of the Scale2x project.
3 *
4 * Copyright (C) 2001, 2002, 2003, 2004 Andrea Mazzoleni
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21 /*
22 * This file contains a C and MMX implementation of the Scale2x effect.
23 *
24 * You can find an high level description of the effect at :
25 *
26 * http://scale2x.sourceforge.net/
27 *
28 * Alternatively at the previous license terms, you are allowed to use this
29 * code in your program with these conditions:
30 * - the program is not used in commercial activities.
31 * - the whole source code of the program is released with the binary.
32 * - derivative works of the program are allowed.
33 */
34
35 #include "scale2x.h"
36
37 #include <assert.h>
38
39 /***************************************************************************/
40 /* Scale2x C implementation */
41
42 /**
43 * Define the macro USE_SCALE_RANDOMWRITE to enable
44 * an optimized version which writes memory in random order.
45 * This version is a little faster if you write in system memory.
46 * But it's a lot slower if you write in video memory.
47 * So, enable it only if you are sure to never write directly in video memory.
48 */
49 /* #define USE_SCALE_RANDOMWRITE */
50
51 #ifdef USE_SCALE_RANDOMWRITE
52
scale2x_8_def_whole(scale2x_uint8 * restrict dst0,scale2x_uint8 * restrict dst1,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)53 static inline void scale2x_8_def_whole(scale2x_uint8* restrict dst0, scale2x_uint8* restrict dst1, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
54 {
55 assert(count >= 2);
56
57 /* first pixel */
58 if (src0[0] != src2[0] && src1[0] != src1[1]) {
59 dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
60 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
61 dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
62 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
63 } else {
64 dst0[0] = src1[0];
65 dst0[1] = src1[0];
66 dst1[0] = src1[0];
67 dst1[1] = src1[0];
68 }
69 ++src0;
70 ++src1;
71 ++src2;
72 dst0 += 2;
73 dst1 += 2;
74
75 /* central pixels */
76 count -= 2;
77 while (count) {
78 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
79 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
80 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
81 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
82 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
83 } else {
84 dst0[0] = src1[0];
85 dst0[1] = src1[0];
86 dst1[0] = src1[0];
87 dst1[1] = src1[0];
88 }
89
90 ++src0;
91 ++src1;
92 ++src2;
93 dst0 += 2;
94 dst1 += 2;
95 --count;
96 }
97
98 /* last pixel */
99 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
100 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
101 dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
102 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
103 dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
104 } else {
105 dst0[0] = src1[0];
106 dst0[1] = src1[0];
107 dst1[0] = src1[0];
108 dst1[1] = src1[0];
109 }
110 }
111
112 #endif
113
scale2x_8_def_border(scale2x_uint8 * restrict dst,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)114 static inline void scale2x_8_def_border(scale2x_uint8* restrict dst, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
115 {
116 assert(count >= 2);
117
118 /* first pixel */
119 if (src0[0] != src2[0] && src1[0] != src1[1]) {
120 dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
121 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
122 } else {
123 dst[0] = src1[0];
124 dst[1] = src1[0];
125 }
126 ++src0;
127 ++src1;
128 ++src2;
129 dst += 2;
130
131 /* central pixels */
132 count -= 2;
133 while (count) {
134 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
135 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
136 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
137 } else {
138 dst[0] = src1[0];
139 dst[1] = src1[0];
140 }
141
142 ++src0;
143 ++src1;
144 ++src2;
145 dst += 2;
146 --count;
147 }
148
149 /* last pixel */
150 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
151 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
152 dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
153 } else {
154 dst[0] = src1[0];
155 dst[1] = src1[0];
156 }
157 }
158
scale2x_8_def_center(scale2x_uint8 * restrict dst,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)159 static inline void scale2x_8_def_center(scale2x_uint8* restrict dst, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
160 {
161 assert(count >= 2);
162
163 /* first pixel */
164 if (src0[0] != src2[0] && src1[0] != src1[1]) {
165 dst[0] = src1[0];
166 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
167 } else {
168 dst[0] = src1[0];
169 dst[1] = src1[0];
170 }
171 ++src0;
172 ++src1;
173 ++src2;
174 dst += 2;
175
176 /* central pixels */
177 count -= 2;
178 while (count) {
179 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
180 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
181 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
182 } else {
183 dst[0] = src1[0];
184 dst[1] = src1[0];
185 }
186
187 ++src0;
188 ++src1;
189 ++src2;
190 dst += 2;
191 --count;
192 }
193
194 /* last pixel */
195 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
196 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
197 dst[1] = src1[0];
198 } else {
199 dst[0] = src1[0];
200 dst[1] = src1[0];
201 }
202 }
203
204 #ifdef USE_SCALE_RANDOMWRITE
205
scale2x_16_def_whole(scale2x_uint16 * restrict dst0,scale2x_uint16 * restrict dst1,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)206 static inline void scale2x_16_def_whole(scale2x_uint16* restrict dst0, scale2x_uint16* restrict dst1, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
207 {
208 assert(count >= 2);
209
210 /* first pixel */
211 if (src0[0] != src2[0] && src1[0] != src1[1]) {
212 dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
213 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
214 dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
215 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
216 } else {
217 dst0[0] = src1[0];
218 dst0[1] = src1[0];
219 dst1[0] = src1[0];
220 dst1[1] = src1[0];
221 }
222 ++src0;
223 ++src1;
224 ++src2;
225 dst0 += 2;
226 dst1 += 2;
227
228 /* central pixels */
229 count -= 2;
230 while (count) {
231 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
232 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
233 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
234 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
235 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
236 } else {
237 dst0[0] = src1[0];
238 dst0[1] = src1[0];
239 dst1[0] = src1[0];
240 dst1[1] = src1[0];
241 }
242
243 ++src0;
244 ++src1;
245 ++src2;
246 dst0 += 2;
247 dst1 += 2;
248 --count;
249 }
250
251 /* last pixel */
252 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
253 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
254 dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
255 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
256 dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
257 } else {
258 dst0[0] = src1[0];
259 dst0[1] = src1[0];
260 dst1[0] = src1[0];
261 dst1[1] = src1[0];
262 }
263 }
264
265 #endif
266
scale2x_16_def_border(scale2x_uint16 * restrict dst,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)267 static inline void scale2x_16_def_border(scale2x_uint16* restrict dst, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
268 {
269 assert(count >= 2);
270
271 /* first pixel */
272 if (src0[0] != src2[0] && src1[0] != src1[1]) {
273 dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
274 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
275 } else {
276 dst[0] = src1[0];
277 dst[1] = src1[0];
278 }
279 ++src0;
280 ++src1;
281 ++src2;
282 dst += 2;
283
284 /* central pixels */
285 count -= 2;
286 while (count) {
287 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
288 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
289 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
290 } else {
291 dst[0] = src1[0];
292 dst[1] = src1[0];
293 }
294
295 ++src0;
296 ++src1;
297 ++src2;
298 dst += 2;
299 --count;
300 }
301
302 /* last pixel */
303 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
304 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
305 dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
306 } else {
307 dst[0] = src1[0];
308 dst[1] = src1[0];
309 }
310 }
311
scale2x_16_def_center(scale2x_uint16 * restrict dst,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)312 static inline void scale2x_16_def_center(scale2x_uint16* restrict dst, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
313 {
314 assert(count >= 2);
315
316 /* first pixel */
317 if (src0[0] != src2[0] && src1[0] != src1[1]) {
318 dst[0] = src1[0];
319 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
320 } else {
321 dst[0] = src1[0];
322 dst[1] = src1[0];
323 }
324 ++src0;
325 ++src1;
326 ++src2;
327 dst += 2;
328
329 /* central pixels */
330 count -= 2;
331 while (count) {
332 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
333 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
334 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
335 } else {
336 dst[0] = src1[0];
337 dst[1] = src1[0];
338 }
339
340 ++src0;
341 ++src1;
342 ++src2;
343 dst += 2;
344 --count;
345 }
346
347 /* last pixel */
348 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
349 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
350 dst[1] = src1[0];
351 } else {
352 dst[0] = src1[0];
353 dst[1] = src1[0];
354 }
355 }
356
357 #ifdef USE_SCALE_RANDOMWRITE
358
scale2x_32_def_whole(scale2x_uint32 * restrict dst0,scale2x_uint32 * restrict dst1,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)359 static inline void scale2x_32_def_whole(scale2x_uint32* restrict dst0, scale2x_uint32* restrict dst1, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
360 {
361 assert(count >= 2);
362
363 /* first pixel */
364 if (src0[0] != src2[0] && src1[0] != src1[1]) {
365 dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
366 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
367 dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
368 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
369 } else {
370 dst0[0] = src1[0];
371 dst0[1] = src1[0];
372 dst1[0] = src1[0];
373 dst1[1] = src1[0];
374 }
375 ++src0;
376 ++src1;
377 ++src2;
378 dst0 += 2;
379 dst1 += 2;
380
381 /* central pixels */
382 count -= 2;
383 while (count) {
384 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
385 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
386 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
387 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
388 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
389 } else {
390 dst0[0] = src1[0];
391 dst0[1] = src1[0];
392 dst1[0] = src1[0];
393 dst1[1] = src1[0];
394 }
395
396 ++src0;
397 ++src1;
398 ++src2;
399 dst0 += 2;
400 dst1 += 2;
401 --count;
402 }
403
404 /* last pixel */
405 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
406 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
407 dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
408 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
409 dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
410 } else {
411 dst0[0] = src1[0];
412 dst0[1] = src1[0];
413 dst1[0] = src1[0];
414 dst1[1] = src1[0];
415 }
416 }
417
418 #endif
419
scale2x_32_def_border(scale2x_uint32 * restrict dst,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)420 static inline void scale2x_32_def_border(scale2x_uint32* restrict dst, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
421 {
422 assert(count >= 2);
423
424 /* first pixel */
425 if (src0[0] != src2[0] && src1[0] != src1[1]) {
426 dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
427 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
428 } else {
429 dst[0] = src1[0];
430 dst[1] = src1[0];
431 }
432 ++src0;
433 ++src1;
434 ++src2;
435 dst += 2;
436
437 /* central pixels */
438 count -= 2;
439 while (count) {
440 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
441 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
442 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
443 } else {
444 dst[0] = src1[0];
445 dst[1] = src1[0];
446 }
447
448 ++src0;
449 ++src1;
450 ++src2;
451 dst += 2;
452 --count;
453 }
454
455 /* last pixel */
456 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
457 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
458 dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
459 } else {
460 dst[0] = src1[0];
461 dst[1] = src1[0];
462 }
463 }
464
scale2x_32_def_center(scale2x_uint32 * restrict dst,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)465 static inline void scale2x_32_def_center(scale2x_uint32* restrict dst, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
466 {
467 assert(count >= 2);
468
469 /* first pixel */
470 if (src0[0] != src2[0] && src1[0] != src1[1]) {
471 dst[0] = src1[0];
472 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
473 } else {
474 dst[0] = src1[0];
475 dst[1] = src1[0];
476 }
477 ++src0;
478 ++src1;
479 ++src2;
480 dst += 2;
481
482 /* central pixels */
483 count -= 2;
484 while (count) {
485 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
486 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
487 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
488 } else {
489 dst[0] = src1[0];
490 dst[1] = src1[0];
491 }
492
493 ++src0;
494 ++src1;
495 ++src2;
496 dst += 2;
497 --count;
498 }
499
500 /* last pixel */
501 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
502 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
503 dst[1] = src1[0];
504 } else {
505 dst[0] = src1[0];
506 dst[1] = src1[0];
507 }
508 }
509
510 /**
511 * Scale by a factor of 2 a row of pixels of 8 bits.
512 * The function is implemented in C.
513 * The pixels over the left and right borders are assumed of the same color of
514 * the pixels on the border.
515 * Note that the implementation is optimized to write data sequentially to
516 * maximize the bandwidth on video memory.
517 * \param src0 Pointer at the first pixel of the previous row.
518 * \param src1 Pointer at the first pixel of the current row.
519 * \param src2 Pointer at the first pixel of the next row.
520 * \param count Length in pixels of the src0, src1 and src2 rows.
521 * It must be at least 2.
522 * \param dst0 First destination row, double length in pixels.
523 * \param dst1 Second destination row, double length in pixels.
524 */
scale2x_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)525 void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
526 {
527 #ifdef USE_SCALE_RANDOMWRITE
528 scale2x_8_def_whole(dst0, dst1, src0, src1, src2, count);
529 #else
530 scale2x_8_def_border(dst0, src0, src1, src2, count);
531 scale2x_8_def_border(dst1, src2, src1, src0, count);
532 #endif
533 }
534
535 /**
536 * Scale by a factor of 2 a row of pixels of 16 bits.
537 * This function operates like scale2x_8_def() but for 16 bits pixels.
538 * \param src0 Pointer at the first pixel of the previous row.
539 * \param src1 Pointer at the first pixel of the current row.
540 * \param src2 Pointer at the first pixel of the next row.
541 * \param count Length in pixels of the src0, src1 and src2 rows.
542 * It must be at least 2.
543 * \param dst0 First destination row, double length in pixels.
544 * \param dst1 Second destination row, double length in pixels.
545 */
scale2x_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)546 void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
547 {
548 #ifdef USE_SCALE_RANDOMWRITE
549 scale2x_16_def_whole(dst0, dst1, src0, src1, src2, count);
550 #else
551 scale2x_16_def_border(dst0, src0, src1, src2, count);
552 scale2x_16_def_border(dst1, src2, src1, src0, count);
553 #endif
554 }
555
556 /**
557 * Scale by a factor of 2 a row of pixels of 32 bits.
558 * This function operates like scale2x_8_def() but for 32 bits pixels.
559 * \param src0 Pointer at the first pixel of the previous row.
560 * \param src1 Pointer at the first pixel of the current row.
561 * \param src2 Pointer at the first pixel of the next row.
562 * \param count Length in pixels of the src0, src1 and src2 rows.
563 * It must be at least 2.
564 * \param dst0 First destination row, double length in pixels.
565 * \param dst1 Second destination row, double length in pixels.
566 */
scale2x_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)567 void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
568 {
569 #ifdef USE_SCALE_RANDOMWRITE
570 scale2x_32_def_whole(dst0, dst1, src0, src1, src2, count);
571 #else
572 scale2x_32_def_border(dst0, src0, src1, src2, count);
573 scale2x_32_def_border(dst1, src2, src1, src0, count);
574 #endif
575 }
576
577 /**
578 * Scale by a factor of 2x3 a row of pixels of 8 bits.
579 * \note Like scale2x_8_def();
580 */
scale2x3_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)581 void scale2x3_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
582 {
583 #ifdef USE_SCALE_RANDOMWRITE
584 scale2x_8_def_whole(dst0, dst2, src0, src1, src2, count);
585 scale2x_8_def_center(dst1, src0, src1, src2, count);
586 #else
587 scale2x_8_def_border(dst0, src0, src1, src2, count);
588 scale2x_8_def_center(dst1, src0, src1, src2, count);
589 scale2x_8_def_border(dst2, src2, src1, src0, count);
590 #endif
591 }
592
593 /**
594 * Scale by a factor of 2x3 a row of pixels of 16 bits.
595 * \note Like scale2x_16_def();
596 */
scale2x3_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)597 void scale2x3_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
598 {
599 #ifdef USE_SCALE_RANDOMWRITE
600 scale2x_16_def_whole(dst0, dst2, src0, src1, src2, count);
601 scale2x_16_def_center(dst1, src0, src1, src2, count);
602 #else
603 scale2x_16_def_border(dst0, src0, src1, src2, count);
604 scale2x_16_def_center(dst1, src0, src1, src2, count);
605 scale2x_16_def_border(dst2, src2, src1, src0, count);
606 #endif
607 }
608
609 /**
610 * Scale by a factor of 2x3 a row of pixels of 32 bits.
611 * \note Like scale2x_32_def();
612 */
scale2x3_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)613 void scale2x3_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
614 {
615 #ifdef USE_SCALE_RANDOMWRITE
616 scale2x_32_def_whole(dst0, dst2, src0, src1, src2, count);
617 scale2x_32_def_center(dst1, src0, src1, src2, count);
618 #else
619 scale2x_32_def_border(dst0, src0, src1, src2, count);
620 scale2x_32_def_center(dst1, src0, src1, src2, count);
621 scale2x_32_def_border(dst2, src2, src1, src0, count);
622 #endif
623 }
624
625 /**
626 * Scale by a factor of 2x4 a row of pixels of 8 bits.
627 * \note Like scale2x_8_def();
628 */
scale2x4_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,scale2x_uint8 * dst3,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)629 void scale2x4_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, scale2x_uint8* dst3, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
630 {
631 #ifdef USE_SCALE_RANDOMWRITE
632 scale2x_8_def_whole(dst0, dst3, src0, src1, src2, count);
633 scale2x_8_def_center(dst1, src0, src1, src2, count);
634 scale2x_8_def_center(dst2, src0, src1, src2, count);
635 #else
636 scale2x_8_def_border(dst0, src0, src1, src2, count);
637 scale2x_8_def_center(dst1, src0, src1, src2, count);
638 scale2x_8_def_center(dst2, src0, src1, src2, count);
639 scale2x_8_def_border(dst3, src2, src1, src0, count);
640 #endif
641 }
642
643 /**
644 * Scale by a factor of 2x4 a row of pixels of 16 bits.
645 * \note Like scale2x_16_def();
646 */
scale2x4_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,scale2x_uint16 * dst3,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)647 void scale2x4_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, scale2x_uint16* dst3, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
648 {
649 #ifdef USE_SCALE_RANDOMWRITE
650 scale2x_16_def_whole(dst0, dst3, src0, src1, src2, count);
651 scale2x_16_def_center(dst1, src0, src1, src2, count);
652 scale2x_16_def_center(dst2, src0, src1, src2, count);
653 #else
654 scale2x_16_def_border(dst0, src0, src1, src2, count);
655 scale2x_16_def_center(dst1, src0, src1, src2, count);
656 scale2x_16_def_center(dst2, src0, src1, src2, count);
657 scale2x_16_def_border(dst3, src2, src1, src0, count);
658 #endif
659 }
660
661 /**
662 * Scale by a factor of 2x4 a row of pixels of 32 bits.
663 * \note Like scale2x_32_def();
664 */
scale2x4_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,scale2x_uint32 * dst3,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)665 void scale2x4_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, scale2x_uint32* dst3, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
666 {
667 #ifdef USE_SCALE_RANDOMWRITE
668 scale2x_32_def_whole(dst0, dst3, src0, src1, src2, count);
669 scale2x_32_def_center(dst1, src0, src1, src2, count);
670 scale2x_32_def_center(dst2, src0, src1, src2, count);
671 #else
672 scale2x_32_def_border(dst0, src0, src1, src2, count);
673 scale2x_32_def_center(dst1, src0, src1, src2, count);
674 scale2x_32_def_center(dst2, src0, src1, src2, count);
675 scale2x_32_def_border(dst3, src2, src1, src0, count);
676 #endif
677 }
678
679 /***************************************************************************/
680 /* Scale2x MMX implementation */
681
682 #if defined(__GNUC__) && defined(__i386__)
683
684 /*
685 * Apply the Scale2x effect at a single row.
686 * This function must be called only by the other scale2x functions.
687 *
688 * Considering the pixel map :
689 *
690 * ABC (src0)
691 * DEF (src1)
692 * GHI (src2)
693 *
694 * this functions compute 2 new pixels in substitution of the source pixel E
695 * like this map :
696 *
697 * ab (dst)
698 *
699 * with these variables :
700 *
701 * ¤t -> E
702 * ¤t_left -> D
703 * ¤t_right -> F
704 * ¤t_upper -> B
705 * ¤t_lower -> H
706 *
707 * %0 -> current_upper
708 * %1 -> current
709 * %2 -> current_lower
710 * %3 -> dst
711 * %4 -> counter
712 *
713 * %mm0 -> *current_left
714 * %mm1 -> *current_next
715 * %mm2 -> tmp0
716 * %mm3 -> tmp1
717 * %mm4 -> tmp2
718 * %mm5 -> tmp3
719 * %mm6 -> *current_upper
720 * %mm7 -> *current
721 */
scale2x_8_mmx_border(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)722 static inline void scale2x_8_mmx_border(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
723 {
724 assert(count >= 16);
725 assert(count % 8 == 0);
726
727 /* always do the first and last run */
728 count -= 2*8;
729
730 __asm__ __volatile__(
731 /* first run */
732 /* set the current, current_pre, current_next registers */
733 "movq 0(%1), %%mm0\n"
734 "movq 0(%1), %%mm7\n"
735 "movq 8(%1), %%mm1\n"
736 "psllq $56, %%mm0\n"
737 "psllq $56, %%mm1\n"
738 "psrlq $56, %%mm0\n"
739 "movq %%mm7, %%mm2\n"
740 "movq %%mm7, %%mm3\n"
741 "psllq $8, %%mm2\n"
742 "psrlq $8, %%mm3\n"
743 "por %%mm2, %%mm0\n"
744 "por %%mm3, %%mm1\n"
745
746 /* current_upper */
747 "movq (%0), %%mm6\n"
748
749 /* compute the upper-left pixel for dst on %%mm2 */
750 /* compute the upper-right pixel for dst on %%mm4 */
751 "movq %%mm0, %%mm2\n"
752 "movq %%mm1, %%mm4\n"
753 "movq %%mm0, %%mm3\n"
754 "movq %%mm1, %%mm5\n"
755 "pcmpeqb %%mm6, %%mm2\n"
756 "pcmpeqb %%mm6, %%mm4\n"
757 "pcmpeqb (%2), %%mm3\n"
758 "pcmpeqb (%2), %%mm5\n"
759 "pandn %%mm2, %%mm3\n"
760 "pandn %%mm4, %%mm5\n"
761 "movq %%mm0, %%mm2\n"
762 "movq %%mm1, %%mm4\n"
763 "pcmpeqb %%mm1, %%mm2\n"
764 "pcmpeqb %%mm0, %%mm4\n"
765 "pandn %%mm3, %%mm2\n"
766 "pandn %%mm5, %%mm4\n"
767 "movq %%mm2, %%mm3\n"
768 "movq %%mm4, %%mm5\n"
769 "pand %%mm6, %%mm2\n"
770 "pand %%mm6, %%mm4\n"
771 "pandn %%mm7, %%mm3\n"
772 "pandn %%mm7, %%mm5\n"
773 "por %%mm3, %%mm2\n"
774 "por %%mm5, %%mm4\n"
775
776 /* set *dst */
777 "movq %%mm2, %%mm3\n"
778 "punpcklbw %%mm4, %%mm2\n"
779 "punpckhbw %%mm4, %%mm3\n"
780 "movq %%mm2, (%3)\n"
781 "movq %%mm3, 8(%3)\n"
782
783 /* next */
784 "addl $8, %0\n"
785 "addl $8, %1\n"
786 "addl $8, %2\n"
787 "addl $16, %3\n"
788
789 /* central runs */
790 "shrl $3, %4\n"
791 "jz 1f\n"
792
793 "0:\n"
794
795 /* set the current, current_pre, current_next registers */
796 "movq -8(%1), %%mm0\n"
797 "movq (%1), %%mm7\n"
798 "movq 8(%1), %%mm1\n"
799 "psrlq $56, %%mm0\n"
800 "psllq $56, %%mm1\n"
801 "movq %%mm7, %%mm2\n"
802 "movq %%mm7, %%mm3\n"
803 "psllq $8, %%mm2\n"
804 "psrlq $8, %%mm3\n"
805 "por %%mm2, %%mm0\n"
806 "por %%mm3, %%mm1\n"
807
808 /* current_upper */
809 "movq (%0), %%mm6\n"
810
811 /* compute the upper-left pixel for dst on %%mm2 */
812 /* compute the upper-right pixel for dst on %%mm4 */
813 "movq %%mm0, %%mm2\n"
814 "movq %%mm1, %%mm4\n"
815 "movq %%mm0, %%mm3\n"
816 "movq %%mm1, %%mm5\n"
817 "pcmpeqb %%mm6, %%mm2\n"
818 "pcmpeqb %%mm6, %%mm4\n"
819 "pcmpeqb (%2), %%mm3\n"
820 "pcmpeqb (%2), %%mm5\n"
821 "pandn %%mm2, %%mm3\n"
822 "pandn %%mm4, %%mm5\n"
823 "movq %%mm0, %%mm2\n"
824 "movq %%mm1, %%mm4\n"
825 "pcmpeqb %%mm1, %%mm2\n"
826 "pcmpeqb %%mm0, %%mm4\n"
827 "pandn %%mm3, %%mm2\n"
828 "pandn %%mm5, %%mm4\n"
829 "movq %%mm2, %%mm3\n"
830 "movq %%mm4, %%mm5\n"
831 "pand %%mm6, %%mm2\n"
832 "pand %%mm6, %%mm4\n"
833 "pandn %%mm7, %%mm3\n"
834 "pandn %%mm7, %%mm5\n"
835 "por %%mm3, %%mm2\n"
836 "por %%mm5, %%mm4\n"
837
838 /* set *dst */
839 "movq %%mm2, %%mm3\n"
840 "punpcklbw %%mm4, %%mm2\n"
841 "punpckhbw %%mm4, %%mm3\n"
842 "movq %%mm2, (%3)\n"
843 "movq %%mm3, 8(%3)\n"
844
845 /* next */
846 "addl $8, %0\n"
847 "addl $8, %1\n"
848 "addl $8, %2\n"
849 "addl $16, %3\n"
850
851 "decl %4\n"
852 "jnz 0b\n"
853 "1:\n"
854
855 /* final run */
856 /* set the current, current_pre, current_next registers */
857 "movq (%1), %%mm1\n"
858 "movq (%1), %%mm7\n"
859 "movq -8(%1), %%mm0\n"
860 "psrlq $56, %%mm1\n"
861 "psrlq $56, %%mm0\n"
862 "psllq $56, %%mm1\n"
863 "movq %%mm7, %%mm2\n"
864 "movq %%mm7, %%mm3\n"
865 "psllq $8, %%mm2\n"
866 "psrlq $8, %%mm3\n"
867 "por %%mm2, %%mm0\n"
868 "por %%mm3, %%mm1\n"
869
870 /* current_upper */
871 "movq (%0), %%mm6\n"
872
873 /* compute the upper-left pixel for dst on %%mm2 */
874 /* compute the upper-right pixel for dst on %%mm4 */
875 "movq %%mm0, %%mm2\n"
876 "movq %%mm1, %%mm4\n"
877 "movq %%mm0, %%mm3\n"
878 "movq %%mm1, %%mm5\n"
879 "pcmpeqb %%mm6, %%mm2\n"
880 "pcmpeqb %%mm6, %%mm4\n"
881 "pcmpeqb (%2), %%mm3\n"
882 "pcmpeqb (%2), %%mm5\n"
883 "pandn %%mm2, %%mm3\n"
884 "pandn %%mm4, %%mm5\n"
885 "movq %%mm0, %%mm2\n"
886 "movq %%mm1, %%mm4\n"
887 "pcmpeqb %%mm1, %%mm2\n"
888 "pcmpeqb %%mm0, %%mm4\n"
889 "pandn %%mm3, %%mm2\n"
890 "pandn %%mm5, %%mm4\n"
891 "movq %%mm2, %%mm3\n"
892 "movq %%mm4, %%mm5\n"
893 "pand %%mm6, %%mm2\n"
894 "pand %%mm6, %%mm4\n"
895 "pandn %%mm7, %%mm3\n"
896 "pandn %%mm7, %%mm5\n"
897 "por %%mm3, %%mm2\n"
898 "por %%mm5, %%mm4\n"
899
900 /* set *dst */
901 "movq %%mm2, %%mm3\n"
902 "punpcklbw %%mm4, %%mm2\n"
903 "punpckhbw %%mm4, %%mm3\n"
904 "movq %%mm2, (%3)\n"
905 "movq %%mm3, 8(%3)\n"
906
907 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
908 :
909 : "cc"
910 );
911 }
912
scale2x_16_mmx_border(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)913 static inline void scale2x_16_mmx_border(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
914 {
915 assert(count >= 8);
916 assert(count % 4 == 0);
917
918 /* always do the first and last run */
919 count -= 2*4;
920
921 __asm__ __volatile__(
922 /* first run */
923 /* set the current, current_pre, current_next registers */
924 "movq 0(%1), %%mm0\n"
925 "movq 0(%1), %%mm7\n"
926 "movq 8(%1), %%mm1\n"
927 "psllq $48, %%mm0\n"
928 "psllq $48, %%mm1\n"
929 "psrlq $48, %%mm0\n"
930 "movq %%mm7, %%mm2\n"
931 "movq %%mm7, %%mm3\n"
932 "psllq $16, %%mm2\n"
933 "psrlq $16, %%mm3\n"
934 "por %%mm2, %%mm0\n"
935 "por %%mm3, %%mm1\n"
936
937 /* current_upper */
938 "movq (%0), %%mm6\n"
939
940 /* compute the upper-left pixel for dst on %%mm2 */
941 /* compute the upper-right pixel for dst on %%mm4 */
942 "movq %%mm0, %%mm2\n"
943 "movq %%mm1, %%mm4\n"
944 "movq %%mm0, %%mm3\n"
945 "movq %%mm1, %%mm5\n"
946 "pcmpeqw %%mm6, %%mm2\n"
947 "pcmpeqw %%mm6, %%mm4\n"
948 "pcmpeqw (%2), %%mm3\n"
949 "pcmpeqw (%2), %%mm5\n"
950 "pandn %%mm2, %%mm3\n"
951 "pandn %%mm4, %%mm5\n"
952 "movq %%mm0, %%mm2\n"
953 "movq %%mm1, %%mm4\n"
954 "pcmpeqw %%mm1, %%mm2\n"
955 "pcmpeqw %%mm0, %%mm4\n"
956 "pandn %%mm3, %%mm2\n"
957 "pandn %%mm5, %%mm4\n"
958 "movq %%mm2, %%mm3\n"
959 "movq %%mm4, %%mm5\n"
960 "pand %%mm6, %%mm2\n"
961 "pand %%mm6, %%mm4\n"
962 "pandn %%mm7, %%mm3\n"
963 "pandn %%mm7, %%mm5\n"
964 "por %%mm3, %%mm2\n"
965 "por %%mm5, %%mm4\n"
966
967 /* set *dst */
968 "movq %%mm2, %%mm3\n"
969 "punpcklwd %%mm4, %%mm2\n"
970 "punpckhwd %%mm4, %%mm3\n"
971 "movq %%mm2, (%3)\n"
972 "movq %%mm3, 8(%3)\n"
973
974 /* next */
975 "addl $8, %0\n"
976 "addl $8, %1\n"
977 "addl $8, %2\n"
978 "addl $16, %3\n"
979
980 /* central runs */
981 "shrl $2, %4\n"
982 "jz 1f\n"
983
984 "0:\n"
985
986 /* set the current, current_pre, current_next registers */
987 "movq -8(%1), %%mm0\n"
988 "movq (%1), %%mm7\n"
989 "movq 8(%1), %%mm1\n"
990 "psrlq $48, %%mm0\n"
991 "psllq $48, %%mm1\n"
992 "movq %%mm7, %%mm2\n"
993 "movq %%mm7, %%mm3\n"
994 "psllq $16, %%mm2\n"
995 "psrlq $16, %%mm3\n"
996 "por %%mm2, %%mm0\n"
997 "por %%mm3, %%mm1\n"
998
999 /* current_upper */
1000 "movq (%0), %%mm6\n"
1001
1002 /* compute the upper-left pixel for dst on %%mm2 */
1003 /* compute the upper-right pixel for dst on %%mm4 */
1004 "movq %%mm0, %%mm2\n"
1005 "movq %%mm1, %%mm4\n"
1006 "movq %%mm0, %%mm3\n"
1007 "movq %%mm1, %%mm5\n"
1008 "pcmpeqw %%mm6, %%mm2\n"
1009 "pcmpeqw %%mm6, %%mm4\n"
1010 "pcmpeqw (%2), %%mm3\n"
1011 "pcmpeqw (%2), %%mm5\n"
1012 "pandn %%mm2, %%mm3\n"
1013 "pandn %%mm4, %%mm5\n"
1014 "movq %%mm0, %%mm2\n"
1015 "movq %%mm1, %%mm4\n"
1016 "pcmpeqw %%mm1, %%mm2\n"
1017 "pcmpeqw %%mm0, %%mm4\n"
1018 "pandn %%mm3, %%mm2\n"
1019 "pandn %%mm5, %%mm4\n"
1020 "movq %%mm2, %%mm3\n"
1021 "movq %%mm4, %%mm5\n"
1022 "pand %%mm6, %%mm2\n"
1023 "pand %%mm6, %%mm4\n"
1024 "pandn %%mm7, %%mm3\n"
1025 "pandn %%mm7, %%mm5\n"
1026 "por %%mm3, %%mm2\n"
1027 "por %%mm5, %%mm4\n"
1028
1029 /* set *dst */
1030 "movq %%mm2, %%mm3\n"
1031 "punpcklwd %%mm4, %%mm2\n"
1032 "punpckhwd %%mm4, %%mm3\n"
1033 "movq %%mm2, (%3)\n"
1034 "movq %%mm3, 8(%3)\n"
1035
1036 /* next */
1037 "addl $8, %0\n"
1038 "addl $8, %1\n"
1039 "addl $8, %2\n"
1040 "addl $16, %3\n"
1041
1042 "decl %4\n"
1043 "jnz 0b\n"
1044 "1:\n"
1045
1046 /* final run */
1047 /* set the current, current_pre, current_next registers */
1048 "movq (%1), %%mm1\n"
1049 "movq (%1), %%mm7\n"
1050 "movq -8(%1), %%mm0\n"
1051 "psrlq $48, %%mm1\n"
1052 "psrlq $48, %%mm0\n"
1053 "psllq $48, %%mm1\n"
1054 "movq %%mm7, %%mm2\n"
1055 "movq %%mm7, %%mm3\n"
1056 "psllq $16, %%mm2\n"
1057 "psrlq $16, %%mm3\n"
1058 "por %%mm2, %%mm0\n"
1059 "por %%mm3, %%mm1\n"
1060
1061 /* current_upper */
1062 "movq (%0), %%mm6\n"
1063
1064 /* compute the upper-left pixel for dst on %%mm2 */
1065 /* compute the upper-right pixel for dst on %%mm4 */
1066 "movq %%mm0, %%mm2\n"
1067 "movq %%mm1, %%mm4\n"
1068 "movq %%mm0, %%mm3\n"
1069 "movq %%mm1, %%mm5\n"
1070 "pcmpeqw %%mm6, %%mm2\n"
1071 "pcmpeqw %%mm6, %%mm4\n"
1072 "pcmpeqw (%2), %%mm3\n"
1073 "pcmpeqw (%2), %%mm5\n"
1074 "pandn %%mm2, %%mm3\n"
1075 "pandn %%mm4, %%mm5\n"
1076 "movq %%mm0, %%mm2\n"
1077 "movq %%mm1, %%mm4\n"
1078 "pcmpeqw %%mm1, %%mm2\n"
1079 "pcmpeqw %%mm0, %%mm4\n"
1080 "pandn %%mm3, %%mm2\n"
1081 "pandn %%mm5, %%mm4\n"
1082 "movq %%mm2, %%mm3\n"
1083 "movq %%mm4, %%mm5\n"
1084 "pand %%mm6, %%mm2\n"
1085 "pand %%mm6, %%mm4\n"
1086 "pandn %%mm7, %%mm3\n"
1087 "pandn %%mm7, %%mm5\n"
1088 "por %%mm3, %%mm2\n"
1089 "por %%mm5, %%mm4\n"
1090
1091 /* set *dst */
1092 "movq %%mm2, %%mm3\n"
1093 "punpcklwd %%mm4, %%mm2\n"
1094 "punpckhwd %%mm4, %%mm3\n"
1095 "movq %%mm2, (%3)\n"
1096 "movq %%mm3, 8(%3)\n"
1097
1098 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
1099 :
1100 : "cc"
1101 );
1102 }
1103
scale2x_32_mmx_border(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1104 static inline void scale2x_32_mmx_border(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1105 {
1106 assert(count >= 4);
1107 assert(count % 2 == 0);
1108
1109 /* always do the first and last run */
1110 count -= 2*2;
1111
1112 __asm__ __volatile__(
1113 /* first run */
1114 /* set the current, current_pre, current_next registers */
1115 "movq 0(%1), %%mm0\n"
1116 "movq 0(%1), %%mm7\n"
1117 "movq 8(%1), %%mm1\n"
1118 "psllq $32, %%mm0\n"
1119 "psllq $32, %%mm1\n"
1120 "psrlq $32, %%mm0\n"
1121 "movq %%mm7, %%mm2\n"
1122 "movq %%mm7, %%mm3\n"
1123 "psllq $32, %%mm2\n"
1124 "psrlq $32, %%mm3\n"
1125 "por %%mm2, %%mm0\n"
1126 "por %%mm3, %%mm1\n"
1127
1128 /* current_upper */
1129 "movq (%0), %%mm6\n"
1130
1131 /* compute the upper-left pixel for dst on %%mm2 */
1132 /* compute the upper-right pixel for dst on %%mm4 */
1133 "movq %%mm0, %%mm2\n"
1134 "movq %%mm1, %%mm4\n"
1135 "movq %%mm0, %%mm3\n"
1136 "movq %%mm1, %%mm5\n"
1137 "pcmpeqd %%mm6, %%mm2\n"
1138 "pcmpeqd %%mm6, %%mm4\n"
1139 "pcmpeqd (%2), %%mm3\n"
1140 "pcmpeqd (%2), %%mm5\n"
1141 "pandn %%mm2, %%mm3\n"
1142 "pandn %%mm4, %%mm5\n"
1143 "movq %%mm0, %%mm2\n"
1144 "movq %%mm1, %%mm4\n"
1145 "pcmpeqd %%mm1, %%mm2\n"
1146 "pcmpeqd %%mm0, %%mm4\n"
1147 "pandn %%mm3, %%mm2\n"
1148 "pandn %%mm5, %%mm4\n"
1149 "movq %%mm2, %%mm3\n"
1150 "movq %%mm4, %%mm5\n"
1151 "pand %%mm6, %%mm2\n"
1152 "pand %%mm6, %%mm4\n"
1153 "pandn %%mm7, %%mm3\n"
1154 "pandn %%mm7, %%mm5\n"
1155 "por %%mm3, %%mm2\n"
1156 "por %%mm5, %%mm4\n"
1157
1158 /* set *dst */
1159 "movq %%mm2, %%mm3\n"
1160 "punpckldq %%mm4, %%mm2\n"
1161 "punpckhdq %%mm4, %%mm3\n"
1162 "movq %%mm2, (%3)\n"
1163 "movq %%mm3, 8(%3)\n"
1164
1165 /* next */
1166 "addl $8, %0\n"
1167 "addl $8, %1\n"
1168 "addl $8, %2\n"
1169 "addl $16, %3\n"
1170
1171 /* central runs */
1172 "shrl $1, %4\n"
1173 "jz 1f\n"
1174
1175 "0:\n"
1176
1177 /* set the current, current_pre, current_next registers */
1178 "movq -8(%1), %%mm0\n"
1179 "movq (%1), %%mm7\n"
1180 "movq 8(%1), %%mm1\n"
1181 "psrlq $32, %%mm0\n"
1182 "psllq $32, %%mm1\n"
1183 "movq %%mm7, %%mm2\n"
1184 "movq %%mm7, %%mm3\n"
1185 "psllq $32, %%mm2\n"
1186 "psrlq $32, %%mm3\n"
1187 "por %%mm2, %%mm0\n"
1188 "por %%mm3, %%mm1\n"
1189
1190 /* current_upper */
1191 "movq (%0), %%mm6\n"
1192
1193 /* compute the upper-left pixel for dst on %%mm2 */
1194 /* compute the upper-right pixel for dst on %%mm4 */
1195 "movq %%mm0, %%mm2\n"
1196 "movq %%mm1, %%mm4\n"
1197 "movq %%mm0, %%mm3\n"
1198 "movq %%mm1, %%mm5\n"
1199 "pcmpeqd %%mm6, %%mm2\n"
1200 "pcmpeqd %%mm6, %%mm4\n"
1201 "pcmpeqd (%2), %%mm3\n"
1202 "pcmpeqd (%2), %%mm5\n"
1203 "pandn %%mm2, %%mm3\n"
1204 "pandn %%mm4, %%mm5\n"
1205 "movq %%mm0, %%mm2\n"
1206 "movq %%mm1, %%mm4\n"
1207 "pcmpeqd %%mm1, %%mm2\n"
1208 "pcmpeqd %%mm0, %%mm4\n"
1209 "pandn %%mm3, %%mm2\n"
1210 "pandn %%mm5, %%mm4\n"
1211 "movq %%mm2, %%mm3\n"
1212 "movq %%mm4, %%mm5\n"
1213 "pand %%mm6, %%mm2\n"
1214 "pand %%mm6, %%mm4\n"
1215 "pandn %%mm7, %%mm3\n"
1216 "pandn %%mm7, %%mm5\n"
1217 "por %%mm3, %%mm2\n"
1218 "por %%mm5, %%mm4\n"
1219
1220 /* set *dst */
1221 "movq %%mm2, %%mm3\n"
1222 "punpckldq %%mm4, %%mm2\n"
1223 "punpckhdq %%mm4, %%mm3\n"
1224 "movq %%mm2, (%3)\n"
1225 "movq %%mm3, 8(%3)\n"
1226
1227 /* next */
1228 "addl $8, %0\n"
1229 "addl $8, %1\n"
1230 "addl $8, %2\n"
1231 "addl $16, %3\n"
1232
1233 "decl %4\n"
1234 "jnz 0b\n"
1235 "1:\n"
1236
1237 /* final run */
1238 /* set the current, current_pre, current_next registers */
1239 "movq (%1), %%mm1\n"
1240 "movq (%1), %%mm7\n"
1241 "movq -8(%1), %%mm0\n"
1242 "psrlq $32, %%mm1\n"
1243 "psrlq $32, %%mm0\n"
1244 "psllq $32, %%mm1\n"
1245 "movq %%mm7, %%mm2\n"
1246 "movq %%mm7, %%mm3\n"
1247 "psllq $32, %%mm2\n"
1248 "psrlq $32, %%mm3\n"
1249 "por %%mm2, %%mm0\n"
1250 "por %%mm3, %%mm1\n"
1251
1252 /* current_upper */
1253 "movq (%0), %%mm6\n"
1254
1255 /* compute the upper-left pixel for dst on %%mm2 */
1256 /* compute the upper-right pixel for dst on %%mm4 */
1257 "movq %%mm0, %%mm2\n"
1258 "movq %%mm1, %%mm4\n"
1259 "movq %%mm0, %%mm3\n"
1260 "movq %%mm1, %%mm5\n"
1261 "pcmpeqd %%mm6, %%mm2\n"
1262 "pcmpeqd %%mm6, %%mm4\n"
1263 "pcmpeqd (%2), %%mm3\n"
1264 "pcmpeqd (%2), %%mm5\n"
1265 "pandn %%mm2, %%mm3\n"
1266 "pandn %%mm4, %%mm5\n"
1267 "movq %%mm0, %%mm2\n"
1268 "movq %%mm1, %%mm4\n"
1269 "pcmpeqd %%mm1, %%mm2\n"
1270 "pcmpeqd %%mm0, %%mm4\n"
1271 "pandn %%mm3, %%mm2\n"
1272 "pandn %%mm5, %%mm4\n"
1273 "movq %%mm2, %%mm3\n"
1274 "movq %%mm4, %%mm5\n"
1275 "pand %%mm6, %%mm2\n"
1276 "pand %%mm6, %%mm4\n"
1277 "pandn %%mm7, %%mm3\n"
1278 "pandn %%mm7, %%mm5\n"
1279 "por %%mm3, %%mm2\n"
1280 "por %%mm5, %%mm4\n"
1281
1282 /* set *dst */
1283 "movq %%mm2, %%mm3\n"
1284 "punpckldq %%mm4, %%mm2\n"
1285 "punpckhdq %%mm4, %%mm3\n"
1286 "movq %%mm2, (%3)\n"
1287 "movq %%mm3, 8(%3)\n"
1288
1289 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
1290 :
1291 : "cc"
1292 );
1293 }
1294
1295 /**
1296 * Scale by a factor of 2 a row of pixels of 8 bits.
1297 * This is a very fast MMX implementation.
1298 * The implementation uses a combination of cmp/and/not operations to
1299 * completly remove the need of conditional jumps. This trick give the
1300 * major speed improvement.
1301 * Also, using the 8 bytes MMX registers more than one pixel are computed
1302 * at the same time.
1303 * Before calling this function you must ensure that the currenct CPU supports
1304 * the MMX instruction set. After calling it you must be sure to call the EMMS
1305 * instruction before any floating-point operation.
1306 * The pixels over the left and right borders are assumed of the same color of
1307 * the pixels on the border.
1308 * Note that the implementation is optimized to write data sequentially to
1309 * maximize the bandwidth on video memory.
1310 * \param src0 Pointer at the first pixel of the previous row.
1311 * \param src1 Pointer at the first pixel of the current row.
1312 * \param src2 Pointer at the first pixel of the next row.
1313 * \param count Length in pixels of the src0, src1 and src2 rows. It must
1314 * be at least 16 and a multiple of 8.
1315 * \param dst0 First destination row, double length in pixels.
1316 * \param dst1 Second destination row, double length in pixels.
1317 */
scale2x_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1318 void scale2x_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1319 {
1320 if (count % 8 != 0 || count < 16) {
1321 scale2x_8_def(dst0, dst1, src0, src1, src2, count);
1322 } else {
1323 scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1324 scale2x_8_mmx_border(dst1, src2, src1, src0, count);
1325 }
1326 }
1327
1328 /**
1329 * Scale by a factor of 2 a row of pixels of 16 bits.
1330 * This function operates like scale2x_8_mmx() but for 16 bits pixels.
1331 * \param src0 Pointer at the first pixel of the previous row.
1332 * \param src1 Pointer at the first pixel of the current row.
1333 * \param src2 Pointer at the first pixel of the next row.
1334 * \param count Length in pixels of the src0, src1 and src2 rows. It must
1335 * be at least 8 and a multiple of 4.
1336 * \param dst0 First destination row, double length in pixels.
1337 * \param dst1 Second destination row, double length in pixels.
1338 */
scale2x_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1339 void scale2x_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1340 {
1341 if (count % 4 != 0 || count < 8) {
1342 scale2x_16_def(dst0, dst1, src0, src1, src2, count);
1343 } else {
1344 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1345 scale2x_16_mmx_border(dst1, src2, src1, src0, count);
1346 }
1347 }
1348
1349 /**
1350 * Scale by a factor of 2 a row of pixels of 32 bits.
1351 * This function operates like scale2x_8_mmx() but for 32 bits pixels.
1352 * \param src0 Pointer at the first pixel of the previous row.
1353 * \param src1 Pointer at the first pixel of the current row.
1354 * \param src2 Pointer at the first pixel of the next row.
1355 * \param count Length in pixels of the src0, src1 and src2 rows. It must
1356 * be at least 4 and a multiple of 2.
1357 * \param dst0 First destination row, double length in pixels.
1358 * \param dst1 Second destination row, double length in pixels.
1359 */
scale2x_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1360 void scale2x_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1361 {
1362 if (count % 2 != 0 || count < 4) {
1363 scale2x_32_def(dst0, dst1, src0, src1, src2, count);
1364 } else {
1365 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1366 scale2x_32_mmx_border(dst1, src2, src1, src0, count);
1367 }
1368 }
1369
1370 /**
1371 * Scale by a factor of 2x3 a row of pixels of 8 bits.
1372 * This function operates like scale2x_8_mmx() but with an expansion
1373 * factor of 2x3 instead of 2x2.
1374 */
scale2x3_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1375 void scale2x3_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1376 {
1377 if (count % 8 != 0 || count < 16) {
1378 scale2x3_8_def(dst0, dst1, dst2, src0, src1, src2, count);
1379 } else {
1380 scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1381 scale2x_8_def_center(dst1, src0, src1, src2, count);
1382 scale2x_8_mmx_border(dst2, src2, src1, src0, count);
1383 }
1384 }
1385
1386 /**
1387 * Scale by a factor of 2x3 a row of pixels of 16 bits.
1388 * This function operates like scale2x_16_mmx() but with an expansion
1389 * factor of 2x3 instead of 2x2.
1390 */
scale2x3_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1391 void scale2x3_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1392 {
1393 if (count % 4 != 0 || count < 8) {
1394 scale2x3_16_def(dst0, dst1, dst2, src0, src1, src2, count);
1395 } else {
1396 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1397 scale2x_16_def_center(dst1, src0, src1, src2, count);
1398 scale2x_16_mmx_border(dst2, src2, src1, src0, count);
1399 }
1400 }
1401
1402 /**
1403 * Scale by a factor of 2x3 a row of pixels of 32 bits.
1404 * This function operates like scale2x_32_mmx() but with an expansion
1405 * factor of 2x3 instead of 2x2.
1406 */
scale2x3_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1407 void scale2x3_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1408 {
1409 if (count % 2 != 0 || count < 4) {
1410 scale2x3_32_def(dst0, dst1, dst2, src0, src1, src2, count);
1411 } else {
1412 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1413 scale2x_32_def_center(dst1, src0, src1, src2, count);
1414 scale2x_32_mmx_border(dst2, src2, src1, src0, count);
1415 }
1416 }
1417
1418 /**
1419 * Scale by a factor of 2x4 a row of pixels of 8 bits.
1420 * This function operates like scale2x_8_mmx() but with an expansion
1421 * factor of 2x4 instead of 2x2.
1422 */
scale2x4_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,scale2x_uint8 * dst3,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1423 void scale2x4_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, scale2x_uint8* dst3, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1424 {
1425 if (count % 8 != 0 || count < 16) {
1426 scale2x4_8_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1427 } else {
1428 scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1429 scale2x_8_def_center(dst1, src0, src1, src2, count);
1430 scale2x_8_def_center(dst2, src0, src1, src2, count);
1431 scale2x_8_mmx_border(dst3, src2, src1, src0, count);
1432 }
1433 }
1434
1435 /**
1436 * Scale by a factor of 2x4 a row of pixels of 16 bits.
1437 * This function operates like scale2x_16_mmx() but with an expansion
1438 * factor of 2x4 instead of 2x2.
1439 */
scale2x4_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,scale2x_uint16 * dst3,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1440 void scale2x4_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, scale2x_uint16* dst3, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1441 {
1442 if (count % 4 != 0 || count < 8) {
1443 scale2x4_16_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1444 } else {
1445 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1446 scale2x_16_def_center(dst1, src0, src1, src2, count);
1447 scale2x_16_def_center(dst2, src0, src1, src2, count);
1448 scale2x_16_mmx_border(dst3, src2, src1, src0, count);
1449 }
1450 }
1451
1452 /**
1453 * Scale by a factor of 2x4 a row of pixels of 32 bits.
1454 * This function operates like scale2x_32_mmx() but with an expansion
1455 * factor of 2x4 instead of 2x2.
1456 */
scale2x4_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,scale2x_uint32 * dst3,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1457 void scale2x4_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, scale2x_uint32* dst3, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1458 {
1459 if (count % 2 != 0 || count < 4) {
1460 scale2x4_32_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1461 } else {
1462 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1463 scale2x_32_def_center(dst1, src0, src1, src2, count);
1464 scale2x_32_def_center(dst2, src0, src1, src2, count);
1465 scale2x_32_mmx_border(dst3, src2, src1, src0, count);
1466 }
1467 }
1468
1469 #endif
1470
1471