1 /*
2 * This file is part of the Scale2x project.
3 *
4 * Copyright (C) 2001, 2002, 2003, 2004 Andrea Mazzoleni
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21 /*
22 * This file contains a C and MMX implementation of the Scale2x effect.
23 *
24 * You can find an high level description of the effect at :
25 *
26 * http://scale2x.sourceforge.net/
27 *
28 * Alternatively at the previous license terms, you are allowed to use this
29 * code in your program with these conditions:
30 * - the program is not used in commercial activities.
31 * - the whole source code of the program is released with the binary.
32 * - derivative works of the program are allowed.
33 */
34
35 #if HAVE_CONFIG_H
36 #include <config.h>
37 #endif
38
39 #include "scale2x.h"
40
41 #include <assert.h>
42
43 /***************************************************************************/
44 /* Scale2x C implementation */
45
46 /**
47 * Define the macro USE_SCALE_RANDOMWRITE to enable
48 * an optimized version which writes memory in random order.
49 * This version is a little faster if you write in system memory.
50 * But it's a lot slower if you write in video memory.
51 * So, enable it only if you are sure to never write directly in video memory.
52 */
53 /* #define USE_SCALE_RANDOMWRITE */
54
scale2x_8_def_whole(scale2x_uint8 * restrict dst0,scale2x_uint8 * restrict dst1,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)55 static inline void scale2x_8_def_whole(scale2x_uint8* restrict dst0, scale2x_uint8* restrict dst1, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
56 {
57 assert(count >= 2);
58
59 /* first pixel */
60 if (src0[0] != src2[0] && src1[0] != src1[1]) {
61 dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
62 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
63 dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
64 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
65 } else {
66 dst0[0] = src1[0];
67 dst0[1] = src1[0];
68 dst1[0] = src1[0];
69 dst1[1] = src1[0];
70 }
71 ++src0;
72 ++src1;
73 ++src2;
74 dst0 += 2;
75 dst1 += 2;
76
77 /* central pixels */
78 count -= 2;
79 while (count) {
80 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
81 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
82 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
83 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
84 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
85 } else {
86 dst0[0] = src1[0];
87 dst0[1] = src1[0];
88 dst1[0] = src1[0];
89 dst1[1] = src1[0];
90 }
91
92 ++src0;
93 ++src1;
94 ++src2;
95 dst0 += 2;
96 dst1 += 2;
97 --count;
98 }
99
100 /* last pixel */
101 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
102 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
103 dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
104 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
105 dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
106 } else {
107 dst0[0] = src1[0];
108 dst0[1] = src1[0];
109 dst1[0] = src1[0];
110 dst1[1] = src1[0];
111 }
112 }
113
scale2x_8_def_border(scale2x_uint8 * restrict dst,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)114 static inline void scale2x_8_def_border(scale2x_uint8* restrict dst, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
115 {
116 assert(count >= 2);
117
118 /* first pixel */
119 if (src0[0] != src2[0] && src1[0] != src1[1]) {
120 dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
121 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
122 } else {
123 dst[0] = src1[0];
124 dst[1] = src1[0];
125 }
126 ++src0;
127 ++src1;
128 ++src2;
129 dst += 2;
130
131 /* central pixels */
132 count -= 2;
133 while (count) {
134 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
135 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
136 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
137 } else {
138 dst[0] = src1[0];
139 dst[1] = src1[0];
140 }
141
142 ++src0;
143 ++src1;
144 ++src2;
145 dst += 2;
146 --count;
147 }
148
149 /* last pixel */
150 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
151 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
152 dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
153 } else {
154 dst[0] = src1[0];
155 dst[1] = src1[0];
156 }
157 }
158
scale2x_8_def_center(scale2x_uint8 * restrict dst,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)159 static inline void scale2x_8_def_center(scale2x_uint8* restrict dst, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
160 {
161 assert(count >= 2);
162
163 /* first pixel */
164 if (src0[0] != src2[0] && src1[0] != src1[1]) {
165 dst[0] = src1[0];
166 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
167 } else {
168 dst[0] = src1[0];
169 dst[1] = src1[0];
170 }
171 ++src0;
172 ++src1;
173 ++src2;
174 dst += 2;
175
176 /* central pixels */
177 count -= 2;
178 while (count) {
179 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
180 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
181 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
182 } else {
183 dst[0] = src1[0];
184 dst[1] = src1[0];
185 }
186
187 ++src0;
188 ++src1;
189 ++src2;
190 dst += 2;
191 --count;
192 }
193
194 /* last pixel */
195 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
196 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
197 dst[1] = src1[0];
198 } else {
199 dst[0] = src1[0];
200 dst[1] = src1[0];
201 }
202 }
203
scale2x_16_def_whole(scale2x_uint16 * restrict dst0,scale2x_uint16 * restrict dst1,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)204 static inline void scale2x_16_def_whole(scale2x_uint16* restrict dst0, scale2x_uint16* restrict dst1, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
205 {
206 assert(count >= 2);
207
208 /* first pixel */
209 if (src0[0] != src2[0] && src1[0] != src1[1]) {
210 dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
211 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
212 dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
213 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
214 } else {
215 dst0[0] = src1[0];
216 dst0[1] = src1[0];
217 dst1[0] = src1[0];
218 dst1[1] = src1[0];
219 }
220 ++src0;
221 ++src1;
222 ++src2;
223 dst0 += 2;
224 dst1 += 2;
225
226 /* central pixels */
227 count -= 2;
228 while (count) {
229 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
230 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
231 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
232 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
233 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
234 } else {
235 dst0[0] = src1[0];
236 dst0[1] = src1[0];
237 dst1[0] = src1[0];
238 dst1[1] = src1[0];
239 }
240
241 ++src0;
242 ++src1;
243 ++src2;
244 dst0 += 2;
245 dst1 += 2;
246 --count;
247 }
248
249 /* last pixel */
250 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
251 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
252 dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
253 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
254 dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
255 } else {
256 dst0[0] = src1[0];
257 dst0[1] = src1[0];
258 dst1[0] = src1[0];
259 dst1[1] = src1[0];
260 }
261 }
262
scale2x_16_def_border(scale2x_uint16 * restrict dst,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)263 static inline void scale2x_16_def_border(scale2x_uint16* restrict dst, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
264 {
265 assert(count >= 2);
266
267 /* first pixel */
268 if (src0[0] != src2[0] && src1[0] != src1[1]) {
269 dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
270 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
271 } else {
272 dst[0] = src1[0];
273 dst[1] = src1[0];
274 }
275 ++src0;
276 ++src1;
277 ++src2;
278 dst += 2;
279
280 /* central pixels */
281 count -= 2;
282 while (count) {
283 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
284 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
285 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
286 } else {
287 dst[0] = src1[0];
288 dst[1] = src1[0];
289 }
290
291 ++src0;
292 ++src1;
293 ++src2;
294 dst += 2;
295 --count;
296 }
297
298 /* last pixel */
299 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
300 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
301 dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
302 } else {
303 dst[0] = src1[0];
304 dst[1] = src1[0];
305 }
306 }
307
scale2x_16_def_center(scale2x_uint16 * restrict dst,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)308 static inline void scale2x_16_def_center(scale2x_uint16* restrict dst, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
309 {
310 assert(count >= 2);
311
312 /* first pixel */
313 if (src0[0] != src2[0] && src1[0] != src1[1]) {
314 dst[0] = src1[0];
315 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
316 } else {
317 dst[0] = src1[0];
318 dst[1] = src1[0];
319 }
320 ++src0;
321 ++src1;
322 ++src2;
323 dst += 2;
324
325 /* central pixels */
326 count -= 2;
327 while (count) {
328 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
329 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
330 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
331 } else {
332 dst[0] = src1[0];
333 dst[1] = src1[0];
334 }
335
336 ++src0;
337 ++src1;
338 ++src2;
339 dst += 2;
340 --count;
341 }
342
343 /* last pixel */
344 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
345 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
346 dst[1] = src1[0];
347 } else {
348 dst[0] = src1[0];
349 dst[1] = src1[0];
350 }
351 }
352
scale2x_32_def_whole(scale2x_uint32 * restrict dst0,scale2x_uint32 * restrict dst1,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)353 static inline void scale2x_32_def_whole(scale2x_uint32* restrict dst0, scale2x_uint32* restrict dst1, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
354 {
355 assert(count >= 2);
356
357 /* first pixel */
358 if (src0[0] != src2[0] && src1[0] != src1[1]) {
359 dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
360 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
361 dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
362 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
363 } else {
364 dst0[0] = src1[0];
365 dst0[1] = src1[0];
366 dst1[0] = src1[0];
367 dst1[1] = src1[0];
368 }
369 ++src0;
370 ++src1;
371 ++src2;
372 dst0 += 2;
373 dst1 += 2;
374
375 /* central pixels */
376 count -= 2;
377 while (count) {
378 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
379 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
380 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
381 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
382 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
383 } else {
384 dst0[0] = src1[0];
385 dst0[1] = src1[0];
386 dst1[0] = src1[0];
387 dst1[1] = src1[0];
388 }
389
390 ++src0;
391 ++src1;
392 ++src2;
393 dst0 += 2;
394 dst1 += 2;
395 --count;
396 }
397
398 /* last pixel */
399 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
400 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
401 dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
402 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
403 dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
404 } else {
405 dst0[0] = src1[0];
406 dst0[1] = src1[0];
407 dst1[0] = src1[0];
408 dst1[1] = src1[0];
409 }
410 }
411
scale2x_32_def_border(scale2x_uint32 * restrict dst,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)412 static inline void scale2x_32_def_border(scale2x_uint32* restrict dst, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
413 {
414 assert(count >= 2);
415
416 /* first pixel */
417 if (src0[0] != src2[0] && src1[0] != src1[1]) {
418 dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
419 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
420 } else {
421 dst[0] = src1[0];
422 dst[1] = src1[0];
423 }
424 ++src0;
425 ++src1;
426 ++src2;
427 dst += 2;
428
429 /* central pixels */
430 count -= 2;
431 while (count) {
432 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
433 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
434 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
435 } else {
436 dst[0] = src1[0];
437 dst[1] = src1[0];
438 }
439
440 ++src0;
441 ++src1;
442 ++src2;
443 dst += 2;
444 --count;
445 }
446
447 /* last pixel */
448 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
449 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
450 dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
451 } else {
452 dst[0] = src1[0];
453 dst[1] = src1[0];
454 }
455 }
456
scale2x_32_def_center(scale2x_uint32 * restrict dst,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)457 static inline void scale2x_32_def_center(scale2x_uint32* restrict dst, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
458 {
459 assert(count >= 2);
460
461 /* first pixel */
462 if (src0[0] != src2[0] && src1[0] != src1[1]) {
463 dst[0] = src1[0];
464 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
465 } else {
466 dst[0] = src1[0];
467 dst[1] = src1[0];
468 }
469 ++src0;
470 ++src1;
471 ++src2;
472 dst += 2;
473
474 /* central pixels */
475 count -= 2;
476 while (count) {
477 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
478 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
479 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
480 } else {
481 dst[0] = src1[0];
482 dst[1] = src1[0];
483 }
484
485 ++src0;
486 ++src1;
487 ++src2;
488 dst += 2;
489 --count;
490 }
491
492 /* last pixel */
493 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
494 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
495 dst[1] = src1[0];
496 } else {
497 dst[0] = src1[0];
498 dst[1] = src1[0];
499 }
500 }
501
502 /**
503 * Scale by a factor of 2 a row of pixels of 8 bits.
504 * The function is implemented in C.
505 * The pixels over the left and right borders are assumed of the same color of
506 * the pixels on the border.
507 * Note that the implementation is optimized to write data sequentially to
508 * maximize the bandwidth on video memory.
509 * \param src0 Pointer at the first pixel of the previous row.
510 * \param src1 Pointer at the first pixel of the current row.
511 * \param src2 Pointer at the first pixel of the next row.
512 * \param count Length in pixels of the src0, src1 and src2 rows.
513 * It must be at least 2.
514 * \param dst0 First destination row, double length in pixels.
515 * \param dst1 Second destination row, double length in pixels.
516 */
scale2x_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)517 void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
518 {
519 #ifdef USE_SCALE_RANDOMWRITE
520 scale2x_8_def_whole(dst0, dst1, src0, src1, src2, count);
521 #else
522 scale2x_8_def_border(dst0, src0, src1, src2, count);
523 scale2x_8_def_border(dst1, src2, src1, src0, count);
524 #endif
525 }
526
527 /**
528 * Scale by a factor of 2 a row of pixels of 16 bits.
529 * This function operates like scale2x_8_def() but for 16 bits pixels.
530 * \param src0 Pointer at the first pixel of the previous row.
531 * \param src1 Pointer at the first pixel of the current row.
532 * \param src2 Pointer at the first pixel of the next row.
533 * \param count Length in pixels of the src0, src1 and src2 rows.
534 * It must be at least 2.
535 * \param dst0 First destination row, double length in pixels.
536 * \param dst1 Second destination row, double length in pixels.
537 */
scale2x_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)538 void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
539 {
540 #ifdef USE_SCALE_RANDOMWRITE
541 scale2x_16_def_whole(dst0, dst1, src0, src1, src2, count);
542 #else
543 scale2x_16_def_border(dst0, src0, src1, src2, count);
544 scale2x_16_def_border(dst1, src2, src1, src0, count);
545 #endif
546 }
547
548 /**
549 * Scale by a factor of 2 a row of pixels of 32 bits.
550 * This function operates like scale2x_8_def() but for 32 bits pixels.
551 * \param src0 Pointer at the first pixel of the previous row.
552 * \param src1 Pointer at the first pixel of the current row.
553 * \param src2 Pointer at the first pixel of the next row.
554 * \param count Length in pixels of the src0, src1 and src2 rows.
555 * It must be at least 2.
556 * \param dst0 First destination row, double length in pixels.
557 * \param dst1 Second destination row, double length in pixels.
558 */
scale2x_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)559 void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
560 {
561 #ifdef USE_SCALE_RANDOMWRITE
562 scale2x_32_def_whole(dst0, dst1, src0, src1, src2, count);
563 #else
564 scale2x_32_def_border(dst0, src0, src1, src2, count);
565 scale2x_32_def_border(dst1, src2, src1, src0, count);
566 #endif
567 }
568
569 /**
570 * Scale by a factor of 2x3 a row of pixels of 8 bits.
571 * \note Like scale2x_8_def();
572 */
scale2x3_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)573 void scale2x3_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
574 {
575 #ifdef USE_SCALE_RANDOMWRITE
576 scale2x_8_def_whole(dst0, dst2, src0, src1, src2, count);
577 scale2x_8_def_center(dst1, src0, src1, src2, count);
578 #else
579 scale2x_8_def_border(dst0, src0, src1, src2, count);
580 scale2x_8_def_center(dst1, src0, src1, src2, count);
581 scale2x_8_def_border(dst2, src2, src1, src0, count);
582 #endif
583 }
584
585 /**
586 * Scale by a factor of 2x3 a row of pixels of 16 bits.
587 * \note Like scale2x_16_def();
588 */
scale2x3_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)589 void scale2x3_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
590 {
591 #ifdef USE_SCALE_RANDOMWRITE
592 scale2x_16_def_whole(dst0, dst2, src0, src1, src2, count);
593 scale2x_16_def_center(dst1, src0, src1, src2, count);
594 #else
595 scale2x_16_def_border(dst0, src0, src1, src2, count);
596 scale2x_16_def_center(dst1, src0, src1, src2, count);
597 scale2x_16_def_border(dst2, src2, src1, src0, count);
598 #endif
599 }
600
601 /**
602 * Scale by a factor of 2x3 a row of pixels of 32 bits.
603 * \note Like scale2x_32_def();
604 */
scale2x3_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)605 void scale2x3_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
606 {
607 #ifdef USE_SCALE_RANDOMWRITE
608 scale2x_32_def_whole(dst0, dst2, src0, src1, src2, count);
609 scale2x_32_def_center(dst1, src0, src1, src2, count);
610 #else
611 scale2x_32_def_border(dst0, src0, src1, src2, count);
612 scale2x_32_def_center(dst1, src0, src1, src2, count);
613 scale2x_32_def_border(dst2, src2, src1, src0, count);
614 #endif
615 }
616
617 /**
618 * Scale by a factor of 2x4 a row of pixels of 8 bits.
619 * \note Like scale2x_8_def();
620 */
scale2x4_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,scale2x_uint8 * dst3,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)621 void scale2x4_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, scale2x_uint8* dst3, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
622 {
623 #ifdef USE_SCALE_RANDOMWRITE
624 scale2x_8_def_whole(dst0, dst3, src0, src1, src2, count);
625 scale2x_8_def_center(dst1, src0, src1, src2, count);
626 scale2x_8_def_center(dst2, src0, src1, src2, count);
627 #else
628 scale2x_8_def_border(dst0, src0, src1, src2, count);
629 scale2x_8_def_center(dst1, src0, src1, src2, count);
630 scale2x_8_def_center(dst2, src0, src1, src2, count);
631 scale2x_8_def_border(dst3, src2, src1, src0, count);
632 #endif
633 }
634
635 /**
636 * Scale by a factor of 2x4 a row of pixels of 16 bits.
637 * \note Like scale2x_16_def();
638 */
scale2x4_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,scale2x_uint16 * dst3,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)639 void scale2x4_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, scale2x_uint16* dst3, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
640 {
641 #ifdef USE_SCALE_RANDOMWRITE
642 scale2x_16_def_whole(dst0, dst3, src0, src1, src2, count);
643 scale2x_16_def_center(dst1, src0, src1, src2, count);
644 scale2x_16_def_center(dst2, src0, src1, src2, count);
645 #else
646 scale2x_16_def_border(dst0, src0, src1, src2, count);
647 scale2x_16_def_center(dst1, src0, src1, src2, count);
648 scale2x_16_def_center(dst2, src0, src1, src2, count);
649 scale2x_16_def_border(dst3, src2, src1, src0, count);
650 #endif
651 }
652
653 /**
654 * Scale by a factor of 2x4 a row of pixels of 32 bits.
655 * \note Like scale2x_32_def();
656 */
scale2x4_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,scale2x_uint32 * dst3,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)657 void scale2x4_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, scale2x_uint32* dst3, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
658 {
659 #ifdef USE_SCALE_RANDOMWRITE
660 scale2x_32_def_whole(dst0, dst3, src0, src1, src2, count);
661 scale2x_32_def_center(dst1, src0, src1, src2, count);
662 scale2x_32_def_center(dst2, src0, src1, src2, count);
663 #else
664 scale2x_32_def_border(dst0, src0, src1, src2, count);
665 scale2x_32_def_center(dst1, src0, src1, src2, count);
666 scale2x_32_def_center(dst2, src0, src1, src2, count);
667 scale2x_32_def_border(dst3, src2, src1, src0, count);
668 #endif
669 }
670
671 /***************************************************************************/
672 /* Scale2x MMX implementation */
673
674 #if defined(__GNUC__) && (defined(HAVE_MMX) || defined(__amd64__))
675
676 /*
677 * Apply the Scale2x effect at a single row.
678 * This function must be called only by the other scale2x functions.
679 *
680 * Considering the pixel map :
681 *
682 * ABC (src0)
683 * DEF (src1)
684 * GHI (src2)
685 *
686 * this functions compute 2 new pixels in substitution of the source pixel E
687 * like this map :
688 *
689 * ab (dst)
690 *
691 * with these variables :
692 *
693 * ¤t -> E
694 * ¤t_left -> D
695 * ¤t_right -> F
696 * ¤t_upper -> B
697 * ¤t_lower -> H
698 *
699 * %0 -> current_upper
700 * %1 -> current
701 * %2 -> current_lower
702 * %3 -> dst
703 * %4 -> counter
704 *
705 * %mm0 -> *current_left
706 * %mm1 -> *current_next
707 * %mm2 -> tmp0
708 * %mm3 -> tmp1
709 * %mm4 -> tmp2
710 * %mm5 -> tmp3
711 * %mm6 -> *current_upper
712 * %mm7 -> *current
713 */
scale2x_8_mmx_border(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)714 static inline void scale2x_8_mmx_border(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
715 {
716 assert(count >= 16);
717 assert(count % 8 == 0);
718
719 /* always do the first and last run */
720 count -= 2*8;
721
722 __asm__ __volatile__(
723 /* first run */
724 /* set the current, current_pre, current_next registers */
725 "movq 0(%1), %%mm0\n"
726 "movq 0(%1), %%mm7\n"
727 "movq 8(%1), %%mm1\n"
728 "psllq $56, %%mm0\n"
729 "psllq $56, %%mm1\n"
730 "psrlq $56, %%mm0\n"
731 "movq %%mm7, %%mm2\n"
732 "movq %%mm7, %%mm3\n"
733 "psllq $8, %%mm2\n"
734 "psrlq $8, %%mm3\n"
735 "por %%mm2, %%mm0\n"
736 "por %%mm3, %%mm1\n"
737
738 /* current_upper */
739 "movq (%0), %%mm6\n"
740
741 /* compute the upper-left pixel for dst on %%mm2 */
742 /* compute the upper-right pixel for dst on %%mm4 */
743 "movq %%mm0, %%mm2\n"
744 "movq %%mm1, %%mm4\n"
745 "movq %%mm0, %%mm3\n"
746 "movq %%mm1, %%mm5\n"
747 "pcmpeqb %%mm6, %%mm2\n"
748 "pcmpeqb %%mm6, %%mm4\n"
749 "pcmpeqb (%2), %%mm3\n"
750 "pcmpeqb (%2), %%mm5\n"
751 "pandn %%mm2, %%mm3\n"
752 "pandn %%mm4, %%mm5\n"
753 "movq %%mm0, %%mm2\n"
754 "movq %%mm1, %%mm4\n"
755 "pcmpeqb %%mm1, %%mm2\n"
756 "pcmpeqb %%mm0, %%mm4\n"
757 "pandn %%mm3, %%mm2\n"
758 "pandn %%mm5, %%mm4\n"
759 "movq %%mm2, %%mm3\n"
760 "movq %%mm4, %%mm5\n"
761 "pand %%mm6, %%mm2\n"
762 "pand %%mm6, %%mm4\n"
763 "pandn %%mm7, %%mm3\n"
764 "pandn %%mm7, %%mm5\n"
765 "por %%mm3, %%mm2\n"
766 "por %%mm5, %%mm4\n"
767
768 /* set *dst */
769 "movq %%mm2, %%mm3\n"
770 "punpcklbw %%mm4, %%mm2\n"
771 "punpckhbw %%mm4, %%mm3\n"
772 "movq %%mm2, (%3)\n"
773 "movq %%mm3, 8(%3)\n"
774
775 /* next */
776 "add $8, %0\n"
777 "add $8, %1\n"
778 "add $8, %2\n"
779 "add $16, %3\n"
780
781 /* central runs */
782 "shr $3, %4\n"
783 "jz 1f\n"
784
785 "0:\n"
786
787 /* set the current, current_pre, current_next registers */
788 "movq -8(%1), %%mm0\n"
789 "movq (%1), %%mm7\n"
790 "movq 8(%1), %%mm1\n"
791 "psrlq $56, %%mm0\n"
792 "psllq $56, %%mm1\n"
793 "movq %%mm7, %%mm2\n"
794 "movq %%mm7, %%mm3\n"
795 "psllq $8, %%mm2\n"
796 "psrlq $8, %%mm3\n"
797 "por %%mm2, %%mm0\n"
798 "por %%mm3, %%mm1\n"
799
800 /* current_upper */
801 "movq (%0), %%mm6\n"
802
803 /* compute the upper-left pixel for dst on %%mm2 */
804 /* compute the upper-right pixel for dst on %%mm4 */
805 "movq %%mm0, %%mm2\n"
806 "movq %%mm1, %%mm4\n"
807 "movq %%mm0, %%mm3\n"
808 "movq %%mm1, %%mm5\n"
809 "pcmpeqb %%mm6, %%mm2\n"
810 "pcmpeqb %%mm6, %%mm4\n"
811 "pcmpeqb (%2), %%mm3\n"
812 "pcmpeqb (%2), %%mm5\n"
813 "pandn %%mm2, %%mm3\n"
814 "pandn %%mm4, %%mm5\n"
815 "movq %%mm0, %%mm2\n"
816 "movq %%mm1, %%mm4\n"
817 "pcmpeqb %%mm1, %%mm2\n"
818 "pcmpeqb %%mm0, %%mm4\n"
819 "pandn %%mm3, %%mm2\n"
820 "pandn %%mm5, %%mm4\n"
821 "movq %%mm2, %%mm3\n"
822 "movq %%mm4, %%mm5\n"
823 "pand %%mm6, %%mm2\n"
824 "pand %%mm6, %%mm4\n"
825 "pandn %%mm7, %%mm3\n"
826 "pandn %%mm7, %%mm5\n"
827 "por %%mm3, %%mm2\n"
828 "por %%mm5, %%mm4\n"
829
830 /* set *dst */
831 "movq %%mm2, %%mm3\n"
832 "punpcklbw %%mm4, %%mm2\n"
833 "punpckhbw %%mm4, %%mm3\n"
834 "movq %%mm2, (%3)\n"
835 "movq %%mm3, 8(%3)\n"
836
837 /* next */
838 "add $8, %0\n"
839 "add $8, %1\n"
840 "add $8, %2\n"
841 "add $16, %3\n"
842
843 "dec %4\n"
844 "jnz 0b\n"
845 "1:\n"
846
847 /* final run */
848 /* set the current, current_pre, current_next registers */
849 "movq (%1), %%mm1\n"
850 "movq (%1), %%mm7\n"
851 "movq -8(%1), %%mm0\n"
852 "psrlq $56, %%mm1\n"
853 "psrlq $56, %%mm0\n"
854 "psllq $56, %%mm1\n"
855 "movq %%mm7, %%mm2\n"
856 "movq %%mm7, %%mm3\n"
857 "psllq $8, %%mm2\n"
858 "psrlq $8, %%mm3\n"
859 "por %%mm2, %%mm0\n"
860 "por %%mm3, %%mm1\n"
861
862 /* current_upper */
863 "movq (%0), %%mm6\n"
864
865 /* compute the upper-left pixel for dst on %%mm2 */
866 /* compute the upper-right pixel for dst on %%mm4 */
867 "movq %%mm0, %%mm2\n"
868 "movq %%mm1, %%mm4\n"
869 "movq %%mm0, %%mm3\n"
870 "movq %%mm1, %%mm5\n"
871 "pcmpeqb %%mm6, %%mm2\n"
872 "pcmpeqb %%mm6, %%mm4\n"
873 "pcmpeqb (%2), %%mm3\n"
874 "pcmpeqb (%2), %%mm5\n"
875 "pandn %%mm2, %%mm3\n"
876 "pandn %%mm4, %%mm5\n"
877 "movq %%mm0, %%mm2\n"
878 "movq %%mm1, %%mm4\n"
879 "pcmpeqb %%mm1, %%mm2\n"
880 "pcmpeqb %%mm0, %%mm4\n"
881 "pandn %%mm3, %%mm2\n"
882 "pandn %%mm5, %%mm4\n"
883 "movq %%mm2, %%mm3\n"
884 "movq %%mm4, %%mm5\n"
885 "pand %%mm6, %%mm2\n"
886 "pand %%mm6, %%mm4\n"
887 "pandn %%mm7, %%mm3\n"
888 "pandn %%mm7, %%mm5\n"
889 "por %%mm3, %%mm2\n"
890 "por %%mm5, %%mm4\n"
891
892 /* set *dst */
893 "movq %%mm2, %%mm3\n"
894 "punpcklbw %%mm4, %%mm2\n"
895 "punpckhbw %%mm4, %%mm3\n"
896 "movq %%mm2, (%3)\n"
897 "movq %%mm3, 8(%3)\n"
898
899 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
900 :
901 : "cc"
902 );
903 }
904
scale2x_16_mmx_border(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)905 static inline void scale2x_16_mmx_border(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
906 {
907 assert(count >= 8);
908 assert(count % 4 == 0);
909
910 /* always do the first and last run */
911 count -= 2*4;
912
913 __asm__ __volatile__(
914 /* first run */
915 /* set the current, current_pre, current_next registers */
916 "movq 0(%1), %%mm0\n"
917 "movq 0(%1), %%mm7\n"
918 "movq 8(%1), %%mm1\n"
919 "psllq $48, %%mm0\n"
920 "psllq $48, %%mm1\n"
921 "psrlq $48, %%mm0\n"
922 "movq %%mm7, %%mm2\n"
923 "movq %%mm7, %%mm3\n"
924 "psllq $16, %%mm2\n"
925 "psrlq $16, %%mm3\n"
926 "por %%mm2, %%mm0\n"
927 "por %%mm3, %%mm1\n"
928
929 /* current_upper */
930 "movq (%0), %%mm6\n"
931
932 /* compute the upper-left pixel for dst on %%mm2 */
933 /* compute the upper-right pixel for dst on %%mm4 */
934 "movq %%mm0, %%mm2\n"
935 "movq %%mm1, %%mm4\n"
936 "movq %%mm0, %%mm3\n"
937 "movq %%mm1, %%mm5\n"
938 "pcmpeqw %%mm6, %%mm2\n"
939 "pcmpeqw %%mm6, %%mm4\n"
940 "pcmpeqw (%2), %%mm3\n"
941 "pcmpeqw (%2), %%mm5\n"
942 "pandn %%mm2, %%mm3\n"
943 "pandn %%mm4, %%mm5\n"
944 "movq %%mm0, %%mm2\n"
945 "movq %%mm1, %%mm4\n"
946 "pcmpeqw %%mm1, %%mm2\n"
947 "pcmpeqw %%mm0, %%mm4\n"
948 "pandn %%mm3, %%mm2\n"
949 "pandn %%mm5, %%mm4\n"
950 "movq %%mm2, %%mm3\n"
951 "movq %%mm4, %%mm5\n"
952 "pand %%mm6, %%mm2\n"
953 "pand %%mm6, %%mm4\n"
954 "pandn %%mm7, %%mm3\n"
955 "pandn %%mm7, %%mm5\n"
956 "por %%mm3, %%mm2\n"
957 "por %%mm5, %%mm4\n"
958
959 /* set *dst */
960 "movq %%mm2, %%mm3\n"
961 "punpcklwd %%mm4, %%mm2\n"
962 "punpckhwd %%mm4, %%mm3\n"
963 "movq %%mm2, (%3)\n"
964 "movq %%mm3, 8(%3)\n"
965
966 /* next */
967 "add $8, %0\n"
968 "add $8, %1\n"
969 "add $8, %2\n"
970 "add $16, %3\n"
971
972 /* central runs */
973 "shr $2, %4\n"
974 "jz 1f\n"
975
976 "0:\n"
977
978 /* set the current, current_pre, current_next registers */
979 "movq -8(%1), %%mm0\n"
980 "movq (%1), %%mm7\n"
981 "movq 8(%1), %%mm1\n"
982 "psrlq $48, %%mm0\n"
983 "psllq $48, %%mm1\n"
984 "movq %%mm7, %%mm2\n"
985 "movq %%mm7, %%mm3\n"
986 "psllq $16, %%mm2\n"
987 "psrlq $16, %%mm3\n"
988 "por %%mm2, %%mm0\n"
989 "por %%mm3, %%mm1\n"
990
991 /* current_upper */
992 "movq (%0), %%mm6\n"
993
994 /* compute the upper-left pixel for dst on %%mm2 */
995 /* compute the upper-right pixel for dst on %%mm4 */
996 "movq %%mm0, %%mm2\n"
997 "movq %%mm1, %%mm4\n"
998 "movq %%mm0, %%mm3\n"
999 "movq %%mm1, %%mm5\n"
1000 "pcmpeqw %%mm6, %%mm2\n"
1001 "pcmpeqw %%mm6, %%mm4\n"
1002 "pcmpeqw (%2), %%mm3\n"
1003 "pcmpeqw (%2), %%mm5\n"
1004 "pandn %%mm2, %%mm3\n"
1005 "pandn %%mm4, %%mm5\n"
1006 "movq %%mm0, %%mm2\n"
1007 "movq %%mm1, %%mm4\n"
1008 "pcmpeqw %%mm1, %%mm2\n"
1009 "pcmpeqw %%mm0, %%mm4\n"
1010 "pandn %%mm3, %%mm2\n"
1011 "pandn %%mm5, %%mm4\n"
1012 "movq %%mm2, %%mm3\n"
1013 "movq %%mm4, %%mm5\n"
1014 "pand %%mm6, %%mm2\n"
1015 "pand %%mm6, %%mm4\n"
1016 "pandn %%mm7, %%mm3\n"
1017 "pandn %%mm7, %%mm5\n"
1018 "por %%mm3, %%mm2\n"
1019 "por %%mm5, %%mm4\n"
1020
1021 /* set *dst */
1022 "movq %%mm2, %%mm3\n"
1023 "punpcklwd %%mm4, %%mm2\n"
1024 "punpckhwd %%mm4, %%mm3\n"
1025 "movq %%mm2, (%3)\n"
1026 "movq %%mm3, 8(%3)\n"
1027
1028 /* next */
1029 "add $8, %0\n"
1030 "add $8, %1\n"
1031 "add $8, %2\n"
1032 "add $16, %3\n"
1033
1034 "dec %4\n"
1035 "jnz 0b\n"
1036 "1:\n"
1037
1038 /* final run */
1039 /* set the current, current_pre, current_next registers */
1040 "movq (%1), %%mm1\n"
1041 "movq (%1), %%mm7\n"
1042 "movq -8(%1), %%mm0\n"
1043 "psrlq $48, %%mm1\n"
1044 "psrlq $48, %%mm0\n"
1045 "psllq $48, %%mm1\n"
1046 "movq %%mm7, %%mm2\n"
1047 "movq %%mm7, %%mm3\n"
1048 "psllq $16, %%mm2\n"
1049 "psrlq $16, %%mm3\n"
1050 "por %%mm2, %%mm0\n"
1051 "por %%mm3, %%mm1\n"
1052
1053 /* current_upper */
1054 "movq (%0), %%mm6\n"
1055
1056 /* compute the upper-left pixel for dst on %%mm2 */
1057 /* compute the upper-right pixel for dst on %%mm4 */
1058 "movq %%mm0, %%mm2\n"
1059 "movq %%mm1, %%mm4\n"
1060 "movq %%mm0, %%mm3\n"
1061 "movq %%mm1, %%mm5\n"
1062 "pcmpeqw %%mm6, %%mm2\n"
1063 "pcmpeqw %%mm6, %%mm4\n"
1064 "pcmpeqw (%2), %%mm3\n"
1065 "pcmpeqw (%2), %%mm5\n"
1066 "pandn %%mm2, %%mm3\n"
1067 "pandn %%mm4, %%mm5\n"
1068 "movq %%mm0, %%mm2\n"
1069 "movq %%mm1, %%mm4\n"
1070 "pcmpeqw %%mm1, %%mm2\n"
1071 "pcmpeqw %%mm0, %%mm4\n"
1072 "pandn %%mm3, %%mm2\n"
1073 "pandn %%mm5, %%mm4\n"
1074 "movq %%mm2, %%mm3\n"
1075 "movq %%mm4, %%mm5\n"
1076 "pand %%mm6, %%mm2\n"
1077 "pand %%mm6, %%mm4\n"
1078 "pandn %%mm7, %%mm3\n"
1079 "pandn %%mm7, %%mm5\n"
1080 "por %%mm3, %%mm2\n"
1081 "por %%mm5, %%mm4\n"
1082
1083 /* set *dst */
1084 "movq %%mm2, %%mm3\n"
1085 "punpcklwd %%mm4, %%mm2\n"
1086 "punpckhwd %%mm4, %%mm3\n"
1087 "movq %%mm2, (%3)\n"
1088 "movq %%mm3, 8(%3)\n"
1089
1090 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
1091 :
1092 : "cc"
1093 );
1094 }
1095
scale2x_32_mmx_border(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1096 static inline void scale2x_32_mmx_border(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1097 {
1098 assert(count >= 4);
1099 assert(count % 2 == 0);
1100
1101 /* always do the first and last run */
1102 count -= 2*2;
1103
1104 __asm__ __volatile__(
1105 /* first run */
1106 /* set the current, current_pre, current_next registers */
1107 "movq 0(%1), %%mm0\n"
1108 "movq 0(%1), %%mm7\n"
1109 "movq 8(%1), %%mm1\n"
1110 "psllq $32, %%mm0\n"
1111 "psllq $32, %%mm1\n"
1112 "psrlq $32, %%mm0\n"
1113 "movq %%mm7, %%mm2\n"
1114 "movq %%mm7, %%mm3\n"
1115 "psllq $32, %%mm2\n"
1116 "psrlq $32, %%mm3\n"
1117 "por %%mm2, %%mm0\n"
1118 "por %%mm3, %%mm1\n"
1119
1120 /* current_upper */
1121 "movq (%0), %%mm6\n"
1122
1123 /* compute the upper-left pixel for dst on %%mm2 */
1124 /* compute the upper-right pixel for dst on %%mm4 */
1125 "movq %%mm0, %%mm2\n"
1126 "movq %%mm1, %%mm4\n"
1127 "movq %%mm0, %%mm3\n"
1128 "movq %%mm1, %%mm5\n"
1129 "pcmpeqd %%mm6, %%mm2\n"
1130 "pcmpeqd %%mm6, %%mm4\n"
1131 "pcmpeqd (%2), %%mm3\n"
1132 "pcmpeqd (%2), %%mm5\n"
1133 "pandn %%mm2, %%mm3\n"
1134 "pandn %%mm4, %%mm5\n"
1135 "movq %%mm0, %%mm2\n"
1136 "movq %%mm1, %%mm4\n"
1137 "pcmpeqd %%mm1, %%mm2\n"
1138 "pcmpeqd %%mm0, %%mm4\n"
1139 "pandn %%mm3, %%mm2\n"
1140 "pandn %%mm5, %%mm4\n"
1141 "movq %%mm2, %%mm3\n"
1142 "movq %%mm4, %%mm5\n"
1143 "pand %%mm6, %%mm2\n"
1144 "pand %%mm6, %%mm4\n"
1145 "pandn %%mm7, %%mm3\n"
1146 "pandn %%mm7, %%mm5\n"
1147 "por %%mm3, %%mm2\n"
1148 "por %%mm5, %%mm4\n"
1149
1150 /* set *dst */
1151 "movq %%mm2, %%mm3\n"
1152 "punpckldq %%mm4, %%mm2\n"
1153 "punpckhdq %%mm4, %%mm3\n"
1154 "movq %%mm2, (%3)\n"
1155 "movq %%mm3, 8(%3)\n"
1156
1157 /* next */
1158 "add $8, %0\n"
1159 "add $8, %1\n"
1160 "add $8, %2\n"
1161 "add $16, %3\n"
1162
1163 /* central runs */
1164 "shr $1, %4\n"
1165 "jz 1f\n"
1166
1167 "0:\n"
1168
1169 /* set the current, current_pre, current_next registers */
1170 "movq -8(%1), %%mm0\n"
1171 "movq (%1), %%mm7\n"
1172 "movq 8(%1), %%mm1\n"
1173 "psrlq $32, %%mm0\n"
1174 "psllq $32, %%mm1\n"
1175 "movq %%mm7, %%mm2\n"
1176 "movq %%mm7, %%mm3\n"
1177 "psllq $32, %%mm2\n"
1178 "psrlq $32, %%mm3\n"
1179 "por %%mm2, %%mm0\n"
1180 "por %%mm3, %%mm1\n"
1181
1182 /* current_upper */
1183 "movq (%0), %%mm6\n"
1184
1185 /* compute the upper-left pixel for dst on %%mm2 */
1186 /* compute the upper-right pixel for dst on %%mm4 */
1187 "movq %%mm0, %%mm2\n"
1188 "movq %%mm1, %%mm4\n"
1189 "movq %%mm0, %%mm3\n"
1190 "movq %%mm1, %%mm5\n"
1191 "pcmpeqd %%mm6, %%mm2\n"
1192 "pcmpeqd %%mm6, %%mm4\n"
1193 "pcmpeqd (%2), %%mm3\n"
1194 "pcmpeqd (%2), %%mm5\n"
1195 "pandn %%mm2, %%mm3\n"
1196 "pandn %%mm4, %%mm5\n"
1197 "movq %%mm0, %%mm2\n"
1198 "movq %%mm1, %%mm4\n"
1199 "pcmpeqd %%mm1, %%mm2\n"
1200 "pcmpeqd %%mm0, %%mm4\n"
1201 "pandn %%mm3, %%mm2\n"
1202 "pandn %%mm5, %%mm4\n"
1203 "movq %%mm2, %%mm3\n"
1204 "movq %%mm4, %%mm5\n"
1205 "pand %%mm6, %%mm2\n"
1206 "pand %%mm6, %%mm4\n"
1207 "pandn %%mm7, %%mm3\n"
1208 "pandn %%mm7, %%mm5\n"
1209 "por %%mm3, %%mm2\n"
1210 "por %%mm5, %%mm4\n"
1211
1212 /* set *dst */
1213 "movq %%mm2, %%mm3\n"
1214 "punpckldq %%mm4, %%mm2\n"
1215 "punpckhdq %%mm4, %%mm3\n"
1216 "movq %%mm2, (%3)\n"
1217 "movq %%mm3, 8(%3)\n"
1218
1219 /* next */
1220 "add $8, %0\n"
1221 "add $8, %1\n"
1222 "add $8, %2\n"
1223 "add $16, %3\n"
1224
1225 "dec %4\n"
1226 "jnz 0b\n"
1227 "1:\n"
1228
1229 /* final run */
1230 /* set the current, current_pre, current_next registers */
1231 "movq (%1), %%mm1\n"
1232 "movq (%1), %%mm7\n"
1233 "movq -8(%1), %%mm0\n"
1234 "psrlq $32, %%mm1\n"
1235 "psrlq $32, %%mm0\n"
1236 "psllq $32, %%mm1\n"
1237 "movq %%mm7, %%mm2\n"
1238 "movq %%mm7, %%mm3\n"
1239 "psllq $32, %%mm2\n"
1240 "psrlq $32, %%mm3\n"
1241 "por %%mm2, %%mm0\n"
1242 "por %%mm3, %%mm1\n"
1243
1244 /* current_upper */
1245 "movq (%0), %%mm6\n"
1246
1247 /* compute the upper-left pixel for dst on %%mm2 */
1248 /* compute the upper-right pixel for dst on %%mm4 */
1249 "movq %%mm0, %%mm2\n"
1250 "movq %%mm1, %%mm4\n"
1251 "movq %%mm0, %%mm3\n"
1252 "movq %%mm1, %%mm5\n"
1253 "pcmpeqd %%mm6, %%mm2\n"
1254 "pcmpeqd %%mm6, %%mm4\n"
1255 "pcmpeqd (%2), %%mm3\n"
1256 "pcmpeqd (%2), %%mm5\n"
1257 "pandn %%mm2, %%mm3\n"
1258 "pandn %%mm4, %%mm5\n"
1259 "movq %%mm0, %%mm2\n"
1260 "movq %%mm1, %%mm4\n"
1261 "pcmpeqd %%mm1, %%mm2\n"
1262 "pcmpeqd %%mm0, %%mm4\n"
1263 "pandn %%mm3, %%mm2\n"
1264 "pandn %%mm5, %%mm4\n"
1265 "movq %%mm2, %%mm3\n"
1266 "movq %%mm4, %%mm5\n"
1267 "pand %%mm6, %%mm2\n"
1268 "pand %%mm6, %%mm4\n"
1269 "pandn %%mm7, %%mm3\n"
1270 "pandn %%mm7, %%mm5\n"
1271 "por %%mm3, %%mm2\n"
1272 "por %%mm5, %%mm4\n"
1273
1274 /* set *dst */
1275 "movq %%mm2, %%mm3\n"
1276 "punpckldq %%mm4, %%mm2\n"
1277 "punpckhdq %%mm4, %%mm3\n"
1278 "movq %%mm2, (%3)\n"
1279 "movq %%mm3, 8(%3)\n"
1280
1281 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
1282 :
1283 : "cc"
1284 );
1285 }
1286
1287 /**
1288 * Scale by a factor of 2 a row of pixels of 8 bits.
1289 * This is a very fast MMX implementation.
1290 * The implementation uses a combination of cmp/and/not operations to
1291 * completly remove the need of conditional jumps. This trick give the
1292 * major speed improvement.
1293 * Also, using the 8 bytes MMX registers more than one pixel are computed
1294 * at the same time.
1295 * Before calling this function you must ensure that the currenct CPU supports
1296 * the MMX instruction set. After calling it you must be sure to call the EMMS
1297 * instruction before any floating-point operation.
1298 * The pixels over the left and right borders are assumed of the same color of
1299 * the pixels on the border.
1300 * Note that the implementation is optimized to write data sequentially to
1301 * maximize the bandwidth on video memory.
1302 * \param src0 Pointer at the first pixel of the previous row.
1303 * \param src1 Pointer at the first pixel of the current row.
1304 * \param src2 Pointer at the first pixel of the next row.
1305 * \param count Length in pixels of the src0, src1 and src2 rows. It must
1306 * be at least 16 and a multiple of 8.
1307 * \param dst0 First destination row, double length in pixels.
1308 * \param dst1 Second destination row, double length in pixels.
1309 */
scale2x_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1310 void scale2x_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1311 {
1312 if (count % 8 != 0 || count < 16) {
1313 scale2x_8_def(dst0, dst1, src0, src1, src2, count);
1314 } else {
1315 scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1316 scale2x_8_mmx_border(dst1, src2, src1, src0, count);
1317 }
1318 }
1319
1320 /**
1321 * Scale by a factor of 2 a row of pixels of 16 bits.
1322 * This function operates like scale2x_8_mmx() but for 16 bits pixels.
1323 * \param src0 Pointer at the first pixel of the previous row.
1324 * \param src1 Pointer at the first pixel of the current row.
1325 * \param src2 Pointer at the first pixel of the next row.
1326 * \param count Length in pixels of the src0, src1 and src2 rows. It must
1327 * be at least 8 and a multiple of 4.
1328 * \param dst0 First destination row, double length in pixels.
1329 * \param dst1 Second destination row, double length in pixels.
1330 */
scale2x_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1331 void scale2x_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1332 {
1333 if (count % 4 != 0 || count < 8) {
1334 scale2x_16_def(dst0, dst1, src0, src1, src2, count);
1335 } else {
1336 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1337 scale2x_16_mmx_border(dst1, src2, src1, src0, count);
1338 }
1339 }
1340
1341 /**
1342 * Scale by a factor of 2 a row of pixels of 32 bits.
1343 * This function operates like scale2x_8_mmx() but for 32 bits pixels.
1344 * \param src0 Pointer at the first pixel of the previous row.
1345 * \param src1 Pointer at the first pixel of the current row.
1346 * \param src2 Pointer at the first pixel of the next row.
1347 * \param count Length in pixels of the src0, src1 and src2 rows. It must
1348 * be at least 4 and a multiple of 2.
1349 * \param dst0 First destination row, double length in pixels.
1350 * \param dst1 Second destination row, double length in pixels.
1351 */
scale2x_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1352 void scale2x_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1353 {
1354 if (count % 2 != 0 || count < 4) {
1355 scale2x_32_def(dst0, dst1, src0, src1, src2, count);
1356 } else {
1357 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1358 scale2x_32_mmx_border(dst1, src2, src1, src0, count);
1359 }
1360 }
1361
1362 /**
1363 * Scale by a factor of 2x3 a row of pixels of 8 bits.
1364 * This function operates like scale2x_8_mmx() but with an expansion
1365 * factor of 2x3 instead of 2x2.
1366 */
scale2x3_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1367 void scale2x3_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1368 {
1369 if (count % 8 != 0 || count < 16) {
1370 scale2x3_8_def(dst0, dst1, dst2, src0, src1, src2, count);
1371 } else {
1372 scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1373 scale2x_8_def_center(dst1, src0, src1, src2, count);
1374 scale2x_8_mmx_border(dst2, src2, src1, src0, count);
1375 }
1376 }
1377
1378 /**
1379 * Scale by a factor of 2x3 a row of pixels of 16 bits.
1380 * This function operates like scale2x_16_mmx() but with an expansion
1381 * factor of 2x3 instead of 2x2.
1382 */
scale2x3_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1383 void scale2x3_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1384 {
1385 if (count % 4 != 0 || count < 8) {
1386 scale2x3_16_def(dst0, dst1, dst2, src0, src1, src2, count);
1387 } else {
1388 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1389 scale2x_16_def_center(dst1, src0, src1, src2, count);
1390 scale2x_16_mmx_border(dst2, src2, src1, src0, count);
1391 }
1392 }
1393
1394 /**
1395 * Scale by a factor of 2x3 a row of pixels of 32 bits.
1396 * This function operates like scale2x_32_mmx() but with an expansion
1397 * factor of 2x3 instead of 2x2.
1398 */
scale2x3_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1399 void scale2x3_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1400 {
1401 if (count % 2 != 0 || count < 4) {
1402 scale2x3_32_def(dst0, dst1, dst2, src0, src1, src2, count);
1403 } else {
1404 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1405 scale2x_32_def_center(dst1, src0, src1, src2, count);
1406 scale2x_32_mmx_border(dst2, src2, src1, src0, count);
1407 }
1408 }
1409
1410 /**
1411 * Scale by a factor of 2x4 a row of pixels of 8 bits.
1412 * This function operates like scale2x_8_mmx() but with an expansion
1413 * factor of 2x4 instead of 2x2.
1414 */
scale2x4_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,scale2x_uint8 * dst3,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1415 void scale2x4_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, scale2x_uint8* dst3, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1416 {
1417 if (count % 8 != 0 || count < 16) {
1418 scale2x4_8_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1419 } else {
1420 scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1421 scale2x_8_def_center(dst1, src0, src1, src2, count);
1422 scale2x_8_def_center(dst2, src0, src1, src2, count);
1423 scale2x_8_mmx_border(dst3, src2, src1, src0, count);
1424 }
1425 }
1426
1427 /**
1428 * Scale by a factor of 2x4 a row of pixels of 16 bits.
1429 * This function operates like scale2x_16_mmx() but with an expansion
1430 * factor of 2x4 instead of 2x2.
1431 */
scale2x4_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,scale2x_uint16 * dst3,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1432 void scale2x4_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, scale2x_uint16* dst3, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1433 {
1434 if (count % 4 != 0 || count < 8) {
1435 scale2x4_16_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1436 } else {
1437 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1438 scale2x_16_def_center(dst1, src0, src1, src2, count);
1439 scale2x_16_def_center(dst2, src0, src1, src2, count);
1440 scale2x_16_mmx_border(dst3, src2, src1, src0, count);
1441 }
1442 }
1443
1444 /**
1445 * Scale by a factor of 2x4 a row of pixels of 32 bits.
1446 * This function operates like scale2x_32_mmx() but with an expansion
1447 * factor of 2x4 instead of 2x2.
1448 */
scale2x4_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,scale2x_uint32 * dst3,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1449 void scale2x4_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, scale2x_uint32* dst3, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1450 {
1451 if (count % 2 != 0 || count < 4) {
1452 scale2x4_32_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1453 } else {
1454 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1455 scale2x_32_def_center(dst1, src0, src1, src2, count);
1456 scale2x_32_def_center(dst2, src0, src1, src2, count);
1457 scale2x_32_mmx_border(dst3, src2, src1, src0, count);
1458 }
1459 }
1460
1461 #endif
1462
1463