1 /*
2  * This file is part of the Scale2x project.
3  *
4  * Copyright (C) 2001, 2002, 2003, 2004 Andrea Mazzoleni
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20 
21 /*
22  * This file contains a C and MMX implementation of the Scale2x effect.
23  *
24  * You can find an high level description of the effect at :
25  *
26  * http://scale2x.sourceforge.net/
27  *
28  * Alternatively at the previous license terms, you are allowed to use this
29  * code in your program with these conditions:
30  * - the program is not used in commercial activities.
31  * - the whole source code of the program is released with the binary.
32  * - derivative works of the program are allowed.
33  */
34 
35 #include "scale2x.h"
36 
37 #include <assert.h>
38 
39 /***************************************************************************/
40 /* Scale2x C implementation */
41 
42 /**
43  * Define the macro USE_SCALE_RANDOMWRITE to enable
44  * an optimized version which writes memory in random order.
45  * This version is a little faster if you write in system memory.
46  * But it's a lot slower if you write in video memory.
47  * So, enable it only if you are sure to never write directly in video memory.
48  */
49 /* #define USE_SCALE_RANDOMWRITE */
50 
51 #ifdef USE_SCALE_RANDOMWRITE
52 
scale2x_8_def_whole(scale2x_uint8 * restrict dst0,scale2x_uint8 * restrict dst1,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)53 static inline void scale2x_8_def_whole(scale2x_uint8* restrict dst0, scale2x_uint8* restrict dst1, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
54 {
55 	assert(count >= 2);
56 
57 	/* first pixel */
58 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
59 		dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
60 		dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
61 		dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
62 		dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
63 	} else {
64 		dst0[0] = src1[0];
65 		dst0[1] = src1[0];
66 		dst1[0] = src1[0];
67 		dst1[1] = src1[0];
68 	}
69 	++src0;
70 	++src1;
71 	++src2;
72 	dst0 += 2;
73 	dst1 += 2;
74 
75 	/* central pixels */
76 	count -= 2;
77 	while (count) {
78 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
79 			dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
80 			dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
81 			dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
82 			dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
83 		} else {
84 			dst0[0] = src1[0];
85 			dst0[1] = src1[0];
86 			dst1[0] = src1[0];
87 			dst1[1] = src1[0];
88 		}
89 
90 		++src0;
91 		++src1;
92 		++src2;
93 		dst0 += 2;
94 		dst1 += 2;
95 		--count;
96 	}
97 
98 	/* last pixel */
99 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
100 		dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
101 		dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
102 		dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
103 		dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
104 	} else {
105 		dst0[0] = src1[0];
106 		dst0[1] = src1[0];
107 		dst1[0] = src1[0];
108 		dst1[1] = src1[0];
109 	}
110 }
111 
112 #endif
113 
scale2x_8_def_border(scale2x_uint8 * restrict dst,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)114 static inline void scale2x_8_def_border(scale2x_uint8* restrict dst, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
115 {
116 	assert(count >= 2);
117 
118 	/* first pixel */
119 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
120 		dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
121 		dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
122 	} else {
123 		dst[0] = src1[0];
124 		dst[1] = src1[0];
125 	}
126 	++src0;
127 	++src1;
128 	++src2;
129 	dst += 2;
130 
131 	/* central pixels */
132 	count -= 2;
133 	while (count) {
134 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
135 			dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
136 			dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
137 		} else {
138 			dst[0] = src1[0];
139 			dst[1] = src1[0];
140 		}
141 
142 		++src0;
143 		++src1;
144 		++src2;
145 		dst += 2;
146 		--count;
147 	}
148 
149 	/* last pixel */
150 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
151 		dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
152 		dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
153 	} else {
154 		dst[0] = src1[0];
155 		dst[1] = src1[0];
156 	}
157 }
158 
scale2x_8_def_center(scale2x_uint8 * restrict dst,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)159 static inline void scale2x_8_def_center(scale2x_uint8* restrict dst, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
160 {
161 	assert(count >= 2);
162 
163 	/* first pixel */
164 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
165 		dst[0] = src1[0];
166 		dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
167 	} else {
168 		dst[0] = src1[0];
169 		dst[1] = src1[0];
170 	}
171 	++src0;
172 	++src1;
173 	++src2;
174 	dst += 2;
175 
176 	/* central pixels */
177 	count -= 2;
178 	while (count) {
179 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
180 			dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
181 			dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
182 		} else {
183 			dst[0] = src1[0];
184 			dst[1] = src1[0];
185 		}
186 
187 		++src0;
188 		++src1;
189 		++src2;
190 		dst += 2;
191 		--count;
192 	}
193 
194 	/* last pixel */
195 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
196 		dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
197 		dst[1] = src1[0];
198 	} else {
199 		dst[0] = src1[0];
200 		dst[1] = src1[0];
201 	}
202 }
203 
204 #ifdef USE_SCALE_RANDOMWRITE
205 
scale2x_16_def_whole(scale2x_uint16 * restrict dst0,scale2x_uint16 * restrict dst1,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)206 static inline void scale2x_16_def_whole(scale2x_uint16* restrict dst0, scale2x_uint16* restrict dst1, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
207 {
208 	assert(count >= 2);
209 
210 	/* first pixel */
211 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
212 		dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
213 		dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
214 		dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
215 		dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
216 	} else {
217 		dst0[0] = src1[0];
218 		dst0[1] = src1[0];
219 		dst1[0] = src1[0];
220 		dst1[1] = src1[0];
221 	}
222 	++src0;
223 	++src1;
224 	++src2;
225 	dst0 += 2;
226 	dst1 += 2;
227 
228 	/* central pixels */
229 	count -= 2;
230 	while (count) {
231 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
232 			dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
233 			dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
234 			dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
235 			dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
236 		} else {
237 			dst0[0] = src1[0];
238 			dst0[1] = src1[0];
239 			dst1[0] = src1[0];
240 			dst1[1] = src1[0];
241 		}
242 
243 		++src0;
244 		++src1;
245 		++src2;
246 		dst0 += 2;
247 		dst1 += 2;
248 		--count;
249 	}
250 
251 	/* last pixel */
252 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
253 		dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
254 		dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
255 		dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
256 		dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
257 	} else {
258 		dst0[0] = src1[0];
259 		dst0[1] = src1[0];
260 		dst1[0] = src1[0];
261 		dst1[1] = src1[0];
262 	}
263 }
264 
265 #endif
266 
scale2x_16_def_border(scale2x_uint16 * restrict dst,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)267 static inline void scale2x_16_def_border(scale2x_uint16* restrict dst, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
268 {
269 	assert(count >= 2);
270 
271 	/* first pixel */
272 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
273 		dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
274 		dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
275 	} else {
276 		dst[0] = src1[0];
277 		dst[1] = src1[0];
278 	}
279 	++src0;
280 	++src1;
281 	++src2;
282 	dst += 2;
283 
284 	/* central pixels */
285 	count -= 2;
286 	while (count) {
287 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
288 			dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
289 			dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
290 		} else {
291 			dst[0] = src1[0];
292 			dst[1] = src1[0];
293 		}
294 
295 		++src0;
296 		++src1;
297 		++src2;
298 		dst += 2;
299 		--count;
300 	}
301 
302 	/* last pixel */
303 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
304 		dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
305 		dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
306 	} else {
307 		dst[0] = src1[0];
308 		dst[1] = src1[0];
309 	}
310 }
311 
scale2x_16_def_center(scale2x_uint16 * restrict dst,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)312 static inline void scale2x_16_def_center(scale2x_uint16* restrict dst, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
313 {
314 	assert(count >= 2);
315 
316 	/* first pixel */
317 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
318 		dst[0] = src1[0];
319 		dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
320 	} else {
321 		dst[0] = src1[0];
322 		dst[1] = src1[0];
323 	}
324 	++src0;
325 	++src1;
326 	++src2;
327 	dst += 2;
328 
329 	/* central pixels */
330 	count -= 2;
331 	while (count) {
332 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
333 			dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
334 			dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
335 		} else {
336 			dst[0] = src1[0];
337 			dst[1] = src1[0];
338 		}
339 
340 		++src0;
341 		++src1;
342 		++src2;
343 		dst += 2;
344 		--count;
345 	}
346 
347 	/* last pixel */
348 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
349 		dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
350 		dst[1] = src1[0];
351 	} else {
352 		dst[0] = src1[0];
353 		dst[1] = src1[0];
354 	}
355 }
356 
357 #ifdef USE_SCALE_RANDOMWRITE
358 
scale2x_32_def_whole(scale2x_uint32 * restrict dst0,scale2x_uint32 * restrict dst1,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)359 static inline void scale2x_32_def_whole(scale2x_uint32* restrict dst0, scale2x_uint32* restrict dst1, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
360 {
361 	assert(count >= 2);
362 
363 	/* first pixel */
364 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
365 		dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
366 		dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
367 		dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
368 		dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
369 	} else {
370 		dst0[0] = src1[0];
371 		dst0[1] = src1[0];
372 		dst1[0] = src1[0];
373 		dst1[1] = src1[0];
374 	}
375 	++src0;
376 	++src1;
377 	++src2;
378 	dst0 += 2;
379 	dst1 += 2;
380 
381 	/* central pixels */
382 	count -= 2;
383 	while (count) {
384 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
385 			dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
386 			dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
387 			dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
388 			dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
389 		} else {
390 			dst0[0] = src1[0];
391 			dst0[1] = src1[0];
392 			dst1[0] = src1[0];
393 			dst1[1] = src1[0];
394 		}
395 
396 		++src0;
397 		++src1;
398 		++src2;
399 		dst0 += 2;
400 		dst1 += 2;
401 		--count;
402 	}
403 
404 	/* last pixel */
405 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
406 		dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
407 		dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
408 		dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
409 		dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
410 	} else {
411 		dst0[0] = src1[0];
412 		dst0[1] = src1[0];
413 		dst1[0] = src1[0];
414 		dst1[1] = src1[0];
415 	}
416 }
417 
418 #endif
419 
scale2x_32_def_border(scale2x_uint32 * restrict dst,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)420 static inline void scale2x_32_def_border(scale2x_uint32* restrict dst, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
421 {
422 	assert(count >= 2);
423 
424 	/* first pixel */
425 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
426 		dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
427 		dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
428 	} else {
429 		dst[0] = src1[0];
430 		dst[1] = src1[0];
431 	}
432 	++src0;
433 	++src1;
434 	++src2;
435 	dst += 2;
436 
437 	/* central pixels */
438 	count -= 2;
439 	while (count) {
440 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
441 			dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
442 			dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
443 		} else {
444 			dst[0] = src1[0];
445 			dst[1] = src1[0];
446 		}
447 
448 		++src0;
449 		++src1;
450 		++src2;
451 		dst += 2;
452 		--count;
453 	}
454 
455 	/* last pixel */
456 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
457 		dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
458 		dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
459 	} else {
460 		dst[0] = src1[0];
461 		dst[1] = src1[0];
462 	}
463 }
464 
scale2x_32_def_center(scale2x_uint32 * restrict dst,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)465 static inline void scale2x_32_def_center(scale2x_uint32* restrict dst, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
466 {
467 	assert(count >= 2);
468 
469 	/* first pixel */
470 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
471 		dst[0] = src1[0];
472 		dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
473 	} else {
474 		dst[0] = src1[0];
475 		dst[1] = src1[0];
476 	}
477 	++src0;
478 	++src1;
479 	++src2;
480 	dst += 2;
481 
482 	/* central pixels */
483 	count -= 2;
484 	while (count) {
485 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
486 			dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
487 			dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
488 		} else {
489 			dst[0] = src1[0];
490 			dst[1] = src1[0];
491 		}
492 
493 		++src0;
494 		++src1;
495 		++src2;
496 		dst += 2;
497 		--count;
498 	}
499 
500 	/* last pixel */
501 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
502 		dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
503 		dst[1] = src1[0];
504 	} else {
505 		dst[0] = src1[0];
506 		dst[1] = src1[0];
507 	}
508 }
509 
510 /**
511  * Scale by a factor of 2 a row of pixels of 8 bits.
512  * The function is implemented in C.
513  * The pixels over the left and right borders are assumed of the same color of
514  * the pixels on the border.
515  * Note that the implementation is optimized to write data sequentially to
516  * maximize the bandwidth on video memory.
517  * \param src0 Pointer at the first pixel of the previous row.
518  * \param src1 Pointer at the first pixel of the current row.
519  * \param src2 Pointer at the first pixel of the next row.
520  * \param count Length in pixels of the src0, src1 and src2 rows.
521  * It must be at least 2.
522  * \param dst0 First destination row, double length in pixels.
523  * \param dst1 Second destination row, double length in pixels.
524  */
scale2x_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)525 void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
526 {
527 #ifdef USE_SCALE_RANDOMWRITE
528 	scale2x_8_def_whole(dst0, dst1, src0, src1, src2, count);
529 #else
530 	scale2x_8_def_border(dst0, src0, src1, src2, count);
531 	scale2x_8_def_border(dst1, src2, src1, src0, count);
532 #endif
533 }
534 
535 /**
536  * Scale by a factor of 2 a row of pixels of 16 bits.
537  * This function operates like scale2x_8_def() but for 16 bits pixels.
538  * \param src0 Pointer at the first pixel of the previous row.
539  * \param src1 Pointer at the first pixel of the current row.
540  * \param src2 Pointer at the first pixel of the next row.
541  * \param count Length in pixels of the src0, src1 and src2 rows.
542  * It must be at least 2.
543  * \param dst0 First destination row, double length in pixels.
544  * \param dst1 Second destination row, double length in pixels.
545  */
scale2x_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)546 void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
547 {
548 #ifdef USE_SCALE_RANDOMWRITE
549 	scale2x_16_def_whole(dst0, dst1, src0, src1, src2, count);
550 #else
551 	scale2x_16_def_border(dst0, src0, src1, src2, count);
552 	scale2x_16_def_border(dst1, src2, src1, src0, count);
553 #endif
554 }
555 
556 /**
557  * Scale by a factor of 2 a row of pixels of 32 bits.
558  * This function operates like scale2x_8_def() but for 32 bits pixels.
559  * \param src0 Pointer at the first pixel of the previous row.
560  * \param src1 Pointer at the first pixel of the current row.
561  * \param src2 Pointer at the first pixel of the next row.
562  * \param count Length in pixels of the src0, src1 and src2 rows.
563  * It must be at least 2.
564  * \param dst0 First destination row, double length in pixels.
565  * \param dst1 Second destination row, double length in pixels.
566  */
scale2x_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)567 void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
568 {
569 #ifdef USE_SCALE_RANDOMWRITE
570 	scale2x_32_def_whole(dst0, dst1, src0, src1, src2, count);
571 #else
572 	scale2x_32_def_border(dst0, src0, src1, src2, count);
573 	scale2x_32_def_border(dst1, src2, src1, src0, count);
574 #endif
575 }
576 
577 /**
578  * Scale by a factor of 2x3 a row of pixels of 8 bits.
579  * \note Like scale2x_8_def();
580  */
scale2x3_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)581 void scale2x3_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
582 {
583 #ifdef USE_SCALE_RANDOMWRITE
584 	scale2x_8_def_whole(dst0, dst2, src0, src1, src2, count);
585 	scale2x_8_def_center(dst1, src0, src1, src2, count);
586 #else
587 	scale2x_8_def_border(dst0, src0, src1, src2, count);
588 	scale2x_8_def_center(dst1, src0, src1, src2, count);
589 	scale2x_8_def_border(dst2, src2, src1, src0, count);
590 #endif
591 }
592 
593 /**
594  * Scale by a factor of 2x3 a row of pixels of 16 bits.
595  * \note Like scale2x_16_def();
596  */
scale2x3_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)597 void scale2x3_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
598 {
599 #ifdef USE_SCALE_RANDOMWRITE
600 	scale2x_16_def_whole(dst0, dst2, src0, src1, src2, count);
601 	scale2x_16_def_center(dst1, src0, src1, src2, count);
602 #else
603 	scale2x_16_def_border(dst0, src0, src1, src2, count);
604 	scale2x_16_def_center(dst1, src0, src1, src2, count);
605 	scale2x_16_def_border(dst2, src2, src1, src0, count);
606 #endif
607 }
608 
609 /**
610  * Scale by a factor of 2x3 a row of pixels of 32 bits.
611  * \note Like scale2x_32_def();
612  */
scale2x3_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)613 void scale2x3_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
614 {
615 #ifdef USE_SCALE_RANDOMWRITE
616 	scale2x_32_def_whole(dst0, dst2, src0, src1, src2, count);
617 	scale2x_32_def_center(dst1, src0, src1, src2, count);
618 #else
619 	scale2x_32_def_border(dst0, src0, src1, src2, count);
620 	scale2x_32_def_center(dst1, src0, src1, src2, count);
621 	scale2x_32_def_border(dst2, src2, src1, src0, count);
622 #endif
623 }
624 
625 /**
626  * Scale by a factor of 2x4 a row of pixels of 8 bits.
627  * \note Like scale2x_8_def();
628  */
scale2x4_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,scale2x_uint8 * dst3,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)629 void scale2x4_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, scale2x_uint8* dst3, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
630 {
631 #ifdef USE_SCALE_RANDOMWRITE
632 	scale2x_8_def_whole(dst0, dst3, src0, src1, src2, count);
633 	scale2x_8_def_center(dst1, src0, src1, src2, count);
634 	scale2x_8_def_center(dst2, src0, src1, src2, count);
635 #else
636 	scale2x_8_def_border(dst0, src0, src1, src2, count);
637 	scale2x_8_def_center(dst1, src0, src1, src2, count);
638 	scale2x_8_def_center(dst2, src0, src1, src2, count);
639 	scale2x_8_def_border(dst3, src2, src1, src0, count);
640 #endif
641 }
642 
643 /**
644  * Scale by a factor of 2x4 a row of pixels of 16 bits.
645  * \note Like scale2x_16_def();
646  */
scale2x4_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,scale2x_uint16 * dst3,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)647 void scale2x4_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, scale2x_uint16* dst3, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
648 {
649 #ifdef USE_SCALE_RANDOMWRITE
650 	scale2x_16_def_whole(dst0, dst3, src0, src1, src2, count);
651 	scale2x_16_def_center(dst1, src0, src1, src2, count);
652 	scale2x_16_def_center(dst2, src0, src1, src2, count);
653 #else
654 	scale2x_16_def_border(dst0, src0, src1, src2, count);
655 	scale2x_16_def_center(dst1, src0, src1, src2, count);
656 	scale2x_16_def_center(dst2, src0, src1, src2, count);
657 	scale2x_16_def_border(dst3, src2, src1, src0, count);
658 #endif
659 }
660 
661 /**
662  * Scale by a factor of 2x4 a row of pixels of 32 bits.
663  * \note Like scale2x_32_def();
664  */
scale2x4_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,scale2x_uint32 * dst3,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)665 void scale2x4_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, scale2x_uint32* dst3, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
666 {
667 #ifdef USE_SCALE_RANDOMWRITE
668 	scale2x_32_def_whole(dst0, dst3, src0, src1, src2, count);
669 	scale2x_32_def_center(dst1, src0, src1, src2, count);
670 	scale2x_32_def_center(dst2, src0, src1, src2, count);
671 #else
672 	scale2x_32_def_border(dst0, src0, src1, src2, count);
673 	scale2x_32_def_center(dst1, src0, src1, src2, count);
674 	scale2x_32_def_center(dst2, src0, src1, src2, count);
675 	scale2x_32_def_border(dst3, src2, src1, src0, count);
676 #endif
677 }
678 
679 /***************************************************************************/
680 /* Scale2x MMX implementation */
681 
682 #if defined(__GNUC__) && defined(__i386__)
683 
684 /*
685  * Apply the Scale2x effect at a single row.
686  * This function must be called only by the other scale2x functions.
687  *
688  * Considering the pixel map :
689  *
690  *      ABC (src0)
691  *      DEF (src1)
692  *      GHI (src2)
693  *
694  * this functions compute 2 new pixels in substitution of the source pixel E
695  * like this map :
696  *
697  *      ab (dst)
698  *
699  * with these variables :
700  *
701  *      &current -> E
702  *      &current_left -> D
703  *      &current_right -> F
704  *      &current_upper -> B
705  *      &current_lower -> H
706  *
707  *      %0 -> current_upper
708  *      %1 -> current
709  *      %2 -> current_lower
710  *      %3 -> dst
711  *      %4 -> counter
712  *
713  *      %mm0 -> *current_left
714  *      %mm1 -> *current_next
715  *      %mm2 -> tmp0
716  *      %mm3 -> tmp1
717  *      %mm4 -> tmp2
718  *      %mm5 -> tmp3
719  *      %mm6 -> *current_upper
720  *      %mm7 -> *current
721  */
scale2x_8_mmx_border(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)722 static inline void scale2x_8_mmx_border(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
723 {
724 	assert(count >= 16);
725 	assert(count % 8 == 0);
726 
727 	/* always do the first and last run */
728 	count -= 2*8;
729 
730 	__asm__ __volatile__(
731 /* first run */
732 		/* set the current, current_pre, current_next registers */
733 		"movq 0(%1), %%mm0\n"
734 		"movq 0(%1), %%mm7\n"
735 		"movq 8(%1), %%mm1\n"
736 		"psllq $56, %%mm0\n"
737 		"psllq $56, %%mm1\n"
738 		"psrlq $56, %%mm0\n"
739 		"movq %%mm7, %%mm2\n"
740 		"movq %%mm7, %%mm3\n"
741 		"psllq $8, %%mm2\n"
742 		"psrlq $8, %%mm3\n"
743 		"por %%mm2, %%mm0\n"
744 		"por %%mm3, %%mm1\n"
745 
746 		/* current_upper */
747 		"movq (%0), %%mm6\n"
748 
749 		/* compute the upper-left pixel for dst on %%mm2 */
750 		/* compute the upper-right pixel for dst on %%mm4 */
751 		"movq %%mm0, %%mm2\n"
752 		"movq %%mm1, %%mm4\n"
753 		"movq %%mm0, %%mm3\n"
754 		"movq %%mm1, %%mm5\n"
755 		"pcmpeqb %%mm6, %%mm2\n"
756 		"pcmpeqb %%mm6, %%mm4\n"
757 		"pcmpeqb (%2), %%mm3\n"
758 		"pcmpeqb (%2), %%mm5\n"
759 		"pandn %%mm2, %%mm3\n"
760 		"pandn %%mm4, %%mm5\n"
761 		"movq %%mm0, %%mm2\n"
762 		"movq %%mm1, %%mm4\n"
763 		"pcmpeqb %%mm1, %%mm2\n"
764 		"pcmpeqb %%mm0, %%mm4\n"
765 		"pandn %%mm3, %%mm2\n"
766 		"pandn %%mm5, %%mm4\n"
767 		"movq %%mm2, %%mm3\n"
768 		"movq %%mm4, %%mm5\n"
769 		"pand %%mm6, %%mm2\n"
770 		"pand %%mm6, %%mm4\n"
771 		"pandn %%mm7, %%mm3\n"
772 		"pandn %%mm7, %%mm5\n"
773 		"por %%mm3, %%mm2\n"
774 		"por %%mm5, %%mm4\n"
775 
776 		/* set *dst */
777 		"movq %%mm2, %%mm3\n"
778 		"punpcklbw %%mm4, %%mm2\n"
779 		"punpckhbw %%mm4, %%mm3\n"
780 		"movq %%mm2, (%3)\n"
781 		"movq %%mm3, 8(%3)\n"
782 
783 		/* next */
784 		"addl $8, %0\n"
785 		"addl $8, %1\n"
786 		"addl $8, %2\n"
787 		"addl $16, %3\n"
788 
789 /* central runs */
790 		"shrl $3, %4\n"
791 		"jz 1f\n"
792 
793 		"0:\n"
794 
795 		/* set the current, current_pre, current_next registers */
796 		"movq -8(%1), %%mm0\n"
797 		"movq (%1), %%mm7\n"
798 		"movq 8(%1), %%mm1\n"
799 		"psrlq $56, %%mm0\n"
800 		"psllq $56, %%mm1\n"
801 		"movq %%mm7, %%mm2\n"
802 		"movq %%mm7, %%mm3\n"
803 		"psllq $8, %%mm2\n"
804 		"psrlq $8, %%mm3\n"
805 		"por %%mm2, %%mm0\n"
806 		"por %%mm3, %%mm1\n"
807 
808 		/* current_upper */
809 		"movq (%0), %%mm6\n"
810 
811 		/* compute the upper-left pixel for dst on %%mm2 */
812 		/* compute the upper-right pixel for dst on %%mm4 */
813 		"movq %%mm0, %%mm2\n"
814 		"movq %%mm1, %%mm4\n"
815 		"movq %%mm0, %%mm3\n"
816 		"movq %%mm1, %%mm5\n"
817 		"pcmpeqb %%mm6, %%mm2\n"
818 		"pcmpeqb %%mm6, %%mm4\n"
819 		"pcmpeqb (%2), %%mm3\n"
820 		"pcmpeqb (%2), %%mm5\n"
821 		"pandn %%mm2, %%mm3\n"
822 		"pandn %%mm4, %%mm5\n"
823 		"movq %%mm0, %%mm2\n"
824 		"movq %%mm1, %%mm4\n"
825 		"pcmpeqb %%mm1, %%mm2\n"
826 		"pcmpeqb %%mm0, %%mm4\n"
827 		"pandn %%mm3, %%mm2\n"
828 		"pandn %%mm5, %%mm4\n"
829 		"movq %%mm2, %%mm3\n"
830 		"movq %%mm4, %%mm5\n"
831 		"pand %%mm6, %%mm2\n"
832 		"pand %%mm6, %%mm4\n"
833 		"pandn %%mm7, %%mm3\n"
834 		"pandn %%mm7, %%mm5\n"
835 		"por %%mm3, %%mm2\n"
836 		"por %%mm5, %%mm4\n"
837 
838 		/* set *dst */
839 		"movq %%mm2, %%mm3\n"
840 		"punpcklbw %%mm4, %%mm2\n"
841 		"punpckhbw %%mm4, %%mm3\n"
842 		"movq %%mm2, (%3)\n"
843 		"movq %%mm3, 8(%3)\n"
844 
845 		/* next */
846 		"addl $8, %0\n"
847 		"addl $8, %1\n"
848 		"addl $8, %2\n"
849 		"addl $16, %3\n"
850 
851 		"decl %4\n"
852 		"jnz 0b\n"
853 		"1:\n"
854 
855 /* final run */
856 		/* set the current, current_pre, current_next registers */
857 		"movq (%1), %%mm1\n"
858 		"movq (%1), %%mm7\n"
859 		"movq -8(%1), %%mm0\n"
860 		"psrlq $56, %%mm1\n"
861 		"psrlq $56, %%mm0\n"
862 		"psllq $56, %%mm1\n"
863 		"movq %%mm7, %%mm2\n"
864 		"movq %%mm7, %%mm3\n"
865 		"psllq $8, %%mm2\n"
866 		"psrlq $8, %%mm3\n"
867 		"por %%mm2, %%mm0\n"
868 		"por %%mm3, %%mm1\n"
869 
870 		/* current_upper */
871 		"movq (%0), %%mm6\n"
872 
873 		/* compute the upper-left pixel for dst on %%mm2 */
874 		/* compute the upper-right pixel for dst on %%mm4 */
875 		"movq %%mm0, %%mm2\n"
876 		"movq %%mm1, %%mm4\n"
877 		"movq %%mm0, %%mm3\n"
878 		"movq %%mm1, %%mm5\n"
879 		"pcmpeqb %%mm6, %%mm2\n"
880 		"pcmpeqb %%mm6, %%mm4\n"
881 		"pcmpeqb (%2), %%mm3\n"
882 		"pcmpeqb (%2), %%mm5\n"
883 		"pandn %%mm2, %%mm3\n"
884 		"pandn %%mm4, %%mm5\n"
885 		"movq %%mm0, %%mm2\n"
886 		"movq %%mm1, %%mm4\n"
887 		"pcmpeqb %%mm1, %%mm2\n"
888 		"pcmpeqb %%mm0, %%mm4\n"
889 		"pandn %%mm3, %%mm2\n"
890 		"pandn %%mm5, %%mm4\n"
891 		"movq %%mm2, %%mm3\n"
892 		"movq %%mm4, %%mm5\n"
893 		"pand %%mm6, %%mm2\n"
894 		"pand %%mm6, %%mm4\n"
895 		"pandn %%mm7, %%mm3\n"
896 		"pandn %%mm7, %%mm5\n"
897 		"por %%mm3, %%mm2\n"
898 		"por %%mm5, %%mm4\n"
899 
900 		/* set *dst */
901 		"movq %%mm2, %%mm3\n"
902 		"punpcklbw %%mm4, %%mm2\n"
903 		"punpckhbw %%mm4, %%mm3\n"
904 		"movq %%mm2, (%3)\n"
905 		"movq %%mm3, 8(%3)\n"
906 
907 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
908 		:
909 		: "cc"
910 	);
911 }
912 
scale2x_16_mmx_border(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)913 static inline void scale2x_16_mmx_border(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
914 {
915 	assert(count >= 8);
916 	assert(count % 4 == 0);
917 
918 	/* always do the first and last run */
919 	count -= 2*4;
920 
921 	__asm__ __volatile__(
922 /* first run */
923 		/* set the current, current_pre, current_next registers */
924 		"movq 0(%1), %%mm0\n"
925 		"movq 0(%1), %%mm7\n"
926 		"movq 8(%1), %%mm1\n"
927 		"psllq $48, %%mm0\n"
928 		"psllq $48, %%mm1\n"
929 		"psrlq $48, %%mm0\n"
930 		"movq %%mm7, %%mm2\n"
931 		"movq %%mm7, %%mm3\n"
932 		"psllq $16, %%mm2\n"
933 		"psrlq $16, %%mm3\n"
934 		"por %%mm2, %%mm0\n"
935 		"por %%mm3, %%mm1\n"
936 
937 		/* current_upper */
938 		"movq (%0), %%mm6\n"
939 
940 		/* compute the upper-left pixel for dst on %%mm2 */
941 		/* compute the upper-right pixel for dst on %%mm4 */
942 		"movq %%mm0, %%mm2\n"
943 		"movq %%mm1, %%mm4\n"
944 		"movq %%mm0, %%mm3\n"
945 		"movq %%mm1, %%mm5\n"
946 		"pcmpeqw %%mm6, %%mm2\n"
947 		"pcmpeqw %%mm6, %%mm4\n"
948 		"pcmpeqw (%2), %%mm3\n"
949 		"pcmpeqw (%2), %%mm5\n"
950 		"pandn %%mm2, %%mm3\n"
951 		"pandn %%mm4, %%mm5\n"
952 		"movq %%mm0, %%mm2\n"
953 		"movq %%mm1, %%mm4\n"
954 		"pcmpeqw %%mm1, %%mm2\n"
955 		"pcmpeqw %%mm0, %%mm4\n"
956 		"pandn %%mm3, %%mm2\n"
957 		"pandn %%mm5, %%mm4\n"
958 		"movq %%mm2, %%mm3\n"
959 		"movq %%mm4, %%mm5\n"
960 		"pand %%mm6, %%mm2\n"
961 		"pand %%mm6, %%mm4\n"
962 		"pandn %%mm7, %%mm3\n"
963 		"pandn %%mm7, %%mm5\n"
964 		"por %%mm3, %%mm2\n"
965 		"por %%mm5, %%mm4\n"
966 
967 		/* set *dst */
968 		"movq %%mm2, %%mm3\n"
969 		"punpcklwd %%mm4, %%mm2\n"
970 		"punpckhwd %%mm4, %%mm3\n"
971 		"movq %%mm2, (%3)\n"
972 		"movq %%mm3, 8(%3)\n"
973 
974 		/* next */
975 		"addl $8, %0\n"
976 		"addl $8, %1\n"
977 		"addl $8, %2\n"
978 		"addl $16, %3\n"
979 
980 /* central runs */
981 		"shrl $2, %4\n"
982 		"jz 1f\n"
983 
984 		"0:\n"
985 
986 		/* set the current, current_pre, current_next registers */
987 		"movq -8(%1), %%mm0\n"
988 		"movq (%1), %%mm7\n"
989 		"movq 8(%1), %%mm1\n"
990 		"psrlq $48, %%mm0\n"
991 		"psllq $48, %%mm1\n"
992 		"movq %%mm7, %%mm2\n"
993 		"movq %%mm7, %%mm3\n"
994 		"psllq $16, %%mm2\n"
995 		"psrlq $16, %%mm3\n"
996 		"por %%mm2, %%mm0\n"
997 		"por %%mm3, %%mm1\n"
998 
999 		/* current_upper */
1000 		"movq (%0), %%mm6\n"
1001 
1002 		/* compute the upper-left pixel for dst on %%mm2 */
1003 		/* compute the upper-right pixel for dst on %%mm4 */
1004 		"movq %%mm0, %%mm2\n"
1005 		"movq %%mm1, %%mm4\n"
1006 		"movq %%mm0, %%mm3\n"
1007 		"movq %%mm1, %%mm5\n"
1008 		"pcmpeqw %%mm6, %%mm2\n"
1009 		"pcmpeqw %%mm6, %%mm4\n"
1010 		"pcmpeqw (%2), %%mm3\n"
1011 		"pcmpeqw (%2), %%mm5\n"
1012 		"pandn %%mm2, %%mm3\n"
1013 		"pandn %%mm4, %%mm5\n"
1014 		"movq %%mm0, %%mm2\n"
1015 		"movq %%mm1, %%mm4\n"
1016 		"pcmpeqw %%mm1, %%mm2\n"
1017 		"pcmpeqw %%mm0, %%mm4\n"
1018 		"pandn %%mm3, %%mm2\n"
1019 		"pandn %%mm5, %%mm4\n"
1020 		"movq %%mm2, %%mm3\n"
1021 		"movq %%mm4, %%mm5\n"
1022 		"pand %%mm6, %%mm2\n"
1023 		"pand %%mm6, %%mm4\n"
1024 		"pandn %%mm7, %%mm3\n"
1025 		"pandn %%mm7, %%mm5\n"
1026 		"por %%mm3, %%mm2\n"
1027 		"por %%mm5, %%mm4\n"
1028 
1029 		/* set *dst */
1030 		"movq %%mm2, %%mm3\n"
1031 		"punpcklwd %%mm4, %%mm2\n"
1032 		"punpckhwd %%mm4, %%mm3\n"
1033 		"movq %%mm2, (%3)\n"
1034 		"movq %%mm3, 8(%3)\n"
1035 
1036 		/* next */
1037 		"addl $8, %0\n"
1038 		"addl $8, %1\n"
1039 		"addl $8, %2\n"
1040 		"addl $16, %3\n"
1041 
1042 		"decl %4\n"
1043 		"jnz 0b\n"
1044 		"1:\n"
1045 
1046 /* final run */
1047 		/* set the current, current_pre, current_next registers */
1048 		"movq (%1), %%mm1\n"
1049 		"movq (%1), %%mm7\n"
1050 		"movq -8(%1), %%mm0\n"
1051 		"psrlq $48, %%mm1\n"
1052 		"psrlq $48, %%mm0\n"
1053 		"psllq $48, %%mm1\n"
1054 		"movq %%mm7, %%mm2\n"
1055 		"movq %%mm7, %%mm3\n"
1056 		"psllq $16, %%mm2\n"
1057 		"psrlq $16, %%mm3\n"
1058 		"por %%mm2, %%mm0\n"
1059 		"por %%mm3, %%mm1\n"
1060 
1061 		/* current_upper */
1062 		"movq (%0), %%mm6\n"
1063 
1064 		/* compute the upper-left pixel for dst on %%mm2 */
1065 		/* compute the upper-right pixel for dst on %%mm4 */
1066 		"movq %%mm0, %%mm2\n"
1067 		"movq %%mm1, %%mm4\n"
1068 		"movq %%mm0, %%mm3\n"
1069 		"movq %%mm1, %%mm5\n"
1070 		"pcmpeqw %%mm6, %%mm2\n"
1071 		"pcmpeqw %%mm6, %%mm4\n"
1072 		"pcmpeqw (%2), %%mm3\n"
1073 		"pcmpeqw (%2), %%mm5\n"
1074 		"pandn %%mm2, %%mm3\n"
1075 		"pandn %%mm4, %%mm5\n"
1076 		"movq %%mm0, %%mm2\n"
1077 		"movq %%mm1, %%mm4\n"
1078 		"pcmpeqw %%mm1, %%mm2\n"
1079 		"pcmpeqw %%mm0, %%mm4\n"
1080 		"pandn %%mm3, %%mm2\n"
1081 		"pandn %%mm5, %%mm4\n"
1082 		"movq %%mm2, %%mm3\n"
1083 		"movq %%mm4, %%mm5\n"
1084 		"pand %%mm6, %%mm2\n"
1085 		"pand %%mm6, %%mm4\n"
1086 		"pandn %%mm7, %%mm3\n"
1087 		"pandn %%mm7, %%mm5\n"
1088 		"por %%mm3, %%mm2\n"
1089 		"por %%mm5, %%mm4\n"
1090 
1091 		/* set *dst */
1092 		"movq %%mm2, %%mm3\n"
1093 		"punpcklwd %%mm4, %%mm2\n"
1094 		"punpckhwd %%mm4, %%mm3\n"
1095 		"movq %%mm2, (%3)\n"
1096 		"movq %%mm3, 8(%3)\n"
1097 
1098 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
1099 		:
1100 		: "cc"
1101 	);
1102 }
1103 
scale2x_32_mmx_border(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1104 static inline void scale2x_32_mmx_border(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1105 {
1106 	assert(count >= 4);
1107 	assert(count % 2 == 0);
1108 
1109 	/* always do the first and last run */
1110 	count -= 2*2;
1111 
1112 	__asm__ __volatile__(
1113 /* first run */
1114 		/* set the current, current_pre, current_next registers */
1115 		"movq 0(%1), %%mm0\n"
1116 		"movq 0(%1), %%mm7\n"
1117 		"movq 8(%1), %%mm1\n"
1118 		"psllq $32, %%mm0\n"
1119 		"psllq $32, %%mm1\n"
1120 		"psrlq $32, %%mm0\n"
1121 		"movq %%mm7, %%mm2\n"
1122 		"movq %%mm7, %%mm3\n"
1123 		"psllq $32, %%mm2\n"
1124 		"psrlq $32, %%mm3\n"
1125 		"por %%mm2, %%mm0\n"
1126 		"por %%mm3, %%mm1\n"
1127 
1128 		/* current_upper */
1129 		"movq (%0), %%mm6\n"
1130 
1131 		/* compute the upper-left pixel for dst on %%mm2 */
1132 		/* compute the upper-right pixel for dst on %%mm4 */
1133 		"movq %%mm0, %%mm2\n"
1134 		"movq %%mm1, %%mm4\n"
1135 		"movq %%mm0, %%mm3\n"
1136 		"movq %%mm1, %%mm5\n"
1137 		"pcmpeqd %%mm6, %%mm2\n"
1138 		"pcmpeqd %%mm6, %%mm4\n"
1139 		"pcmpeqd (%2), %%mm3\n"
1140 		"pcmpeqd (%2), %%mm5\n"
1141 		"pandn %%mm2, %%mm3\n"
1142 		"pandn %%mm4, %%mm5\n"
1143 		"movq %%mm0, %%mm2\n"
1144 		"movq %%mm1, %%mm4\n"
1145 		"pcmpeqd %%mm1, %%mm2\n"
1146 		"pcmpeqd %%mm0, %%mm4\n"
1147 		"pandn %%mm3, %%mm2\n"
1148 		"pandn %%mm5, %%mm4\n"
1149 		"movq %%mm2, %%mm3\n"
1150 		"movq %%mm4, %%mm5\n"
1151 		"pand %%mm6, %%mm2\n"
1152 		"pand %%mm6, %%mm4\n"
1153 		"pandn %%mm7, %%mm3\n"
1154 		"pandn %%mm7, %%mm5\n"
1155 		"por %%mm3, %%mm2\n"
1156 		"por %%mm5, %%mm4\n"
1157 
1158 		/* set *dst */
1159 		"movq %%mm2, %%mm3\n"
1160 		"punpckldq %%mm4, %%mm2\n"
1161 		"punpckhdq %%mm4, %%mm3\n"
1162 		"movq %%mm2, (%3)\n"
1163 		"movq %%mm3, 8(%3)\n"
1164 
1165 		/* next */
1166 		"addl $8, %0\n"
1167 		"addl $8, %1\n"
1168 		"addl $8, %2\n"
1169 		"addl $16, %3\n"
1170 
1171 /* central runs */
1172 		"shrl $1, %4\n"
1173 		"jz 1f\n"
1174 
1175 		"0:\n"
1176 
1177 		/* set the current, current_pre, current_next registers */
1178 		"movq -8(%1), %%mm0\n"
1179 		"movq (%1), %%mm7\n"
1180 		"movq 8(%1), %%mm1\n"
1181 		"psrlq $32, %%mm0\n"
1182 		"psllq $32, %%mm1\n"
1183 		"movq %%mm7, %%mm2\n"
1184 		"movq %%mm7, %%mm3\n"
1185 		"psllq $32, %%mm2\n"
1186 		"psrlq $32, %%mm3\n"
1187 		"por %%mm2, %%mm0\n"
1188 		"por %%mm3, %%mm1\n"
1189 
1190 		/* current_upper */
1191 		"movq (%0), %%mm6\n"
1192 
1193 		/* compute the upper-left pixel for dst on %%mm2 */
1194 		/* compute the upper-right pixel for dst on %%mm4 */
1195 		"movq %%mm0, %%mm2\n"
1196 		"movq %%mm1, %%mm4\n"
1197 		"movq %%mm0, %%mm3\n"
1198 		"movq %%mm1, %%mm5\n"
1199 		"pcmpeqd %%mm6, %%mm2\n"
1200 		"pcmpeqd %%mm6, %%mm4\n"
1201 		"pcmpeqd (%2), %%mm3\n"
1202 		"pcmpeqd (%2), %%mm5\n"
1203 		"pandn %%mm2, %%mm3\n"
1204 		"pandn %%mm4, %%mm5\n"
1205 		"movq %%mm0, %%mm2\n"
1206 		"movq %%mm1, %%mm4\n"
1207 		"pcmpeqd %%mm1, %%mm2\n"
1208 		"pcmpeqd %%mm0, %%mm4\n"
1209 		"pandn %%mm3, %%mm2\n"
1210 		"pandn %%mm5, %%mm4\n"
1211 		"movq %%mm2, %%mm3\n"
1212 		"movq %%mm4, %%mm5\n"
1213 		"pand %%mm6, %%mm2\n"
1214 		"pand %%mm6, %%mm4\n"
1215 		"pandn %%mm7, %%mm3\n"
1216 		"pandn %%mm7, %%mm5\n"
1217 		"por %%mm3, %%mm2\n"
1218 		"por %%mm5, %%mm4\n"
1219 
1220 		/* set *dst */
1221 		"movq %%mm2, %%mm3\n"
1222 		"punpckldq %%mm4, %%mm2\n"
1223 		"punpckhdq %%mm4, %%mm3\n"
1224 		"movq %%mm2, (%3)\n"
1225 		"movq %%mm3, 8(%3)\n"
1226 
1227 		/* next */
1228 		"addl $8, %0\n"
1229 		"addl $8, %1\n"
1230 		"addl $8, %2\n"
1231 		"addl $16, %3\n"
1232 
1233 		"decl %4\n"
1234 		"jnz 0b\n"
1235 		"1:\n"
1236 
1237 /* final run */
1238 		/* set the current, current_pre, current_next registers */
1239 		"movq (%1), %%mm1\n"
1240 		"movq (%1), %%mm7\n"
1241 		"movq -8(%1), %%mm0\n"
1242 		"psrlq $32, %%mm1\n"
1243 		"psrlq $32, %%mm0\n"
1244 		"psllq $32, %%mm1\n"
1245 		"movq %%mm7, %%mm2\n"
1246 		"movq %%mm7, %%mm3\n"
1247 		"psllq $32, %%mm2\n"
1248 		"psrlq $32, %%mm3\n"
1249 		"por %%mm2, %%mm0\n"
1250 		"por %%mm3, %%mm1\n"
1251 
1252 		/* current_upper */
1253 		"movq (%0), %%mm6\n"
1254 
1255 		/* compute the upper-left pixel for dst on %%mm2 */
1256 		/* compute the upper-right pixel for dst on %%mm4 */
1257 		"movq %%mm0, %%mm2\n"
1258 		"movq %%mm1, %%mm4\n"
1259 		"movq %%mm0, %%mm3\n"
1260 		"movq %%mm1, %%mm5\n"
1261 		"pcmpeqd %%mm6, %%mm2\n"
1262 		"pcmpeqd %%mm6, %%mm4\n"
1263 		"pcmpeqd (%2), %%mm3\n"
1264 		"pcmpeqd (%2), %%mm5\n"
1265 		"pandn %%mm2, %%mm3\n"
1266 		"pandn %%mm4, %%mm5\n"
1267 		"movq %%mm0, %%mm2\n"
1268 		"movq %%mm1, %%mm4\n"
1269 		"pcmpeqd %%mm1, %%mm2\n"
1270 		"pcmpeqd %%mm0, %%mm4\n"
1271 		"pandn %%mm3, %%mm2\n"
1272 		"pandn %%mm5, %%mm4\n"
1273 		"movq %%mm2, %%mm3\n"
1274 		"movq %%mm4, %%mm5\n"
1275 		"pand %%mm6, %%mm2\n"
1276 		"pand %%mm6, %%mm4\n"
1277 		"pandn %%mm7, %%mm3\n"
1278 		"pandn %%mm7, %%mm5\n"
1279 		"por %%mm3, %%mm2\n"
1280 		"por %%mm5, %%mm4\n"
1281 
1282 		/* set *dst */
1283 		"movq %%mm2, %%mm3\n"
1284 		"punpckldq %%mm4, %%mm2\n"
1285 		"punpckhdq %%mm4, %%mm3\n"
1286 		"movq %%mm2, (%3)\n"
1287 		"movq %%mm3, 8(%3)\n"
1288 
1289 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
1290 		:
1291 		: "cc"
1292 	);
1293 }
1294 
1295 /**
1296  * Scale by a factor of 2 a row of pixels of 8 bits.
1297  * This is a very fast MMX implementation.
1298  * The implementation uses a combination of cmp/and/not operations to
1299  * completly remove the need of conditional jumps. This trick give the
1300  * major speed improvement.
1301  * Also, using the 8 bytes MMX registers more than one pixel are computed
1302  * at the same time.
1303  * Before calling this function you must ensure that the currenct CPU supports
1304  * the MMX instruction set. After calling it you must be sure to call the EMMS
1305  * instruction before any floating-point operation.
1306  * The pixels over the left and right borders are assumed of the same color of
1307  * the pixels on the border.
1308  * Note that the implementation is optimized to write data sequentially to
1309  * maximize the bandwidth on video memory.
1310  * \param src0 Pointer at the first pixel of the previous row.
1311  * \param src1 Pointer at the first pixel of the current row.
1312  * \param src2 Pointer at the first pixel of the next row.
1313  * \param count Length in pixels of the src0, src1 and src2 rows. It must
1314  * be at least 16 and a multiple of 8.
1315  * \param dst0 First destination row, double length in pixels.
1316  * \param dst1 Second destination row, double length in pixels.
1317  */
scale2x_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1318 void scale2x_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1319 {
1320 	if (count % 8 != 0 || count < 16) {
1321 		scale2x_8_def(dst0, dst1, src0, src1, src2, count);
1322 	} else {
1323 		scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1324 		scale2x_8_mmx_border(dst1, src2, src1, src0, count);
1325 	}
1326 }
1327 
1328 /**
1329  * Scale by a factor of 2 a row of pixels of 16 bits.
1330  * This function operates like scale2x_8_mmx() but for 16 bits pixels.
1331  * \param src0 Pointer at the first pixel of the previous row.
1332  * \param src1 Pointer at the first pixel of the current row.
1333  * \param src2 Pointer at the first pixel of the next row.
1334  * \param count Length in pixels of the src0, src1 and src2 rows. It must
1335  * be at least 8 and a multiple of 4.
1336  * \param dst0 First destination row, double length in pixels.
1337  * \param dst1 Second destination row, double length in pixels.
1338  */
scale2x_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1339 void scale2x_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1340 {
1341 	if (count % 4 != 0 || count < 8) {
1342 		scale2x_16_def(dst0, dst1, src0, src1, src2, count);
1343 	} else {
1344 		scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1345 		scale2x_16_mmx_border(dst1, src2, src1, src0, count);
1346 	}
1347 }
1348 
1349 /**
1350  * Scale by a factor of 2 a row of pixels of 32 bits.
1351  * This function operates like scale2x_8_mmx() but for 32 bits pixels.
1352  * \param src0 Pointer at the first pixel of the previous row.
1353  * \param src1 Pointer at the first pixel of the current row.
1354  * \param src2 Pointer at the first pixel of the next row.
1355  * \param count Length in pixels of the src0, src1 and src2 rows. It must
1356  * be at least 4 and a multiple of 2.
1357  * \param dst0 First destination row, double length in pixels.
1358  * \param dst1 Second destination row, double length in pixels.
1359  */
scale2x_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1360 void scale2x_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1361 {
1362 	if (count % 2 != 0 || count < 4) {
1363 		scale2x_32_def(dst0, dst1, src0, src1, src2, count);
1364 	} else {
1365 		scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1366 		scale2x_32_mmx_border(dst1, src2, src1, src0, count);
1367 	}
1368 }
1369 
1370 /**
1371  * Scale by a factor of 2x3 a row of pixels of 8 bits.
1372  * This function operates like scale2x_8_mmx() but with an expansion
1373  * factor of 2x3 instead of 2x2.
1374  */
scale2x3_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1375 void scale2x3_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1376 {
1377 	if (count % 8 != 0 || count < 16) {
1378 		scale2x3_8_def(dst0, dst1, dst2, src0, src1, src2, count);
1379 	} else {
1380 		scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1381 		scale2x_8_def_center(dst1, src0, src1, src2, count);
1382 		scale2x_8_mmx_border(dst2, src2, src1, src0, count);
1383 	}
1384 }
1385 
1386 /**
1387  * Scale by a factor of 2x3 a row of pixels of 16 bits.
1388  * This function operates like scale2x_16_mmx() but with an expansion
1389  * factor of 2x3 instead of 2x2.
1390  */
scale2x3_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1391 void scale2x3_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1392 {
1393 	if (count % 4 != 0 || count < 8) {
1394 		scale2x3_16_def(dst0, dst1, dst2, src0, src1, src2, count);
1395 	} else {
1396 		scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1397 		scale2x_16_def_center(dst1, src0, src1, src2, count);
1398 		scale2x_16_mmx_border(dst2, src2, src1, src0, count);
1399 	}
1400 }
1401 
1402 /**
1403  * Scale by a factor of 2x3 a row of pixels of 32 bits.
1404  * This function operates like scale2x_32_mmx() but with an expansion
1405  * factor of 2x3 instead of 2x2.
1406  */
scale2x3_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1407 void scale2x3_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1408 {
1409 	if (count % 2 != 0 || count < 4) {
1410 		scale2x3_32_def(dst0, dst1, dst2, src0, src1, src2, count);
1411 	} else {
1412 		scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1413 		scale2x_32_def_center(dst1, src0, src1, src2, count);
1414 		scale2x_32_mmx_border(dst2, src2, src1, src0, count);
1415 	}
1416 }
1417 
1418 /**
1419  * Scale by a factor of 2x4 a row of pixels of 8 bits.
1420  * This function operates like scale2x_8_mmx() but with an expansion
1421  * factor of 2x4 instead of 2x2.
1422  */
scale2x4_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,scale2x_uint8 * dst3,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1423 void scale2x4_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, scale2x_uint8* dst3, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1424 {
1425 	if (count % 8 != 0 || count < 16) {
1426 		scale2x4_8_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1427 	} else {
1428 		scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1429 		scale2x_8_def_center(dst1, src0, src1, src2, count);
1430 		scale2x_8_def_center(dst2, src0, src1, src2, count);
1431 		scale2x_8_mmx_border(dst3, src2, src1, src0, count);
1432 	}
1433 }
1434 
1435 /**
1436  * Scale by a factor of 2x4 a row of pixels of 16 bits.
1437  * This function operates like scale2x_16_mmx() but with an expansion
1438  * factor of 2x4 instead of 2x2.
1439  */
scale2x4_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,scale2x_uint16 * dst3,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1440 void scale2x4_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, scale2x_uint16* dst3, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1441 {
1442 	if (count % 4 != 0 || count < 8) {
1443 		scale2x4_16_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1444 	} else {
1445 		scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1446 		scale2x_16_def_center(dst1, src0, src1, src2, count);
1447 		scale2x_16_def_center(dst2, src0, src1, src2, count);
1448 		scale2x_16_mmx_border(dst3, src2, src1, src0, count);
1449 	}
1450 }
1451 
1452 /**
1453  * Scale by a factor of 2x4 a row of pixels of 32 bits.
1454  * This function operates like scale2x_32_mmx() but with an expansion
1455  * factor of 2x4 instead of 2x2.
1456  */
scale2x4_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,scale2x_uint32 * dst3,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1457 void scale2x4_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, scale2x_uint32* dst3, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1458 {
1459 	if (count % 2 != 0 || count < 4) {
1460 		scale2x4_32_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1461 	} else {
1462 		scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1463 		scale2x_32_def_center(dst1, src0, src1, src2, count);
1464 		scale2x_32_def_center(dst2, src0, src1, src2, count);
1465 		scale2x_32_mmx_border(dst3, src2, src1, src0, count);
1466 	}
1467 }
1468 
1469 #endif
1470 
1471